From 20e9fc55feb58dd1f766a494c530684011291ff3 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sun, 24 May 2020 22:19:22 -0700
Subject: [PATCH 001/770] [MCDwarf] Delete unneeded DW_AT_prototyped for
 DW_TAG_label

---
 llvm/lib/MC/MCDwarf.cpp        | 4 ----
 llvm/test/MC/MachO/gen-dwarf.s | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 0b7fc45540189..71b8f0e28e1cd 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -846,7 +846,6 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   EmitAbbrev(MCOS, dwarf::DW_AT_decl_file, dwarf::DW_FORM_data4);
   EmitAbbrev(MCOS, dwarf::DW_AT_decl_line, dwarf::DW_FORM_data4);
   EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
-  EmitAbbrev(MCOS, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag);
   EmitAbbrev(MCOS, 0, 0);
 
   // DW_TAG_unspecified_parameters DIE abbrev (3).
@@ -1087,9 +1086,6 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
                                              MCSymbolRefExpr::VK_None, context);
     MCOS->emitValue(AT_low_pc, AddrSize);
 
-    // DW_AT_prototyped, a one byte flag value of 0 saying we have no prototype.
-    MCOS->emitInt8(0);
-
     // The DW_TAG_unspecified_parameters DIE abbrev (3).
     MCOS->emitULEB128IntValue(3);
 
diff --git a/llvm/test/MC/MachO/gen-dwarf.s b/llvm/test/MC/MachO/gen-dwarf.s
index 6d39d278e8184..58f8a7ccf8994 100644
--- a/llvm/test/MC/MachO/gen-dwarf.s
+++ b/llvm/test/MC/MachO/gen-dwarf.s
@@ -30,7 +30,6 @@ _x:	.long 1
 // CHECK: 	DW_AT_decl_file	DW_FORM_data4
 // CHECK: 	DW_AT_decl_line	DW_FORM_data4
 // CHECK: 	DW_AT_low_pc	DW_FORM_addr
-// CHECK: 	DW_AT_prototyped	DW_FORM_flag
 
 // CHECK: [3] DW_TAG_unspecified_parameters	DW_CHILDREN_no
 
@@ -53,7 +52,6 @@ _x:	.long 1
 // CHECK:      DW_AT_decl_file ([[FILE:".*gen-dwarf.s"]])
 // CHECK:      DW_AT_decl_line (5)
 // CHECK:      DW_AT_low_pc (0x0000000000000000)
-// CHECK:      DW_AT_prototyped (0x00)
 
 // CHECK:      DW_TAG_unspecified_parameters
 
@@ -64,7 +62,6 @@ _x:	.long 1
 // CHECK:      DW_AT_decl_file ([[FILE]])
 // CHECK:      DW_AT_decl_line (9)
 // CHECK:      DW_AT_low_pc (0x0000000000000007)
-// CHECK:      DW_AT_prototyped (0x00)
 
 // CHECK:      DW_TAG_unspecified_parameters
 
@@ -75,7 +72,6 @@ _x:	.long 1
 // CHECK:      DW_AT_decl_file ([[FILE]])
 // CHECK:      DW_AT_decl_line (10)
 // CHECK:      DW_AT_low_pc (0x0000000000000007)
-// CHECK:      DW_AT_prototyped (0x00)
 
 // CHECK:      DW_TAG_unspecified_parameters
 

From 1b79509f97b6c9595027b53d3d67f174d0ae1c78 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sun, 24 May 2020 22:30:59 -0700
Subject: [PATCH 002/770] [MCDwarf] Delete unneeded
 DW_AT_unspecified_parameters

---
 llvm/lib/MC/MCDwarf.cpp                        | 14 +-------------
 llvm/test/MC/ARM/dwarf-asm-multiple-sections.s |  4 ++--
 llvm/test/MC/MachO/gen-dwarf.s                 | 16 +---------------
 3 files changed, 4 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 71b8f0e28e1cd..d75b55c6f8d26 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -841,19 +841,13 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   // DW_TAG_label DIE abbrev (2).
   MCOS->emitULEB128IntValue(2);
   MCOS->emitULEB128IntValue(dwarf::DW_TAG_label);
-  MCOS->emitInt8(dwarf::DW_CHILDREN_yes);
+  MCOS->emitInt8(dwarf::DW_CHILDREN_no);
   EmitAbbrev(MCOS, dwarf::DW_AT_name, dwarf::DW_FORM_string);
   EmitAbbrev(MCOS, dwarf::DW_AT_decl_file, dwarf::DW_FORM_data4);
   EmitAbbrev(MCOS, dwarf::DW_AT_decl_line, dwarf::DW_FORM_data4);
   EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
   EmitAbbrev(MCOS, 0, 0);
 
-  // DW_TAG_unspecified_parameters DIE abbrev (3).
-  MCOS->emitULEB128IntValue(3);
-  MCOS->emitULEB128IntValue(dwarf::DW_TAG_unspecified_parameters);
-  MCOS->emitInt8(dwarf::DW_CHILDREN_no);
-  EmitAbbrev(MCOS, 0, 0);
-
   // Terminate the abbreviations for this compilation unit.
   MCOS->emitInt8(0);
 }
@@ -1085,12 +1079,6 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
     const MCExpr *AT_low_pc = MCSymbolRefExpr::create(Entry.getLabel(),
                                              MCSymbolRefExpr::VK_None, context);
     MCOS->emitValue(AT_low_pc, AddrSize);
-
-    // The DW_TAG_unspecified_parameters DIE abbrev (3).
-    MCOS->emitULEB128IntValue(3);
-
-    // Add the NULL DIE terminating the DW_TAG_unspecified_parameters DIE's.
-    MCOS->emitInt8(0);
   }
 
   // Add the NULL DIE terminating the Compile Unit DIE's.
diff --git a/llvm/test/MC/ARM/dwarf-asm-multiple-sections.s b/llvm/test/MC/ARM/dwarf-asm-multiple-sections.s
index ffcdfda397524..2f32681b36271 100644
--- a/llvm/test/MC/ARM/dwarf-asm-multiple-sections.s
+++ b/llvm/test/MC/ARM/dwarf-asm-multiple-sections.s
@@ -41,10 +41,10 @@ b:
 // DWARF4:  DW_AT_ranges [DW_FORM_sec_offset]      (0x00000000
 // DWARF5:  DW_AT_ranges [DW_FORM_sec_offset]      (0x0000000c
 
-// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2]
 // DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("a")
 
-// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2]
 // DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("b")
 
 
diff --git a/llvm/test/MC/MachO/gen-dwarf.s b/llvm/test/MC/MachO/gen-dwarf.s
index 58f8a7ccf8994..5bf6cac3428e8 100644
--- a/llvm/test/MC/MachO/gen-dwarf.s
+++ b/llvm/test/MC/MachO/gen-dwarf.s
@@ -25,14 +25,12 @@ _x:	.long 1
 // CHECK: 	DW_AT_producer	DW_FORM_string
 // CHECK: 	DW_AT_language	DW_FORM_data2
 
-// CHECK: [2] DW_TAG_label	DW_CHILDREN_yes
+// CHECK: [2] DW_TAG_label	DW_CHILDREN_no
 // CHECK: 	DW_AT_name	DW_FORM_string
 // CHECK: 	DW_AT_decl_file	DW_FORM_data4
 // CHECK: 	DW_AT_decl_line	DW_FORM_data4
 // CHECK: 	DW_AT_low_pc	DW_FORM_addr
 
-// CHECK: [3] DW_TAG_unspecified_parameters	DW_CHILDREN_no
-
 
 // CHECK: .debug_info contents:
 
@@ -53,30 +51,18 @@ _x:	.long 1
 // CHECK:      DW_AT_decl_line (5)
 // CHECK:      DW_AT_low_pc (0x0000000000000000)
 
-// CHECK:      DW_TAG_unspecified_parameters
-
-// CHECK:      NULL
-
 // CHECK:    DW_TAG_label
 // CHECK:      DW_AT_name ("foo")
 // CHECK:      DW_AT_decl_file ([[FILE]])
 // CHECK:      DW_AT_decl_line (9)
 // CHECK:      DW_AT_low_pc (0x0000000000000007)
 
-// CHECK:      DW_TAG_unspecified_parameters
-
-// CHECK:      NULL
-
 // CHECK:    DW_TAG_label
 // CHECK:      DW_AT_name ("baz")
 // CHECK:      DW_AT_decl_file ([[FILE]])
 // CHECK:      DW_AT_decl_line (10)
 // CHECK:      DW_AT_low_pc (0x0000000000000007)
 
-// CHECK:      DW_TAG_unspecified_parameters
-
-// CHECK:      NULL
-
 // CHECK:    NULL
 
 // CHECK: .debug_aranges contents:

From 760f45eacadbabf9634fb81d7ccaa16c269cf19e Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Mon, 25 May 2020 10:28:17 +0300
Subject: [PATCH 003/770] [CMake] Properly handle the LTO cache arguments for
 MinGW

We want to make sure that LINKER_IS_LLD_LINK is properly set - in
this case it shouldn't be set when building for MinGW.

Then we want to make the test for it correct and finally include
the option to build with thinlto cache since the MinGW driver now
supports that.

Differential Revision: https://reviews.llvm.org/D80493
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index b50100f4d63ad..d5c924ca1a2c8 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -13,7 +13,7 @@ include(CheckCXXCompilerFlag)
 include(CheckSymbolExists)
 include(CMakeDependentOption)
 
-if(CMAKE_LINKER MATCHES "lld-link" OR (WIN32 AND LLVM_USE_LINKER STREQUAL "lld") OR LLVM_ENABLE_LLD)
+if(CMAKE_LINKER MATCHES "lld-link" OR (MSVC AND (LLVM_USE_LINKER STREQUAL "lld" OR LLVM_ENABLE_LLD)))
   set(LINKER_IS_LLD_LINK TRUE)
 else()
   set(LINKER_IS_LLD_LINK FALSE)
@@ -941,7 +941,7 @@ if (LLVM_BUILD_INSTRUMENTED AND LLVM_BUILD_INSTRUMENTED_COVERAGE)
   message(FATAL_ERROR "LLVM_BUILD_INSTRUMENTED and LLVM_BUILD_INSTRUMENTED_COVERAGE cannot both be specified")
 endif()
 
-if(LLVM_ENABLE_LTO AND LLVM_ON_WIN32 AND NOT LINKER_IS_LLD_LINK)
+if(LLVM_ENABLE_LTO AND LLVM_ON_WIN32 AND NOT LINKER_IS_LLD_LINK AND NOT MINGW)
   message(FATAL_ERROR "When compiling for Windows, LLVM_ENABLE_LTO requires using lld as the linker (point CMAKE_LINKER at lld-link.exe)")
 endif()
 if(uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
@@ -956,7 +956,7 @@ if(uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
   if(APPLE)
     append("-Wl,-cache_path_lto,${PROJECT_BINARY_DIR}/lto.cache"
            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
-  elseif(UNIX AND LLVM_USE_LINKER STREQUAL "lld")
+  elseif((UNIX OR MINGW) AND LLVM_USE_LINKER STREQUAL "lld")
     append("-Wl,--thinlto-cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   elseif(LLVM_USE_LINKER STREQUAL "gold")

From 5b7ff6f07ffbcbcfad24f39faad5858cc379fad0 Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Mon, 25 May 2020 09:48:51 +0200
Subject: [PATCH 004/770] [VE][NFC] Correct sjlj_expection test

Summary: '|&' works with bash only, so it should not be used in regression
tests.

Differential Revision: https://reviews.llvm.org/D80501
---
 llvm/test/CodeGen/VE/sjlj_except.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/VE/sjlj_except.ll b/llvm/test/CodeGen/VE/sjlj_except.ll
index 582ee6eb1e6a8..4d2558571bf4f 100644
--- a/llvm/test/CodeGen/VE/sjlj_except.ll
+++ b/llvm/test/CodeGen/VE/sjlj_except.ll
@@ -1,5 +1,5 @@
-; RUN: llc  -mtriple=x86_64-unknown-unknown --exception-model=sjlj --print-after=sjljehprepare < %s |& FileCheck --check-prefix=CHECK-X86 %s
-; RUN: (llc  -mtriple=ve-unknown-unknown --exception-model=sjlj  --print-after=sjljehprepare < %s || true) |& FileCheck --check-prefix=CHECK-VE %s
+; RUN: llc  -mtriple=x86_64-unknown-unknown --exception-model=sjlj --print-after=sjljehprepare < %s 2>&1 | FileCheck --check-prefix=CHECK-X86 %s
+; RUN: (llc  -mtriple=ve-unknown-unknown --exception-model=sjlj  --print-after=sjljehprepare < %s || true) 2>&1 | FileCheck --check-prefix=CHECK-VE %s
 
 @SomeGlobal = external dso_local global i8
 

From b752a2743ab0d24d8da5d97c07fbdb996df78b1f Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 25 May 2020 10:19:34 +0200
Subject: [PATCH 005/770] [clangd] Log use of heuristic go-to-def. NFC

Generally:
 - found results using this method -> log
 - no results using this method -> vlog
 - method wasn't applied because ineligible -> no log
---
 clang-tools-extra/clangd/XRefs.cpp | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 1d82763b6a3cf..1fc0e0348d093 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -438,8 +438,11 @@ locateSymbolTextually(const SpelledWord &Word, ParsedAST &AST,
     ScoredResults.push_back({Score, std::move(Located)});
   });
 
-  if (TooMany)
+  if (TooMany) {
+    vlog("Heuristic index lookup for {0} returned too many candidates, ignored",
+         Word.Text);
     return {};
+  }
 
   llvm::sort(ScoredResults,
              [](const ScoredLocatedSymbol &A, const ScoredLocatedSymbol &B) {
@@ -448,6 +451,10 @@ locateSymbolTextually(const SpelledWord &Word, ParsedAST &AST,
   std::vector<LocatedSymbol> Results;
   for (auto &Res : std::move(ScoredResults))
     Results.push_back(std::move(Res.second));
+  if (Results.empty())
+    vlog("No heuristic index definition for {0}", Word.Text);
+  else
+    log("Found definition heuristically in index for {0}", Word.Text);
   return Results;
 }
 
@@ -570,13 +577,22 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
     // Is the same word nearby a real identifier that might refer to something?
     if (const syntax::Token *NearbyIdent =
             findNearbyIdentifier(*Word, AST.getTokens())) {
-      if (auto Macro = locateMacroReferent(*NearbyIdent, AST, *MainFilePath))
+      if (auto Macro = locateMacroReferent(*NearbyIdent, AST, *MainFilePath)) {
+        log("Found macro definition heuristically using nearby identifier {0}",
+            Word->Text);
         return {*std::move(Macro)};
+      }
       ASTResults =
           locateASTReferent(NearbyIdent->location(), NearbyIdent, AST,
                             *MainFilePath, Index, /*NodeKind=*/nullptr);
-      if (!ASTResults.empty())
+      if (!ASTResults.empty()) {
+        log("Found definition heuristically using nearby identifier {0}",
+            NearbyIdent->text(SM));
         return ASTResults;
+      } else {
+        vlog("No definition found using nearby identifier {0} at {1}",
+             Word->Text, Word->Location.printToString(SM));
+      }
     }
     // No nearby word, or it didn't refer to anything either. Try the index.
     auto TextualResults =

From 3895148d7cd8ff76220f8f8209ec06369a8e816f Mon Sep 17 00:00:00 2001
From: Joachim Protze <protze@itc.rwth-aachen.de>
Date: Mon, 25 May 2020 10:19:35 +0200
Subject: [PATCH 006/770] [OpenMP] Fix a race in task queue reallocation

__kmp_realloc_task_deque implicitly assumes, that the task queue is full
(ntasks == size), therefore tail = size in line 319.
An assertion is added to document this assumption.

The first check for a full queue is before the locking and might not hold
when the lock is taken. So, we need to check again for this condition when
we have the lock.

Reviewed By: AndreyChurbanov

Differential Revision: https://reviews.llvm.org/D80480
---
 openmp/runtime/src/kmp_tasking.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index c928517410608..6e584731a85fe 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -298,6 +298,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
 static void __kmp_realloc_task_deque(kmp_info_t *thread,
                                      kmp_thread_data_t *thread_data) {
   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
   kmp_int32 new_size = 2 * size;
 
   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
@@ -381,8 +382,11 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
     } else {
       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
       locked = 1;
-      // expand deque to push the task which is not allowed to execute
-      __kmp_realloc_task_deque(thread, thread_data);
+      if (TCR_4(thread_data->td.td_deque_ntasks) >=
+          TASK_DEQUE_SIZE(thread_data->td)) {
+        // expand deque to push the task which is not allowed to execute
+        __kmp_realloc_task_deque(thread, thread_data);
+      }
     }
   }
   // Lock the deque for the task push operation
@@ -3659,7 +3663,11 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
       return result;
 
     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
-    __kmp_realloc_task_deque(thread, thread_data);
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      // expand deque to push the task which is not allowed to execute
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
 
   } else {
 

From 840450549c9199150cbdee29acef756c19660ca1 Mon Sep 17 00:00:00 2001
From: Ayal Zaks <ayal.zaks@intel.com>
Date: Sun, 24 May 2020 16:22:39 +0300
Subject: [PATCH 007/770] [LV] Clamp MaxVF to power of 2.

If a loop has a constant trip count known to be a multiple of MaxVF (times user
UF), LV infers that no tail will be generated for any chosen VF. This relies on
the chosen VF's being powers of 2 bound by MaxVF, and assumes MaxVF is a power
of 2. Make sure the latter holds, in particular when MaxVF is set by a memory
dependence distance which may not be a power of 2.

Differential Revision: https://reviews.llvm.org/D80491
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +
 .../LoopVectorize/memdep-fold-tail.ll         | 108 ++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6c2a3e42de48c..df1529a2f7b9e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5003,6 +5003,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
   }
 
   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
+  assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
   if (TC > 0 && TC % MaxVFtimesIC == 0) {
     // Accept MaxVF if we do not have a tail.
@@ -5051,6 +5052,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
 
   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
 
+  // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
+  WidestRegister = PowerOf2Floor(WidestRegister);
+
   unsigned MaxVectorSize = WidestRegister / WidestType;
 
   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
diff --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
new file mode 100644
index 0000000000000..4fe0d12253506
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilog -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Vectorization with dependence checks.
+
+; Check that a non-power-of-2 MaxVF, calculated based on maximum safe distance,
+; does not lead fold-tail to think that no tail will be generated for any chosen
+; (power of 2) VF.
+; Dependence distance here is 3 iterations.
+; Tiny trip count of 15 divides 3, but any (even) VF will have a tail.
+
+;unsigned char a [15+3];
+;void maxvf3(){
+;  for (int j = 0; j < 15; ++j) {
+;    a[j] = 69;
+;    a[j+3] = 7;
+;  }
+;}
+
+@a = common local_unnamed_addr global [18 x i8] zeroinitializer, align 16
+
+define void @maxvf3() {
+; CHECK-LABEL: @maxvf3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 14, i32 14>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    store i8 69, i8* [[TMP3]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP5]]
+; CHECK-NEXT:    store i8 69, i8* [[TMP6]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw <2 x i32> <i32 3, i32 3>, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP9]]
+; CHECK-NEXT:    store i8 7, i8* [[TMP10]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP12]]
+; CHECK-NEXT:    store i8 7, i8* [[TMP13]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[AJ:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[J]]
+; CHECK-NEXT:    store i8 69, i8* [[AJ]], align 8
+; CHECK-NEXT:    [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
+; CHECK-NEXT:    [[AJP3:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[JP3]]
+; CHECK-NEXT:    store i8 7, i8* [[AJP3]], align 8
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i32 [[J]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
+  %aj = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 %j
+  store i8 69, i8* %aj, align 8
+  %jp3 = add nuw nsw i32 3, %j
+  %ajp3 = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 %jp3
+  store i8 7, i8* %ajp3, align 8
+  %j.next = add nuw nsw i32 %j, 1
+  %exitcond = icmp eq i32 %j.next, 15
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}

From 447ea9b4f5f562c8fab7d11ecbb10ecd33155d5b Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Wed, 20 May 2020 07:22:01 +0100
Subject: [PATCH 008/770] [AST] default implementation is possible for
 non-member functions in C++20.

Summary:
Make RAV not visit the default function decl by default.
Also update some stale comments on FunctionDecl::isDefault.

Fixes https://github.com/clangd/clangd/issues/383

Reviewers: sammccall, rsmith

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80288
---
 clang/include/clang/AST/Decl.h                 | 10 ++++------
 clang/include/clang/AST/RecursiveASTVisitor.h  | 10 +++++-----
 .../RecursiveASTVisitorTests/CXXMethodDecl.cpp | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index d7136a4cd420b..2e1630827cce3 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2125,19 +2125,17 @@ class FunctionDecl : public DeclaratorDecl,
   bool isTrivialForCall() const { return FunctionDeclBits.IsTrivialForCall; }
   void setTrivialForCall(bool IT) { FunctionDeclBits.IsTrivialForCall = IT; }
 
-  /// Whether this function is defaulted per C++0x. Only valid for
-  /// special member functions.
+  /// Whether this function is defaulted. Valid for e.g.
+  /// special member functions, defaulted comparisions (not methods!).
   bool isDefaulted() const { return FunctionDeclBits.IsDefaulted; }
   void setDefaulted(bool D = true) { FunctionDeclBits.IsDefaulted = D; }
 
-  /// Whether this function is explicitly defaulted per C++0x. Only valid
-  /// for special member functions.
+  /// Whether this function is explicitly defaulted.
   bool isExplicitlyDefaulted() const {
     return FunctionDeclBits.IsExplicitlyDefaulted;
   }
 
-  /// State that this function is explicitly defaulted per C++0x. Only valid
-  /// for special member functions.
+  /// State that this function is explicitly defaulted.
   void setExplicitlyDefaulted(bool ED = true) {
     FunctionDeclBits.IsExplicitlyDefaulted = ED;
   }
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index a264d1cf24b23..b30d456bd24a8 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2103,11 +2103,11 @@ bool RecursiveASTVisitor<Derived>::TraverseFunctionHelper(FunctionDecl *D) {
     }
   }
 
-  bool VisitBody = D->isThisDeclarationADefinition();
-  // If a method is set to default outside the class definition the compiler
-  // generates the method body and adds it to the AST.
-  if (const auto *MD = dyn_cast<CXXMethodDecl>(D))
-    VisitBody &= !MD->isDefaulted() || getDerived().shouldVisitImplicitCode();
+  bool VisitBody =
+      D->isThisDeclarationADefinition() &&
+      // Don't visit the function body if the function definition is generated
+      // by clang.
+      (!D->isDefaulted() || getDerived().shouldVisitImplicitCode());
 
   if (VisitBody) {
     TRY_TO(TraverseStmt(D->getBody())); // Function body.
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/CXXMethodDecl.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/CXXMethodDecl.cpp
index 6441ea99dd2cc..90fa84bd44812 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/CXXMethodDecl.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/CXXMethodDecl.cpp
@@ -55,4 +55,22 @@ TEST(RecursiveASTVisitor, CXXMethodDeclNoDefaultBodyVisited) {
     EXPECT_TRUE(Visitor.runOver(Code, CXXMethodDeclVisitor::Lang_CXX11));
   }
 }
+
+TEST(RecursiveASTVisitor, FunctionDeclNoDefaultBodyVisited) {
+  for (bool VisitImplCode : {false, true}) {
+    CXXMethodDeclVisitor Visitor(VisitImplCode);
+    if (VisitImplCode)
+      Visitor.ExpectMatch("declref", 4, 58, /*Times=*/2);
+    else
+      Visitor.DisallowMatch("declref", 4, 58);
+    llvm::StringRef Code = R"cpp(
+      struct s {
+        int x;
+        friend auto operator==(s a, s b) -> bool = default;
+      };
+      bool k = s() == s(); // make sure clang generates the "==" definition.
+    )cpp";
+    EXPECT_TRUE(Visitor.runOver(Code, CXXMethodDeclVisitor::Lang_CXX2a));
+  }
+}
 } // end anonymous namespace

From 72c5ea1d73bb89af6f82c14ddb0b7f4c2510bab7 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Tue, 5 May 2020 17:11:30 +0200
Subject: [PATCH 009/770] [clangd] Enable cross-file-rename by default.

Summary:
The cross-file rename feature is stable enough to enable it (has been
rolled out internally for a few weeks).

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80507
---
 clang-tools-extra/clangd/tool/ClangdMain.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 031f57f954cb2..cab6c97cf121e 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -274,11 +274,8 @@ list<std::string> TweakList{
 opt<bool> CrossFileRename{
     "cross-file-rename",
     cat(Features),
-    desc("Enable cross-file rename feature. Note that this feature is "
-         "experimental and may lead to broken code or incomplete rename "
-         "results"),
-    init(false),
-    Hidden,
+    desc("Enable cross-file rename feature."),
+    init(true),
 };
 
 opt<bool> RecoveryAST{

From 83bd2c4a06803fa9af7f92a474b1d37cb70397cc Mon Sep 17 00:00:00 2001
From: Jaroslav Sevcik <jarin@google.com>
Date: Mon, 25 May 2020 11:17:48 +0200
Subject: [PATCH 010/770] Prevent GetNumChildren from transitively walking
 pointer chains

Summary:

This is an attempt to fix https://bugs.llvm.org/show_bug.cgi?id=45988,
where SBValue::GetNumChildren returns 2, but SBValue::GetChildAtIndex(1) returns
an invalid value sentinel.

The root cause of this seems to be that GetNumChildren can return the number of
children of a wrong value. In particular, for pointers GetNumChildren just
recursively calls itself on the pointee type, so it effectively walks chains of
pointers. This is different from the logic of GetChildAtIndex, which only
recurses if pointee.IsAggregateType() returns true (IsAggregateType is false for
pointers and references), so it never follows chain of pointers.

This patch aims to make GetNumChildren (more) consistent with GetChildAtIndex by
only recursively calling GetNumChildren for aggregate types.

Ideally, GetNumChildren and GetChildAtIndex would share the code that decides
which pointers/references are followed, but that is a bit more invasive change.

Reviewers: teemperor, jingham, clayborg

Reviewed By: teemperor, clayborg

Subscribers: clayborg, labath, shafik, lldb-commits

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D80254
---
 .../TypeSystem/Clang/TypeSystemClang.cpp      | 34 +++++++------------
 .../pointer_num_children/Makefile             |  3 ++
 .../TestPointerNumChildren.py                 | 28 +++++++++++++++
 .../pointer_num_children/main.cpp             | 16 +++++++++
 4 files changed, 60 insertions(+), 21 deletions(-)
 create mode 100644 lldb/test/API/functionalities/pointer_num_children/Makefile
 create mode 100644 lldb/test/API/functionalities/pointer_num_children/TestPointerNumChildren.py
 create mode 100644 lldb/test/API/functionalities/pointer_num_children/main.cpp

diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 6e8946e23104f..c687251ed5dcb 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5172,12 +5172,15 @@ uint32_t TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
     }
     break;
 
+  case clang::Type::LValueReference:
+  case clang::Type::RValueReference:
   case clang::Type::ObjCObjectPointer: {
-    const clang::ObjCObjectPointerType *pointer_type =
-        llvm::cast<clang::ObjCObjectPointerType>(qual_type.getTypePtr());
-    clang::QualType pointee_type = pointer_type->getPointeeType();
-    uint32_t num_pointee_children =
-        GetType(pointee_type).GetNumChildren(omit_empty_base_classes, exe_ctx);
+    CompilerType pointee_clang_type(GetPointeeType(type));
+
+    uint32_t num_pointee_children = 0;
+    if (pointee_clang_type.IsAggregateType())
+      num_pointee_children =
+          pointee_clang_type.GetNumChildren(omit_empty_base_classes, exe_ctx);
     // If this type points to a simple type, then it has 1 child
     if (num_pointee_children == 0)
       num_children = 1;
@@ -5209,8 +5212,11 @@ uint32_t TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
     const clang::PointerType *pointer_type =
         llvm::cast<clang::PointerType>(qual_type.getTypePtr());
     clang::QualType pointee_type(pointer_type->getPointeeType());
-    uint32_t num_pointee_children =
-        GetType(pointee_type).GetNumChildren(omit_empty_base_classes, exe_ctx);
+    CompilerType pointee_clang_type(GetType(pointee_type));
+    uint32_t num_pointee_children = 0;
+    if (pointee_clang_type.IsAggregateType())
+      num_pointee_children =
+          pointee_clang_type.GetNumChildren(omit_empty_base_classes, exe_ctx);
     if (num_pointee_children == 0) {
       // We have a pointer to a pointee type that claims it has no children. We
       // will want to look at
@@ -5219,20 +5225,6 @@ uint32_t TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
       num_children = num_pointee_children;
   } break;
 
-  case clang::Type::LValueReference:
-  case clang::Type::RValueReference: {
-    const clang::ReferenceType *reference_type =
-        llvm::cast<clang::ReferenceType>(qual_type.getTypePtr());
-    clang::QualType pointee_type = reference_type->getPointeeType();
-    uint32_t num_pointee_children =
-        GetType(pointee_type).GetNumChildren(omit_empty_base_classes, exe_ctx);
-    // If this type points to a simple type, then it has 1 child
-    if (num_pointee_children == 0)
-      num_children = 1;
-    else
-      num_children = num_pointee_children;
-  } break;
-
   default:
     break;
   }
diff --git a/lldb/test/API/functionalities/pointer_num_children/Makefile b/lldb/test/API/functionalities/pointer_num_children/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/functionalities/pointer_num_children/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/pointer_num_children/TestPointerNumChildren.py b/lldb/test/API/functionalities/pointer_num_children/TestPointerNumChildren.py
new file mode 100644
index 0000000000000..aaeaef75810cb
--- /dev/null
+++ b/lldb/test/API/functionalities/pointer_num_children/TestPointerNumChildren.py
@@ -0,0 +1,28 @@
+"""
+Test children counts of pointer values.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestPointerNumChilden(TestBase):
+    mydir = TestBase.compute_mydir(__file__)
+
+    def test_pointer_num_children(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
+
+        result = self.frame().FindVariable("Ref")
+        self.assertEqual(1, result.GetNumChildren())
+        self.assertEqual(2, result.GetChildAtIndex(0).GetNumChildren())
+        self.assertEqual("42", result.GetChildAtIndex(0).GetChildAtIndex(0).GetValue())
+        self.assertEqual("56", result.GetChildAtIndex(0).GetChildAtIndex(1).GetValue())
+
+        result = self.frame().FindVariable("Ptr")
+        self.assertEqual(1, result.GetNumChildren())
+        self.assertEqual(2, result.GetChildAtIndex(0).GetNumChildren())
+        self.assertEqual("42", result.GetChildAtIndex(0).GetChildAtIndex(0).GetValue())
+        self.assertEqual("56", result.GetChildAtIndex(0).GetChildAtIndex(1).GetValue())
diff --git a/lldb/test/API/functionalities/pointer_num_children/main.cpp b/lldb/test/API/functionalities/pointer_num_children/main.cpp
new file mode 100644
index 0000000000000..a17182092a676
--- /dev/null
+++ b/lldb/test/API/functionalities/pointer_num_children/main.cpp
@@ -0,0 +1,16 @@
+struct Inner {
+  int a;
+  int b;
+};
+
+struct Outer {
+  Inner *inner;
+};
+
+int main() {
+  Inner inner{42, 56};
+  Outer outer{&inner};
+  Inner **Ptr = &(outer.inner);
+  Inner *&Ref = outer.inner;
+  return 0; // break here
+}

From fe22e5689e94370b8eadef4b7267201cc9fcb2e3 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Mon, 25 May 2020 12:14:21 +0200
Subject: [PATCH 011/770] [lldb][NFC] Pass DeclarationName to NameSearchContext
 by value

DeclarationName is usually passed around by value as it's just a pointer.
---
 .../source/Plugins/ExpressionParser/Clang/NameSearchContext.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.h b/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.h
index 52d2a19a404b3..dc8621dd6aba5 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.h
@@ -32,7 +32,7 @@ struct NameSearchContext {
   /// modules.
   ClangASTImporter::NamespaceMapSP m_namespace_map;
   /// The name being looked for.
-  const clang::DeclarationName &m_decl_name;
+  const clang::DeclarationName m_decl_name;
   /// The DeclContext to put declarations into.
   const clang::DeclContext *m_decl_context;
   /// All the types of functions that have been reported, so we don't
@@ -63,7 +63,7 @@ struct NameSearchContext {
   ///     The DeclContext to register Decls in.
   NameSearchContext(TypeSystemClang &clang_ts,
                     llvm::SmallVectorImpl<clang::NamedDecl *> &decls,
-                    clang::DeclarationName &name, const clang::DeclContext *dc)
+                    clang::DeclarationName name, const clang::DeclContext *dc)
       : m_clang_ts(clang_ts), m_decls(decls),
         m_namespace_map(std::make_shared<ClangASTImporter::NamespaceMap>()),
         m_decl_name(name), m_decl_context(dc) {

From b087b91c917087bc53d47282a16ee4af78bfe286 Mon Sep 17 00:00:00 2001
From: Dmitry Preobrazhensky <dmitry.preobrazhensky@amd.com>
Date: Mon, 25 May 2020 14:19:22 +0300
Subject: [PATCH 012/770] [AMDGPU][CODEGEN] Added 'A' constraint for inline
 assembler

Summary: 'A' constraint requires an immediate int or fp constant that can be inlined in an instruction encoding.

Reviewers: arsenm, rampitec

Differential Revision: https://reviews.llvm.org/D78494
---
 llvm/docs/LangRef.rst                         |   2 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  13 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  58 ++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   7 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  10 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   3 +
 .../test/CodeGen/AMDGPU/inline-constraints.ll | 277 +++++++++++++++++-
 7 files changed, 363 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a1f3297d6454f..bf0627e441960 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4128,7 +4128,7 @@ AMDGPU:
 - ``[0-9]v``: The 32-bit VGPR register, number 0-9.
 - ``[0-9]s``: The 32-bit SGPR register, number 0-9.
 - ``[0-9]a``: The 32-bit AGPR register, number 0-9.
-
+- ``A``: An integer or a floating-point inline constant.
 
 All ARM modes:
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 36cc0ea20052e..81676d63643df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1339,7 +1339,18 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
                                        *MF->getSubtarget().getRegisterInfo());
     return false;
+  } else if (MO.isImm()) {
+    int64_t Val = MO.getImm();
+    if (AMDGPU::isInlinableIntLiteral(Val)) {
+      O << Val;
+    } else if (isUInt<16>(Val)) {
+      O << format("0x%" PRIx64, static_cast<uint16_t>(Val));
+    } else if (isUInt<32>(Val)) {
+      O << format("0x%" PRIx64, static_cast<uint32_t>(Val));
+    } else {
+      O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
+    }
+    return false;
   }
-
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3ddf4ae70397d..2c147fa8947c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10886,11 +10886,69 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
     case 'v':
     case 'a':
       return C_RegisterClass;
+    case 'A':
+      return C_Other;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
+void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                    std::string &Constraint,
+                                                    std::vector<SDValue> &Ops,
+                                                    SelectionDAG &DAG) const {
+  if (Constraint.length() == 1 && Constraint[0] == 'A') {
+    LowerAsmOperandForConstraintA(Op, Ops, DAG);
+  } else {
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+  }
+}
+
+void SITargetLowering::LowerAsmOperandForConstraintA(SDValue Op,
+                                                     std::vector<SDValue> &Ops,
+                                                     SelectionDAG &DAG) const {
+  unsigned Size = Op.getScalarValueSizeInBits();
+  if (Size > 64)
+    return;
+
+  uint64_t Val;
+  bool IsConst = false;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    Val = C->getSExtValue();
+    IsConst = true;
+  } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
+    Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
+    IsConst = true;
+  } else if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
+    if (Size != 16 || Op.getNumOperands() != 2)
+      return;
+    if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
+      return;
+    if (ConstantSDNode *C = V->getConstantSplatNode()) {
+      Val = C->getSExtValue();
+      IsConst = true;
+    } else if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
+      Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
+      IsConst = true;
+    }
+  }
+
+  if (IsConst) {
+    bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
+    if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
+        (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
+        (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
+      // Clear unused bits of fp constants
+      if (!AMDGPU::isInlinableIntLiteral(Val)) {
+        unsigned UnusedBits = 64 - Size;
+        Val = (Val << UnusedBits) >> UnusedBits;
+      }
+      auto Res = DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64);
+      Ops.push_back(Res);
+    }
+  }
+}
+
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 226003423889c..7ef11eba4f9ce 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -383,6 +383,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
   ConstraintType getConstraintType(StringRef Constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+  void LowerAsmOperandForConstraintA(SDValue Op,
+                                     std::vector<SDValue> &Ops,
+                                     SelectionDAG &DAG) const;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 409bef0065e29..cba9857e4d158 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1169,8 +1169,12 @@ unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
   return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
 }
 
+bool isInlinableIntLiteral(int64_t Literal) {
+  return Literal >= -16 && Literal <= 64;
+}
+
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   uint64_t Val = static_cast<uint64_t>(Literal);
@@ -1187,7 +1191,7 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
 }
 
 bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   // The actual type of the operand does not seem to matter as long
@@ -1216,7 +1220,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
   if (!HasInv2Pi)
     return false;
 
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   uint16_t Val = static_cast<uint16_t>(Literal);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ef7b224138841..224f797b3ef84 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -776,6 +776,9 @@ struct SIModeRegisterDefaults {
   }
 };
 
+LLVM_READNONE
+bool isInlinableIntLiteral(int64_t Literal);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 6f1d35519f2fa..63585ebc9553f 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -1,5 +1,8 @@
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: not llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: not llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=VI %s
+
+; RUN: not llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs 2>&1 | FileCheck --check-prefix=NOGCN --check-prefix=NOSI %s
+; RUN: not llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs 2>&1 | FileCheck --check-prefix=NOGCN %s
 
 ; GCN-LABEL: {{^}}inline_reg_constraints:
 ; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
@@ -74,3 +77,273 @@ define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
   tail call void asm sideeffect "; use $0", "s"(double 1.0)
   ret void
 }
+
+;==============================================================================
+; 'A' constraint, 16-bit operand
+;==============================================================================
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H0:
+; VI: v_mov_b32 {{v[0-9]+}}, 64
+define i32 @inline_A_constraint_H0() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 64)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H1:
+; VI: v_mov_b32 {{v[0-9]+}}, -16
+define i32 @inline_A_constraint_H1() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 -16)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H2:
+; VI: v_mov_b32 {{v[0-9]+}}, 0x3c00
+define i32 @inline_A_constraint_H2() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 1.0 to i16))
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H3:
+; VI: v_mov_b32 {{v[0-9]+}}, 0xbc00
+define i32 @inline_A_constraint_H3() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half -1.0 to i16))
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H4:
+; VI: v_mov_b32 {{v[0-9]+}}, 0x3118
+define i32 @inline_A_constraint_H4() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half 0xH3118)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H5:
+; VI: v_mov_b32 {{v[0-9]+}}, 0x3118
+define i32 @inline_A_constraint_H5() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 0xH3118 to i16))
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_H6:
+; VI: v_mov_b32 {{v[0-9]+}}, 0xb800
+define i32 @inline_A_constraint_H6() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half -0.5)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_H7() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 0xH3119 to i16))
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_H8() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 0xH3117 to i16))
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_H9() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 65)
+  ret i32 %v0
+}
+
+;==============================================================================
+; 'A' constraint, 32-bit operand
+;==============================================================================
+
+; GCN-LABEL: {{^}}inline_A_constraint_F0:
+; GCN: v_mov_b32 {{v[0-9]+}}, -16
+define i32 @inline_A_constraint_F0() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 -16)
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_F1:
+; GCN: v_mov_b32 {{v[0-9]+}}, 1
+define i32 @inline_A_constraint_F1() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 1)
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_F2:
+; GCN: v_mov_b32 {{v[0-9]+}}, 0xbf000000
+define i32 @inline_A_constraint_F2() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 bitcast (float -0.5 to i32))
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_F3:
+; GCN: v_mov_b32 {{v[0-9]+}}, 0x40000000
+define i32 @inline_A_constraint_F3() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 bitcast (float 2.0 to i32))
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_F4:
+; GCN: v_mov_b32 {{v[0-9]+}}, 0xc0800000
+define i32 @inline_A_constraint_F4() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(float -4.0)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_F5:
+; VI: v_mov_b32 {{v[0-9]+}}, 0x3e22f983
+define i32 @inline_A_constraint_F5() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 1042479491)
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_F6:
+; GCN: v_mov_b32 {{v[0-9]+}}, 0x3f000000
+define i32 @inline_A_constraint_F6() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(float 0.5)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_F7() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 1042479490)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_F8() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 -17)
+  ret i32 %v0
+}
+
+;==============================================================================
+; 'A' constraint, 64-bit operand
+;==============================================================================
+
+; GCN-LABEL: {{^}}inline_A_constraint_D0:
+; GCN: v_mov_b32 {{v[0-9]+}}, -16
+define i32 @inline_A_constraint_D0() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i64 -16)
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_D1:
+; GCN: v_cvt_f32_f64 {{v[0-9]+}}, 0xc000000000000000
+define i32 @inline_A_constraint_D1() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(i64 bitcast (double -2.0 to i64))
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_D2:
+; GCN: v_cvt_f32_f64 {{v[0-9]+}}, 0x3fe0000000000000
+define i32 @inline_A_constraint_D2() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(double 0.5)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_D3:
+; VI: v_cvt_f32_f64 {{v[0-9]+}}, 0x3fc45f306dc9c882
+define i32 @inline_A_constraint_D3() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(double 0.15915494309189532)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_D4:
+; VI: v_cvt_f32_f64 {{v[0-9]+}}, 0x3fc45f306dc9c882
+define i32 @inline_A_constraint_D4() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(i64 bitcast (double 0.15915494309189532 to i64))
+  ret i32 %v0
+}
+
+; GCN-LABEL: {{^}}inline_A_constraint_D5:
+; GCN: v_cvt_f32_f64 {{v[0-9]+}}, 0xc000000000000000
+define i32 @inline_A_constraint_D5() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(double -2.0)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_D8() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(double 1.1)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_D9() {
+  %v0 = tail call i32 asm "v_cvt_f32_f64 $0, $1", "=v,A"(i64 bitcast (double 0.1 to i64))
+  ret i32 %v0
+}
+
+;==============================================================================
+; 'A' constraint, v2x16 operand
+;==============================================================================
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_V0:
+; VI: v_mov_b32 {{v[0-9]+}}, -4
+define i32 @inline_A_constraint_V0() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x i16> <i16 -4, i16 -4>)
+  ret i32 %v0
+}
+
+; NOSI: error: invalid operand for inline asm constraint 'A'
+; VI-LABEL: {{^}}inline_A_constraint_V1:
+; VI: v_mov_b32 {{v[0-9]+}}, 0xb800
+define i32 @inline_A_constraint_V1() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x half> <half -0.5, half -0.5>)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_V2() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x i16> <i16 -4, i16 undef>)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_V3() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x half> <half undef, half -0.5>)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_V4() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x i16> <i16 1, i16 2>)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_V5() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<4 x i16> <i16 0, i16 0, i16 0, i16 0>)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_V6() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(<2 x i32> <i32 0, i32 0>)
+  ret i32 %v0
+}
+
+;==============================================================================
+; 'A' constraint, type errors
+;==============================================================================
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_E1(i32 %x) {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i32 %x)
+  ret i32 %v0
+}
+
+; NOGCN: error: invalid operand for inline asm constraint 'A'
+define i32 @inline_A_constraint_E2() {
+  %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i128 100000000000000000000)
+  ret i32 %v0
+}

From 8e62f3b658cc85bf0a42dec1326c5e87e848485c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 12:11:53 +0100
Subject: [PATCH 013/770] TargetInstrInfo.h - remove unnecessary includes. NFC.

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 30ab1039bef00..8c6d845215948 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -17,7 +17,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
-#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
@@ -26,7 +25,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOutliner.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/BranchProbability.h"

From 0e83e67cd359aef475e5c3b86c1a5c932cfb1aba Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 12:20:58 +0100
Subject: [PATCH 014/770] SystemZInstrBuilder.h - remove unnecessary
 PseudoSourceValue.h include. NFC.

---
 llvm/lib/Target/SystemZ/SystemZInstrBuilder.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
index 57c10648612dc..9fc786f92635f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 
 namespace llvm {
 

From 9fa58d1bf2f83a556c109f701aacfb92e2184c23 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 12:41:08 +0100
Subject: [PATCH 015/770] [DAG] Add SimplifyDemandedVectorElts binop
 SimplifyMultipleUseDemandedBits handling

For the supported binops (basic arithmetic, logicals + shifts), if we fail to simplify the demanded vector elts, then call SimplifyMultipleUseDemandedBits and try to peek through ops to remove unnecessary dependencies.

This helps with PR40502.

Differential Revision: https://reviews.llvm.org/D79003
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  75 ++++++++---
 llvm/test/CodeGen/AArch64/mul_by_elt.ll       |   2 +-
 llvm/test/CodeGen/X86/combine-pmuldq.ll       |   8 +-
 llvm/test/CodeGen/X86/combine-sdiv.ll         |  44 ++++---
 llvm/test/CodeGen/X86/oddsubvector.ll         | 116 +++++++-----------
 llvm/test/CodeGen/X86/vector-fshl-rot-128.ll  |  37 ++----
 llvm/test/CodeGen/X86/vector-fshl-rot-256.ll  |   7 +-
 llvm/test/CodeGen/X86/vector-fshr-rot-128.ll  |  37 ++----
 llvm/test/CodeGen/X86/vector-fshr-rot-256.ll  |   7 +-
 llvm/test/CodeGen/X86/vector-narrow-binop.ll  |   9 +-
 10 files changed, 166 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index eabfd7fc85cbb..b6fdddc46ede5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2225,6 +2225,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
     bool AssumeSingleUse) const {
   EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
   APInt DemandedElts = OriginalDemandedElts;
   unsigned NumElts = DemandedElts.getBitWidth();
   assert(VT.isVector() && "Expected vector op");
@@ -2256,7 +2257,26 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   SDLoc DL(Op);
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
-  switch (Op.getOpcode()) {
+  // Helper for demanding the specified elements and all the bits of both binary
+  // operands.
+  auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) {
+    unsigned NumBits0 = Op0.getScalarValueSizeInBits();
+    unsigned NumBits1 = Op1.getScalarValueSizeInBits();
+    APInt DemandedBits0 = APInt::getAllOnesValue(NumBits0);
+    APInt DemandedBits1 = APInt::getAllOnesValue(NumBits1);
+    SDValue NewOp0 = SimplifyMultipleUseDemandedBits(
+        Op0, DemandedBits0, DemandedElts, TLO.DAG, Depth + 1);
+    SDValue NewOp1 = SimplifyMultipleUseDemandedBits(
+        Op1, DemandedBits1, DemandedElts, TLO.DAG, Depth + 1);
+    if (NewOp0 || NewOp1) {
+      SDValue NewOp = TLO.DAG.getNode(
+          Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
+      return TLO.CombineTo(Op, NewOp);
+    }
+    return false;
+  };
+
+  switch (Opcode) {
   case ISD::SCALAR_TO_VECTOR: {
     if (!DemandedElts[0]) {
       KnownUndef.setAllBits();
@@ -2635,7 +2655,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     break;
   }
 
-  // TODO: There are more binop opcodes that could be handled here - MUL, MIN,
+  // TODO: There are more binop opcodes that could be handled here - MIN,
   // MAX, saturated math, etc.
   case ISD::OR:
   case ISD::XOR:
@@ -2646,17 +2666,26 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
     APInt UndefRHS, ZeroRHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
-                                   ZeroRHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
       return true;
     APInt UndefLHS, ZeroLHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
-                                   ZeroLHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
       return true;
 
     KnownZero = ZeroLHS & ZeroRHS;
     KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnesValue())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
     break;
   }
   case ISD::SHL:
@@ -2664,27 +2693,39 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   case ISD::SRA:
   case ISD::ROTL:
   case ISD::ROTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
     APInt UndefRHS, ZeroRHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
-                                   ZeroRHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
       return true;
     APInt UndefLHS, ZeroLHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
-                                   ZeroLHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
       return true;
 
     KnownZero = ZeroLHS;
     KnownUndef = UndefLHS & UndefRHS; // TODO: use getKnownUndefForVectorBinop?
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnesValue())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
     break;
   }
   case ISD::MUL:
   case ISD::AND: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
     APInt SrcUndef, SrcZero;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
-                                   SrcZero, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
       return true;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
-                                   KnownZero, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
       return true;
 
     // If either side has a zero element, then the result element is zero, even
@@ -2694,6 +2735,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownZero |= SrcZero;
     KnownUndef &= SrcUndef;
     KnownUndef &= ~KnownZero;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnesValue())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
     break;
   }
   case ISD::TRUNCATE:
diff --git a/llvm/test/CodeGen/AArch64/mul_by_elt.ll b/llvm/test/CodeGen/AArch64/mul_by_elt.ll
index c51ef8c379ccc..c9fc2dea28eba 100644
--- a/llvm/test/CodeGen/AArch64/mul_by_elt.ll
+++ b/llvm/test/CodeGen/AArch64/mul_by_elt.ll
@@ -133,7 +133,7 @@ define <4 x float> @splat0_before_fmul_fmul_constant(<4 x float> %a) {
 ; CHECK-LABEL: splat0_before_fmul_fmul_constant:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov v1.4s, #3.00000000
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.s[0]
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    fmov v1.4s, #6.00000000
 ; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.s[0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 82387c936233c..0e448f3f3be06 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -187,7 +187,7 @@ define i32 @PR43159(<4 x i32>* %a0) {
 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm3, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm2
@@ -213,7 +213,7 @@ define i32 @PR43159(<4 x i32>* %a0) {
 ; AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
@@ -238,7 +238,7 @@ define i32 @PR43159(<4 x i32>* %a0) {
 ; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX512VL-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
@@ -263,7 +263,7 @@ define i32 @PR43159(<4 x i32>* %a0) {
 ; AVX512DQVL-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; AVX512DQVL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512DQVL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX512DQVL-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 235a6f0f33421..618b0a8d26067 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1997,12 +1997,12 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
 ; SSE2-NEXT:    psrad $2, %xmm0
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2022,12 +2022,11 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psrad $2, %xmm3
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    psubd %xmm3, %xmm2
 ; SSE41-NEXT:    psrad $3, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2043,12 +2042,11 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2057,10 +2055,10 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
 ; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2ORLATER-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2ORLATER-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2ORLATER-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
 ; AVX2ORLATER-NEXT:    retq
 ;
 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2069,10 +2067,10 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
 ; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; XOP-NEXT:    vpshad {{.*}}(%rip), %xmm1, %xmm1
-; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; XOP-NEXT:    retq
   %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index b42578eafdfd8..8d3e01f86def6 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -192,82 +192,67 @@ define void @PR42833() {
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    addl .Lb${{.*}}(%rip), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movaps {{.*#+}} xmm3 = <u,1,1,1>
-; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    paddd %xmm3, %xmm4
-; SSE2-NEXT:    pslld $23, %xmm3
-; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pmuludq %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm3, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3]
-; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm4
+; SSE2-NEXT:    psubd %xmm1, %xmm4
 ; SSE2-NEXT:    paddd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
 ; SSE2-NEXT:    movdqa %xmm1, .Lc$local+{{.*}}(%rip)
 ; SSE2-NEXT:    movaps %xmm5, .Lc$local+{{.*}}(%rip)
 ; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm4
+; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm3
 ; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm5
 ; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm6
 ; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm7
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    psubd %xmm0, %xmm7
-; SSE2-NEXT:    psubd %xmm4, %xmm6
+; SSE2-NEXT:    psubd %xmm3, %xmm6
 ; SSE2-NEXT:    psubd %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, .Ld$local+{{.*}}(%rip)
 ; SSE2-NEXT:    movdqa %xmm6, .Ld$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa %xmm3, .Ld$local+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm4, .Ld$local+{{.*}}(%rip)
 ; SSE2-NEXT:    movdqa %xmm7, .Ld$local+{{.*}}(%rip)
-; SSE2-NEXT:    paddd %xmm4, %xmm4
+; SSE2-NEXT:    paddd %xmm3, %xmm3
 ; SSE2-NEXT:    paddd %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, .Lc$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa %xmm4, .Lc$local+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm3, .Lc$local+{{.*}}(%rip)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: PR42833:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
 ; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm0
-; SSE42-NEXT:    movd %xmm0, %eax
+; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
+; SSE42-NEXT:    movd %xmm1, %eax
 ; SSE42-NEXT:    addl .Lb${{.*}}(%rip), %eax
-; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = <u,1,1,1>
-; SSE42-NEXT:    pinsrd $0, %eax, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    paddd %xmm2, %xmm3
-; SSE42-NEXT:    pslld $23, %xmm2
-; SSE42-NEXT:    paddd {{.*}}(%rip), %xmm2
-; SSE42-NEXT:    cvttps2dq %xmm2, %xmm2
-; SSE42-NEXT:    pmulld %xmm0, %xmm2
-; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    movd %eax, %xmm2
+; SSE42-NEXT:    paddd %xmm1, %xmm2
 ; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm3
-; SSE42-NEXT:    psubd %xmm1, %xmm3
-; SSE42-NEXT:    paddd %xmm1, %xmm1
-; SSE42-NEXT:    movdqa %xmm1, .Lc$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa %xmm2, .Lc$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
+; SSE42-NEXT:    psubd %xmm0, %xmm3
+; SSE42-NEXT:    paddd %xmm0, %xmm0
+; SSE42-NEXT:    movdqa %xmm1, %xmm4
+; SSE42-NEXT:    paddd %xmm1, %xmm4
+; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
+; SSE42-NEXT:    movdqa %xmm0, .Lc$local+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm4, .Lc$local+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm0
 ; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm2
 ; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm4
 ; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm5
 ; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm6
-; SSE42-NEXT:    pinsrd $0, %eax, %xmm0
-; SSE42-NEXT:    psubd %xmm0, %xmm6
+; SSE42-NEXT:    pinsrd $0, %eax, %xmm1
+; SSE42-NEXT:    psubd %xmm1, %xmm6
 ; SSE42-NEXT:    psubd %xmm2, %xmm5
-; SSE42-NEXT:    psubd %xmm1, %xmm4
+; SSE42-NEXT:    psubd %xmm0, %xmm4
 ; SSE42-NEXT:    movdqa %xmm4, .Ld$local+{{.*}}(%rip)
 ; SSE42-NEXT:    movdqa %xmm5, .Ld$local+{{.*}}(%rip)
 ; SSE42-NEXT:    movdqa %xmm3, .Ld$local+{{.*}}(%rip)
 ; SSE42-NEXT:    movdqa %xmm6, .Ld$local+{{.*}}(%rip)
 ; SSE42-NEXT:    paddd %xmm2, %xmm2
-; SSE42-NEXT:    paddd %xmm1, %xmm1
-; SSE42-NEXT:    movdqa %xmm1, .Lc$local+{{.*}}(%rip)
+; SSE42-NEXT:    paddd %xmm0, %xmm0
+; SSE42-NEXT:    movdqa %xmm0, .Lc$local+{{.*}}(%rip)
 ; SSE42-NEXT:    movdqa %xmm2, .Lc$local+{{.*}}(%rip)
 ; SSE42-NEXT:    retq
 ;
@@ -276,17 +261,13 @@ define void @PR42833() {
 ; AVX1-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    addl .Lb${{.*}}(%rip), %eax
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,1,1,1>
-; AVX1-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm3
-; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpslld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm2
 ; AVX1-NEXT:    vpsubd .Lc$local+{{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vmovups %ymm1, .Lc$local+{{.*}}(%rip)
@@ -316,10 +297,9 @@ define void @PR42833() {
 ; AVX2-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm0
 ; AVX2-NEXT:    addl .Lc$local+{{.*}}(%rip), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],mem[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqu %ymm2, .Lc$local+{{.*}}(%rip)
 ; AVX2-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm2
 ; AVX2-NEXT:    vmovdqu .Ld$local+{{.*}}(%rip), %ymm3
@@ -341,10 +321,9 @@ define void @PR42833() {
 ; AVX512-NEXT:    vmovdqu64 .Lc$local+{{.*}}(%rip), %zmm1
 ; AVX512-NEXT:    addl .Lc$local+{{.*}}(%rip), %eax
 ; AVX512-NEXT:    vmovd %eax, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],mem[1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpaddd %ymm2, %ymm0, %ymm3
-; AVX512-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
+; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm2
 ; AVX512-NEXT:    vmovdqu %ymm0, .Lc$local+{{.*}}(%rip)
 ; AVX512-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm0
@@ -364,14 +343,13 @@ define void @PR42833() {
 ; XOP-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm0
 ; XOP-NEXT:    vmovd %xmm0, %eax
 ; XOP-NEXT:    addl .Lb${{.*}}(%rip), %eax
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,1,1,1>
-; XOP-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
-; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; XOP-NEXT:    vmovd %eax, %xmm1
+; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
 ; XOP-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm3
-; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm1
-; XOP-NEXT:    vpslld $1, %xmm3, %xmm3
-; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
+; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; XOP-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm2
 ; XOP-NEXT:    vpsubd .Lc$local+{{.*}}(%rip), %xmm2, %xmm2
 ; XOP-NEXT:    vmovups %ymm1, .Lc$local+{{.*}}(%rip)
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 43d02fca5d500..4d5b148b362e8 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -671,7 +671,6 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
 ; SSE-LABEL: splatvar_funnnel_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
 ; SSE-NEXT:    pxor %xmm3, %xmm3
 ; SSE-NEXT:    psubq %xmm1, %xmm3
@@ -683,31 +682,17 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_funnnel_v2i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splatvar_funnnel_v2i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX2-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: splatvar_funnnel_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index e4c76d59f3339..e8fb824076f2f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -514,9 +514,9 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
@@ -537,7 +537,6 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsllq %xmm3, %ymm0, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 406f9b614a43a..e923df1c01423 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -711,7 +711,6 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
 ; SSE-LABEL: splatvar_funnnel_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [63,63]
 ; SSE-NEXT:    pxor %xmm3, %xmm3
 ; SSE-NEXT:    psubq %xmm1, %xmm3
@@ -723,31 +722,17 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
 ; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_funnnel_v2i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splatvar_funnnel_v2i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; AVX2-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: splatvar_funnnel_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpsrlq %xmm3, %xmm0, %xmm3
+; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index ec18f8948771d..723a9dc51bc82 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -560,9 +560,9 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
@@ -583,7 +583,6 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpsrlq %xmm3, %ymm0, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
index d76f0dc544589..383fde7038d04 100644
--- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
@@ -151,12 +151,11 @@ define <4 x double> @fmul_v2f64(<2 x  double> %x, <2 x double> %y) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd %xmm1, %xmm2
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-NEXT:    mulpd %xmm0, %xmm0
 ; SSE-NEXT:    mulpd %xmm2, %xmm2
-; SSE-NEXT:    addpd %xmm0, %xmm2
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-NEXT:    movapd %xmm2, %xmm0
+; SSE-NEXT:    mulpd %xmm1, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fmul_v2f64:

From 7b15dc1e0e8dfaf3efb608734421eac4e2399d6a Mon Sep 17 00:00:00 2001
From: Xing GUO <higuoxing@gmail.com>
Date: Mon, 25 May 2020 19:42:15 +0800
Subject: [PATCH 016/770] [ObjectYAML][DWARF] Remove unimplemented function.

```
StringMap<std::unique_ptr<MemoryBuffer>>
EmitDebugSections(llvm::DWARFYAML::Data &DI, bool ApplyFixups);
```
is unimplemented and unused.
---
 llvm/include/llvm/ObjectYAML/DWARFEmitter.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index 092aa0040f95b..b6613265c7782 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -41,9 +41,6 @@ void EmitDebugLine(raw_ostream &OS, const Data &DI);
 Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
 EmitDebugSections(StringRef YAMLString, bool ApplyFixups = false,
                   bool IsLittleEndian = sys::IsLittleEndianHost);
-StringMap<std::unique_ptr<MemoryBuffer>>
-EmitDebugSections(llvm::DWARFYAML::Data &DI, bool ApplyFixups);
-
 } // end namespace DWARFYAML
 } // end namespace llvm
 

From 9ff361b099f16ce27c8af61806447df5bca52228 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 25 May 2020 12:25:03 +0100
Subject: [PATCH 017/770] [ARM] VMULH tests for when other parts are working.
 NFC

---
 llvm/test/CodeGen/Thumb2/mve-vmulh.ll | 529 ++++++++++++++++++++++++++
 1 file changed, 529 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-vmulh.ll

diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
new file mode 100644
index 0000000000000..36b3b1e6c312f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -0,0 +1,529 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
+
+define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: vmulhs_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmullb.s32 q2, q0, q1
+; CHECK-NEXT:    vmov r0, s9
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = sext <2 x i32> %s0 to <2 x i64>
+  %s1s = sext <2 x i32> %s1 to <2 x i64>
+  %m = mul <2 x i64> %s0s, %s1s
+  %s = ashr <2 x i64> %m, <i64 32, i64 32>
+  %s2 = trunc <2 x i64> %s to <2 x i32>
+  ret <2 x i32> %s2
+}
+
+define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: vmulhu_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmullb.u32 q2, q0, q1
+; CHECK-NEXT:    vldr s1, .LCPI1_0
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmov.f32 s3, s1
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 0x00000000 @ float 0
+entry:
+  %s0s = zext <2 x i32> %s0 to <2 x i64>
+  %s1s = zext <2 x i32> %s1 to <2 x i64>
+  %m = mul <2 x i64> %s0s, %s1s
+  %s = lshr <2 x i64> %m, <i64 32, i64 32>
+  %s2 = trunc <2 x i64> %s to <2 x i32>
+  ret <2 x i32> %s2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
+; CHECK-LABEL: vmulhs_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s12, s0
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmullb.s32 q0, q1, q3
+; CHECK-NEXT:    smmul r0, r1, r0
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    smmul r1, r2, r1
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = sext <4 x i32> %s0 to <4 x i64>
+  %s1s = sext <4 x i32> %s1 to <4 x i64>
+  %m = mul <4 x i64> %s0s, %s1s
+  %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
+  %s2 = trunc <4 x i64> %s to <4 x i32>
+  ret <4 x i32> %s2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
+; CHECK-LABEL: vmulhu_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s16, s2
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmullb.u32 q2, q4, q3
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmullb.u32 q3, q0, q1
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s1, s15
+; CHECK-NEXT:    vmov.f32 s2, s9
+; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = zext <4 x i32> %s0 to <4 x i64>
+  %s1s = zext <4 x i32> %s1 to <4 x i64>
+  %m = mul <4 x i64> %s0s, %s1s
+  %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
+  %s2 = trunc <4 x i64> %s to <4 x i32>
+  ret <4 x i32> %s2
+}
+
+define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: vmulhs_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmullb.s16 q0, q0, q1
+; CHECK-NEXT:    vshr.s32 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = sext <4 x i16> %s0 to <4 x i32>
+  %s1s = sext <4 x i16> %s1 to <4 x i32>
+  %m = mul <4 x i32> %s0s, %s1s
+  %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
+  %s2 = trunc <4 x i32> %s to <4 x i16>
+  ret <4 x i16> %s2
+}
+
+define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: vmulhu_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmullb.u16 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q0, q0, #16
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = zext <4 x i16> %s0 to <4 x i32>
+  %s1s = zext <4 x i16> %s1 to <4 x i32>
+  %m = mul <4 x i32> %s0s, %s1s
+  %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
+  %s2 = trunc <4 x i32> %s to <4 x i16>
+  ret <4 x i16> %s2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-LABEL: vmulhs_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmullb.s16 q2, q3, q2
+; CHECK-NEXT:    vshr.s32 q3, q2, #16
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmullb.s16 q0, q1, q3
+; CHECK-NEXT:    vshr.s32 q0, q0, #16
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = sext <8 x i16> %s0 to <8 x i32>
+  %s1s = sext <8 x i16> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %s2 = trunc <8 x i32> %s to <8 x i16>
+  ret <8 x i16> %s2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-LABEL: vmulhu_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmullb.u16 q2, q3, q2
+; CHECK-NEXT:    vshr.u32 q3, q2, #16
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmullb.u16 q0, q1, q3
+; CHECK-NEXT:    vshr.u32 q0, q0, #16
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = zext <8 x i16> %s0 to <8 x i32>
+  %s1s = zext <8 x i16> %s1 to <8 x i32>
+  %m = mul <8 x i32> %s0s, %s1s
+  %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %s2 = trunc <8 x i32> %s to <8 x i16>
+  ret <8 x i16> %s2
+}
+
+define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: vmulhs_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmullb.s8 q0, q0, q1
+; CHECK-NEXT:    vshr.s16 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i16>
+  %s1s = sext <8 x i8> %s1 to <8 x i16>
+  %m = mul <8 x i16> %s0s, %s1s
+  %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %s2 = trunc <8 x i16> %s to <8 x i8>
+  ret <8 x i8> %s2
+}
+
+define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: vmulhu_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmullb.u8 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q0, q0, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i16>
+  %s1s = zext <8 x i8> %s1 to <8 x i16>
+  %m = mul <8 x i16> %s0s, %s1s
+  %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %s2 = trunc <8 x i16> %s to <8 x i8>
+  ret <8 x i8> %s2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-LABEL: vmulhs_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmullb.s8 q2, q3, q2
+; CHECK-NEXT:    vshr.s16 q3, q2, #8
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.8 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.8 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.8 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmullb.s8 q0, q1, q3
+; CHECK-NEXT:    vshr.s16 q0, q0, #8
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.8 q2[8], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q2[9], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.8 q2[11], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q2[12], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.8 q2[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.8 q2[14], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = sext <16 x i8> %s0 to <16 x i16>
+  %s1s = sext <16 x i8> %s1 to <16 x i16>
+  %m = mul <16 x i16> %s0s, %s1s
+  %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %s2 = trunc <16 x i16> %s to <16 x i8>
+  ret <16 x i8> %s2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-LABEL: vmulhu_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmullb.u8 q2, q3, q2
+; CHECK-NEXT:    vshr.u16 q3, q2, #8
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.8 q2[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.8 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.8 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.8 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.8 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.8 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.8 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.8 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmullb.u8 q0, q1, q3
+; CHECK-NEXT:    vshr.u16 q0, q0, #8
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.8 q2[8], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q2[9], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.8 q2[11], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q2[12], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.8 q2[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.8 q2[14], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %s0s = zext <16 x i8> %s0 to <16 x i16>
+  %s1s = zext <16 x i8> %s1 to <16 x i16>
+  %m = mul <16 x i16> %s0s, %s1s
+  %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %s2 = trunc <16 x i16> %s to <16 x i8>
+  ret <16 x i8> %s2
+}

From 5a4bcec8db420cf22b06720d45a9f9981b0297bf Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Mon, 25 May 2020 06:32:51 -0500
Subject: [PATCH 018/770] [PowerPC][NFC] Split PPCELFStreamer::emitInstruction

Split off PPCELFStreamer::emitPrefixedInstruction from
PPCELFStreamer::emitInstruction.

Differential Revision: https://reviews.llvm.org/D79626
---
 .../PowerPC/MCTargetDesc/PPCELFStreamer.cpp   | 26 +++++++++++--------
 .../PowerPC/MCTargetDesc/PPCELFStreamer.h     |  2 ++
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index c9760ed38bcc5..4373778cc96cc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -44,17 +44,8 @@ PPCELFStreamer::PPCELFStreamer(MCContext &Context,
                     std::move(Emitter)), LastLabel(NULL) {
 }
 
-void PPCELFStreamer::emitInstruction(const MCInst &Inst,
-                                     const MCSubtargetInfo &STI) {
-  PPCMCCodeEmitter *Emitter =
-      static_cast<PPCMCCodeEmitter*>(getAssembler().getEmitterPtr());
-
-  // Special handling is only for prefixed instructions.
-  if (!Emitter->isPrefixedInstruction(Inst)) {
-    MCELFStreamer::emitInstruction(Inst, STI);
-    return;
-  }
-
+void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
+                                             const MCSubtargetInfo &STI) {
   // Prefixed instructions must not cross a 64-byte boundary (i.e. prefix is
   // before the boundary and the remaining 4-bytes are after the boundary). In
   // order to achieve this, a nop is added prior to any such boundary-crossing
@@ -93,6 +84,19 @@ void PPCELFStreamer::emitInstruction(const MCInst &Inst,
   }
 }
 
+void PPCELFStreamer::emitInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI) {
+  PPCMCCodeEmitter *Emitter =
+      static_cast<PPCMCCodeEmitter*>(getAssembler().getEmitterPtr());
+
+  // Special handling is only for prefixed instructions.
+  if (!Emitter->isPrefixedInstruction(Inst)) {
+    MCELFStreamer::emitInstruction(Inst, STI);
+    return;
+  }
+  emitPrefixedInstruction(Inst, STI);
+}
+
 void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   LastLabel = Symbol;
   LastLabelLoc = Loc;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
index 403681ed383aa..51863232d0719 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
@@ -41,6 +41,8 @@ class PPCELFStreamer : public MCELFStreamer {
 
   // EmitLabel updates LastLabel and LastLabelLoc when a new label is emitted.
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+private:
+  void emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
 };
 
 MCELFStreamer *createPPCELFStreamer(MCContext &Context,

From 7293dd5b4033d94ce1397b192a93010e64b2d949 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 25 May 2020 07:42:17 -0400
Subject: [PATCH 019/770] Added pow intrinsic to LLVMIR dialect

Added pow intrinsic to LLVMIR dialect. Added a roundrip test for it.

Differential Revision: https://reviews.llvm.org/D80248
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 1 +
 mlir/test/Dialect/LLVMIR/roundtrip.mlir     | 3 +++
 mlir/test/Target/llvmir-intrinsics.mlir     | 9 +++++++++
 3 files changed, 13 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 9a7836cd558cb..4be27b94e75e1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -790,6 +790,7 @@ def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0]>,
                                LLVM_Type:$hint, LLVM_Type:$cache)>;
 def LLVM_SinOp : LLVM_UnaryIntrinsicOp<"sin">;
 def LLVM_SqrtOp : LLVM_UnaryIntrinsicOp<"sqrt">;
+def LLVM_PowOp : LLVM_BinarySameArgsIntrinsicOp<"pow">;
 
 //
 // Vector Reductions.
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 79f9078e56778..d93de93882bea 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -100,6 +100,9 @@ func @ops(%arg0 : !llvm.i32, %arg1 : !llvm.float) {
 // CHECK: "llvm.intr.sin"(%arg1) : (!llvm.float) -> !llvm.float
   %30 = "llvm.intr.sin"(%arg1) : (!llvm.float) -> !llvm.float
 
+// CHECK: "llvm.intr.pow"(%arg1, %arg1) : (!llvm.float, !llvm.float) -> !llvm.float
+  %31 = "llvm.intr.pow"(%arg1, %arg1) : (!llvm.float, !llvm.float) -> !llvm.float
+
 // CHECK:  llvm.return
   llvm.return
 }
diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir
index c332bc23b814e..17c586e9a88b6 100644
--- a/mlir/test/Target/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/llvmir-intrinsics.mlir
@@ -108,6 +108,15 @@ llvm.func @copysign_test(%arg0: !llvm.float, %arg1: !llvm.float, %arg2: !llvm<"<
   llvm.return
 }
 
+// CHECK-LABEL: @pow_test
+llvm.func @pow_test(%arg0: !llvm.float, %arg1: !llvm.float, %arg2: !llvm<"<8 x float>">, %arg3: !llvm<"<8 x float>">) {
+  // CHECK: call float @llvm.pow.f32
+  "llvm.intr.pow"(%arg0, %arg1) : (!llvm.float, !llvm.float) -> !llvm.float
+  // CHECK: call <8 x float> @llvm.pow.v8f32
+  "llvm.intr.pow"(%arg2, %arg3) : (!llvm<"<8 x float>">, !llvm<"<8 x float>">) -> !llvm<"<8 x float>">
+  llvm.return
+}
+
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: !llvm.float, %arg1: !llvm<"<8 x float>">, %arg2: !llvm<"<8 x i32>">) {
   // CHECK: call i32 @llvm.experimental.vector.reduce.add.v8i32

From 38366cf1676f9ac8d421586658e8bcd5ac4ab62d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 12:52:14 +0100
Subject: [PATCH 020/770] FunctionLoweringInfo.h - remove orphan
 addSEHHandlersForLPads declaration. NFC.

---
 llvm/include/llvm/CodeGen/FunctionLoweringInfo.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index bc5e4be674287..37e1e0de6510f 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -257,8 +257,6 @@ class FunctionLoweringInfo {
                                            const TargetRegisterClass *RC);
 
 private:
-  void addSEHHandlersForLPads(ArrayRef<const LandingPadInst *> LPads);
-
   /// LiveOutRegInfo - Information about live out vregs.
   IndexedMap<LiveOutInfo, VirtReg2IndexFunctor> LiveOutRegInfo;
 };

From 8f48814879c06bbf9f211fa5d959419f0d2d38b6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 12:56:44 +0100
Subject: [PATCH 021/770] FunctionLoweringInfo.h - move APInt.h dependency to
 FunctionLoweringInfo.cpp. NFC.

---
 llvm/include/llvm/CodeGen/FunctionLoweringInfo.h       | 2 +-
 llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 37e1e0de6510f..c99ca00eac29f 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -13,7 +13,7 @@
 
 #ifndef LLVM_CODEGEN_FUNCTIONLOWERINGINFO_H
 #define LLVM_CODEGEN_FUNCTIONLOWERINGINFO_H
-#include "llvm/ADT/APInt.h"
+
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 4f46f19905319..7a5fd7d24c681 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"

From fa038e03504c7d0dfd438b1dfdd6da7081e75617 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 25 May 2020 07:50:45 -0400
Subject: [PATCH 022/770] [x86] favor vector constant load to avoid GPR to XMM
 transfer, part 2

This replaces the build_vector lowering code that was just added in
D80013
and matches the pattern later from the x86-specific "vzext_movl".
That seems to result in the same or better improvements and gets rid
of the 'TODO' items from that patch.

AFAICT, we always shrink wider constant vectors to 128-bit on these
patterns, so we still get the implicit zero-extension to ymm/zmm
without wasting space on larger vector constants. There's a trade-off
there because that means we miss potential load-folding.

Similarly, we could load scalar constants here with implicit
zero-extension even to 128-bit. That saves constant space, but it
means we forego load-folding, and so it increases register pressure.
This seems like a good middle-ground between those 2 options.

Differential Revision: https://reviews.llvm.org/D80131
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 33 +++++++---
 llvm/test/CodeGen/X86/avx-load-store.ll       | 10 ++-
 llvm/test/CodeGen/X86/avx2-arith.ll           |  6 +-
 llvm/test/CodeGen/X86/combine-udiv.ll         | 40 ++++--------
 llvm/test/CodeGen/X86/fcmp-constant.ll        |  3 +-
 .../X86/insert-into-constant-vector.ll        | 48 ++++++--------
 llvm/test/CodeGen/X86/packss.ll               |  7 +-
 llvm/test/CodeGen/X86/pshufb-mask-comments.ll |  2 +-
 llvm/test/CodeGen/X86/ret-mmx.ll              |  2 +-
 llvm/test/CodeGen/X86/sad.ll                  | 65 +++++--------------
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    |  6 +-
 llvm/test/CodeGen/X86/vec_set-A.ll            |  2 +-
 llvm/test/CodeGen/X86/vec_shift2.ll           |  4 +-
 llvm/test/CodeGen/X86/vector-lzcnt-128.ll     | 12 ++--
 .../CodeGen/X86/vector-shuffle-256-v16.ll     |  3 +-
 .../CodeGen/X86/vector-shuffle-256-v32.ll     | 16 ++---
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 10 ++-
 .../CodeGen/X86/vector-shuffle-512-v32.ll     | 10 ++-
 .../CodeGen/X86/vector-shuffle-512-v64.ll     | 10 ++-
 .../test/CodeGen/X86/vector-shuffle-512-v8.ll | 10 ++-
 .../X86/vector-shuffle-combining-avx512f.ll   | 10 ++-
 .../X86/vector-shuffle-combining-xop.ll       | 29 +++------
 llvm/test/CodeGen/X86/vector-shuffle-v1.ll    | 12 ++--
 llvm/test/CodeGen/X86/vector-tzcnt-128.ll     | 24 +++----
 24 files changed, 147 insertions(+), 227 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5fc8448d1e725..eab9f14bec910 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10209,15 +10209,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
-      // Just load a vector integer constant. Loading is better for code size,
-      // avoids move GPR immediate --> XMM, and reduces register pressure.
-      if (IsAllConstants && VT.isInteger()) {
-        // TODO: Remove -1 restriction with demanded elements improvement?
-        // TODO: Insert 128-bit load into wider undef vector?
-        if (VT.is128BitVector() && !isAllOnesConstant(Item))
-          return SDValue();
-      }
-
       if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
           (EltVT == MVT::i64 && Subtarget.is64Bit())) {
         assert((VT.is128BitVector() || VT.is256BitVector() ||
@@ -35858,6 +35849,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       }
     }
 
+    // Load a scalar integer constant directly to XMM instead of transferring an
+    // immediate value from GPR.
+    // vzext_movl (scalar_to_vector C) --> load [C,0...]
+    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+        // Create a vector constant - scalar constant followed by zeros.
+        EVT ScalarVT = N0.getOperand(0).getValueType();
+        Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
+        unsigned NumElts = VT.getVectorNumElements();
+        Constant *Zero = ConstantInt::getNullValue(ScalarTy);
+        SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
+        ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
+
+        // Load the vector constant from constant pool.
+        MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+        SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
+        MachinePointerInfo MPI =
+            MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+        Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+        return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
+                           MachineMemOperand::MOLoad);
+      }
+    }
+
     return SDValue();
   }
   case X86ISD::BLENDI: {
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 1b3c35855ae9a..718449d7a771f 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -220,8 +220,7 @@ define void @f_f() nounwind {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne .LBB9_4
 ; CHECK-NEXT:  # %bb.3: # %cif_mixed_test_all
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
 ; CHECK-NEXT:    vmaskmovps %ymm0, %ymm0, (%rax)
 ; CHECK-NEXT:  .LBB9_4: # %cif_mixed_test_any_check
 ;
@@ -238,13 +237,12 @@ define void @f_f() nounwind {
 ; CHECK_O0-NEXT:    jne .LBB9_3
 ; CHECK_O0-NEXT:    jmp .LBB9_4
 ; CHECK_O0-NEXT:  .LBB9_3: # %cif_mixed_test_all
-; CHECK_O0-NEXT:    movl $-1, %eax
-; CHECK_O0-NEXT:    vmovd %eax, %xmm0
+; CHECK_O0-NEXT:    vmovdqa {{.*#+}} xmm0 = [4294967295,0,0,0]
 ; CHECK_O0-NEXT:    vmovdqa %xmm0, %xmm0
 ; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm1
-; CHECK_O0-NEXT:    # implicit-def: $rcx
+; CHECK_O0-NEXT:    # implicit-def: $rax
 ; CHECK_O0-NEXT:    # implicit-def: $ymm2
-; CHECK_O0-NEXT:    vmaskmovps %ymm2, %ymm1, (%rcx)
+; CHECK_O0-NEXT:    vmaskmovps %ymm2, %ymm1, (%rax)
 ; CHECK_O0-NEXT:  .LBB9_4: # %cif_mixed_test_any_check
 allocas:
   br i1 undef, label %cif_mask_all, label %cif_mask_mixed
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 39a81a2dc020a..b694b98d04c47 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -347,15 +347,13 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
 define <8 x i32> @mul_const9(<8 x i32> %x) {
 ; X32-LABEL: mul_const9:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl $2, %eax
-; X32-NEXT:    vmovd %eax, %xmm1
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
 ; X32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_const9:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
 ; X64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index c06719b91e273..e6d7aac926162 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -590,9 +590,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
 ;
 ; XOP-LABEL: combine_vec_udiv_nonuniform2:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; XOP-NEXT:    vmovd %eax, %xmm1
-; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
@@ -664,31 +662,17 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: combine_vec_udiv_nonuniform4:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: combine_vec_udiv_nonuniform4:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl $171, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: combine_vec_udiv_nonuniform4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT:    vpackuswb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: combine_vec_udiv_nonuniform4:
 ; XOP:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fcmp-constant.ll b/llvm/test/CodeGen/X86/fcmp-constant.ll
index 239830f980890..481a32b39dd37 100644
--- a/llvm/test/CodeGen/X86/fcmp-constant.ll
+++ b/llvm/test/CodeGen/X86/fcmp-constant.ll
@@ -92,8 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() {
 define <2 x i64> @fcmp_ueq_v2f64_undef_elt() {
 ; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,0]
 ; CHECK-NEXT:    retq
   %1 = fcmp ueq <2 x double> <double 0x3FF0000000000000, double 0xFFEFFFFFFFFFFFFF>, <double undef, double 0x3FF0000000000000>
   %2 = sext <2 x i1> %1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 18d57e9280557..779c91ab2575c 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -129,10 +129,8 @@ define <4 x i32> @elt3_v4i32(i32 %x) {
 define <2 x i64> @elt0_v2i64(i64 %x) {
 ; X32SSE-LABEL: elt0_v2i64:
 ; X32SSE:       # %bb.0:
-; X32SSE-NEXT:    movl $1, %eax
-; X32SSE-NEXT:    movd %eax, %xmm1
-; X32SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32SSE-NEXT:    retl
 ;
 ; X64SSE2-LABEL: elt0_v2i64:
@@ -150,10 +148,8 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
 ;
 ; X32AVX-LABEL: elt0_v2i64:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    movl $1, %eax
-; X32AVX-NEXT:    vmovd %eax, %xmm0
-; X32AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X32AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32AVX-NEXT:    retl
 ;
 ; X64AVX-LABEL: elt0_v2i64:
@@ -365,10 +361,9 @@ define <8 x float> @elt6_v8f32(float %x) {
 define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X32SSE-LABEL: elt5_v8i64:
 ; X32SSE:       # %bb.0:
-; X32SSE-NEXT:    movl $4, %eax
-; X32SSE-NEXT:    movd %eax, %xmm2
-; X32SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X32SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32SSE-NEXT:    movaps {{.*#+}} xmm2 = [4,0,0,0]
+; X32SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; X32SSE-NEXT:    movaps {{.*#+}} xmm0 = [42,0,1,0]
 ; X32SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,0,3,0]
 ; X32SSE-NEXT:    movaps {{.*#+}} xmm3 = [6,0,7,0]
@@ -395,10 +390,9 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X32AVX1-LABEL: elt5_v8i64:
 ; X32AVX1:       # %bb.0:
-; X32AVX1-NEXT:    movl $4, %eax
-; X32AVX1-NEXT:    vmovd %eax, %xmm0
-; X32AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X32AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X32AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X32AVX1-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
 ; X32AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
 ; X32AVX1-NEXT:    retl
@@ -413,11 +407,10 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X32AVX2-LABEL: elt5_v8i64:
 ; X32AVX2:       # %bb.0:
-; X32AVX2-NEXT:    movl $4, %eax
-; X32AVX2-NEXT:    vmovd %eax, %xmm0
-; X32AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X32AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32AVX2-NEXT:    vinserti128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
+; X32AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X32AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32AVX2-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
 ; X32AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
 ; X32AVX2-NEXT:    retl
 ;
@@ -431,13 +424,12 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X32AVX512F-LABEL: elt5_v8i64:
 ; X32AVX512F:       # %bb.0:
-; X32AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
-; X32AVX512F-NEXT:    movl $4, %eax
-; X32AVX512F-NEXT:    vmovd %eax, %xmm1
-; X32AVX512F-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X32AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32AVX512F-NEXT:    vinserti128 $1, {{\.LCPI.*}}, %ymm1, %ymm1
-; X32AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32AVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
+; X32AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32AVX512F-NEXT:    vmovaps {{.*#+}} xmm2 = [4,0,0,0]
+; X32AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X32AVX512F-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm1, %ymm1
+; X32AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; X32AVX512F-NEXT:    retl
 ;
 ; X64AVX512F-LABEL: elt5_v8i64:
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index 9c8d1f301f62f..9a4025ab75e4b 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -159,13 +159,12 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X86-SSE-NEXT:    psllq $63, %xmm1
 ; X86-SSE-NEXT:    psllq $63, %xmm0
 ; X86-SSE-NEXT:    psrlq $63, %xmm0
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = <1,0,u,u>
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,0,0]
 ; X86-SSE-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE-NEXT:    pcmpeqd %xmm3, %xmm3
-; X86-SSE-NEXT:    paddq %xmm3, %xmm0
+; X86-SSE-NEXT:    psubq %xmm2, %xmm0
 ; X86-SSE-NEXT:    psrlq $63, %xmm1
 ; X86-SSE-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE-NEXT:    paddq %xmm3, %xmm1
+; X86-SSE-NEXT:    psubq %xmm2, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    packssdw %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
index 9a1a000db93c8..416b5c8cb0966 100644
--- a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -54,7 +54,7 @@ define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
 define <16 x i8> @test5(<16 x i8> %V) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,0]
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,0,0,0]
 ; CHECK-NEXT:    movaps %xmm1, (%rax)
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
 ; CHECK-NEXT:    movaps %xmm1, (%rax)
diff --git a/llvm/test/CodeGen/X86/ret-mmx.ll b/llvm/test/CodeGen/X86/ret-mmx.ll
index bca47e182953f..8c2ded4ebb362 100644
--- a/llvm/test/CodeGen/X86/ret-mmx.ll
+++ b/llvm/test/CodeGen/X86/ret-mmx.ll
@@ -32,7 +32,7 @@ define <1 x i64> @t2() nounwind {
 define <2 x i32> @t3() nounwind {
 ; CHECK-LABEL: t3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1,0,u,u>
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,0,0,0]
 ; CHECK-NEXT:    retq
   ret <2 x i32> <i32 1, i32 0>
 }
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 66a8661698a53..006dd3d5ff178 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -544,8 +544,7 @@ define i32 @sad_2i8() nounwind {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB3_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -995,54 +994,20 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
-; AVX1-LABEL: sad_unroll_nonzero_initial:
-; AVX1:       # %bb.0: # %bb
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX1-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX1-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sad_unroll_nonzero_initial:
-; AVX2:       # %bb.0: # %bb
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX2-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX2-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sad_unroll_nonzero_initial:
-; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX512-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm2
-; AVX512-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    retq
+; AVX-LABEL: sad_unroll_nonzero_initial:
+; AVX:       # %bb.0: # %bb
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
 bb:
   %tmp = load <16 x i8>, <16 x i8>* %arg, align 1
   %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 1eee782f90cac..6c72adbc63175 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -325,7 +325,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2454267027,u,0,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
 ; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -452,7 +452,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2454267027,u,0,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
 ; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -1314,7 +1314,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2454267027,u,0,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
 ; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
diff --git a/llvm/test/CodeGen/X86/vec_set-A.ll b/llvm/test/CodeGen/X86/vec_set-A.ll
index e246ef047231f..c8ff250b5bfbc 100644
--- a/llvm/test/CodeGen/X86/vec_set-A.ll
+++ b/llvm/test/CodeGen/X86/vec_set-A.ll
@@ -10,7 +10,7 @@ define <2 x i64> @test1() nounwind {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,0]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,0,0,0]
 ; X64-NEXT:    retq
   ret <2 x i64> < i64 1, i64 0 >
 }
diff --git a/llvm/test/CodeGen/X86/vec_shift2.ll b/llvm/test/CodeGen/X86/vec_shift2.ll
index 1f386bb5a1daf..a38187f190f99 100644
--- a/llvm/test/CodeGen/X86/vec_shift2.ll
+++ b/llvm/test/CodeGen/X86/vec_shift2.ll
@@ -5,12 +5,12 @@
 define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind  {
 ; X32-LABEL: t1:
 ; X32:       # %bb.0:
-; X32-NEXT:    psrlw {{\.LCPI.*}}, %xmm0
+; X32-NEXT:    psrlw $14, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: t1:
 ; X64:       # %bb.0:
-; X64-NEXT:    psrlw {{.*}}(%rip), %xmm0
+; X64-NEXT:    psrlw $14, %xmm0
 ; X64-NEXT:    retq
 	%tmp1 = bitcast <2 x i64> %b1 to <8 x i16>
 	%tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index ea77de5393375..8c451b7215b07 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1666,17 +1666,17 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; NOBW-LABEL: foldv2i64:
 ; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0]
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
 ; NOBW-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv2i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0]
+; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv2i64:
@@ -1690,17 +1690,17 @@ define <2 x i64> @foldv2i64() nounwind {
 define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-LABEL: foldv2i64u:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; NOBW-LABEL: foldv2i64u:
 ; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0]
+; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
 ; NOBW-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv2i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0]
+; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv2i64u:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 9d04c701ce6c8..cd82cc89778b1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -713,8 +713,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    movl $15, %eax
-; AVX512VL-NEXT:    vmovd %eax, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 0807aa6dab095..1cf048deca298 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1600,24 +1600,21 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX2-NEXT:    movl $15, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
 ; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; AVX512VLBW-NEXT:    movl $15, %eax
-; AVX512VLBW-NEXT:    vmovd %eax, %xmm1
 ; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    movl $31, %eax
-; AVX512VLVBMI-NEXT:    vmovd %eax, %xmm1
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [31,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1632,9 +1629,8 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; XOPAVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
 ; XOPAVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
-; XOPAVX2-NEXT:    movl $15, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
 ; XOPAVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -2791,7 +2787,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2805,7 +2801,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index db36bdb5d9400..4e62ffae6738c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -187,9 +187,8 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    movl $7, %eax
-; AVX2OR512VL-NEXT:    vmovd %eax, %xmm1
-; AVX2OR512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x float> %shuffle
@@ -1510,9 +1509,8 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    movl $7, %eax
-; AVX2OR512VL-NEXT:    vmovd %eax, %xmm1
-; AVX2OR512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index f798b55d7fd9d..59c4769017820 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -201,16 +201,14 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
 define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
 ; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movl $65535, %eax ## imm = 0xFFFF
-; KNL-NEXT:    vmovd %eax, %xmm1
-; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; KNL-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    movl $65535, %eax ## imm = 0xFFFF
-; SKX-NEXT:    vmovd %eax, %xmm1
-; SKX-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    retq
   %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
   ret <32 x i16> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 191b29a19ce3a..2cf8ab11ce278 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -109,9 +109,8 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
 define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
 ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movl $255, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm1
-; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
@@ -121,9 +120,8 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    movl $255, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm1
-; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 739ea6bae5352..f582a31a607dd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -142,9 +142,8 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_70000000:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movl $7, %eax
-; ALL-NEXT:    vmovd %eax, %xmm1
-; ALL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x double> %shuffle
@@ -961,9 +960,8 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_70000000:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movl $7, %eax
-; ALL-NEXT:    vmovd %eax, %xmm1
-; ALL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i64> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index 257bd03dee8ee..dd1f766dcf667 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -899,12 +899,10 @@ define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %
 define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1, i64 %a2) {
 ; X86-LABEL: combine_vpermi2var_8f64_as_permpd:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $2, %eax
-; X86-NEXT:    vmovd %eax, %xmm2
-; X86-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X86-NEXT:    vinserti128 $1, {{\.LCPI.*}}, %ymm2, %ymm2
-; X86-NEXT:    vinserti64x4 $1, {{\.LCPI.*}}, %zmm2, %zmm2
+; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; X86-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2
+; X86-NEXT:    vinsertf64x4 $1, {{\.LCPI.*}}, %zmm2, %zmm2
 ; X86-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2
 ; X86-NEXT:    vpermpd {{.*#+}} zmm0 = zmm2[2,3,1,1,6,7,5,5]
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 9c507ad5443e7..4a10a20bc5ff7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -131,27 +131,14 @@ define <4 x double> @combine_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x doub
 }
 
 define <4 x double> @demandedelts_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x double> %a1, i64 %a2) {
-; X86-AVX-LABEL: demandedelts_vpermil2pd256_as_shufpd:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl $4, %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm2
-; X86-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X86-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X86-AVX-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2
-; X86-AVX-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X86-AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3]
-; X86-AVX-NEXT:    retl
-;
-; X86-AVX2-LABEL: demandedelts_vpermil2pd256_as_shufpd:
-; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    movl $4, %eax
-; X86-AVX2-NEXT:    vmovd %eax, %xmm2
-; X86-AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X86-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X86-AVX2-NEXT:    vinserti128 $1, {{\.LCPI.*}}, %ymm2, %ymm2
-; X86-AVX2-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3]
-; X86-AVX2-NEXT:    retl
+; X86-LABEL: demandedelts_vpermil2pd256_as_shufpd:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; X86-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2
+; X86-NEXT:    vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X86-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: demandedelts_vpermil2pd256_as_shufpd:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index c511971096200..8ab57bf78a6f2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -46,7 +46,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -60,8 +60,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT:    movq $-1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
 ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -71,10 +70,9 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
-; VL_BW_DQ-NEXT:    movq $-1, %rax
-; VL_BW_DQ-NEXT:    vmovq %rax, %xmm0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
-; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
+; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
 ; VL_BW_DQ-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 91301f319b02e..5a0507a71799c 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1576,32 +1576,32 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: foldv2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv2i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv2i64:
@@ -1615,32 +1615,32 @@ define <2 x i64> @foldv2i64() nounwind {
 define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-LABEL: foldv2i64u:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: foldv2i64u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0]
+; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: foldv2i64u:

From 5bb632339ac53e72b81921b6db9c1f0c1fbf63bb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 13:09:30 +0100
Subject: [PATCH 023/770] InlineAdvisor.h - remove unnecessary
 PreservedAnalyses forward declaration. NFC.

This is directly defined in PassManager.h
---
 llvm/include/llvm/Analysis/InlineAdvisor.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 118fd236bee48..ac8e7c20429de 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -22,7 +22,6 @@ class CallBase;
 class Function;
 class Module;
 class OptimizationRemarkEmitter;
-class PreservedAnalyses;
 
 /// There are 3 scenarios we can use the InlineAdvisor:
 /// - Default - use manual heuristics.

From 8b4ecafee66c405ca33b9d2dc826c2d720160432 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 13:27:10 +0100
Subject: [PATCH 024/770] InstructionSimplify.h - remove unnecessary includes.
 NFC.

Remove unused User.h include.
Replace SetVector.h with forward declaration.
Sort the forward declarations + remove FastMathFlags (defined in Operator.h).
Fix implicit SetVector.h dependency in LowerConstantIntrinsics.cpp.
---
 llvm/include/llvm/Analysis/InstructionSimplify.h    | 13 ++++++-------
 .../Transforms/Scalar/LowerConstantIntrinsics.cpp   |  1 +
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index 7a9a1a81555b4..2a39a4e090870 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -31,28 +31,27 @@
 #ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
 #define LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
 
-#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/User.h"
 
 namespace llvm {
-class Function;
+
 template <typename T, typename... TArgs> class AnalysisManager;
 template <class T> class ArrayRef;
 class AssumptionCache;
+class BinaryOperator;
 class CallBase;
-class DominatorTree;
 class DataLayout;
-class FastMathFlags;
+class DominatorTree;
+class Function;
 struct LoopStandardAnalysisResults;
+class MDNode;
 class OptimizationRemarkEmitter;
 class Pass;
+template <class T, unsigned n> class SmallSetVector;
 class TargetLibraryInfo;
 class Type;
 class Value;
-class MDNode;
-class BinaryOperator;
 
 /// InstrInfoQuery provides an interface to query additional information for
 /// instructions like metadata or keywords like nsw, which provides conservative
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index 1dafa17e5c75e..f59280b60d8da 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"

From 03ec5b6bc4629b9ce4e11cbf54799995dbcb9c29 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 13:43:20 +0100
Subject: [PATCH 025/770] LoopInfo.h - remove unnecessary PHINode forward
 declaration. NFC.

This is directly defined in Instructions.h
---
 llvm/include/llvm/Analysis/LoopInfo.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 0498020249856..35fe2a03a2a2d 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -60,7 +60,6 @@ class Loop;
 class InductionDescriptor;
 class MDNode;
 class MemorySSAUpdater;
-class PHINode;
 class ScalarEvolution;
 class raw_ostream;
 template <class N, bool IsPostDom> class DominatorTreeBase;

From 0e3faab6f0fa00668f97747a6a4afa1bc5647ef9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 13:44:27 +0100
Subject: [PATCH 026/770] MemoryBuiltins.h - remove unnecessary
 TargetLibraryInfo forward declaration. NFC.

We already have to include TargetLibraryInfo.h
---
 llvm/include/llvm/Analysis/MemoryBuiltins.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/MemoryBuiltins.h b/llvm/include/llvm/Analysis/MemoryBuiltins.h
index 13fc95e00f760..c5428726995e4 100644
--- a/llvm/include/llvm/Analysis/MemoryBuiltins.h
+++ b/llvm/include/llvm/Analysis/MemoryBuiltins.h
@@ -47,7 +47,6 @@ class LoadInst;
 class PHINode;
 class PointerType;
 class SelectInst;
-class TargetLibraryInfo;
 class Type;
 class UndefValue;
 class Value;

From 8eae32188bbaa4ac5943f8a98b3b7e4bbba55698 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Wed, 6 May 2020 13:42:01 +0200
Subject: [PATCH 027/770] Improve stack-clash implementation on x86

- test both 32 and 64 bit version
- probe the tail in dynamic-alloca
- generate more concise code

Differential Revision: https://reviews.llvm.org/D79482
---
 llvm/lib/Target/X86/X86FrameLowering.cpp      | 25 +++--
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 76 ++++++++-------
 .../CodeGen/X86/stack-clash-dynamic-alloca.ll | 95 ++++++++++++-------
 llvm/test/CodeGen/X86/stack-clash-large.ll    | 65 ++++++++-----
 llvm/test/CodeGen/X86/stack-clash-medium.ll   | 47 +++++----
 5 files changed, 180 insertions(+), 128 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 062cf7acc58d4..f320041b2de63 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -275,9 +275,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
   // allocation is split in smaller chunks anyway.
   if (EmitInlineStackProbe && !InEpilogue) {
 
-    // stack probing may involve looping, and control flow generations is
-    // disallowed at this point. Rely to later processing through
-    // `inlineStackProbe`.
+    // Delegate stack probing to the `inlineStackProbe` mechanism to avoid
+    // complications.
     MachineInstr *Stub = emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
 
     // Encode the static offset as a metadata attached to the stub.
@@ -645,6 +644,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
     uint64_t Offset) const {
+  assert(Offset && "null offset");
 
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const X86TargetLowering &TLI = *STI.getTargetLowering();
@@ -662,8 +662,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   MF.insert(MBBIter, testMBB);
   MF.insert(MBBIter, tailMBB);
 
-  unsigned FinalStackPtr = Uses64BitFramePtr ? X86::R11 : X86::R11D;
-  BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FinalStackPtr)
+  Register FinalStackPtr = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackPtr)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
 
@@ -693,7 +693,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
       .setMIFlag(MachineInstr::FrameSetup);
 
   // cmp with stack pointer bound
-  BuildMI(testMBB, DL, TII.get(IsLP64 ? X86::CMP64rr : X86::CMP32rr))
+  BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
       .addReg(StackPtr)
       .addReg(FinalStackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
@@ -701,23 +701,22 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   // jump
   BuildMI(testMBB, DL, TII.get(X86::JCC_1))
       .addMBB(testMBB)
-      .addImm(X86::COND_NE)
+      .addImm(X86::COND_L)
       .setMIFlag(MachineInstr::FrameSetup);
   testMBB->addSuccessor(testMBB);
   testMBB->addSuccessor(tailMBB);
   testMBB->addLiveIn(FinalStackPtr);
 
-  // allocate a block and touch it
-
+  // BB management
   tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
   tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
   MBB.addSuccessor(testMBB);
 
+  // handle tail
   if (Offset % StackProbeSize) {
-    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
-    BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr)
-        .addReg(StackPtr)
-        .addImm(Offset % StackProbeSize)
+    BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(TargetOpcode::COPY),
+            StackPtr)
+        .addReg(FinalStackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
   }
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eab9f14bec910..5101977a68edc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31612,14 +31612,26 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   return SinkMBB;
 }
 
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::SUB64ri8;
+    return X86::SUB64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::SUB32ri8;
+    return X86::SUB32ri;
+  }
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
-                                           MachineBasicBlock *BB) const {
-  MachineFunction *MF = BB->getParent();
+                                           MachineBasicBlock *MBB) const {
+  MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
   DebugLoc DL = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
 
   const unsigned ProbeSize = getStackProbeSize(*MF);
 
@@ -31628,31 +31640,35 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
-  MachineFunction::iterator MBBIter = ++BB->getIterator();
+  MachineFunction::iterator MBBIter = ++MBB->getIterator();
   MF->insert(MBBIter, testMBB);
   MF->insert(MBBIter, blockMBB);
   MF->insert(MBBIter, tailMBB);
 
-  unsigned sizeVReg = MI.getOperand(1).getReg();
+  Register sizeVReg = MI.getOperand(1).getReg();
 
-  const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg);
+  Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
 
-  unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass);
-  unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass);
+  Register TmpStackPtr = MRI.createVirtualRegister(
+      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+  Register FinalStackPtr = MRI.createVirtualRegister(
+      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
 
-  unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
+  BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
+      .addReg(physSPReg);
+  {
+    const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
+    BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
+        .addReg(TmpStackPtr)
+        .addReg(sizeVReg);
+  }
 
   // test rsp size
-  BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg)
-      .addReg(sizeVReg)
-      .addMBB(BB)
-      .addReg(tmpSizeVReg2)
-      .addMBB(blockMBB);
 
   BuildMI(testMBB, DL,
-          TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri))
-      .addReg(tmpSizeVReg)
-      .addImm(ProbeSize);
+          TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+      .addReg(physSPReg)
+      .addReg(FinalStackPtr);
 
   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
       .addMBB(tailMBB)
@@ -31663,14 +31679,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   // allocate a block and touch it
 
   BuildMI(blockMBB, DL,
-          TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
-          tmpSizeVReg2)
-      .addReg(tmpSizeVReg)
-      .addImm(ProbeSize);
-
-  BuildMI(blockMBB, DL,
-          TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
-          physSPReg)
+          TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
       .addReg(physSPReg)
       .addImm(ProbeSize);
 
@@ -31682,19 +31691,14 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
   blockMBB->addSuccessor(testMBB);
 
-  // allocate the tail and continue
-  BuildMI(tailMBB, DL,
-          TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr),
-          physSPReg)
-      .addReg(physSPReg)
-      .addReg(tmpSizeVReg);
+  // Replace original instruction by the expected stack ptr
   BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
-      .addReg(physSPReg);
+      .addReg(FinalStackPtr);
 
-  tailMBB->splice(tailMBB->end(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  tailMBB->transferSuccessorsAndUpdatePHIs(BB);
-  BB->addSuccessor(testMBB);
+  tailMBB->splice(tailMBB->end(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(testMBB);
 
   // Delete the original pseudo instruction.
   MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
index c0a199e16a941..140da42fc6fbb 100644
--- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
@@ -1,39 +1,7 @@
-; RUN: llc < %s | FileCheck %s
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s 
+; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s 
 
 define i32 @foo(i32 %n) local_unnamed_addr #0 {
-
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:  	pushq	%rbp
-; CHECK-NEXT:  	.cfi_def_cfa_offset 16
-; CHECK-NEXT:  	.cfi_offset %rbp, -16
-; CHECK-NEXT:  	movq	%rsp, %rbp
-; CHECK-NEXT:  	.cfi_def_cfa_register %rbp
-; CHECK-NEXT:  	movl	%edi, %eax
-; CHECK-NEXT:  	leaq	15(,%rax,4), %rax
-; CHECK-NEXT:  	andq	$-16, %rax
-; CHECK-NEXT:  	cmpq	$4096, %rax # imm = 0x1000
-; CHECK-NEXT:  	jl	.LBB0_3
-; CHECK-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:  	subq	$4096, %rax # imm = 0x1000
-; CHECK-NEXT:  	subq	$4096, %rsp # imm = 0x1000
-; CHECK-NEXT:  	movq	$0, (%rsp)
-; CHECK-NEXT:  	cmpq	$4096, %rax # imm = 0x1000
-; CHECK-NEXT:  	jge	.LBB0_2
-; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:  	subq	%rax, %rsp
-; CHECK-NEXT:  	movq	%rsp, %rax
-; CHECK-NEXT:  	movl	$1, 4792(%rax)
-; CHECK-NEXT:  	movl	(%rax), %eax
-; CHECK-NEXT:  	movq	%rbp, %rsp
-; CHECK-NEXT:  	popq	%rbp
-; CHECK-NEXT:  .cfi_def_cfa %rsp, 8
-; CHECK-NEXT:   retq
-
   %a = alloca i32, i32 %n, align 16
   %b = getelementptr inbounds i32, i32* %a, i64 1198
   store volatile i32 1, i32* %b
@@ -42,3 +10,62 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 {
 }
 
 attributes #0 =  {"probe-stack"="inline-asm"}
+
+; CHECK-X86-64-LABEL: foo:
+; CHECK-X86-64:       # %bb.0:
+; CHECK-X86-64-NEXT:  	pushq	%rbp
+; CHECK-X86-64-NEXT:  	.cfi_def_cfa_offset 16
+; CHECK-X86-64-NEXT:  	.cfi_offset %rbp, -16
+; CHECK-X86-64-NEXT:  	movq	%rsp, %rbp
+; CHECK-X86-64-NEXT:  	.cfi_def_cfa_register %rbp
+; CHECK-X86-64-NEXT:  	movq    %rsp, %rax
+; CHECK-X86-64-NEXT:    movl    %edi, %ecx
+; CHECK-X86-64-NEXT:  	leaq 15(,%rcx,4), %rcx
+; CHECK-X86-64-NEXT:  	andq	$-16, %rcx
+; CHECK-X86-64-NEXT:  	subq	%rcx, %rax
+; CHECK-X86-64-NEXT:  	cmpq	%rax, %rsp
+; CHECK-X86-64-NEXT:  	jl	.LBB0_3
+; CHECK-X86-64-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
+; CHECK-X86-64-NEXT:  	subq	$4096, %rsp # imm = 0x1000
+; CHECK-X86-64-NEXT:  	movq	$0, (%rsp)
+; CHECK-X86-64-NEXT:  	cmpq	%rax, %rsp
+; CHECK-X86-64-NEXT:  	jge	.LBB0_2
+; CHECK-X86-64-NEXT:  .LBB0_3:
+; CHECK-X86-64-NEXT:  	movq	%rax, %rsp
+; CHECK-X86-64-NEXT:  	movl	$1, 4792(%rax)
+; CHECK-X86-64-NEXT:  	movl	(%rax), %eax
+; CHECK-X86-64-NEXT:  	movq	%rbp, %rsp
+; CHECK-X86-64-NEXT:  	popq	%rbp
+; CHECK-X86-64-NEXT:  .cfi_def_cfa %rsp, 8
+; CHECK-X86-64-NEXT:   retq
+
+
+; CHECK-X86-32-LABEL: foo:
+; CHECK-X86-32:       # %bb.0:
+; CHECK-X86-32-NEXT:    pushl   %ebp
+; CHECK-X86-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-X86-32-NEXT:    .cfi_offset %ebp, -8
+; CHECK-X86-32-NEXT:    movl    %esp, %ebp
+; CHECK-X86-32-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-X86-32-NEXT:    subl    $8, %esp
+; CHECK-X86-32-NEXT:    movl    8(%ebp), %ecx
+; CHECK-X86-32-NEXT:    movl    %esp, %eax
+; CHECK-X86-32-NEXT:    leal    15(,%ecx,4), %ecx
+; CHECK-X86-32-NEXT:    andl    $-16, %ecx
+; CHECK-X86-32-NEXT:    subl    %ecx, %eax
+; CHECK-X86-32-NEXT:    cmpl    %eax, %esp
+; CHECK-X86-32-NEXT:    jl  .LBB0_3
+; CHECK-X86-32-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
+; CHECK-X86-32-NEXT:    subl    $4096, %esp # imm = 0x1000
+; CHECK-X86-32-NEXT:    movl    $0, (%esp)
+; CHECK-X86-32-NEXT:    cmpl    %eax, %esp
+; CHECK-X86-32-NEXT:    jge .LBB0_2
+; CHECK-X86-32-NEXT:  .LBB0_3:
+; CHECK-X86-32-NEXT:    movl    %eax, %esp
+; CHECK-X86-32-NEXT:    movl    $1, 4792(%eax)
+; CHECK-X86-32-NEXT:    movl    (%eax), %eax
+; CHECK-X86-32-NEXT:    movl    %ebp, %esp
+; CHECK-X86-32-NEXT:    popl    %ebp
+; CHECK-X86-32-NEXT:    .cfi_def_cfa %esp, 4
+; CHECK-X86-32-NEXT:    retl
+
diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll
index f9a5fdc17b84a..ccc52f213e377 100644
--- a/llvm/test/CodeGen/X86/stack-clash-large.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-large.ll
@@ -1,31 +1,8 @@
-; RUN: llc < %s | FileCheck %s
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s 
+; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s 
 
 define i32 @foo() local_unnamed_addr #0 {
 
-; CHECK-LABEL: foo:
-; CHECK:        # %bb.0:
-; CHECK-NEXT:	movq	%rsp, %r11
-; CHECK-NEXT:	subq	$69632, %r11 # imm = 0x11000
-; CHECK-NEXT:   .LBB0_1:
-; CHECK-NEXT:	subq	$4096, %rsp # imm = 0x1000
-; CHECK-NEXT:	movq	$0, (%rsp)
-; CHECK-NEXT:	cmpq	%r11, %rsp
-; CHECK-NEXT:	jne	.LBB0_1
-; CHECK-NEXT:# %bb.2:
-; CHECK-NEXT:	subq	$2248, %rsp # imm = 0x8C8
-; CHECK-NEXT:	.cfi_def_cfa_offset 71888
-; CHECK-NEXT:	movl	$1, 264(%rsp)
-; CHECK-NEXT:	movl	$1, 28664(%rsp)
-; CHECK-NEXT:	movl	-128(%rsp), %eax
-; CHECK-NEXT:	addq	$71880, %rsp # imm = 0x118C8
-; CHECK-NEXT:	.cfi_def_cfa_offset 8
-; CHECK-NEXT:	retq
-
-
   %a = alloca i32, i64 18000, align 16
   %b0 = getelementptr inbounds i32, i32* %a, i64 98
   %b1 = getelementptr inbounds i32, i32* %a, i64 7198
@@ -36,3 +13,41 @@ define i32 @foo() local_unnamed_addr #0 {
 }
 
 attributes #0 =  {"probe-stack"="inline-asm"}
+
+; CHECK-X86-64-LABEL: foo:
+; CHECK-X86-64:        # %bb.0:
+; CHECK-X86-64-NEXT:	movq	%rsp, %r11
+; CHECK-X86-64-NEXT:	subq	$69632, %r11 # imm = 0x11000
+; CHECK-X86-64-NEXT:   .LBB0_1:
+; CHECK-X86-64-NEXT:	subq	$4096, %rsp # imm = 0x1000
+; CHECK-X86-64-NEXT:	movq	$0, (%rsp)
+; CHECK-X86-64-NEXT:	cmpq	%r11, %rsp
+; CHECK-X86-64-NEXT:	jl	.LBB0_1
+; CHECK-X86-64-NEXT:# %bb.2:
+; CHECK-X86-64-NEXT:	movq    %r11, %rsp
+; CHECK-X86-64-NEXT:	.cfi_def_cfa_offset 71888
+; CHECK-X86-64-NEXT:	movl	$1, 264(%rsp)
+; CHECK-X86-64-NEXT:	movl	$1, 28664(%rsp)
+; CHECK-X86-64-NEXT:	movl	-128(%rsp), %eax
+; CHECK-X86-64-NEXT:	addq	$71880, %rsp # imm = 0x118C8
+; CHECK-X86-64-NEXT:	.cfi_def_cfa_offset 8
+; CHECK-X86-64-NEXT:	retq
+
+; CHECK-X86-32-LABEL: foo:
+; CHECK-X86-32:      # %bb.0:
+; CHECK-X86-32-NEXT:    movl    %esp, %r11d
+; CHECK-X86-32-NEXT:    subl    $69632, %r11d # imm = 0x11000
+; CHECK-X86-32-NEXT:    .LBB0_1: # =>This Inner Loop Header: Depth=1
+; CHECK-X86-32-NEXT:    subl    $4096, %esp # imm = 0x1000
+; CHECK-X86-32-NEXT:    movl    $0, (%esp)
+; CHECK-X86-32-NEXT:    cmpl    %r11d, %esp
+; CHECK-X86-32-NEXT:    jl  .LBB0_1
+; CHECK-X86-32-NEXT:# %bb.2:
+; CHECK-X86-32-NEXT:    movl    %r11d, %esp
+; CHECK-X86-32-NEXT:    .cfi_def_cfa_offset 72016
+; CHECK-X86-32-NEXT:    movl    $1, 392(%esp)
+; CHECK-X86-32-NEXT:    movl    $1, 28792(%esp)
+; CHECK-X86-32-NEXT:    movl    (%esp), %eax
+; CHECK-X86-32-NEXT:    addl    $72012, %esp # imm = 0x1194C
+; CHECK-X86-32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-X86-32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/stack-clash-medium.ll b/llvm/test/CodeGen/X86/stack-clash-medium.ll
index 05af3478cfc07..5a97074025f1f 100644
--- a/llvm/test/CodeGen/X86/stack-clash-medium.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-medium.ll
@@ -1,25 +1,7 @@
-; RUN: llc < %s | FileCheck %s
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s 
+; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s 
 
 define i32 @foo() local_unnamed_addr #0 {
-
-; CHECK-LABEL: foo:
-; CHECK:      # %bb.0:
-; CHECK-NEXT: subq	$4096, %rsp             # imm = 0x1000
-; CHECK-NEXT: movq	$0, (%rsp)
-; CHECK-NEXT: subq	$3784, %rsp             # imm = 0xEC8
-; CHECK-NEXT: .cfi_def_cfa_offset 7888
-; CHECK-NEXT: movl	$1, 672(%rsp)
-; CHECK-NEXT: movl	-128(%rsp), %eax
-; CHECK-NEXT: addq	$7880, %rsp             # imm = 0x1EC8
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
-
-
-
   %a = alloca i32, i64 2000, align 16
   %b = getelementptr inbounds i32, i32* %a, i64 200
   store volatile i32 1, i32* %b
@@ -28,3 +10,28 @@ define i32 @foo() local_unnamed_addr #0 {
 }
 
 attributes #0 =  {"probe-stack"="inline-asm"}
+
+; CHECK-X86-64-LABEL: foo:
+; CHECK-X86-64:      # %bb.0:
+; CHECK-X86-64-NEXT: subq	$4096, %rsp             # imm = 0x1000
+; CHECK-X86-64-NEXT: movq	$0, (%rsp)
+; CHECK-X86-64-NEXT: subq	$3784, %rsp             # imm = 0xEC8
+; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 7888
+; CHECK-X86-64-NEXT: movl	$1, 672(%rsp)
+; CHECK-X86-64-NEXT: movl	-128(%rsp), %eax
+; CHECK-X86-64-NEXT: addq	$7880, %rsp             # imm = 0x1EC8
+; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8
+; CHECK-X86-64-NEXT: retq
+
+
+; CHECK-X86-32-LABEL: foo:
+; CHECK-X86-32:      # %bb.0:
+; CHECK-X86-32-NEXT: subl	$4096, %esp # imm = 0x1000
+; CHECK-X86-32-NEXT: movl	$0, (%esp)
+; CHECK-X86-32-NEXT: subl	$3916, %esp # imm = 0xF4C
+; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8016
+; CHECK-X86-32-NEXT: movl	$1, 800(%esp)
+; CHECK-X86-32-NEXT: movl	(%esp), %eax
+; CHECK-X86-32-NEXT: addl	$8012, %esp # imm = 0x1F4C
+; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4
+; CHECK-X86-32-NEXT: retl

From 6ade4eb91811c7e7c59634d2de2767421d13a99b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 13:59:46 +0100
Subject: [PATCH 028/770] MemoryLocation.h - reduce Instructions.h include to
 Instruction.h include. NFC.

Add forward declarations for the few Instr classes we reference.
---
 llvm/include/llvm/Analysis/MemoryLocation.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/MemoryLocation.h b/llvm/include/llvm/Analysis/MemoryLocation.h
index 9e5e43bbd5f5b..ce70df66ab7a8 100644
--- a/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -17,21 +17,25 @@
 
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/TypeSize.h"
 
 namespace llvm {
 
+class CallBase;
 class LoadInst;
 class StoreInst;
 class MemTransferInst;
 class MemIntrinsic;
+class AtomicCmpXchgInst;
 class AtomicMemTransferInst;
 class AtomicMemIntrinsic;
+class AtomicRMWInst;
 class AnyMemTransferInst;
 class AnyMemIntrinsic;
 class TargetLibraryInfo;
+class VAArgInst;
 
 // Represents the size of a MemoryLocation. Logically, it's an
 // Optional<uint63_t> that also carries a bit to represent whether the integer

From 82bee922afd65bf884abb9ea3db3fc7fede4e1cf Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 25 May 2020 15:12:08 +0200
Subject: [PATCH 029/770] Make FEATURE_AVX512VP2INTERSECT match between
 compiler-rt and LLVM

compiler-rt also doesn't support bits >= 64 as far as I know.
---
 llvm/include/llvm/Support/X86TargetParser.def | 2 +-
 llvm/lib/Support/Host.cpp                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index f4780c7d113e4..c826f590b71f3 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -162,7 +162,7 @@ X86_FEATURE_COMPAT(33, FEATURE_VPCLMULQDQ,      "vpclmulqdq")
 X86_FEATURE_COMPAT(34, FEATURE_AVX512VNNI,      "avx512vnni")
 X86_FEATURE_COMPAT(35, FEATURE_AVX512BITALG,    "avx512bitalg")
 X86_FEATURE_COMPAT(36, FEATURE_AVX512BF16,      "avx512bf16")
-X86_FEATURE_COMPAT(69, FEATURE_AVX512VP2INTERSECT, "avx512vp2intersect")
+X86_FEATURE_COMPAT(37, FEATURE_AVX512VP2INTERSECT, "avx512vp2intersect")
 // Features below here are not in libgcc/compiler-rt.
 X86_FEATURE       (64, FEATURE_MOVBE)
 X86_FEATURE       (65, FEATURE_ADX)
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index d9187efafbc19..d9b3cac5e8dc0 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -776,7 +776,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
 
     default: // Unknown family 6 CPU, try to guess.
       // TODO detect tigerlake host
-      if (Features3 & (1 << (X86::FEATURE_AVX512VP2INTERSECT - 64))) {
+      if (Features2 & (1 << (X86::FEATURE_AVX512VP2INTERSECT - 32))) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_TIGERLAKE;
         break;

From 5d6c5b463cab7aeb74b20f51af88ba1d1658f8a8 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Mon, 25 May 2020 13:31:57 +0000
Subject: [PATCH 030/770] [LoopUtils] Use llvm::find

Summary: Fixes this build error:

llvm/lib/Transforms/Utils/LoopUtils.cpp:679:26: error: no matching
function for call to 'find'
      Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(),
L);
                         ^~~~
Authored By: orivej
Reviewer: Whitney
Reviewed By: Whitney
Subscribers: hiraditya, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D80473
---
 llvm/lib/Transforms/Utils/LoopUtils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 8c7475eae6e32..d7ea9923ed2e9 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -713,11 +713,11 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
     // its parent. While removeLoop/removeChildLoop remove the given loop but
     // not relink its subloops, which is what we want.
     if (Loop *ParentLoop = L->getParentLoop()) {
-      Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(), L);
+      Loop::iterator I = find(*ParentLoop, L);
       assert(I != ParentLoop->end() && "Couldn't find loop");
       ParentLoop->removeChildLoop(I);
     } else {
-      Loop::iterator I = find(LI->begin(), LI->end(), L);
+      Loop::iterator I = find(*LI, L);
       assert(I != LI->end() && "Couldn't find loop");
       LI->removeLoop(I);
     }

From 356bf5ea5d91642b7a932a368804cef6733133c2 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Mon, 25 May 2020 15:56:48 +0200
Subject: [PATCH 031/770] Stack clash: update live-ins

This fixes http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-debian/builds/7150
---
 llvm/lib/Target/X86/X86FrameLowering.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index f320041b2de63..5ee7c87a42b9b 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -705,7 +705,6 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
       .setMIFlag(MachineInstr::FrameSetup);
   testMBB->addSuccessor(testMBB);
   testMBB->addSuccessor(tailMBB);
-  testMBB->addLiveIn(FinalStackPtr);
 
   // BB management
   tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
@@ -719,6 +718,10 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
         .addReg(FinalStackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
   }
+
+  // Update Live In information
+  recomputeLiveIns(*testMBB);
+  recomputeLiveIns(*tailMBB);
 }
 
 void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(

From 46e5c5fe778b92b2a7e2c2ad3610e1da6794bd5e Mon Sep 17 00:00:00 2001
From: Denys Petrov <dpetrov@accesssoftek.com>
Date: Fri, 22 May 2020 13:10:37 +0300
Subject: [PATCH 032/770] [ManagedStatic] Fix build errors with clang-tblgen in
 Debug mode using MSVC 2019 v16.6

After updating MSVS19 from v16.4 to v16.6 I faced with a build errors compiling in Debug mode.
It complains on clang-tblgen.exe and llvm-tblgen.exe cmd line args.
VS compiler had a bug. It dynamically creates an object with constexpr ctor in Debug mode. This bug was fixed in VS2019 v16.5.
A workaround was implemented for that and everything works until v16.5 comes.
The workaround became irrelevant since v16.5 and caused build errors.
So I disabled the workaround for VS2019 v16.5 and higher.

This relates to http://llvm.org/PR41367.

Differential Revision: https://reviews.llvm.org/D80433
---
 llvm/include/llvm/Support/ManagedStatic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/ManagedStatic.h b/llvm/include/llvm/Support/ManagedStatic.h
index bbd0d04ed0404..f2b41422f1315 100644
--- a/llvm/include/llvm/Support/ManagedStatic.h
+++ b/llvm/include/llvm/Support/ManagedStatic.h
@@ -40,8 +40,8 @@ template <typename T, size_t N> struct object_deleter<T[N]> {
 // constexpr, a dynamic initializer may be emitted depending on optimization
 // settings. For the affected versions of MSVC, use the old linker
 // initialization pattern of not providing a constructor and leaving the fields
-// uninitialized.
-#if !defined(_MSC_VER) || defined(__clang__)
+// uninitialized. See http://llvm.org/PR41367 for details.
+#if !defined(_MSC_VER) || (_MSC_VER >= 1925) || defined(__clang__)
 #define LLVM_USE_CONSTEXPR_CTOR
 #endif
 

From ba03bcbc4a21b92f6a4a54bd6e90417956da7952 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 25 May 2020 15:59:48 +0200
Subject: [PATCH 033/770] [lldb] Remove custom DWARF expression printing code

The llvm DWARFExpression dump is nearly identical, but better -- for
example it does print a spurious space after zero-argument expressions.

Some parts of our code (variable locations) have been already switched
to llvm-based expression dumping. This switches the remainder: unwind
plans and some unit tests.
---
 .../include/lldb/Expression/DWARFExpression.h |   8 -
 lldb/source/Expression/DWARFExpression.cpp    | 229 -------------
 .../Plugins/SymbolFile/DWARF/DWARFDefines.cpp | 315 ------------------
 .../Plugins/SymbolFile/DWARF/DWARFDefines.h   |  40 ---
 lldb/source/Symbol/UnwindPlan.cpp             |  12 +-
 .../SymbolFile/Breakpad/stack-cfi-arm.yaml    |   3 +-
 .../Breakpad/stack-cfi-parsing.test           |   4 +-
 .../Breakpad/unwind-via-raSearch.test         |   4 +-
 .../Breakpad/unwind-via-stack-cfi.test        |   2 +-
 .../Breakpad/unwind-via-stack-win.test        |   4 +-
 .../Shell/Unwind/eh-frame-dwarf-unwind.test   |   2 +-
 .../Shell/Unwind/unwind-plan-dwarf-dump.test  |   2 +-
 .../Symbol/PostfixExpressionTest.cpp          |  41 ++-
 .../PdbFPOProgramToDWARFExpressionTests.cpp   |  34 +-
 14 files changed, 47 insertions(+), 653 deletions(-)

diff --git a/lldb/include/lldb/Expression/DWARFExpression.h b/lldb/include/lldb/Expression/DWARFExpression.h
index 302936172ed2c..6b63b186e3e43 100644
--- a/lldb/include/lldb/Expression/DWARFExpression.h
+++ b/lldb/include/lldb/Expression/DWARFExpression.h
@@ -217,14 +217,6 @@ class DWARFExpression {
                               lldb::addr_t func_load_addr, lldb::addr_t address,
                               ABI *abi);
 
-  static bool PrintDWARFExpression(Stream &s, const DataExtractor &data,
-                                   int address_size, int dwarf_ref_size,
-                                   bool location_expression);
-
-  static void PrintDWARFLocationList(Stream &s, const DWARFUnit *cu,
-                                     const DataExtractor &debug_loc_data,
-                                     lldb::offset_t offset);
-
   bool MatchesOperand(StackFrame &frame, const Instruction::Operand &op);
 
 private:
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index e85b5f341fe5b..f8fc1db7ec29f 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -2566,235 +2566,6 @@ bool DWARFExpression::Evaluate(
   return true; // Return true on success
 }
 
-static bool print_dwarf_exp_op(Stream &s, const DataExtractor &data,
-                               lldb::offset_t *offset_ptr, int address_size,
-                               int dwarf_ref_size) {
-  uint8_t opcode = data.GetU8(offset_ptr);
-  DRC_class opcode_class;
-  uint64_t uint;
-  int64_t sint;
-
-  int size;
-
-  opcode_class = DW_OP_value_to_class(opcode) & (~DRC_DWARFv3);
-
-  s.Printf("%s ", DW_OP_value_to_name(opcode));
-
-  /* Does this take zero parameters?  If so we can shortcut this function.  */
-  if (opcode_class == DRC_ZEROOPERANDS)
-    return true;
-
-  if (opcode_class == DRC_TWOOPERANDS && opcode == DW_OP_bregx) {
-    uint = data.GetULEB128(offset_ptr);
-    sint = data.GetSLEB128(offset_ptr);
-    s.Printf("%" PRIu64 " %" PRIi64, uint, sint);
-    return true;
-  }
-  if (opcode_class == DRC_TWOOPERANDS && opcode == DW_OP_entry_value) {
-    uint = data.GetULEB128(offset_ptr);
-    s.Printf("%" PRIu64 " ", uint);
-    return true;
-  }
-  if (opcode_class != DRC_ONEOPERAND) {
-    s.Printf("UNKNOWN OP %u", opcode);
-    return false;
-  }
-
-  switch (opcode) {
-  case DW_OP_addr:
-    size = address_size;
-    break;
-  case DW_OP_const1u:
-    size = 1;
-    break;
-  case DW_OP_const1s:
-    size = -1;
-    break;
-  case DW_OP_const2u:
-    size = 2;
-    break;
-  case DW_OP_const2s:
-    size = -2;
-    break;
-  case DW_OP_const4u:
-    size = 4;
-    break;
-  case DW_OP_const4s:
-    size = -4;
-    break;
-  case DW_OP_const8u:
-    size = 8;
-    break;
-  case DW_OP_const8s:
-    size = -8;
-    break;
-  case DW_OP_constu:
-    size = 128;
-    break;
-  case DW_OP_consts:
-    size = -128;
-    break;
-  case DW_OP_fbreg:
-    size = -128;
-    break;
-  case DW_OP_breg0:
-  case DW_OP_breg1:
-  case DW_OP_breg2:
-  case DW_OP_breg3:
-  case DW_OP_breg4:
-  case DW_OP_breg5:
-  case DW_OP_breg6:
-  case DW_OP_breg7:
-  case DW_OP_breg8:
-  case DW_OP_breg9:
-  case DW_OP_breg10:
-  case DW_OP_breg11:
-  case DW_OP_breg12:
-  case DW_OP_breg13:
-  case DW_OP_breg14:
-  case DW_OP_breg15:
-  case DW_OP_breg16:
-  case DW_OP_breg17:
-  case DW_OP_breg18:
-  case DW_OP_breg19:
-  case DW_OP_breg20:
-  case DW_OP_breg21:
-  case DW_OP_breg22:
-  case DW_OP_breg23:
-  case DW_OP_breg24:
-  case DW_OP_breg25:
-  case DW_OP_breg26:
-  case DW_OP_breg27:
-  case DW_OP_breg28:
-  case DW_OP_breg29:
-  case DW_OP_breg30:
-  case DW_OP_breg31:
-    size = -128;
-    break;
-  case DW_OP_pick:
-  case DW_OP_deref_size:
-  case DW_OP_xderef_size:
-    size = 1;
-    break;
-  case DW_OP_skip:
-  case DW_OP_bra:
-    size = -2;
-    break;
-  case DW_OP_call2:
-    size = 2;
-    break;
-  case DW_OP_call4:
-    size = 4;
-    break;
-  case DW_OP_call_ref:
-    size = dwarf_ref_size;
-    break;
-  case DW_OP_addrx:
-  case DW_OP_piece:
-  case DW_OP_plus_uconst:
-  case DW_OP_regx:
-  case DW_OP_GNU_addr_index:
-  case DW_OP_GNU_const_index:
-  case DW_OP_entry_value:
-    size = 128;
-    break;
-  default:
-    s.Printf("UNKNOWN ONE-OPERAND OPCODE, #%u", opcode);
-    return false;
-  }
-
-  switch (size) {
-  case -1:
-    sint = (int8_t)data.GetU8(offset_ptr);
-    s.Printf("%+" PRIi64, sint);
-    break;
-  case -2:
-    sint = (int16_t)data.GetU16(offset_ptr);
-    s.Printf("%+" PRIi64, sint);
-    break;
-  case -4:
-    sint = (int32_t)data.GetU32(offset_ptr);
-    s.Printf("%+" PRIi64, sint);
-    break;
-  case -8:
-    sint = (int64_t)data.GetU64(offset_ptr);
-    s.Printf("%+" PRIi64, sint);
-    break;
-  case -128:
-    sint = data.GetSLEB128(offset_ptr);
-    s.Printf("%+" PRIi64, sint);
-    break;
-  case 1:
-    uint = data.GetU8(offset_ptr);
-    s.Printf("0x%2.2" PRIx64, uint);
-    break;
-  case 2:
-    uint = data.GetU16(offset_ptr);
-    s.Printf("0x%4.4" PRIx64, uint);
-    break;
-  case 4:
-    uint = data.GetU32(offset_ptr);
-    s.Printf("0x%8.8" PRIx64, uint);
-    break;
-  case 8:
-    uint = data.GetU64(offset_ptr);
-    s.Printf("0x%16.16" PRIx64, uint);
-    break;
-  case 128:
-    uint = data.GetULEB128(offset_ptr);
-    s.Printf("0x%" PRIx64, uint);
-    break;
-  }
-
-  return true;
-}
-
-bool DWARFExpression::PrintDWARFExpression(Stream &s, const DataExtractor &data,
-                                           int address_size, int dwarf_ref_size,
-                                           bool location_expression) {
-  int op_count = 0;
-  lldb::offset_t offset = 0;
-  while (data.ValidOffset(offset)) {
-    if (location_expression && op_count > 0)
-      return false;
-    if (op_count > 0)
-      s.PutCString(", ");
-    if (!print_dwarf_exp_op(s, data, &offset, address_size, dwarf_ref_size))
-      return false;
-    op_count++;
-  }
-
-  return true;
-}
-
-void DWARFExpression::PrintDWARFLocationList(
-    Stream &s, const DWARFUnit *cu, const DataExtractor &debug_loc_data,
-    lldb::offset_t offset) {
-  uint64_t start_addr, end_addr;
-  uint32_t addr_size = DWARFUnit::GetAddressByteSize(cu);
-  s.SetAddressByteSize(DWARFUnit::GetAddressByteSize(cu));
-  dw_addr_t base_addr = cu ? cu->GetBaseAddress() : 0;
-  while (debug_loc_data.ValidOffset(offset)) {
-    start_addr = debug_loc_data.GetMaxU64(&offset, addr_size);
-    end_addr = debug_loc_data.GetMaxU64(&offset, addr_size);
-
-    if (start_addr == 0 && end_addr == 0)
-      break;
-
-    s.PutCString("\n            ");
-    s.Indent();
-    if (cu)
-      DumpAddressRange(s.AsRawOstream(), start_addr + base_addr,
-                       end_addr + base_addr, cu->GetAddressByteSize(), nullptr,
-                       ": ");
-    uint32_t loc_length = debug_loc_data.GetU16(&offset);
-
-    DataExtractor locationData(debug_loc_data, offset, loc_length);
-    PrintDWARFExpression(s, locationData, addr_size, 4, false);
-    offset += loc_length;
-  }
-}
-
 static DataExtractor ToDataExtractor(const llvm::DWARFLocationExpression &loc,
                                      ByteOrder byte_order, uint32_t addr_size) {
   auto buffer_sp =
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
index 29db1101d9971..4e99a295ce50f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.cpp
@@ -58,321 +58,6 @@ const char *DW_OP_value_to_name(uint32_t val) {
   return llvmstr.data();
 }
 
-DRC_class DW_OP_value_to_class(uint32_t val) {
-  // FIXME: If we just used llvm's DWARFExpression printer, we could delete
-  // all this code (and more in lldb's DWARFExpression.cpp).
-  switch (val) {
-  case 0x03:
-    return DRC_ONEOPERAND;
-  case 0x06:
-    return DRC_ZEROOPERANDS;
-  case 0x08:
-    return DRC_ONEOPERAND;
-  case 0x09:
-    return DRC_ONEOPERAND;
-  case 0x0a:
-    return DRC_ONEOPERAND;
-  case 0x0b:
-    return DRC_ONEOPERAND;
-  case 0x0c:
-    return DRC_ONEOPERAND;
-  case 0x0d:
-    return DRC_ONEOPERAND;
-  case 0x0e:
-    return DRC_ONEOPERAND;
-  case 0x0f:
-    return DRC_ONEOPERAND;
-  case 0x10:
-    return DRC_ONEOPERAND;
-  case 0x11:
-    return DRC_ONEOPERAND;
-  case 0x12:
-    return DRC_ZEROOPERANDS;
-  case 0x13:
-    return DRC_ZEROOPERANDS;
-  case 0x14:
-    return DRC_ZEROOPERANDS;
-  case 0x15:
-    return DRC_ONEOPERAND;
-  case 0x16:
-    return DRC_ZEROOPERANDS;
-  case 0x17:
-    return DRC_ZEROOPERANDS;
-  case 0x18:
-    return DRC_ZEROOPERANDS;
-  case 0x19:
-    return DRC_ZEROOPERANDS;
-  case 0x1a:
-    return DRC_ZEROOPERANDS;
-  case 0x1b:
-    return DRC_ZEROOPERANDS;
-  case 0x1c:
-    return DRC_ZEROOPERANDS;
-  case 0x1d:
-    return DRC_ZEROOPERANDS;
-  case 0x1e:
-    return DRC_ZEROOPERANDS;
-  case 0x1f:
-    return DRC_ZEROOPERANDS;
-  case 0x20:
-    return DRC_ZEROOPERANDS;
-  case 0x21:
-    return DRC_ZEROOPERANDS;
-  case 0x22:
-    return DRC_ZEROOPERANDS;
-  case 0x23:
-    return DRC_ONEOPERAND;
-  case 0x24:
-    return DRC_ZEROOPERANDS;
-  case 0x25:
-    return DRC_ZEROOPERANDS;
-  case 0x26:
-    return DRC_ZEROOPERANDS;
-  case 0x27:
-    return DRC_ZEROOPERANDS;
-  case 0x2f:
-    return DRC_ONEOPERAND;
-  case 0x28:
-    return DRC_ONEOPERAND;
-  case 0x29:
-    return DRC_ZEROOPERANDS;
-  case 0x2a:
-    return DRC_ZEROOPERANDS;
-  case 0x2b:
-    return DRC_ZEROOPERANDS;
-  case 0x2c:
-    return DRC_ZEROOPERANDS;
-  case 0x2d:
-    return DRC_ZEROOPERANDS;
-  case 0x2e:
-    return DRC_ZEROOPERANDS;
-  case 0x30:
-    return DRC_ZEROOPERANDS;
-  case 0x31:
-    return DRC_ZEROOPERANDS;
-  case 0x32:
-    return DRC_ZEROOPERANDS;
-  case 0x33:
-    return DRC_ZEROOPERANDS;
-  case 0x34:
-    return DRC_ZEROOPERANDS;
-  case 0x35:
-    return DRC_ZEROOPERANDS;
-  case 0x36:
-    return DRC_ZEROOPERANDS;
-  case 0x37:
-    return DRC_ZEROOPERANDS;
-  case 0x38:
-    return DRC_ZEROOPERANDS;
-  case 0x39:
-    return DRC_ZEROOPERANDS;
-  case 0x3a:
-    return DRC_ZEROOPERANDS;
-  case 0x3b:
-    return DRC_ZEROOPERANDS;
-  case 0x3c:
-    return DRC_ZEROOPERANDS;
-  case 0x3d:
-    return DRC_ZEROOPERANDS;
-  case 0x3e:
-    return DRC_ZEROOPERANDS;
-  case 0x3f:
-    return DRC_ZEROOPERANDS;
-  case 0x40:
-    return DRC_ZEROOPERANDS;
-  case 0x41:
-    return DRC_ZEROOPERANDS;
-  case 0x42:
-    return DRC_ZEROOPERANDS;
-  case 0x43:
-    return DRC_ZEROOPERANDS;
-  case 0x44:
-    return DRC_ZEROOPERANDS;
-  case 0x45:
-    return DRC_ZEROOPERANDS;
-  case 0x46:
-    return DRC_ZEROOPERANDS;
-  case 0x47:
-    return DRC_ZEROOPERANDS;
-  case 0x48:
-    return DRC_ZEROOPERANDS;
-  case 0x49:
-    return DRC_ZEROOPERANDS;
-  case 0x4a:
-    return DRC_ZEROOPERANDS;
-  case 0x4b:
-    return DRC_ZEROOPERANDS;
-  case 0x4c:
-    return DRC_ZEROOPERANDS;
-  case 0x4d:
-    return DRC_ZEROOPERANDS;
-  case 0x4e:
-    return DRC_ZEROOPERANDS;
-  case 0x4f:
-    return DRC_ZEROOPERANDS;
-  case 0x50:
-    return DRC_ZEROOPERANDS;
-  case 0x51:
-    return DRC_ZEROOPERANDS;
-  case 0x52:
-    return DRC_ZEROOPERANDS;
-  case 0x53:
-    return DRC_ZEROOPERANDS;
-  case 0x54:
-    return DRC_ZEROOPERANDS;
-  case 0x55:
-    return DRC_ZEROOPERANDS;
-  case 0x56:
-    return DRC_ZEROOPERANDS;
-  case 0x57:
-    return DRC_ZEROOPERANDS;
-  case 0x58:
-    return DRC_ZEROOPERANDS;
-  case 0x59:
-    return DRC_ZEROOPERANDS;
-  case 0x5a:
-    return DRC_ZEROOPERANDS;
-  case 0x5b:
-    return DRC_ZEROOPERANDS;
-  case 0x5c:
-    return DRC_ZEROOPERANDS;
-  case 0x5d:
-    return DRC_ZEROOPERANDS;
-  case 0x5e:
-    return DRC_ZEROOPERANDS;
-  case 0x5f:
-    return DRC_ZEROOPERANDS;
-  case 0x60:
-    return DRC_ZEROOPERANDS;
-  case 0x61:
-    return DRC_ZEROOPERANDS;
-  case 0x62:
-    return DRC_ZEROOPERANDS;
-  case 0x63:
-    return DRC_ZEROOPERANDS;
-  case 0x64:
-    return DRC_ZEROOPERANDS;
-  case 0x65:
-    return DRC_ZEROOPERANDS;
-  case 0x66:
-    return DRC_ZEROOPERANDS;
-  case 0x67:
-    return DRC_ZEROOPERANDS;
-  case 0x68:
-    return DRC_ZEROOPERANDS;
-  case 0x69:
-    return DRC_ZEROOPERANDS;
-  case 0x6a:
-    return DRC_ZEROOPERANDS;
-  case 0x6b:
-    return DRC_ZEROOPERANDS;
-  case 0x6c:
-    return DRC_ZEROOPERANDS;
-  case 0x6d:
-    return DRC_ZEROOPERANDS;
-  case 0x6e:
-    return DRC_ZEROOPERANDS;
-  case 0x6f:
-    return DRC_ZEROOPERANDS;
-  case 0x70:
-    return DRC_ONEOPERAND;
-  case 0x71:
-    return DRC_ONEOPERAND;
-  case 0x72:
-    return DRC_ONEOPERAND;
-  case 0x73:
-    return DRC_ONEOPERAND;
-  case 0x74:
-    return DRC_ONEOPERAND;
-  case 0x75:
-    return DRC_ONEOPERAND;
-  case 0x76:
-    return DRC_ONEOPERAND;
-  case 0x77:
-    return DRC_ONEOPERAND;
-  case 0x78:
-    return DRC_ONEOPERAND;
-  case 0x79:
-    return DRC_ONEOPERAND;
-  case 0x7a:
-    return DRC_ONEOPERAND;
-  case 0x7b:
-    return DRC_ONEOPERAND;
-  case 0x7c:
-    return DRC_ONEOPERAND;
-  case 0x7d:
-    return DRC_ONEOPERAND;
-  case 0x7e:
-    return DRC_ONEOPERAND;
-  case 0x7f:
-    return DRC_ONEOPERAND;
-  case 0x80:
-    return DRC_ONEOPERAND;
-  case 0x81:
-    return DRC_ONEOPERAND;
-  case 0x82:
-    return DRC_ONEOPERAND;
-  case 0x83:
-    return DRC_ONEOPERAND;
-  case 0x84:
-    return DRC_ONEOPERAND;
-  case 0x85:
-    return DRC_ONEOPERAND;
-  case 0x86:
-    return DRC_ONEOPERAND;
-  case 0x87:
-    return DRC_ONEOPERAND;
-  case 0x88:
-    return DRC_ONEOPERAND;
-  case 0x89:
-    return DRC_ONEOPERAND;
-  case 0x8a:
-    return DRC_ONEOPERAND;
-  case 0x8b:
-    return DRC_ONEOPERAND;
-  case 0x8c:
-    return DRC_ONEOPERAND;
-  case 0x8d:
-    return DRC_ONEOPERAND;
-  case 0x8e:
-    return DRC_ONEOPERAND;
-  case 0x8f:
-    return DRC_ONEOPERAND;
-  case 0x90:
-    return DRC_ONEOPERAND;
-  case 0x91:
-    return DRC_ONEOPERAND;
-  case 0x92:
-    return DRC_TWOOPERANDS;
-  case 0x93:
-    return DRC_ONEOPERAND;
-  case 0x94:
-    return DRC_ONEOPERAND;
-  case 0x95:
-    return DRC_ONEOPERAND;
-  case 0x96:
-    return DRC_ZEROOPERANDS;
-  case 0x97:
-    return DRC_DWARFv3 | DRC_ZEROOPERANDS;
-  case 0x98:
-    return DRC_DWARFv3 | DRC_ONEOPERAND;
-  case 0x99:
-    return DRC_DWARFv3 | DRC_ONEOPERAND;
-  case 0x9a:
-    return DRC_DWARFv3 | DRC_ONEOPERAND;
-  case 0xa3: /* DW_OP_entry_value */
-    return DRC_TWOOPERANDS;
-  case 0xf0:
-    return DRC_ZEROOPERANDS; /* DW_OP_APPLE_uninit */
-  case 0xe0:
-    return 0;
-  case 0xff:
-    return 0;
-  default:
-    return 0;
-  }
-}
-
 const char *DW_ATE_value_to_name(uint32_t val) {
   static char invalid[100];
   llvm::StringRef llvmstr = llvm::dwarf::AttributeEncodingString(val);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h
index 670fde262e1e8..1b7102cd7e317 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDefines.h
@@ -26,52 +26,12 @@ const char *DW_FORM_value_to_name(uint32_t val);
 
 const char *DW_OP_value_to_name(uint32_t val);
 
-DRC_class DW_OP_value_to_class(uint32_t val);
-
 const char *DW_ATE_value_to_name(uint32_t val);
 
 const char *DW_LANG_value_to_name(uint32_t val);
 
 const char *DW_LNS_value_to_name(uint32_t val);
 
-/* These DRC are entirely our own construction,
-    although they are derived from various comments in the DWARF standard.
-    Most of these are not useful to the parser, but the DW_AT and DW_FORM
-    classes should prove to be usable in some fashion.  */
-
-#define DRC_0x65 0x1
-#define DRC_ADDRESS 0x2
-#define DRC_BLOCK 0x4
-#define DRC_CONSTANT 0x8
-#define DRC_DWARFv3 0x10
-#define DRC_FLAG 0x20
-#define DRC_INDIRECT_SPECIAL 0x40
-#define DRC_LINEPTR 0x80
-#define DRC_LOCEXPR 0x100
-#define DRC_LOCLISTPTR 0x200
-#define DRC_MACPTR 0x400
-#define DRC_ONEOPERAND 0x800
-#define DRC_OPERANDONE_1BYTE_DELTA 0x1000
-#define DRC_OPERANDONE_2BYTE_DELTA 0x2000
-#define DRC_OPERANDONE_4BYTE_DELTA 0x4000
-#define DRC_OPERANDONE_ADDRESS 0x8000
-#define DRC_OPERANDONE_BLOCK 0x10000
-#define DRC_OPERANDONE_SLEB128_OFFSET 0x20000
-#define DRC_OPERANDONE_ULEB128_OFFSET 0x40000
-#define DRC_OPERANDONE_ULEB128_REGISTER 0x80000
-#define DRC_OPERANDTWO_BLOCK 0x100000
-#define DRC_OPERANDTWO_SLEB128_OFFSET 0x200000
-#define DRC_OPERANDTWO_ULEB128_OFFSET 0x400000
-#define DRC_OPERANDTWO_ULEB128_REGISTER 0x800000
-#define DRC_OPERNADONE_ULEB128_REGISTER 0x1000000
-#define DRC_RANGELISTPTR 0x2000000
-#define DRC_REFERENCE 0x4000000
-#define DRC_STRING 0x8000000
-#define DRC_TWOOPERANDS 0x10000000
-#define DRC_VENDOR_GNU 0x20000000
-#define DRC_VENDOR_MIPS 0x40000000
-#define DRC_ZEROOPERANDS 0x80000000
-
 } // namespace lldb_private
 
 #endif // LLDB_SOURCE_PLUGINS_SYMBOLFILE_DWARF_DWARFDEFINES_H
diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp
index 438b329274839..e8906f38e2ffe 100644
--- a/lldb/source/Symbol/UnwindPlan.cpp
+++ b/lldb/source/Symbol/UnwindPlan.cpp
@@ -15,6 +15,7 @@
 #include "lldb/Target/Thread.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/Log.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -79,13 +80,10 @@ GetByteOrderAndAddrSize(Thread *thread) {
 
 static void DumpDWARFExpr(Stream &s, llvm::ArrayRef<uint8_t> expr, Thread *thread) {
   if (auto order_and_width = GetByteOrderAndAddrSize(thread)) {
-    DataExtractor extractor(expr.data(), expr.size(), order_and_width->first,
-                            order_and_width->second);
-    if (!DWARFExpression::PrintDWARFExpression(s, extractor,
-                                               order_and_width->second,
-                                               /*dwarf_ref_size*/ 4,
-                                               /*location_expression*/ false))
-      s.PutCString("invalid-dwarf-expr");
+    llvm::DataExtractor data(expr, order_and_width->first == eByteOrderLittle,
+                             order_and_width->second);
+    llvm::DWARFExpression(data, order_and_width->second, llvm::dwarf::DWARF32)
+        .print(s.AsRawOstream(), nullptr, nullptr);
   } else
     s.PutCString("dwarf-expr");
 }
diff --git a/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-arm.yaml b/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-arm.yaml
index f7f46b7e2bff1..61a33e7ba7fc6 100644
--- a/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-arm.yaml
+++ b/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-arm.yaml
@@ -6,8 +6,7 @@
 
 # CHECK:      Symbol file UnwindPlan:
 # CHECK:      row[0]:    0: CFA=DW_OP_breg13 +0, DW_OP_consts +0, DW_OP_plus  => pc=DW_OP_breg14 +0 
-# CHECK-NEXT: row[1]:    2: CFA=DW_OP_breg13 +0, DW_OP_consts +8, DW_OP_plus  => r7=DW_OP_pick 0x00, DW_OP_consts -8, DW_OP_plus , DW_OP_deref  pc=DW_OP_pick 0x00, DW_OP_consts -4, DW_OP_plus , DW_OP_deref  
-# 
+# CHECK-NEXT: row[1]:    2: CFA=DW_OP_breg13 +0, DW_OP_consts +8, DW_OP_plus  => r7=DW_OP_pick 0x0, DW_OP_consts -8, DW_OP_plus, DW_OP_deref pc=DW_OP_pick 0x0, DW_OP_consts -4, DW_OP_plus, DW_OP_deref
 
 --- !minidump
 Streams:         
diff --git a/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test b/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test
index ffb64602c15e5..a6e29290854ad 100644
--- a/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test
+++ b/lldb/test/Shell/SymbolFile/Breakpad/stack-cfi-parsing.test
@@ -11,8 +11,8 @@ image show-unwind -n func0
 # CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes.
 # CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no.
 # CHECK-NEXT: Address range of this UnwindPlan: [stack-cfi-parsing.out..module_image + 0-0x0000000000000002)
-# CHECK-NEXT: row[0]:    0: CFA=DW_OP_breg7 +0 => rbp=DW_OP_breg7 +0 rip=DW_OP_pick 0x00 
-# CHECK-NEXT: row[1]:    1: CFA=DW_OP_breg7 +0 => rbx=DW_OP_breg2 +0 rbp=DW_OP_breg0 +0 rip=DW_OP_pick 0x00 
+# CHECK-NEXT: row[0]:    0: CFA=DW_OP_breg7 +0 => rbp=DW_OP_breg7 +0 rip=DW_OP_pick 0x0
+# CHECK-NEXT: row[1]:    1: CFA=DW_OP_breg7 +0 => rbx=DW_OP_breg2 +0 rbp=DW_OP_breg0 +0 rip=DW_OP_pick 0x0
 
 # The following plans are all (syntactically) invalid for various reasons.
 # Processing those should not cause a crash.
diff --git a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test
index 02404a86fa94e..93ea3ce029419 100644
--- a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test
+++ b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-raSearch.test
@@ -13,13 +13,13 @@ image show-unwind -n call_many
 # CHECK: This UnwindPlan is sourced from the compiler: yes.
 # CHECK: This UnwindPlan is valid at all instruction locations: no.
 # CHECK: Address range of this UnwindPlan: [unwind-via-stack-win.exe..module_image + 4112-0x0000107d)
-# CHECK: row[0]:    0: CFA=RaSearch@SP+0 => esp=DW_OP_pick 0x00, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x00, DW_OP_deref
+# CHECK: row[0]:    0: CFA=RaSearch@SP+0 => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x0, DW_OP_deref
 
 image show-unwind -n nonzero_frame_size
 # CHECK-LABEL: image show-unwind -n nonzero_frame_size
 # CHECK: UNWIND PLANS for unwind-via-stack-win.exe`nonzero_frame_size
 # CHECK: Symbol file UnwindPlan:
-# CHECK: row[0]:    0: CFA=RaSearch@SP+12 => esp=DW_OP_pick 0x00, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x00, DW_OP_deref
+# CHECK: row[0]:    0: CFA=RaSearch@SP+12 => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x0, DW_OP_deref
 
 # Then, some invalid rules.
 image show-unwind -n complex_rasearch
diff --git a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test
index 0a67cb3431102..29cf130694e6d 100644
--- a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test
+++ b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-cfi.test
@@ -13,7 +13,7 @@ image show-unwind -n bar
 # CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes.
 # CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no.
 # CHECK-NEXT: Address range of this UnwindPlan: [unwind-via-stack-cfi..module_image + 1056-0x0000000000000449)
-# CHECK-NEXT: row[0]: 0: CFA=DW_OP_breg6 +0, DW_OP_deref => rbp=DW_OP_pick 0x00, DW_OP_deref rsp=DW_OP_pick 0x00, DW_OP_consts +16, DW_OP_plus rip=DW_OP_pick 0x00, DW_OP_consts +8, DW_OP_plus , DW_OP_deref
+# CHECK-NEXT: row[0]: 0: CFA=DW_OP_breg6 +0, DW_OP_deref => rbp=DW_OP_pick 0x0, DW_OP_deref rsp=DW_OP_pick 0x0, DW_OP_consts +16, DW_OP_plus rip=DW_OP_pick 0x0, DW_OP_consts +8, DW_OP_plus, DW_OP_deref
 
 thread backtrace
 # CHECK-LABEL: thread backtrace
diff --git a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test
index 522637bf952fc..e95367a213f84 100644
--- a/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test
+++ b/lldb/test/Shell/SymbolFile/Breakpad/unwind-via-stack-win.test
@@ -15,7 +15,7 @@ image show-unwind -n call_many
 # CHECK: This UnwindPlan is sourced from the compiler: yes.
 # CHECK: This UnwindPlan is valid at all instruction locations: no.
 # CHECK: Address range of this UnwindPlan: [unwind-via-stack-win.exe..module_image + 4112-0x0000107d)
-# CHECK: row[0]:    0: CFA=DW_OP_breg7 +0, DW_OP_consts +80, DW_OP_plus  => esp=DW_OP_pick 0x00, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x00, DW_OP_deref
+# CHECK: row[0]:    0: CFA=DW_OP_breg7 +0, DW_OP_consts +80, DW_OP_plus  => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus eip=DW_OP_pick 0x0, DW_OP_deref
 
 # Then, some invalid rules.
 image show-unwind -n bogus_rule
@@ -43,7 +43,7 @@ image show-unwind -n temporary_var
 # CHECK: This UnwindPlan is sourced from the compiler: yes.
 # CHECK: This UnwindPlan is valid at all instruction locations: no.
 # CHECK: Address range of this UnwindPlan: [unwind-via-stack-win.exe..module_image + 4400-0x00001134)
-# CHECK: row[0]:    0: CFA=DW_OP_breg7 +0 => esp=DW_OP_pick 0x00, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x00, DW_OP_deref
+# CHECK: row[0]:    0: CFA=DW_OP_breg7 +0 => esp=DW_OP_pick 0x0, DW_OP_consts +4, DW_OP_plus  eip=DW_OP_pick 0x0, DW_OP_deref
 
 # And finally, check that backtracing works as a whole by unwinding a simple
 # stack.
diff --git a/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind.test b/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind.test
index c0b6e5e50f86a..3df9906394f43 100644
--- a/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind.test
+++ b/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind.test
@@ -20,4 +20,4 @@ thread backtrace
 
 target modules show-unwind -n foo
 # CHECK: eh_frame UnwindPlan:
-# CHECK: row[0]: 0: CFA=rsp +8 => rip=DW_OP_lit8 , DW_OP_minus , DW_OP_deref , DW_OP_const1u 0x47, DW_OP_minus
+# CHECK: row[0]: 0: CFA=rsp +8 => rip=DW_OP_lit8, DW_OP_minus, DW_OP_deref, DW_OP_const1u 0x47, DW_OP_minus
diff --git a/lldb/test/Shell/Unwind/unwind-plan-dwarf-dump.test b/lldb/test/Shell/Unwind/unwind-plan-dwarf-dump.test
index 67b482b2df917..6dbb518a656b0 100644
--- a/lldb/test/Shell/Unwind/unwind-plan-dwarf-dump.test
+++ b/lldb/test/Shell/Unwind/unwind-plan-dwarf-dump.test
@@ -11,4 +11,4 @@ process launch
 
 target modules show-unwind -n main
 # CHECK: eh_frame UnwindPlan:
-# CHECK: row[0]:    0: CFA=DW_OP_breg7 +0, DW_OP_const1u 0x00, DW_OP_plus  => rip=DW_OP_const1s -8, DW_OP_plus , DW_OP_deref
+# CHECK: row[0]:    0: CFA=DW_OP_breg7 +0, DW_OP_const1u 0x0, DW_OP_plus => rip=DW_OP_const1s -8, DW_OP_plus, DW_OP_deref
diff --git a/lldb/unittests/Symbol/PostfixExpressionTest.cpp b/lldb/unittests/Symbol/PostfixExpressionTest.cpp
index 1dad83eabb4f5..7def709a60901 100644
--- a/lldb/unittests/Symbol/PostfixExpressionTest.cpp
+++ b/lldb/unittests/Symbol/PostfixExpressionTest.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Symbol/PostfixExpression.h"
-#include "lldb/Expression/DWARFExpression.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/StreamString.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gmock/gmock.h"
@@ -152,17 +152,14 @@ static std::string ParseAndGenerateDWARF(llvm::StringRef expr) {
   ToDWARF(*ast, dwarf);
 
   // print dwarf expression to comparable textual representation
-  DataExtractor extractor(dwarf.GetData(), dwarf.GetSize(),
-                          lldb::eByteOrderLittle, addr_size);
-
-  StreamString result;
-  if (!DWARFExpression::PrintDWARFExpression(result, extractor, addr_size,
-                                             /*dwarf_ref_size*/ 4,
-                                             /*location_expression*/ false)) {
-    return "DWARF printing failed.";
-  }
-
-  return std::string(result.GetString());
+  llvm::DataExtractor extractor(dwarf.GetString(), /*IsLittleEndian=*/true,
+                                addr_size);
+
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  llvm::DWARFExpression(extractor, addr_size, llvm::dwarf::DWARF32)
+      .print(os, nullptr, nullptr);
+  return std::move(os.str());
 }
 
 TEST(PostfixExpression, ToDWARF) {
@@ -170,28 +167,28 @@ TEST(PostfixExpression, ToDWARF) {
 
   EXPECT_EQ("DW_OP_breg1 +0", ParseAndGenerateDWARF("R1"));
 
-  EXPECT_EQ("DW_OP_bregx 65 0", ParseAndGenerateDWARF("R65"));
+  EXPECT_EQ("DW_OP_bregx 0x41 +0", ParseAndGenerateDWARF("R65"));
 
-  EXPECT_EQ("DW_OP_pick 0x00", ParseAndGenerateDWARF("INIT"));
+  EXPECT_EQ("DW_OP_pick 0x0", ParseAndGenerateDWARF("INIT"));
 
-  EXPECT_EQ("DW_OP_pick 0x00, DW_OP_pick 0x01, DW_OP_plus ",
+  EXPECT_EQ("DW_OP_pick 0x0, DW_OP_pick 0x1, DW_OP_plus",
             ParseAndGenerateDWARF("INIT INIT +"));
 
-  EXPECT_EQ("DW_OP_breg1 +0, DW_OP_pick 0x01, DW_OP_plus ",
+  EXPECT_EQ("DW_OP_breg1 +0, DW_OP_pick 0x1, DW_OP_plus",
             ParseAndGenerateDWARF("R1 INIT +"));
 
-  EXPECT_EQ("DW_OP_consts +1, DW_OP_pick 0x01, DW_OP_deref , DW_OP_plus ",
+  EXPECT_EQ("DW_OP_consts +1, DW_OP_pick 0x1, DW_OP_deref, DW_OP_plus",
             ParseAndGenerateDWARF("1 INIT ^ +"));
 
-  EXPECT_EQ("DW_OP_consts +4, DW_OP_consts +5, DW_OP_plus ",
+  EXPECT_EQ("DW_OP_consts +4, DW_OP_consts +5, DW_OP_plus",
             ParseAndGenerateDWARF("4 5 +"));
 
-  EXPECT_EQ("DW_OP_consts +4, DW_OP_consts +5, DW_OP_minus ",
+  EXPECT_EQ("DW_OP_consts +4, DW_OP_consts +5, DW_OP_minus",
             ParseAndGenerateDWARF("4 5 -"));
 
-  EXPECT_EQ("DW_OP_consts +4, DW_OP_deref ", ParseAndGenerateDWARF("4 ^"));
+  EXPECT_EQ("DW_OP_consts +4, DW_OP_deref", ParseAndGenerateDWARF("4 ^"));
 
-  EXPECT_EQ("DW_OP_breg6 +0, DW_OP_consts +128, DW_OP_lit1 "
-            ", DW_OP_minus , DW_OP_not , DW_OP_and ",
+  EXPECT_EQ("DW_OP_breg6 +0, DW_OP_consts +128, DW_OP_lit1, DW_OP_minus, "
+            "DW_OP_not, DW_OP_and",
             ParseAndGenerateDWARF("R6 128 @"));
 }
diff --git a/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp b/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp
index 49e8a1a9fb87f..334a08f92033b 100644
--- a/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp
+++ b/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp
@@ -11,11 +11,11 @@
 #include "Plugins/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpression.h"
 
 #include "lldb/Core/StreamBuffer.h"
-#include "lldb/Expression/DWARFExpression.h"
 #include "lldb/Utility/ArchSpec.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/StreamString.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -27,30 +27,22 @@ static void
 CheckValidProgramTranslation(llvm::StringRef fpo_program,
                              llvm::StringRef target_register_name,
                              llvm::StringRef expected_dwarf_expression) {
-  // initial setup
-  ArchSpec arch_spec("i686-pc-windows");
-  llvm::Triple::ArchType arch_type = arch_spec.GetMachine();
-  ByteOrder byte_order = arch_spec.GetByteOrder();
-  uint32_t address_size = arch_spec.GetAddressByteSize();
-  uint32_t byte_size = arch_spec.GetDataByteSize();
-
   // program translation
-  StreamBuffer<32> stream(Stream::eBinary, address_size, byte_order);
+  StreamBuffer<32> stream(Stream::eBinary, 4, eByteOrderLittle);
   ASSERT_TRUE(TranslateFPOProgramToDWARFExpression(
-      fpo_program, target_register_name, arch_type, stream));
+      fpo_program, target_register_name, llvm::Triple::x86, stream));
 
   // print dwarf expression to comparable textual representation
-  DataBufferSP buffer =
-      std::make_shared<DataBufferHeap>(stream.GetData(), stream.GetSize());
-  DataExtractor extractor(buffer, byte_order, address_size, byte_size);
+  llvm::DataExtractor extractor({stream.GetData(), stream.GetSize()},
+                                /*IsLittleEndian=*/true, /*AddressSize=*/4);
 
-  StreamString result_dwarf_expression;
-  ASSERT_TRUE(DWARFExpression::PrintDWARFExpression(
-      result_dwarf_expression, extractor, address_size, 4, false));
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  llvm::DWARFExpression(extractor, /*AddressSize=*/4, llvm::dwarf::DWARF32)
+      .print(os, nullptr, nullptr);
 
   // actual check
-  ASSERT_STREQ(expected_dwarf_expression.data(),
-               result_dwarf_expression.GetString().data());
+  ASSERT_EQ(expected_dwarf_expression, os.str());
 }
 
 TEST(PDBFPOProgramToDWARFExpressionTests, SingleAssignmentRegisterRef) {
@@ -64,9 +56,9 @@ TEST(PDBFPOProgramToDWARFExpressionTests, MultipleIndependentAssignments) {
 TEST(PDBFPOProgramToDWARFExpressionTests, MultipleDependentAssignments) {
   CheckValidProgramTranslation(
       "$T1 $ebp 4 + = $T0 $T1 8 - 128 @ = ", "$T0",
-      "DW_OP_breg6 +0, DW_OP_consts +4, DW_OP_plus , DW_OP_consts +8, "
-      "DW_OP_minus , DW_OP_consts +128, DW_OP_lit1 , DW_OP_minus , DW_OP_not , "
-      "DW_OP_and ");
+      "DW_OP_breg6 +0, DW_OP_consts +4, DW_OP_plus, DW_OP_consts +8, "
+      "DW_OP_minus, DW_OP_consts +128, DW_OP_lit1, DW_OP_minus, DW_OP_not, "
+      "DW_OP_and");
 }
 
 TEST(PDBFPOProgramToDWARFExpressionTests, DependencyChain) {

From c8b7c73c57f0c835f036aaa00a4970fc91d40020 Mon Sep 17 00:00:00 2001
From: Shuhong Liu <shuhong.liu@ibm.com>
Date: Mon, 25 May 2020 10:11:55 -0400
Subject: [PATCH 034/770] Add AIX to the test macro-same-context XFAIL list

Summary: Since the integrated assembly parser was not implemented yet for AIX and macro is not part of the native assembly dialect on AIX, the test macro-same-context is expected to fail for AIX; hence added AIX to XFAIL list.

Reviewers: hubert.reinterpretcast, daltenty, jasonliu

Reviewed By: daltenty

Subscribers: jasonliu, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80232
---
 llvm/test/MC/AsmParser/macro-same-context.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/MC/AsmParser/macro-same-context.ll b/llvm/test/MC/AsmParser/macro-same-context.ll
index fac30e44193fd..0acc195242083 100644
--- a/llvm/test/MC/AsmParser/macro-same-context.ll
+++ b/llvm/test/MC/AsmParser/macro-same-context.ll
@@ -2,6 +2,9 @@
 ;; thus a definition is available to the whole file. PR36110
 ; RUN: not llc < %s 2>&1 | FileCheck %s
 ; REQUIRES: default_triple
+;; This test is expected to fail on AIX,
+;; since the integrated assembly parser was not implemented yet for AIX.
+; XFAIL: aix
 
 define void @test() {
   call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1

From a6c4cd3bcb715c112607fcc4a1c806d511e2f947 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 15:33:44 +0100
Subject: [PATCH 035/770] [X86] Add PTEST tests showing failure to extract
 allsign cases

As discussed on PR42035, we can often use MOVMSK to avoid a cmpgt/ashr by just analysing the extracted signbits.
---
 llvm/test/CodeGen/X86/combine-ptest.ll | 99 +++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index d23277f627680..2928023c7fc2a 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
 
 ;
 ; testz(~X,Y) -> testc(X,Y)
@@ -297,6 +298,102 @@ start:
   ret i1 %6
 }
 
+;
+; TODO: testz(ashr(X,bw-1),-1) -> movmsk(X)
+;
+
+define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
+; CHECK-LABEL: ptestz_v2i64_signbits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vptest %xmm0, %xmm0
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    retq
+  %t1 = ashr <2 x i64> %c, <i64 63, i64 63>
+  %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> <i64 -1, i64 -1>)
+  %t3 = icmp ne i32 %t2, 0
+  %t4 = select i1 %t3, i32 %a, i32 %b
+  ret i32 %t4
+}
+
+define i32 @ptestz_v8i32_signbits(<8 x i32> %c, i32 %a, i32 %b) {
+; AVX1-LABEL: ptestz_v8i32_signbits:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vptest %ymm0, %ymm0
+; AVX1-NEXT:    cmovnel %esi, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ptestz_v8i32_signbits:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %t1 = ashr <8 x i32> %c, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %t2 = bitcast <8 x i32> %t1 to <4 x i64>
+  %t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
+  %t4 = icmp ne i32 %t3, 0
+  %t5 = select i1 %t4, i32 %a, i32 %b
+  ret i32 %t5
+}
+
+define i32 @ptestz_v8i16_signbits(<8 x i16> %c, i32 %a, i32 %b) {
+; CHECK-LABEL: ptestz_v8i16_signbits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    vpsraw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vptest %xmm0, %xmm0
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    retq
+  %t1 = ashr <8 x i16> %c, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %t2 = bitcast <8 x i16> %t1 to <2 x i64>
+  %t3 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t2, <2 x i64> <i64 -1, i64 -1>)
+  %t4 = icmp ne i32 %t3, 0
+  %t5 = select i1 %t4, i32 %a, i32 %b
+  ret i32 %t5
+}
+
+define i32 @ptestz_v32i8_signbits(<32 x i8> %c, i32 %a, i32 %b) {
+; AVX1-LABEL: ptestz_v32i8_signbits:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vptest %ymm0, %ymm0
+; AVX1-NEXT:    cmovnel %esi, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ptestz_v32i8_signbits:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %t1 = ashr <32 x i8> %c, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %t2 = bitcast <32 x i8> %t1 to <4 x i64>
+  %t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
+  %t4 = icmp ne i32 %t3, 0
+  %t5 = select i1 %t4, i32 %a, i32 %b
+  ret i32 %t5
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone

From 7b1dc0015aec39ad27619872f5debbd86f8f9a2c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 25 May 2020 15:35:47 +0100
Subject: [PATCH 036/770] MustExecute.h - remove unnecessary includes. NFC.

Reduce to forward declarations and fix implicit LoopInfo.h dependency in Attributor.h
---
 llvm/include/llvm/Analysis/MustExecute.h      | 14 +++++++-------
 llvm/include/llvm/Transforms/IPO/Attributor.h |  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MustExecute.h b/llvm/include/llvm/Analysis/MustExecute.h
index 181fdacad2334..093e561690402 100644
--- a/llvm/include/llvm/Analysis/MustExecute.h
+++ b/llvm/include/llvm/Analysis/MustExecute.h
@@ -24,11 +24,9 @@
 #define LLVM_ANALYSIS_MUSTEXECUTE_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionPrecedenceTracking.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
@@ -37,15 +35,17 @@ namespace {
 template <typename T> using GetterTy = std::function<T *(const Function &F)>;
 }
 
-class Instruction;
+class BasicBlock;
 class DominatorTree;
-class PostDominatorTree;
+class Instruction;
 class Loop;
+class LoopInfo;
+class PostDominatorTree;
 
 /// Captures loop safety information.
 /// It keep information for loop blocks may throw exception or otherwise
-/// exit abnormaly on any iteration of the loop which might actually execute
-/// at runtime.  The primary way to consume this infromation is via
+/// exit abnormally on any iteration of the loop which might actually execute
+/// at runtime.  The primary way to consume this information is via
 /// isGuaranteedToExecute below, but some callers bailout or fallback to
 /// alternate reasoning if a loop contains any implicit control flow.
 /// NOTE: LoopSafetyInfo contains cached information regarding loops and their
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 6ec4c3feb28da..a5d5043347450 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -107,6 +107,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"

From 3c6c2ecd6efa393e7a8422d88e5d4ada0970e47e Mon Sep 17 00:00:00 2001
From: Dmitry Preobrazhensky <dmitry.preobrazhensky@amd.com>
Date: Mon, 25 May 2020 17:45:18 +0300
Subject: [PATCH 037/770] [AMDGPU] Added 'A' constraint for inline assembler

Summary: 'A' constraint requires an immediate int or fp constant that can be inlined in an instruction encoding.
This is the second part of the change. The llvm part has been committed as b087b91c9170.
See https://reviews.llvm.org/D78494

Reviewers: arsenm, rampitec

Differential Revision: https://reviews.llvm.org/D79493
---
 clang/lib/Basic/Targets/AMDGPU.h              | 5 +++++
 clang/test/Sema/inline-asm-validate-amdgpu.cl | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index d0e88e223e955..6c9060aa3f7bf 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -131,6 +131,11 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUTargetInfo final : public TargetInfo {
     });
 
     StringRef S(Name);
+    if (S == "A") {
+      Info.setRequiresImmediate();
+      return true;
+    }
+
     bool HasLeftParen = false;
     if (S.front() == '{') {
       HasLeftParen = true;
diff --git a/clang/test/Sema/inline-asm-validate-amdgpu.cl b/clang/test/Sema/inline-asm-validate-amdgpu.cl
index 51009ecb3f1e0..3d6488227ef29 100644
--- a/clang/test/Sema/inline-asm-validate-amdgpu.cl
+++ b/clang/test/Sema/inline-asm-validate-amdgpu.cl
@@ -17,6 +17,10 @@ kernel void test () {
 
   // vgpr constraints
   __asm__ ("v_mov_b32 %0, %1" : "=v" (vgpr) : "v" (imm) : );
+
+  // 'A' constraint
+  __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "A" (imm) : );
+
 }
 
 __kernel void

From 7c298c104bfe725d4315926a656263e8a5ac3054 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Thu, 14 May 2020 07:23:10 +0200
Subject: [PATCH 038/770] [PGO] Fix computation of function Hash

Previous implementation was incorrectly passing an uint64_t, that got converted
to an uint8_t, to finalize the hash computation. This led to different functions
having the same hash if they only differ by the remaining statements, which is
incorrect.

Added a new test case that trivially tests that a small function change is
reflected in the hash value.

Not that as this patch fixes the hash computation, it invalidates all hashes
computed before that patch applies, which could be an issue for large build
system that pre-compute the profile data and let client download them as part of
the build process.

Differential Revision: https://reviews.llvm.org/D79961
---
 clang/lib/CodeGen/CodeGenPGO.cpp |  8 +++++---
 clang/test/Profile/c-collision.c | 22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Profile/c-collision.c

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 3c91a04d54642..98827bc3eec5e 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -747,13 +747,15 @@ uint64_t PGOHash::finalize() {
     return Working;
 
   // Check for remaining work in Working.
-  if (Working)
-    MD5.update(Working);
+  if (Working) {
+    using namespace llvm::support;
+    uint64_t Swapped = endian::byte_swap<uint64_t, little>(Working);
+    MD5.update(llvm::makeArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
+  }
 
   // Finalize the MD5 and return the hash.
   llvm::MD5::MD5Result Result;
   MD5.final(Result);
-  using namespace llvm::support;
   return Result.low();
 }
 
diff --git a/clang/test/Profile/c-collision.c b/clang/test/Profile/c-collision.c
new file mode 100644
index 0000000000000..fabecd752b4ef
--- /dev/null
+++ b/clang/test/Profile/c-collision.c
@@ -0,0 +1,22 @@
+// Test that a slight change in the code leads to a different hash.
+// RUN: %clang_cc1 -UEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-NOEXTRA
+// RUN: %clang_cc1 -DEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-EXTRA
+
+// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 7156072912471487002,
+// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 -4383447408116050035,
+
+extern int bar;
+void foo() {
+  if (bar) {
+  }
+  if (bar) {
+  }
+  if (bar) {
+    if (bar) {
+#ifdef EXTRA
+      if (bar) {
+      }
+#endif
+    }
+  }
+}

From ba92b274225fc78dc15e8dc0076f71e7a8b5d084 Mon Sep 17 00:00:00 2001
From: Denys Petrov <dpetrov@accesssoftek.com>
Date: Thu, 9 Apr 2020 16:20:07 +0300
Subject: [PATCH 039/770] [analyzer] Improved RangeSet::Negate support of
 unsigned ranges

Summary:
This fixes https://bugs.llvm.org/show_bug.cgi?id=41588
RangeSet Negate function shall handle unsigned ranges as well as signed ones.
RangeSet getRangeForMinusSymbol function shall use wider variety of ranges, not only concrete value ranges.
RangeSet Intersect functions shall not produce assertions.

Changes:
Improved safety of RangeSet::Intersect function. Added isEmpty() check to prevent an assertion.
Added support of handling unsigned ranges to RangeSet::Negate and RangeSet::getRangeForMinusSymbol.
Extended RangeSet::getRangeForMinusSymbol to return not only range sets with single value [n,n], but with wide ranges [n,m].
Added unit test for Negate function.
Added regression tests for unsigned values.

Differential Revision: https://reviews.llvm.org/D77802
---
 .../Core/RangeConstraintManager.cpp           | 101 ++++++++++----
 .../constraint_manager_negate_difference.c    |  49 ++++++-
 clang/unittests/StaticAnalyzer/CMakeLists.txt |   1 +
 .../unittests/StaticAnalyzer/RangeSetTest.cpp | 130 ++++++++++++++++++
 4 files changed, 251 insertions(+), 30 deletions(-)
 create mode 100644 clang/unittests/StaticAnalyzer/RangeSetTest.cpp

diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 9752a0e22832c..137e2cefe5a04 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -155,11 +155,11 @@ bool RangeSet::pin(llvm::APSInt &Lower, llvm::APSInt &Upper) const {
 // or, alternatively, /removing/ all integers between Upper and Lower.
 RangeSet RangeSet::Intersect(BasicValueFactory &BV, Factory &F,
                              llvm::APSInt Lower, llvm::APSInt Upper) const {
-  if (!pin(Lower, Upper))
-    return F.getEmptySet();
-
   PrimRangeSet newRanges = F.getEmptySet();
 
+  if (isEmpty() || !pin(Lower, Upper))
+    return newRanges;
+
   PrimRangeSet::iterator i = begin(), e = end();
   if (Lower <= Upper)
     IntersectInRange(BV, F, Lower, Upper, newRanges, i, e);
@@ -190,33 +190,78 @@ RangeSet RangeSet::Intersect(BasicValueFactory &BV, Factory &F,
   return newRanges;
 }
 
-// Turn all [A, B] ranges to [-B, -A]. Ranges [MIN, B] are turned to range set
-// [MIN, MIN] U [-B, MAX], when MIN and MAX are the minimal and the maximal
-// signed values of the type.
+// Turn all [A, B] ranges to [-B, -A], when "-" is a C-like unary minus
+// operation under the values of the type.
+//
+// We also handle MIN because applying unary minus to MIN does not change it.
+// Example 1:
+// char x = -128;        // -128 is a MIN value in a range of 'char'
+// char y = -x;          // y: -128
+// Example 2:
+// unsigned char x = 0;  // 0 is a MIN value in a range of 'unsigned char'
+// unsigned char y = -x; // y: 0
+//
+// And it makes us to separate the range
+// like [MIN, N] to [MIN, MIN] U [-N,MAX].
+// For instance, whole range is {-128..127} and subrange is [-128,-126],
+// thus [-128,-127,-126,.....] negates to [-128,.....,126,127].
+//
+// Negate restores disrupted ranges on bounds,
+// e.g. [MIN, B] => [MIN, MIN] U [-B, MAX] => [MIN, B].
 RangeSet RangeSet::Negate(BasicValueFactory &BV, Factory &F) const {
   PrimRangeSet newRanges = F.getEmptySet();
 
-  for (iterator i = begin(), e = end(); i != e; ++i) {
-    const llvm::APSInt &from = i->From(), &to = i->To();
-    const llvm::APSInt &newTo = (from.isMinSignedValue() ?
-                                 BV.getMaxValue(from) :
-                                 BV.getValue(- from));
-    if (to.isMaxSignedValue() && !newRanges.isEmpty() &&
-        newRanges.begin()->From().isMinSignedValue()) {
-      assert(newRanges.begin()->To().isMinSignedValue() &&
-             "Ranges should not overlap");
-      assert(!from.isMinSignedValue() && "Ranges should not overlap");
-      const llvm::APSInt &newFrom = newRanges.begin()->From();
-      newRanges =
-        F.add(F.remove(newRanges, *newRanges.begin()), Range(newFrom, newTo));
-    } else if (!to.isMinSignedValue()) {
-      const llvm::APSInt &newFrom = BV.getValue(- to);
-      newRanges = F.add(newRanges, Range(newFrom, newTo));
-    }
-    if (from.isMinSignedValue()) {
-      newRanges = F.add(newRanges, Range(BV.getMinValue(from),
-                                         BV.getMinValue(from)));
+  if (isEmpty())
+    return newRanges;
+
+  const llvm::APSInt sampleValue = getMinValue();
+  const llvm::APSInt &MIN = BV.getMinValue(sampleValue);
+  const llvm::APSInt &MAX = BV.getMaxValue(sampleValue);
+
+  // Handle a special case for MIN value.
+  iterator i = begin();
+  const llvm::APSInt &from = i->From();
+  const llvm::APSInt &to = i->To();
+  if (from == MIN) {
+    // If [from, to] are [MIN, MAX], then just return the same [MIN, MAX].
+    if (to == MAX) {
+      newRanges = ranges;
+    } else {
+      // Add separate range for the lowest value.
+      newRanges = F.add(newRanges, Range(MIN, MIN));
+      // Skip adding the second range in case when [from, to] are [MIN, MIN].
+      if (to != MIN) {
+        newRanges = F.add(newRanges, Range(BV.getValue(-to), MAX));
+      }
     }
+    // Skip the first range in the loop.
+    ++i;
+  }
+
+  // Negate all other ranges.
+  for (iterator e = end(); i != e; ++i) {
+    // Negate int values.
+    const llvm::APSInt &newFrom = BV.getValue(-i->To());
+    const llvm::APSInt &newTo = BV.getValue(-i->From());
+    // Add a negated range.
+    newRanges = F.add(newRanges, Range(newFrom, newTo));
+  }
+
+  if (newRanges.isSingleton())
+    return newRanges;
+
+  // Try to find and unite next ranges:
+  // [MIN, MIN] & [MIN + 1, N] => [MIN, N].
+  iterator iter1 = newRanges.begin();
+  iterator iter2 = std::next(iter1);
+
+  if (iter1->To() == MIN && (iter2->From() - 1) == MIN) {
+    const llvm::APSInt &to = iter2->To();
+    // remove adjacent ranges
+    newRanges = F.remove(newRanges, *iter1);
+    newRanges = F.remove(newRanges, *newRanges.begin());
+    // add united range
+    newRanges = F.add(newRanges, Range(MIN, to));
   }
 
   return newRanges;
@@ -527,9 +572,7 @@ RangeConstraintManager::getRangeForMinusSymbol(ProgramStateRef State,
       SymbolRef negSym = SymMgr.getSymSymExpr(SSE->getRHS(), BO_Sub,
                                               SSE->getLHS(), T);
       if (const RangeSet *negV = State->get<ConstraintRange>(negSym)) {
-        // Unsigned range set cannot be negated, unless it is [0, 0].
-        if ((negV->getConcreteValue() &&
-             (*negV->getConcreteValue() == 0)) ||
+        if (T->isUnsignedIntegerOrEnumerationType() ||
             T->isSignedIntegerOrEnumerationType())
           return negV;
       }
diff --git a/clang/test/Analysis/constraint_manager_negate_difference.c b/clang/test/Analysis/constraint_manager_negate_difference.c
index 4412ae0e9733b..a33c5ca81c26a 100644
--- a/clang/test/Analysis/constraint_manager_negate_difference.c
+++ b/clang/test/Analysis/constraint_manager_negate_difference.c
@@ -4,7 +4,9 @@ void clang_analyzer_eval(int);
 
 void exit(int);
 
-#define UINT_MAX (~0U)
+#define UINT_MIN (0U)
+#define UINT_MAX (~UINT_MIN)
+#define UINT_MID (UINT_MAX / 2 + 1)
 #define INT_MAX (UINT_MAX & (UINT_MAX >> 1))
 #define INT_MIN (UINT_MAX & ~(UINT_MAX >> 1))
 
@@ -110,3 +112,48 @@ void effective_range_2(int m, int n) {
   clang_analyzer_eval(m - n == 0); // expected-warning{{TRUE}} expected-warning{{FALSE}}
   clang_analyzer_eval(n - m == 0); // expected-warning{{TRUE}} expected-warning{{FALSE}}
 }
+
+void negate_unsigned_min(unsigned m, unsigned n) {
+  if (m - n == UINT_MIN) {
+    clang_analyzer_eval(n - m == UINT_MIN); // expected-warning{{TRUE}}
+    clang_analyzer_eval(n - m != UINT_MIN); // expected-warning{{FALSE}}
+    clang_analyzer_eval(n - m > UINT_MIN);  // expected-warning{{FALSE}}
+    clang_analyzer_eval(n - m < UINT_MIN);  // expected-warning{{FALSE}}
+  }
+}
+
+void negate_unsigned_mid(unsigned m, unsigned n) {
+  if (m - n == UINT_MID) {
+    clang_analyzer_eval(n - m == UINT_MID); // expected-warning{{TRUE}}
+    clang_analyzer_eval(n - m != UINT_MID); // expected-warning{{FALSE}}
+  }
+}
+
+void negate_unsigned_mid2(unsigned m, unsigned n) {
+  if (m - n < UINT_MID && m - n > UINT_MIN) {
+    clang_analyzer_eval(n - m > UINT_MID); // expected-warning{{TRUE}}
+    clang_analyzer_eval(n - m < UINT_MID); // expected-warning{{FALSE}}
+  }
+}
+
+void negate_unsigned_max(unsigned m, unsigned n) {
+  if (m - n == UINT_MAX) {
+    clang_analyzer_eval(n - m == 1); // expected-warning{{TRUE}}
+    clang_analyzer_eval(n - m != 1); // expected-warning{{FALSE}}
+  }
+}
+
+void negate_unsigned_one(unsigned m, unsigned n) {
+  if (m - n == 1) {
+    clang_analyzer_eval(n - m == UINT_MAX); // expected-warning{{TRUE}}
+    clang_analyzer_eval(n - m < UINT_MAX);  // expected-warning{{FALSE}}
+  }
+}
+
+// The next code is a repro for the bug PR41588
+void negated_unsigned_range(unsigned x, unsigned y) {
+  clang_analyzer_eval(x - y != 0); // expected-warning{{FALSE}} expected-warning{{TRUE}}
+  clang_analyzer_eval(y - x != 0); // expected-warning{{FALSE}} expected-warning{{TRUE}}
+  // expected no assertion on the next line
+  clang_analyzer_eval(x - y != 0); // expected-warning{{FALSE}} expected-warning{{TRUE}}
+}
diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt
index 1070f124921da..e1f86af18b2b5 100644
--- a/clang/unittests/StaticAnalyzer/CMakeLists.txt
+++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt
@@ -10,6 +10,7 @@ add_clang_unittest(StaticAnalysisTests
   StoreTest.cpp
   RegisterCustomCheckersTest.cpp
   SymbolReaperTest.cpp
+  RangeSetTest.cpp
   )
 
 clang_target_link_libraries(StaticAnalysisTests
diff --git a/clang/unittests/StaticAnalyzer/RangeSetTest.cpp b/clang/unittests/StaticAnalyzer/RangeSetTest.cpp
new file mode 100644
index 0000000000000..83b4fac15a198
--- /dev/null
+++ b/clang/unittests/StaticAnalyzer/RangeSetTest.cpp
@@ -0,0 +1,130 @@
+//===- unittests/StaticAnalyzer/RangeSetTest.cpp ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/Builtins.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace ento {
+namespace {
+
+// TestCase contains to lists of ranges.
+// Original one has to be negated.
+// Expected one has to be compared to negated original range.
+template <typename T> struct TestCase {
+  RangeSet original;
+  RangeSet expected;
+
+  TestCase(BasicValueFactory &BVF, RangeSet::Factory &F,
+           const std::initializer_list<T> &originalList,
+           const std::initializer_list<T> &expectedList)
+      : original(createRangeSetFromList(BVF, F, originalList)),
+        expected(createRangeSetFromList(BVF, F, expectedList)) {}
+
+private:
+  RangeSet createRangeSetFromList(BasicValueFactory &BVF, RangeSet::Factory &F,
+                                  const std::initializer_list<T> rangeList) {
+    llvm::APSInt from(sizeof(T) * 8, std::is_unsigned<T>::value);
+    llvm::APSInt to = from;
+    RangeSet rangeSet = F.getEmptySet();
+    for (auto it = rangeList.begin(); it != rangeList.end(); it += 2) {
+      from = *it;
+      to = *(it + 1);
+      rangeSet = rangeSet.addRange(
+          F, RangeSet(F, BVF.getValue(from), BVF.getValue(to)));
+    }
+    return rangeSet;
+  }
+
+  void printNegate(const TestCase &TestCase) {
+    TestCase.original.print(llvm::dbgs());
+    llvm::dbgs() << " => ";
+    TestCase.expected.print(llvm::dbgs());
+  }
+};
+
+class RangeSetTest : public testing::Test {
+protected:
+  // Init block
+  std::unique_ptr<ASTUnit> AST = tooling::buildASTFromCode("struct foo;");
+  ASTContext &context = AST->getASTContext();
+  llvm::BumpPtrAllocator alloc;
+  BasicValueFactory BVF{context, alloc};
+  RangeSet::Factory F;
+  // End init block
+
+  template <typename T> void checkNegate() {
+    using type = T;
+
+    // Use next values of the range {MIN, A, B, MID, C, D, MAX}.
+
+    // MID is a value in the middle of the range
+    // which unary minus does not affect on,
+    // e.g. int8/int32(0), uint8(128), uint32(2147483648).
+
+    constexpr type MIN = std::numeric_limits<type>::min();
+    constexpr type MAX = std::numeric_limits<type>::max();
+    constexpr type MID = std::is_signed<type>::value
+                             ? 0
+                             : ~(static_cast<type>(-1) / static_cast<type>(2));
+    constexpr type A = MID - static_cast<type>(42 + 42);
+    constexpr type B = MID - static_cast<type>(42);
+    constexpr type C = -B;
+    constexpr type D = -A;
+
+    static_assert(MIN < A && A < B && B < MID && MID < C && C < D && D < MAX,
+                  "Values shall be in an ascending order");
+
+    // Left {[x, y], [x, y]} is what shall be negated.
+    // Right {[x, y], [x, y]} is what shall be compared to a negation result.
+    TestCase<type> cases[] = {
+        {BVF, F, {MIN, A}, {MIN, MIN, D, MAX}},
+        {BVF, F, {MIN, C}, {MIN, MIN, B, MAX}},
+        {BVF, F, {MIN, MID}, {MIN, MIN, MID, MAX}},
+        {BVF, F, {MIN, MAX}, {MIN, MAX}},
+        {BVF, F, {A, D}, {A, D}},
+        {BVF, F, {A, B}, {C, D}},
+        {BVF, F, {MIN, A, D, MAX}, {MIN, A, D, MAX}},
+        {BVF, F, {MIN, B, MID, D}, {MIN, MIN, A, MID, C, MAX}},
+        {BVF, F, {MIN, MID, C, D}, {MIN, MIN, A, B, MID, MAX}},
+        {BVF, F, {MIN, MID, C, MAX}, {MIN, B, MID, MAX}},
+        {BVF, F, {A, MID, D, MAX}, {MIN + 1, A, MID, D}},
+        {BVF, F, {A, A}, {D, D}},
+        {BVF, F, {MID, MID}, {MID, MID}},
+        {BVF, F, {MAX, MAX}, {MIN + 1, MIN + 1}},
+    };
+
+    for (const auto &c : cases) {
+      // Negate original and check with expected.
+      RangeSet negatedFromOriginal = c.original.Negate(BVF, F);
+      EXPECT_EQ(negatedFromOriginal, c.expected);
+      // Negate negated back and check with original.
+      RangeSet negatedBackward = negatedFromOriginal.Negate(BVF, F);
+      EXPECT_EQ(negatedBackward, c.original);
+    }
+  }
+};
+
+TEST_F(RangeSetTest, RangeSetNegateTest) {
+  checkNegate<int8_t>();
+  checkNegate<uint8_t>();
+  checkNegate<int16_t>();
+  checkNegate<uint16_t>();
+  checkNegate<int32_t>();
+  checkNegate<uint32_t>();
+  checkNegate<int64_t>();
+  checkNegate<uint64_t>();
+}
+
+} // namespace
+} // namespace ento
+} // namespace clang

From b62ce9e05d9ec95532fa131a3e47ff1d4e7ed5de Mon Sep 17 00:00:00 2001
From: Sergej Jaskiewicz <jaskiewiczs@icloud.com>
Date: Mon, 25 May 2020 19:08:49 +0300
Subject: [PATCH 040/770] Re-commit "[libc++] [test] Generate static_test_env
 on the fly"

Don't use std::filesystem APIs for CWDGuard, use POSIX functions
instead. This way the tests don't rely on the correctness of
the functionality they're testing.

Differential Revision: https://reviews.llvm.org/D78200
---
 .../Inputs/static_test_env/bad_symlink        |   1 -
 .../Inputs/static_test_env/dir1/dir2/afile3   |   0
 .../static_test_env/dir1/dir2/dir3/file5      |   0
 .../Inputs/static_test_env/dir1/dir2/file4    |   0
 .../static_test_env/dir1/dir2/symlink_to_dir3 |   1 -
 .../Inputs/static_test_env/dir1/file1         |   0
 .../Inputs/static_test_env/dir1/file2         |   1 -
 .../Inputs/static_test_env/empty_file         |   0
 .../Inputs/static_test_env/non_empty_file     |   1 -
 .../Inputs/static_test_env/symlink_to_dir     |   1 -
 .../static_test_env/symlink_to_empty_file     |   1 -
 .../directory_entry.cons/path.pass.cpp        |  24 +-
 .../replace_filename.pass.cpp                 |   8 +-
 .../directory_entry.obs/file_size.pass.cpp    |  16 +-
 .../file_type_obs.pass.cpp                    |   4 +-
 .../hard_link_count.pass.cpp                  |  16 +-
 .../last_write_time.pass.cpp                  |  16 +-
 .../directory_entry.obs/status.pass.cpp       |   5 +-
 .../symlink_status.pass.cpp                   |   5 +-
 .../directory_iterator.members/copy.pass.cpp  |   4 +-
 .../copy_assign.pass.cpp                      |  10 +-
 .../directory_iterator.members/ctor.pass.cpp  |  19 +-
 .../increment.pass.cpp                        |  22 +-
 .../directory_iterator.members/move.pass.cpp  |   4 +-
 .../move_assign.pass.cpp                      |  15 +-
 .../begin_end.pass.cpp                        |   8 +-
 .../rec.dir.itr.members/copy.pass.cpp         |   4 +-
 .../rec.dir.itr.members/copy_assign.pass.cpp  |  25 +-
 .../rec.dir.itr.members/ctor.pass.cpp         |  19 +-
 .../rec.dir.itr.members/depth.pass.cpp        |   8 +-
 .../disable_recursion_pending.pass.cpp        |   4 +-
 .../rec.dir.itr.members/increment.pass.cpp    |  29 ++-
 .../rec.dir.itr.members/move.pass.cpp         |   4 +-
 .../rec.dir.itr.members/move_assign.pass.cpp  |  25 +-
 .../rec.dir.itr.members/pop.pass.cpp          |  12 +-
 .../recursion_pending.pass.cpp                |  36 +--
 .../rec.dir.itr.nonmembers/begin_end.pass.cpp |   8 +-
 .../fs.op.canonical/canonical.pass.cpp        |  49 ++--
 .../fs.op.funcs/fs.op.copy/copy.pass.cpp      |   3 +-
 .../fs.op.current_path/current_path.pass.cpp  |  17 +-
 .../fs.op.equivalent/equivalent.pass.cpp      |  19 +-
 .../fs.op.funcs/fs.op.exists/exists.pass.cpp  |   3 +-
 .../fs.op.file_size/file_size.pass.cpp        |  18 +-
 .../fs.op.hard_lk_ct/hard_link_count.pass.cpp |  36 +--
 .../is_block_file.pass.cpp                    |   3 +-
 .../is_character_file.pass.cpp                |   3 +-
 .../fs.op.is_directory/is_directory.pass.cpp  |  11 +-
 .../fs.op.is_empty/is_empty.pass.cpp          |  14 +-
 .../fs.op.is_fifo/is_fifo.pass.cpp            |   3 +-
 .../fs.op.is_other/is_other.pass.cpp          |   3 +-
 .../is_regular_file.pass.cpp                  |   3 +-
 .../fs.op.is_socket/is_socket.pass.cpp        |   3 +-
 .../fs.op.is_symlink/is_symlink.pass.cpp      |  15 +-
 .../last_write_time.pass.cpp                  |  21 +-
 .../fs.op.relative/relative.pass.cpp          |  61 +++--
 .../fs.op.funcs/fs.op.space/space.pass.cpp    |  20 +-
 .../fs.op.funcs/fs.op.status/status.pass.cpp  |  17 +-
 .../symlink_status.pass.cpp                   |  17 +-
 .../weakly_canonical.pass.cpp                 |  28 ++-
 libcxx/test/support/filesystem_test_helper.h  | 230 +++++++++++-------
 60 files changed, 527 insertions(+), 426 deletions(-)
 delete mode 120000 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/bad_symlink
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/afile3
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/dir3/file5
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/file4
 delete mode 120000 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/symlink_to_dir3
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file1
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file2
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/empty_file
 delete mode 100644 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/non_empty_file
 delete mode 120000 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_dir
 delete mode 120000 libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_empty_file

diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/bad_symlink b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/bad_symlink
deleted file mode 120000
index 76646beed5ed3..0000000000000
--- a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/bad_symlink
+++ /dev/null
@@ -1 +0,0 @@
-dne
\ No newline at end of file
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/afile3 b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/afile3
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/dir3/file5 b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/dir3/file5
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/file4 b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/file4
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/symlink_to_dir3 b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/symlink_to_dir3
deleted file mode 120000
index 3979139526219..0000000000000
--- a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/dir2/symlink_to_dir3
+++ /dev/null
@@ -1 +0,0 @@
-dir3
\ No newline at end of file
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file1 b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file1
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file2 b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file2
deleted file mode 100644
index 44834e586734f..0000000000000
--- a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/dir1/file2
+++ /dev/null
@@ -1 +0,0 @@
-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
\ No newline at end of file
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/empty_file b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/empty_file
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/non_empty_file b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/non_empty_file
deleted file mode 100644
index 44834e586734f..0000000000000
--- a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/non_empty_file
+++ /dev/null
@@ -1 +0,0 @@
-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
\ No newline at end of file
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_dir b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_dir
deleted file mode 120000
index df490f837a85c..0000000000000
--- a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_dir
+++ /dev/null
@@ -1 +0,0 @@
-dir1
\ No newline at end of file
diff --git a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_empty_file b/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_empty_file
deleted file mode 120000
index b79b689fc85ac..0000000000000
--- a/libcxx/test/std/input.output/filesystems/Inputs/static_test_env/symlink_to_empty_file
+++ /dev/null
@@ -1 +0,0 @@
-empty_file
\ No newline at end of file
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
index 850be12e839c0..f7c3e442707d7 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -46,6 +45,7 @@ TEST_CASE(path_ctor) {
 }
 
 TEST_CASE(path_ec_ctor) {
+  static_test_env static_env;
   using namespace fs;
   {
     static_assert(
@@ -61,8 +61,8 @@ TEST_CASE(path_ec_ctor) {
   }
   {
     std::error_code ec = GetTestEC();
-    const directory_entry e(StaticEnv::File, ec);
-    TEST_CHECK(e.path() == StaticEnv::File);
+    const directory_entry e(static_env.File, ec);
+    TEST_CHECK(e.path() == static_env.File);
     TEST_CHECK(!ec);
   }
   {
@@ -121,26 +121,28 @@ TEST_CASE(path_ctor_calls_refresh) {
 TEST_CASE(path_ctor_dne) {
   using namespace fs;
 
+  static_test_env static_env;
+
   {
     std::error_code ec = GetTestEC();
-    directory_entry ent(StaticEnv::DNE, ec);
+    directory_entry ent(static_env.DNE, ec);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
-    TEST_CHECK(ent.path() == StaticEnv::DNE);
+    TEST_CHECK(ent.path() == static_env.DNE);
   }
   // don't report dead symlinks as an error.
   {
     std::error_code ec = GetTestEC();
-    directory_entry ent(StaticEnv::BadSymlink, ec);
+    directory_entry ent(static_env.BadSymlink, ec);
     TEST_CHECK(!ec);
-    TEST_CHECK(ent.path() == StaticEnv::BadSymlink);
+    TEST_CHECK(ent.path() == static_env.BadSymlink);
   }
   // DNE does not cause the constructor to throw
   {
-    directory_entry ent(StaticEnv::DNE);
-    TEST_CHECK(ent.path() == StaticEnv::DNE);
+    directory_entry ent(static_env.DNE);
+    TEST_CHECK(ent.path() == static_env.DNE);
 
-    directory_entry ent_two(StaticEnv::BadSymlink);
-    TEST_CHECK(ent_two.path() == StaticEnv::BadSymlink);
+    directory_entry ent_two(static_env.BadSymlink);
+    TEST_CHECK(ent_two.path() == static_env.BadSymlink);
   }
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
index 2a2146be1b849..0f3624bd65a5b 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -54,6 +53,7 @@ TEST_CASE(test_replace_filename_method) {
 TEST_CASE(test_replace_filename_ec_method) {
   using namespace fs;
 
+  static_test_env static_env;
   {
     directory_entry e;
     path replace;
@@ -76,9 +76,9 @@ TEST_CASE(test_replace_filename_ec_method) {
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
   }
   {
-    const path p = StaticEnv::EmptyFile;
-    const path expect = StaticEnv::NonEmptyFile;
-    const path replace = StaticEnv::NonEmptyFile.filename();
+    const path p = static_env.EmptyFile;
+    const path expect = static_env.NonEmptyFile;
+    const path replace = static_env.NonEmptyFile.filename();
     TEST_REQUIRE(expect.parent_path() == p.parent_path());
     directory_entry e(p);
     TEST_CHECK(e.path() == p);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
index 8ff1b46b3b992..ad4cc43c95d97 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -112,6 +111,7 @@ TEST_CASE(not_regular_file) {
 TEST_CASE(error_reporting) {
   using namespace fs;
 
+  static_test_env static_env;
   scoped_test_env env;
 
   const path dir = env.create_dir("dir");
@@ -127,15 +127,15 @@ TEST_CASE(error_reporting) {
     directory_entry ent;
 
     std::error_code ec = GetTestEC();
-    ent.assign(StaticEnv::DNE, ec);
-    TEST_REQUIRE(ent.path() == StaticEnv::DNE);
+    ent.assign(static_env.DNE, ec);
+    TEST_REQUIRE(ent.path() == static_env.DNE);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
     ec = GetTestEC();
     TEST_CHECK(ent.file_size(ec) == uintmax_t(-1));
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
-    ExceptionChecker Checker(StaticEnv::DNE,
+    ExceptionChecker Checker(static_env.DNE,
                              std::errc::no_such_file_or_directory,
                              "directory_entry::file_size");
     TEST_CHECK_THROW_RESULT(filesystem_error, Checker, ent.file_size());
@@ -145,20 +145,20 @@ TEST_CASE(error_reporting) {
     directory_entry ent;
 
     std::error_code ec = GetTestEC();
-    uintmax_t expect_bad = file_size(StaticEnv::BadSymlink, ec);
+    uintmax_t expect_bad = file_size(static_env.BadSymlink, ec);
     TEST_CHECK(expect_bad == uintmax_t(-1));
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
     ec = GetTestEC();
-    ent.assign(StaticEnv::BadSymlink, ec);
-    TEST_REQUIRE(ent.path() == StaticEnv::BadSymlink);
+    ent.assign(static_env.BadSymlink, ec);
+    TEST_REQUIRE(ent.path() == static_env.BadSymlink);
     TEST_CHECK(!ec);
 
     ec = GetTestEC();
     TEST_CHECK(ent.file_size(ec) == expect_bad);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
-    ExceptionChecker Checker(StaticEnv::BadSymlink,
+    ExceptionChecker Checker(static_env.BadSymlink,
                              std::errc::no_such_file_or_directory,
                              "directory_entry::file_size");
     TEST_CHECK_THROW_RESULT(filesystem_error, Checker, ent.file_size());
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
index f5a44b76a8b03..b3bfa5e7eeacb 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
@@ -149,8 +149,8 @@ TEST_CASE(test_with_ec_dne) {
   using fs::directory_entry;
   using fs::file_status;
   using fs::path;
-
-  for (auto p : {StaticEnv::DNE, StaticEnv::BadSymlink}) {
+  static_test_env static_env;
+  for (auto p : {static_env.DNE, static_env.BadSymlink}) {
 
     directory_entry e(p);
     std::error_code status_ec = GetTestEC();
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
index d9958a6c5f030..8090bd77da720 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -110,6 +109,7 @@ TEST_CASE(not_regular_file) {
 TEST_CASE(error_reporting) {
   using namespace fs;
 
+  static_test_env static_env;
   scoped_test_env env;
 
   const path dir = env.create_dir("dir");
@@ -125,16 +125,16 @@ TEST_CASE(error_reporting) {
     directory_entry ent;
 
     std::error_code ec = GetTestEC();
-    ent.assign(StaticEnv::DNE, ec);
+    ent.assign(static_env.DNE, ec);
     TEST_CHECK(ec);
-    TEST_REQUIRE(ent.path() == StaticEnv::DNE);
+    TEST_REQUIRE(ent.path() == static_env.DNE);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
     ec = GetTestEC();
     TEST_CHECK(ent.hard_link_count(ec) == uintmax_t(-1));
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
-    ExceptionChecker Checker(StaticEnv::DNE,
+    ExceptionChecker Checker(static_env.DNE,
                              std::errc::no_such_file_or_directory,
                              "directory_entry::hard_link_count");
     TEST_CHECK_THROW_RESULT(filesystem_error, Checker, ent.hard_link_count());
@@ -144,20 +144,20 @@ TEST_CASE(error_reporting) {
     directory_entry ent;
 
     std::error_code ec = GetTestEC();
-    uintmax_t expect_bad = hard_link_count(StaticEnv::BadSymlink, ec);
+    uintmax_t expect_bad = hard_link_count(static_env.BadSymlink, ec);
     TEST_CHECK(expect_bad == uintmax_t(-1));
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
     ec = GetTestEC();
-    ent.assign(StaticEnv::BadSymlink, ec);
-    TEST_REQUIRE(ent.path() == StaticEnv::BadSymlink);
+    ent.assign(static_env.BadSymlink, ec);
+    TEST_REQUIRE(ent.path() == static_env.BadSymlink);
     TEST_CHECK(!ec);
 
     ec = GetTestEC();
     TEST_CHECK(ent.hard_link_count(ec) == expect_bad);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
-    ExceptionChecker Checker(StaticEnv::BadSymlink,
+    ExceptionChecker Checker(static_env.BadSymlink,
                              std::errc::no_such_file_or_directory,
                              "directory_entry::hard_link_count");
     TEST_CHECK_THROW_RESULT(filesystem_error, Checker, ent.hard_link_count());
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
index de6c36c218f2b..98949ea06e0cb 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -84,6 +83,7 @@ TEST_CASE(basic) {
 TEST_CASE(error_reporting) {
   using namespace fs;
 
+  static_test_env static_env;
   scoped_test_env env;
 
   const path dir = env.create_dir("dir");
@@ -99,15 +99,15 @@ TEST_CASE(error_reporting) {
     directory_entry ent;
 
     std::error_code ec = GetTestEC();
-    ent.assign(StaticEnv::DNE, ec);
-    TEST_REQUIRE(ent.path() == StaticEnv::DNE);
+    ent.assign(static_env.DNE, ec);
+    TEST_REQUIRE(ent.path() == static_env.DNE);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
     ec = GetTestEC();
     TEST_CHECK(ent.last_write_time(ec) == file_time_type::min());
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
-    ExceptionChecker Checker(StaticEnv::DNE,
+    ExceptionChecker Checker(static_env.DNE,
                              std::errc::no_such_file_or_directory,
                              "directory_entry::last_write_time");
     TEST_CHECK_THROW_RESULT(filesystem_error, Checker, ent.last_write_time());
@@ -117,20 +117,20 @@ TEST_CASE(error_reporting) {
     directory_entry ent;
 
     std::error_code ec = GetTestEC();
-    file_time_type expect_bad = last_write_time(StaticEnv::BadSymlink, ec);
+    file_time_type expect_bad = last_write_time(static_env.BadSymlink, ec);
     TEST_CHECK(expect_bad == file_time_type::min());
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
     ec = GetTestEC();
-    ent.assign(StaticEnv::BadSymlink, ec);
-    TEST_REQUIRE(ent.path() == StaticEnv::BadSymlink);
+    ent.assign(static_env.BadSymlink, ec);
+    TEST_REQUIRE(ent.path() == static_env.BadSymlink);
     TEST_CHECK(!ec);
 
     ec = GetTestEC();
     TEST_CHECK(ent.last_write_time(ec) == expect_bad);
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
 
-    ExceptionChecker Checker(StaticEnv::BadSymlink,
+    ExceptionChecker Checker(static_env.BadSymlink,
                              std::errc::no_such_file_or_directory,
                              "directory_entry::last_write_time");
     TEST_CHECK_THROW_RESULT(filesystem_error, Checker, ent.last_write_time());
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
index 8ea1fdbcbe6ec..796d47bfb5259 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
@@ -28,6 +28,7 @@ TEST_SUITE(directory_entry_status_testsuite)
 
 TEST_CASE(test_basic) {
   using namespace fs;
+  static_test_env static_env;
   {
     const fs::directory_entry e("foo");
     std::error_code ec;
@@ -36,8 +37,8 @@ TEST_CASE(test_basic) {
     static_assert(noexcept(e.status()) == false, "");
     static_assert(noexcept(e.status(ec)) == true, "");
   }
-  path TestCases[] = {StaticEnv::File, StaticEnv::Dir, StaticEnv::SymlinkToFile,
-                      StaticEnv::DNE};
+  path TestCases[] = {static_env.File, static_env.Dir, static_env.SymlinkToFile,
+                      static_env.DNE};
   for (const auto& p : TestCases) {
     const directory_entry e(p);
     std::error_code pec = GetTestEC(), eec = GetTestEC(1);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
index cff19bc583ff3..c8dd6c1335392 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
@@ -28,6 +28,7 @@ TEST_SUITE(directory_entry_obs_suite)
 
 TEST_CASE(test_signature) {
   using namespace fs;
+  static_test_env static_env;
   {
     const directory_entry e("foo");
     std::error_code ec;
@@ -36,8 +37,8 @@ TEST_CASE(test_signature) {
     static_assert(noexcept(e.symlink_status()) == false, "");
     static_assert(noexcept(e.symlink_status(ec)) == true, "");
   }
-  path TestCases[] = {StaticEnv::File, StaticEnv::Dir, StaticEnv::SymlinkToFile,
-                      StaticEnv::DNE};
+  path TestCases[] = {static_env.File, static_env.Dir, static_env.SymlinkToFile,
+                      static_env.DNE};
   for (const auto& p : TestCases) {
     const directory_entry e(p);
     std::error_code pec = GetTestEC(), eec = GetTestEC(1);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
index d01854c1e4037..5b7d709d9aece 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -43,7 +42,8 @@ TEST_CASE(test_copy_end_iterator)
 
 TEST_CASE(test_copy_valid_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
     const directory_iterator endIt{};
 
     const directory_iterator it(testDir);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
index 8d33240791052..920ded9523da0 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -36,7 +35,8 @@ TEST_CASE(test_assignment_signature)
 
 TEST_CASE(test_copy_to_end_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
 
     const directory_iterator from(testDir);
     TEST_REQUIRE(from != directory_iterator{});
@@ -52,7 +52,8 @@ TEST_CASE(test_copy_to_end_iterator)
 
 TEST_CASE(test_copy_from_end_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
 
     const directory_iterator from{};
 
@@ -66,7 +67,8 @@ TEST_CASE(test_copy_from_end_iterator)
 
 TEST_CASE(test_copy_valid_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
     const directory_iterator endIt{};
 
     directory_iterator it_obj(testDir);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
index a06c4a64c7ffa..206e7860f4877 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -60,11 +59,12 @@ TEST_CASE(test_constructor_signatures)
 
 TEST_CASE(test_construction_from_bad_path)
 {
+    static_test_env static_env;
     std::error_code ec;
     directory_options opts = directory_options::none;
     const directory_iterator endIt;
 
-    const path testPaths[] = { StaticEnv::DNE, StaticEnv::BadSymlink };
+    const path testPaths[] = { static_env.DNE, static_env.BadSymlink };
     for (path const& testPath : testPaths)
     {
         {
@@ -169,9 +169,10 @@ TEST_CASE(test_open_on_empty_directory_equals_end)
 
 TEST_CASE(test_open_on_directory_succeeds)
 {
-    const path testDir = StaticEnv::Dir;
-    std::set<path> dir_contents(std::begin(StaticEnv::DirIterationList),
-                                std::end(  StaticEnv::DirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    std::set<path> dir_contents(static_env.DirIterationList.begin(),
+                                static_env.DirIterationList.end());
     const directory_iterator endIt{};
 
     {
@@ -190,7 +191,8 @@ TEST_CASE(test_open_on_directory_succeeds)
 
 TEST_CASE(test_open_on_file_fails)
 {
-    const path testFile = StaticEnv::File;
+    static_test_env static_env;
+    const path testFile = static_env.File;
     const directory_iterator endIt{};
     {
         std::error_code ec;
@@ -225,9 +227,10 @@ TEST_CASE(test_open_on_dot_dir)
 
 TEST_CASE(test_open_on_symlink)
 {
-    const path symlinkToDir = StaticEnv::SymlinkToDir;
+    static_test_env static_env;
+    const path symlinkToDir = static_env.SymlinkToDir;
     std::set<path> dir_contents;
-    for (path const& p : StaticEnv::DirIterationList) {
+    for (path const& p : static_env.DirIterationList) {
         dir_contents.insert(p.filename());
     }
     const directory_iterator endIt{};
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
index c527013237ac6..f1f9a27a45f08 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -44,9 +43,10 @@ TEST_CASE(test_increment_signatures)
 
 TEST_CASE(test_prefix_increment)
 {
-    const path testDir = StaticEnv::Dir;
-    const std::set<path> dir_contents(std::begin(StaticEnv::DirIterationList),
-                                      std::end(  StaticEnv::DirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const std::set<path> dir_contents(static_env.DirIterationList.begin(),
+                                      static_env.DirIterationList.end());
     const directory_iterator endIt{};
 
     std::error_code ec;
@@ -67,9 +67,10 @@ TEST_CASE(test_prefix_increment)
 
 TEST_CASE(test_postfix_increment)
 {
-    const path testDir = StaticEnv::Dir;
-    const std::set<path> dir_contents(std::begin(StaticEnv::DirIterationList),
-                                      std::end(  StaticEnv::DirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const std::set<path> dir_contents(static_env.DirIterationList.begin(),
+                                      static_env.DirIterationList.end());
     const directory_iterator endIt{};
 
     std::error_code ec;
@@ -91,9 +92,10 @@ TEST_CASE(test_postfix_increment)
 
 TEST_CASE(test_increment_method)
 {
-    const path testDir = StaticEnv::Dir;
-    const std::set<path> dir_contents(std::begin(StaticEnv::DirIterationList),
-                                      std::end(  StaticEnv::DirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const std::set<path> dir_contents(static_env.DirIterationList.begin(),
+                                      static_env.DirIterationList.end());
     const directory_iterator endIt{};
 
     std::error_code ec;
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
index c4d0e23b46f8d..8b17383fe6c1e 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -46,7 +45,8 @@ TEST_CASE(test_move_end_iterator)
 
 TEST_CASE(test_move_valid_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
     const directory_iterator endIt{};
 
     directory_iterator it(testDir);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
index bf737146d585c..93fa67b4230e3 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -42,7 +41,8 @@ TEST_CASE(test_assignment_signature)
 
 TEST_CASE(test_move_to_end_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
 
     directory_iterator from(testDir);
     TEST_REQUIRE(from != directory_iterator{});
@@ -57,7 +57,8 @@ TEST_CASE(test_move_to_end_iterator)
 
 TEST_CASE(test_move_from_end_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
 
     directory_iterator from{};
 
@@ -71,7 +72,8 @@ TEST_CASE(test_move_from_end_iterator)
 
 TEST_CASE(test_move_valid_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
     const directory_iterator endIt{};
 
     directory_iterator it(testDir);
@@ -101,9 +103,10 @@ TEST_CASE(test_returns_reference_to_self)
 
 TEST_CASE(test_self_move)
 {
+    static_test_env static_env;
     // Create two non-equal iterators that have exactly the same state.
-    directory_iterator it(StaticEnv::Dir);
-    directory_iterator it2(StaticEnv::Dir);
+    directory_iterator it(static_env.Dir);
+    directory_iterator it2(static_env.Dir);
     ++it; ++it2;
     TEST_CHECK(it != it2);
     TEST_CHECK(*it2 == *it);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
index 5b1205edc6dbc..c80458eeab8b4 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -43,9 +42,10 @@ TEST_CASE(test_function_signatures)
 
 TEST_CASE(test_ranged_for_loop)
 {
-    const path testDir = StaticEnv::Dir;
-    std::set<path> dir_contents(std::begin(StaticEnv::DirIterationList),
-                                      std::end(  StaticEnv::DirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    std::set<path> dir_contents(static_env.DirIterationList.begin(),
+                                static_env.DirIterationList.end());
 
     std::error_code ec;
     directory_iterator it(testDir, ec);
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
index bff2a18dd70a5..09a53e46e263a 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -44,7 +43,8 @@ TEST_CASE(test_copy_end_iterator)
 
 TEST_CASE(test_copy_valid_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
     const recursive_directory_iterator endIt{};
 
     // build 'it' up with "interesting" non-default state so we can test
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
index d7468a21c4ed9..e03e9f3880b83 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -28,7 +27,7 @@ using namespace fs;
 
 TEST_SUITE(recursive_directory_iterator_copy_assign_tests)
 
-recursive_directory_iterator createInterestingIterator()
+recursive_directory_iterator createInterestingIterator(const static_test_env &static_env)
     // Create an "interesting" iterator where all fields are
     // in a non-default state. The returned 'it' is in a
     // state such that:
@@ -36,7 +35,7 @@ recursive_directory_iterator createInterestingIterator()
     //   it.depth() == 1
     //   it.recursion_pending() == true
 {
-    const path testDir = StaticEnv::Dir;
+    const path testDir = static_env.Dir;
     const recursive_directory_iterator endIt;
     recursive_directory_iterator it(testDir,
                                     directory_options::skip_permission_denied);
@@ -51,7 +50,7 @@ recursive_directory_iterator createInterestingIterator()
 }
 
 
-recursive_directory_iterator createDifferentInterestingIterator()
+recursive_directory_iterator createDifferentInterestingIterator(const static_test_env &static_env)
     // Create an "interesting" iterator where all fields are
     // in a non-default state. The returned 'it' is in a
     // state such that:
@@ -59,7 +58,7 @@ recursive_directory_iterator createDifferentInterestingIterator()
     //   it.depth() == 2
     //   it.recursion_pending() == false
 {
-    const path testDir = StaticEnv::Dir;
+    const path testDir = static_env.Dir;
     const recursive_directory_iterator endIt;
     recursive_directory_iterator it(testDir,
                                     directory_options::follow_directory_symlink);
@@ -79,9 +78,10 @@ TEST_CASE(test_assignment_signature) {
 
 TEST_CASE(test_copy_to_end_iterator)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt;
 
-    const recursive_directory_iterator from = createInterestingIterator();
+    const recursive_directory_iterator from = createInterestingIterator(static_env);
     const path entry = *from;
 
     recursive_directory_iterator to;
@@ -96,8 +96,9 @@ TEST_CASE(test_copy_to_end_iterator)
 
 TEST_CASE(test_copy_from_end_iterator)
 {
+    static_test_env static_env;
     const recursive_directory_iterator from;
-    recursive_directory_iterator to = createInterestingIterator();
+    recursive_directory_iterator to = createInterestingIterator(static_env);
 
     to = from;
     TEST_REQUIRE(to == from);
@@ -106,12 +107,13 @@ TEST_CASE(test_copy_from_end_iterator)
 
 TEST_CASE(test_copy_valid_iterator)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt;
 
-    const recursive_directory_iterator it = createInterestingIterator();
+    const recursive_directory_iterator it = createInterestingIterator(static_env);
     const path entry = *it;
 
-    recursive_directory_iterator it2 = createDifferentInterestingIterator();
+    recursive_directory_iterator it2 = createDifferentInterestingIterator(static_env);
     TEST_REQUIRE(it2                   != it);
     TEST_CHECK(it2.options()           != it.options());
     TEST_CHECK(it2.depth()             != it.depth());
@@ -136,9 +138,10 @@ TEST_CASE(test_returns_reference_to_self)
 
 TEST_CASE(test_self_copy)
 {
+    static_test_env static_env;
     // Create two non-equal iterators that have exactly the same state.
-    recursive_directory_iterator it = createInterestingIterator();
-    recursive_directory_iterator it2 = createInterestingIterator();
+    recursive_directory_iterator it = createInterestingIterator(static_env);
+    recursive_directory_iterator it2 = createInterestingIterator(static_env);
     TEST_CHECK(it != it2);
     TEST_CHECK(it2.options()           == it.options());
     TEST_CHECK(it2.depth()             == it.depth());
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
index 177edb46e058d..40f89df4e81a8 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -61,11 +60,12 @@ TEST_CASE(test_constructor_signatures)
 
 TEST_CASE(test_construction_from_bad_path)
 {
+    static_test_env static_env;
     std::error_code ec;
     directory_options opts = directory_options::none;
     const RDI endIt;
 
-    const path testPaths[] = { StaticEnv::DNE, StaticEnv::BadSymlink };
+    const path testPaths[] = { static_env.DNE, static_env.BadSymlink };
     for (path const& testPath : testPaths)
     {
         {
@@ -171,9 +171,10 @@ TEST_CASE(test_open_on_empty_directory_equals_end)
 
 TEST_CASE(test_open_on_directory_succeeds)
 {
-    const path testDir = StaticEnv::Dir;
-    std::set<path> dir_contents(std::begin(StaticEnv::DirIterationList),
-                                std::end(  StaticEnv::DirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    std::set<path> dir_contents(static_env.DirIterationList.begin(),
+                                static_env.DirIterationList.end());
     const RDI endIt{};
 
     {
@@ -192,7 +193,8 @@ TEST_CASE(test_open_on_directory_succeeds)
 
 TEST_CASE(test_open_on_file_fails)
 {
-    const path testFile = StaticEnv::File;
+    static_test_env static_env;
+    const path testFile = static_env.File;
     const RDI endIt{};
     {
         std::error_code ec;
@@ -207,8 +209,9 @@ TEST_CASE(test_open_on_file_fails)
 
 TEST_CASE(test_options_post_conditions)
 {
-    const path goodDir = StaticEnv::Dir;
-    const path badDir = StaticEnv::DNE;
+    static_test_env static_env;
+    const path goodDir = static_env.Dir;
+    const path badDir = static_env.DNE;
 
     {
         std::error_code ec;
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
index da6c10a88cc99..bd12ec8885668 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -30,9 +29,10 @@ TEST_SUITE(recursive_directory_iterator_depth_tests)
 
 TEST_CASE(test_depth)
 {
-    const path testDir = StaticEnv::Dir;
-    const path DirDepth1 = StaticEnv::Dir2;
-    const path DirDepth2 = StaticEnv::Dir3;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const path DirDepth1 = static_env.Dir2;
+    const path DirDepth2 = static_env.Dir3;
     const recursive_directory_iterator endIt{};
 
     std::error_code ec;
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
index 5d29b7e32f60f..568563984dd49 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -32,7 +31,8 @@ TEST_SUITE(recursive_directory_iterator_disable_recursion_pending_tests)
 // in the 'recursion_pending()' tests.
 TEST_CASE(basic_test)
 {
-    recursive_directory_iterator it(StaticEnv::Dir);
+    static_test_env static_env;
+    recursive_directory_iterator it(static_env.Dir);
     TEST_REQUIRE(it.recursion_pending() == true);
     it.disable_recursion_pending();
     TEST_CHECK(it.recursion_pending() == false);
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
index 1c7650543396b..3c0127e2a3189 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -43,9 +42,10 @@ TEST_CASE(test_increment_signatures)
 
 TEST_CASE(test_prefix_increment)
 {
-    const path testDir = StaticEnv::Dir;
-    const std::set<path> dir_contents(std::begin(StaticEnv::RecDirIterationList),
-                                      std::end(  StaticEnv::RecDirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const std::set<path> dir_contents(static_env.RecDirIterationList.begin(),
+                                      static_env.RecDirIterationList.end());
     const recursive_directory_iterator endIt{};
 
     std::error_code ec;
@@ -66,9 +66,10 @@ TEST_CASE(test_prefix_increment)
 
 TEST_CASE(test_postfix_increment)
 {
-    const path testDir = StaticEnv::Dir;
-    const std::set<path> dir_contents(std::begin(StaticEnv::RecDirIterationList),
-                                      std::end(  StaticEnv::RecDirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const std::set<path> dir_contents(static_env.RecDirIterationList.begin(),
+                                      static_env.RecDirIterationList.end());
     const recursive_directory_iterator endIt{};
 
     std::error_code ec;
@@ -89,9 +90,10 @@ TEST_CASE(test_postfix_increment)
 
 TEST_CASE(test_increment_method)
 {
-    const path testDir = StaticEnv::Dir;
-    const std::set<path> dir_contents(std::begin(StaticEnv::RecDirIterationList),
-                                      std::end(  StaticEnv::RecDirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    const std::set<path> dir_contents(static_env.RecDirIterationList.begin(),
+                                      static_env.RecDirIterationList.end());
     const recursive_directory_iterator endIt{};
 
     std::error_code ec;
@@ -113,10 +115,11 @@ TEST_CASE(test_increment_method)
 
 TEST_CASE(test_follow_symlinks)
 {
-    const path testDir = StaticEnv::Dir;
-    auto const& IterList = StaticEnv::RecDirFollowSymlinksIterationList;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    auto const& IterList = static_env.RecDirFollowSymlinksIterationList;
 
-    const std::set<path> dir_contents(std::begin(IterList), std::end(IterList));
+    const std::set<path> dir_contents(IterList.begin(), IterList.end());
     const recursive_directory_iterator endIt{};
 
     std::error_code ec;
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
index d709012bef8cc..a944478284e62 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -46,7 +45,8 @@ TEST_CASE(test_move_end_iterator)
 
 TEST_CASE(test_move_valid_iterator)
 {
-    const path testDir = StaticEnv::Dir;
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
     const recursive_directory_iterator endIt{};
 
     // build 'it' up with "interesting" non-default state so we can test
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
index 2dee77c71731e..3dc2bb758b899 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -34,7 +33,7 @@ using namespace fs;
 
 TEST_SUITE(recursive_directory_iterator_move_assign_tests)
 
-recursive_directory_iterator createInterestingIterator()
+recursive_directory_iterator createInterestingIterator(const static_test_env &static_env)
     // Create an "interesting" iterator where all fields are
     // in a non-default state. The returned 'it' is in a
     // state such that:
@@ -42,7 +41,7 @@ recursive_directory_iterator createInterestingIterator()
     //   it.depth() == 1
     //   it.recursion_pending() == true
 {
-    const path testDir = StaticEnv::Dir;
+    const path testDir = static_env.Dir;
     const recursive_directory_iterator endIt;
     recursive_directory_iterator it(testDir,
                                     directory_options::skip_permission_denied);
@@ -56,7 +55,7 @@ recursive_directory_iterator createInterestingIterator()
     return it;
 }
 
-recursive_directory_iterator createDifferentInterestingIterator()
+recursive_directory_iterator createDifferentInterestingIterator(const static_test_env &static_env)
     // Create an "interesting" iterator where all fields are
     // in a non-default state. The returned 'it' is in a
     // state such that:
@@ -64,7 +63,7 @@ recursive_directory_iterator createDifferentInterestingIterator()
     //   it.depth() == 2
     //   it.recursion_pending() == false
 {
-    const path testDir = StaticEnv::Dir;
+    const path testDir = static_env.Dir;
     const recursive_directory_iterator endIt;
     recursive_directory_iterator it(testDir,
                                     directory_options::follow_directory_symlink);
@@ -87,9 +86,10 @@ TEST_CASE(test_assignment_signature)
 
 TEST_CASE(test_move_to_end_iterator)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt;
 
-    recursive_directory_iterator from = createInterestingIterator();
+    recursive_directory_iterator from = createInterestingIterator(static_env);
     const recursive_directory_iterator from_copy(from);
     const path entry = *from;
 
@@ -106,8 +106,9 @@ TEST_CASE(test_move_to_end_iterator)
 
 TEST_CASE(test_move_from_end_iterator)
 {
+    static_test_env static_env;
     recursive_directory_iterator from;
-    recursive_directory_iterator to = createInterestingIterator();
+    recursive_directory_iterator to = createInterestingIterator(static_env);
 
     to = std::move(from);
     TEST_REQUIRE(to == from);
@@ -116,13 +117,14 @@ TEST_CASE(test_move_from_end_iterator)
 
 TEST_CASE(test_move_valid_iterator)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt;
 
-    recursive_directory_iterator it = createInterestingIterator();
+    recursive_directory_iterator it = createInterestingIterator(static_env);
     const recursive_directory_iterator it_copy(it);
     const path entry = *it;
 
-    recursive_directory_iterator it2 = createDifferentInterestingIterator();
+    recursive_directory_iterator it2 = createDifferentInterestingIterator(static_env);
     const recursive_directory_iterator it2_copy(it2);
     TEST_REQUIRE(it2 != it);
     TEST_CHECK(it2.options() != it.options());
@@ -149,9 +151,10 @@ TEST_CASE(test_returns_reference_to_self)
 
 TEST_CASE(test_self_move)
 {
+    static_test_env static_env;
     // Create two non-equal iterators that have exactly the same state.
-    recursive_directory_iterator it = createInterestingIterator();
-    recursive_directory_iterator it2 = createInterestingIterator();
+    recursive_directory_iterator it = createInterestingIterator(static_env);
+    recursive_directory_iterator it2 = createInterestingIterator(static_env);
     TEST_CHECK(it != it2);
     TEST_CHECK(it2.options()           == it.options());
     TEST_CHECK(it2.depth()             == it.depth());
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
index c6ecdae56db3c..2d4edc97c549d 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -40,16 +39,17 @@ TEST_CASE(signature_tests)
 // seen files at each depth to determine the new depth after a 'pop()' operation.
 TEST_CASE(test_depth)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt{};
 
-    auto& DE0 = StaticEnv::DirIterationList;
-    std::set<path> notSeenDepth0(std::begin(DE0), std::end(DE0));
+    auto& DE0 = static_env.DirIterationList;
+    std::set<path> notSeenDepth0(DE0.begin(), DE0.end());
 
-    auto& DE1 = StaticEnv::DirIterationListDepth1;
-    std::set<path> notSeenDepth1(std::begin(DE1), std::end(DE1));
+    auto& DE1 = static_env.DirIterationListDepth1;
+    std::set<path> notSeenDepth1(DE1.begin(), DE1.end());
 
     std::error_code ec;
-    recursive_directory_iterator it(StaticEnv::Dir, ec);
+    recursive_directory_iterator it(static_env.Dir, ec);
     TEST_REQUIRE(it != endIt);
     TEST_CHECK(it.depth() == 0);
 
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
index 6bb7b939554a1..731766e262b38 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -30,14 +29,16 @@ TEST_SUITE(recursive_directory_iterator_recursion_pending_tests)
 
 TEST_CASE(initial_value_test)
 {
-    recursive_directory_iterator it(StaticEnv::Dir);
+    static_test_env static_env;
+    recursive_directory_iterator it(static_env.Dir);
     TEST_REQUIRE(it.recursion_pending() == true);
 }
 
 TEST_CASE(value_after_copy_construction_and_assignment_test)
 {
-    recursive_directory_iterator rec_pending_it(StaticEnv::Dir);
-    recursive_directory_iterator no_rec_pending_it(StaticEnv::Dir);
+    static_test_env static_env;
+    recursive_directory_iterator rec_pending_it(static_env.Dir);
+    recursive_directory_iterator no_rec_pending_it(static_env.Dir);
     no_rec_pending_it.disable_recursion_pending();
 
     { // copy construction
@@ -50,14 +51,14 @@ TEST_CASE(value_after_copy_construction_and_assignment_test)
         TEST_CHECK(it2.recursion_pending() == false);
     }
     { // copy assignment
-        recursive_directory_iterator it(StaticEnv::Dir);
+        recursive_directory_iterator it(static_env.Dir);
         it.disable_recursion_pending();
         it = rec_pending_it;
         TEST_CHECK(it.recursion_pending() == true);
         it.disable_recursion_pending();
         TEST_REQUIRE(rec_pending_it.recursion_pending() == true);
 
-        recursive_directory_iterator it2(StaticEnv::Dir);
+        recursive_directory_iterator it2(static_env.Dir);
         it2 = no_rec_pending_it;
         TEST_CHECK(it2.recursion_pending() == false);
     }
@@ -68,8 +69,9 @@ TEST_CASE(value_after_copy_construction_and_assignment_test)
 
 TEST_CASE(value_after_move_construction_and_assignment_test)
 {
-    recursive_directory_iterator rec_pending_it(StaticEnv::Dir);
-    recursive_directory_iterator no_rec_pending_it(StaticEnv::Dir);
+    static_test_env static_env;
+    recursive_directory_iterator rec_pending_it(static_env.Dir);
+    recursive_directory_iterator no_rec_pending_it(static_env.Dir);
     no_rec_pending_it.disable_recursion_pending();
 
     { // move construction
@@ -82,13 +84,13 @@ TEST_CASE(value_after_move_construction_and_assignment_test)
         TEST_CHECK(it2.recursion_pending() == false);
     }
     { // copy assignment
-        recursive_directory_iterator it(StaticEnv::Dir);
+        recursive_directory_iterator it(static_env.Dir);
         it.disable_recursion_pending();
         recursive_directory_iterator it_cp(rec_pending_it);
         it = std::move(it_cp);
         TEST_CHECK(it.recursion_pending() == true);
 
-        recursive_directory_iterator it2(StaticEnv::Dir);
+        recursive_directory_iterator it2(static_env.Dir);
         recursive_directory_iterator it_cp2(no_rec_pending_it);
         it2 = std::move(it_cp2);
         TEST_CHECK(it2.recursion_pending() == false);
@@ -99,9 +101,10 @@ TEST_CASE(value_after_move_construction_and_assignment_test)
 
 TEST_CASE(increment_resets_value)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt;
     {
-        recursive_directory_iterator it(StaticEnv::Dir);
+        recursive_directory_iterator it(static_env.Dir);
         it.disable_recursion_pending();
         TEST_CHECK(it.recursion_pending() == false);
         ++it;
@@ -109,7 +112,7 @@ TEST_CASE(increment_resets_value)
         TEST_CHECK(it.depth() == 0);
     }
     {
-        recursive_directory_iterator it(StaticEnv::Dir);
+        recursive_directory_iterator it(static_env.Dir);
         it.disable_recursion_pending();
         TEST_CHECK(it.recursion_pending() == false);
         it++;
@@ -117,7 +120,7 @@ TEST_CASE(increment_resets_value)
         TEST_CHECK(it.depth() == 0);
     }
     {
-        recursive_directory_iterator it(StaticEnv::Dir);
+        recursive_directory_iterator it(static_env.Dir);
         it.disable_recursion_pending();
         TEST_CHECK(it.recursion_pending() == false);
         std::error_code ec;
@@ -129,12 +132,13 @@ TEST_CASE(increment_resets_value)
 
 TEST_CASE(pop_does_not_reset_value)
 {
+    static_test_env static_env;
     const recursive_directory_iterator endIt;
 
-    auto& DE0 = StaticEnv::DirIterationList;
-    std::set<path> notSeenDepth0(std::begin(DE0), std::end(DE0));
+    auto& DE0 = static_env.DirIterationList;
+    std::set<path> notSeenDepth0(DE0.begin(), DE0.end());
 
-    recursive_directory_iterator it(StaticEnv::Dir);
+    recursive_directory_iterator it(static_env.Dir);
     TEST_REQUIRE(it != endIt);
 
     while (it.depth() == 0) {
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
index 24eaf84c2931a..1a076f3a3ed58 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -43,9 +42,10 @@ TEST_CASE(test_function_signatures)
 
 TEST_CASE(test_ranged_for_loop)
 {
-    const path testDir = StaticEnv::Dir;
-    std::set<path> dir_contents(std::begin(StaticEnv::RecDirIterationList),
-                                std::end(  StaticEnv::RecDirIterationList));
+    static_test_env static_env;
+    const path testDir = static_env.Dir;
+    std::set<path> dir_contents(static_env.RecDirIterationList.begin(),
+                                static_env.RecDirIterationList.end());
 
     std::error_code ec;
     recursive_directory_iterator it(testDir, ec);
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
index de2fa54aca482..bc3f581b828ec 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -24,15 +23,6 @@
 
 using namespace fs;
 
-struct CWDGuard {
-  path OldCWD;
-  CWDGuard() : OldCWD(fs::current_path()) { }
-  ~CWDGuard() { fs::current_path(OldCWD); }
-
-  CWDGuard(CWDGuard const&) = delete;
-  CWDGuard& operator=(CWDGuard const&) = delete;
-};
-
 TEST_SUITE(filesystem_canonical_path_test_suite)
 
 TEST_CASE(signature_test)
@@ -47,29 +37,32 @@ TEST_CASE(signature_test)
 // Each scope tests one of the cases.
 TEST_CASE(test_canonical)
 {
+    static_test_env static_env;
     CWDGuard guard;
     // has_root_name() && has_root_directory()
-    const path Root = StaticEnv::Root;
+    const path Root = static_env.Root;
     const path RootName = Root.filename();
-    const path DirName = StaticEnv::Dir.filename();
-    const path SymlinkName = StaticEnv::SymlinkToFile.filename();
+    const path DirName = static_env.Dir.filename();
+    const path SymlinkName = static_env.SymlinkToFile.filename();
     struct TestCase {
         path p;
         path expect;
         path base;
-        TestCase(path p1, path e, path b = StaticEnv::Root)
+        TestCase(path p1, path e, path b)
             : p(p1), expect(e), base(b) {}
     };
     const TestCase testCases[] = {
-        { ".", Root, Root},
-        { DirName / ".." / "." / DirName, StaticEnv::Dir, Root},
-        { StaticEnv::Dir2 / "..",    StaticEnv::Dir },
-        { StaticEnv::Dir3 / "../..", StaticEnv::Dir },
-        { StaticEnv::Dir / ".",      StaticEnv::Dir },
-        { Root / "." / DirName / ".." / DirName, StaticEnv::Dir},
-        { path("..") / "." / RootName / DirName / ".." / DirName, StaticEnv::Dir, Root},
-        { StaticEnv::SymlinkToFile,  StaticEnv::File },
-        { SymlinkName, StaticEnv::File, StaticEnv::Root}
+        { ".", Root, Root },
+        { DirName / ".." / "." / DirName, static_env.Dir, Root },
+        { static_env.Dir2 / "..",    static_env.Dir, Root },
+        { static_env.Dir3 / "../..", static_env.Dir, Root },
+        { static_env.Dir / ".",      static_env.Dir, Root },
+        { Root / "." / DirName / ".." / DirName, static_env.Dir, Root },
+        { path("..") / "." / RootName / DirName / ".." / DirName,
+          static_env.Dir,
+          Root },
+        { static_env.SymlinkToFile,  static_env.File, Root },
+        { SymlinkName, static_env.File, Root}
     };
     for (auto& TC : testCases) {
         std::error_code ec = GetTestEC();
@@ -85,21 +78,23 @@ TEST_CASE(test_canonical)
 
 TEST_CASE(test_dne_path)
 {
+    static_test_env static_env;
     std::error_code ec = GetTestEC();
     {
-        const path ret = canonical(StaticEnv::DNE, ec);
+        const path ret = canonical(static_env.DNE, ec);
         TEST_CHECK(ec != GetTestEC());
         TEST_REQUIRE(ec);
         TEST_CHECK(ret == path{});
     }
     {
-        TEST_CHECK_THROW(filesystem_error, canonical(StaticEnv::DNE));
+        TEST_CHECK_THROW(filesystem_error, canonical(static_env.DNE));
     }
 }
 
 TEST_CASE(test_exception_contains_paths)
 {
 #ifndef TEST_HAS_NO_EXCEPTIONS
+    static_test_env static_env;
     CWDGuard guard;
     const path p = "blabla/dne";
     try {
@@ -110,13 +105,13 @@ TEST_CASE(test_exception_contains_paths)
         // libc++ provides the current path as the second path in the exception
         LIBCPP_ONLY(TEST_CHECK(err.path2() == current_path()));
     }
-    fs::current_path(StaticEnv::Dir);
+    fs::current_path(static_env.Dir);
     try {
         canonical(p);
         TEST_REQUIRE(false);
     } catch (filesystem_error const& err) {
         TEST_CHECK(err.path1() == p);
-        LIBCPP_ONLY(TEST_CHECK(err.path2() == StaticEnv::Dir));
+        LIBCPP_ONLY(TEST_CHECK(err.path2() == static_env.Dir));
     }
 #endif
 }
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
index 82565433e3105..5ffbe1a51c15c 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
@@ -63,6 +63,7 @@ TEST_CASE(test_error_reporting)
 #endif
     };
 
+    static_test_env static_env;
     scoped_test_env env;
     const path file = env.create_file("file1", 42);
     const path dir = env.create_dir("dir");
@@ -74,7 +75,7 @@ TEST_CASE(test_error_reporting)
     // !exists(f)
     {
         std::error_code ec = test_ec;
-        const path f = StaticEnv::DNE;
+        const path f = static_env.DNE;
         const path t = env.test_root;
         fs::copy(f, t, ec);
         TEST_REQUIRE(ec);
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
index cae78c2f56140..f5ca9023d96b8 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -52,14 +51,18 @@ TEST_CASE(current_path_test)
 
 TEST_CASE(current_path_after_change_test)
 {
-    const path new_path = StaticEnv::Dir;
+    CWDGuard guard;
+    static_test_env static_env;
+    const path new_path = static_env.Dir;
     current_path(new_path);
     TEST_CHECK(current_path() == new_path);
 }
 
 TEST_CASE(current_path_is_file_test)
 {
-    const path p = StaticEnv::File;
+    CWDGuard guard;
+    static_test_env static_env;
+    const path p = static_env.File;
     std::error_code ec;
     const path old_p = current_path();
     current_path(p, ec);
@@ -69,14 +72,16 @@ TEST_CASE(current_path_is_file_test)
 
 TEST_CASE(set_to_non_absolute_path)
 {
-    const path base = StaticEnv::Dir;
+    CWDGuard guard;
+    static_test_env static_env;
+    const path base = static_env.Dir;
     current_path(base);
-    const path p = StaticEnv::Dir2.filename();
+    const path p = static_env.Dir2.filename();
     std::error_code ec;
     current_path(p, ec);
     TEST_CHECK(!ec);
     const path new_cwd = current_path();
-    TEST_CHECK(new_cwd == StaticEnv::Dir2);
+    TEST_CHECK(new_cwd == static_env.Dir2);
     TEST_CHECK(new_cwd.is_absolute());
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
index 71fce47266612..35f337d0b0a76 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -36,18 +35,19 @@ TEST_CASE(signature_test) {
 }
 
 TEST_CASE(equivalent_test) {
+  static_test_env static_env;
   struct TestCase {
     path lhs;
     path rhs;
     bool expect;
   };
   const TestCase testCases[] = {
-      {StaticEnv::Dir, StaticEnv::Dir, true},
-      {StaticEnv::File, StaticEnv::Dir, false},
-      {StaticEnv::Dir, StaticEnv::SymlinkToDir, true},
-      {StaticEnv::Dir, StaticEnv::SymlinkToFile, false},
-      {StaticEnv::File, StaticEnv::File, true},
-      {StaticEnv::File, StaticEnv::SymlinkToFile, true},
+      {static_env.Dir, static_env.Dir, true},
+      {static_env.File, static_env.Dir, false},
+      {static_env.Dir, static_env.SymlinkToDir, true},
+      {static_env.Dir, static_env.SymlinkToFile, false},
+      {static_env.File, static_env.File, true},
+      {static_env.File, static_env.SymlinkToFile, true},
   };
   for (auto& TC : testCases) {
     std::error_code ec;
@@ -57,8 +57,9 @@ TEST_CASE(equivalent_test) {
 }
 
 TEST_CASE(equivalent_reports_error_if_input_dne) {
-  const path E = StaticEnv::File;
-  const path DNE = StaticEnv::DNE;
+  static_test_env static_env;
+  const path E = static_env.File;
+  const path DNE = static_env.DNE;
   { // Test that an error is reported when either of the paths don't exist
     std::error_code ec = GetTestEC();
     TEST_CHECK(equivalent(E, DNE, ec) == false);
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
index 45270d4cd0ec8..b51182b64b4f8 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(exists_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(exists(p) == false);
 
     std::error_code ec = GetTestEC();
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
index 6f27a4cb57124..0573aa5470d25 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -38,7 +37,8 @@ TEST_CASE(signature_test)
 
 TEST_CASE(file_size_empty_test)
 {
-    const path p = StaticEnv::EmptyFile;
+    static_test_env static_env;
+    const path p = static_env.EmptyFile;
     TEST_CHECK(file_size(p) == 0);
     std::error_code ec;
     TEST_CHECK(file_size(p, ec) == 0);
@@ -55,21 +55,23 @@ TEST_CASE(file_size_non_empty)
 
 TEST_CASE(symlink_test_case)
 {
-    const path p = StaticEnv::File;
-    const path p2 = StaticEnv::SymlinkToFile;
+    static_test_env static_env;
+    const path p = static_env.File;
+    const path p2 = static_env.SymlinkToFile;
     TEST_CHECK(file_size(p) == file_size(p2));
 }
 
 TEST_CASE(file_size_error_cases)
 {
+  static_test_env static_env;
   struct {
     path p;
     std::errc expected_err;
   } TestCases[] = {
-      {StaticEnv::Dir, std::errc::is_a_directory},
-      {StaticEnv::SymlinkToDir, std::errc::is_a_directory},
-      {StaticEnv::BadSymlink, std::errc::no_such_file_or_directory},
-      {StaticEnv::DNE, std::errc::no_such_file_or_directory},
+      {static_env.Dir, std::errc::is_a_directory},
+      {static_env.SymlinkToDir, std::errc::is_a_directory},
+      {static_env.BadSymlink, std::errc::no_such_file_or_directory},
+      {static_env.DNE, std::errc::no_such_file_or_directory},
       {"", std::errc::no_such_file_or_directory}};
     const uintmax_t expect = static_cast<uintmax_t>(-1);
     for (auto& TC : TestCases) {
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
index 96da42be0bc76..82ac572071719 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -38,13 +37,15 @@ TEST_CASE(signature_test)
 
 TEST_CASE(hard_link_count_for_file)
 {
-    TEST_CHECK(hard_link_count(StaticEnv::File) == 1);
+    static_test_env static_env;
+    TEST_CHECK(hard_link_count(static_env.File) == 1);
     std::error_code ec;
-    TEST_CHECK(hard_link_count(StaticEnv::File, ec) == 1);
+    TEST_CHECK(hard_link_count(static_env.File, ec) == 1);
 }
 
 TEST_CASE(hard_link_count_for_directory)
 {
+    static_test_env static_env;
     uintmax_t DirExpect = 3; // hard link from . .. and Dir2
     uintmax_t Dir3Expect = 2; // hard link from . ..
     uintmax_t DirExpectAlt = DirExpect;
@@ -56,20 +57,20 @@ TEST_CASE(hard_link_count_for_directory)
     DirExpectAlt = 5; // .  ..  Dir2  file1  file2
     Dir3Expect = 3; // .  ..  file5
 #endif
-    TEST_CHECK(hard_link_count(StaticEnv::Dir) == DirExpect ||
-               hard_link_count(StaticEnv::Dir) == DirExpectAlt ||
-               hard_link_count(StaticEnv::Dir) == 1);
-    TEST_CHECK(hard_link_count(StaticEnv::Dir3) == Dir3Expect ||
-               hard_link_count(StaticEnv::Dir3) == Dir3ExpectAlt ||
-               hard_link_count(StaticEnv::Dir3) == 1);
+    TEST_CHECK(hard_link_count(static_env.Dir) == DirExpect ||
+               hard_link_count(static_env.Dir) == DirExpectAlt ||
+               hard_link_count(static_env.Dir) == 1);
+    TEST_CHECK(hard_link_count(static_env.Dir3) == Dir3Expect ||
+               hard_link_count(static_env.Dir3) == Dir3ExpectAlt ||
+               hard_link_count(static_env.Dir3) == 1);
 
     std::error_code ec;
-    TEST_CHECK(hard_link_count(StaticEnv::Dir, ec) == DirExpect ||
-               hard_link_count(StaticEnv::Dir, ec) == DirExpectAlt ||
-               hard_link_count(StaticEnv::Dir) == 1);
-    TEST_CHECK(hard_link_count(StaticEnv::Dir3, ec) == Dir3Expect ||
-               hard_link_count(StaticEnv::Dir3, ec) == Dir3ExpectAlt ||
-               hard_link_count(StaticEnv::Dir3) == 1);
+    TEST_CHECK(hard_link_count(static_env.Dir, ec) == DirExpect ||
+               hard_link_count(static_env.Dir, ec) == DirExpectAlt ||
+               hard_link_count(static_env.Dir) == 1);
+    TEST_CHECK(hard_link_count(static_env.Dir3, ec) == Dir3Expect ||
+               hard_link_count(static_env.Dir3, ec) == Dir3ExpectAlt ||
+               hard_link_count(static_env.Dir3) == 1);
 }
 TEST_CASE(hard_link_count_increments_test)
 {
@@ -84,9 +85,10 @@ TEST_CASE(hard_link_count_increments_test)
 
 TEST_CASE(hard_link_count_error_cases)
 {
+    static_test_env static_env;
     const path testCases[] = {
-        StaticEnv::BadSymlink,
-        StaticEnv::DNE
+        static_env.BadSymlink,
+        static_env.DNE
     };
     const uintmax_t expect = static_cast<uintmax_t>(-1);
     for (auto& TC : testCases) {
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
index 6cfbc340c44f4..c584605d6381d 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(is_block_file_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_block_file(p) == false);
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
index 12e66019b65c3..90690a211e6b1 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(is_character_file_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_character_file(p) == false);
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
index 35dd5d40f7b0d..1059a648641e8 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -63,15 +62,17 @@ TEST_CASE(is_directory_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_directory(p) == false);
 }
 
 TEST_CASE(static_env_test)
 {
-    TEST_CHECK(is_directory(StaticEnv::Dir));
-    TEST_CHECK(is_directory(StaticEnv::SymlinkToDir));
-    TEST_CHECK(!is_directory(StaticEnv::File));
+    static_test_env static_env;
+    TEST_CHECK(is_directory(static_env.Dir));
+    TEST_CHECK(is_directory(static_env.SymlinkToDir));
+    TEST_CHECK(!is_directory(static_env.File));
 }
 
 TEST_CASE(test_is_directory_fails)
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
index e3393240aa406..606cebbf437e9 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -36,7 +35,8 @@ TEST_CASE(signature_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     std::error_code ec;
     TEST_CHECK(is_empty(p, ec) == false);
     TEST_CHECK(ec);
@@ -45,8 +45,9 @@ TEST_CASE(test_exist_not_found)
 
 TEST_CASE(test_is_empty_directory)
 {
-    TEST_CHECK(!is_empty(StaticEnv::Dir));
-    TEST_CHECK(!is_empty(StaticEnv::SymlinkToDir));
+    static_test_env static_env;
+    TEST_CHECK(!is_empty(static_env.Dir));
+    TEST_CHECK(!is_empty(static_env.SymlinkToDir));
 }
 
 TEST_CASE(test_is_empty_directory_dynamic)
@@ -59,8 +60,9 @@ TEST_CASE(test_is_empty_directory_dynamic)
 
 TEST_CASE(test_is_empty_file)
 {
-    TEST_CHECK(is_empty(StaticEnv::EmptyFile));
-    TEST_CHECK(!is_empty(StaticEnv::NonEmptyFile));
+    static_test_env static_env;
+    TEST_CHECK(is_empty(static_env.EmptyFile));
+    TEST_CHECK(!is_empty(static_env.NonEmptyFile));
 }
 
 TEST_CASE(test_is_empty_fails)
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
index 4dcf043903641..1488a89165308 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(is_fifo_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_fifo(p) == false);
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
index 894cc57aeba26..1381eb5ae6e47 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(is_other_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_other(p) == false);
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
index 3fcf779b3213d..5c5e1ea07c750 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(is_regular_file_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_regular_file(p) == false);
     std::error_code ec;
     TEST_CHECK(is_regular_file(p, ec) == false);
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
index ba727d6384c43..3f35c50f60117 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
@@ -62,7 +62,8 @@ TEST_CASE(is_socket_status_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_socket(p) == false);
 }
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
index 75ab6059f4d89..14da9cb7fa958 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -63,16 +62,17 @@ TEST_CASE(is_symlink_status_test)
 
 TEST_CASE(static_env_test)
 {
+    static_test_env static_env;
     struct TestCase {
         path p;
         bool expect;
     };
     const TestCase testCases[] = {
-        {StaticEnv::File, false},
-        {StaticEnv::Dir, false},
-        {StaticEnv::SymlinkToFile, true},
-        {StaticEnv::SymlinkToDir, true},
-        {StaticEnv::BadSymlink, true}
+        {static_env.File, false},
+        {static_env.Dir, false},
+        {static_env.SymlinkToFile, true},
+        {static_env.SymlinkToDir, true},
+        {static_env.BadSymlink, true}
     };
     for (auto& TC : testCases) {
         TEST_CHECK(is_symlink(TC.p) == TC.expect);
@@ -81,7 +81,8 @@ TEST_CASE(static_env_test)
 
 TEST_CASE(test_exist_not_found)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     TEST_CHECK(is_symlink(p) == false);
     std::error_code ec;
     TEST_CHECK(is_symlink(p, ec) == false);
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
index 3ef4c53303ad8..e71da60a65e92 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -350,27 +349,28 @@ TEST_CASE(signature_test)
 
 TEST_CASE(read_last_write_time_static_env_test)
 {
+    static_test_env static_env;
     using C = file_time_type::clock;
     file_time_type min = file_time_type::min();
     {
-        file_time_type ret = last_write_time(StaticEnv::File);
+        file_time_type ret = last_write_time(static_env.File);
         TEST_CHECK(ret != min);
         TEST_CHECK(ret < C::now());
-        TEST_CHECK(CompareTime(ret, LastWriteTime(StaticEnv::File)));
+        TEST_CHECK(CompareTime(ret, LastWriteTime(static_env.File)));
 
-        file_time_type ret2 = last_write_time(StaticEnv::SymlinkToFile);
+        file_time_type ret2 = last_write_time(static_env.SymlinkToFile);
         TEST_CHECK(CompareTime(ret, ret2));
-        TEST_CHECK(CompareTime(ret2, LastWriteTime(StaticEnv::SymlinkToFile)));
+        TEST_CHECK(CompareTime(ret2, LastWriteTime(static_env.SymlinkToFile)));
     }
     {
-        file_time_type ret = last_write_time(StaticEnv::Dir);
+        file_time_type ret = last_write_time(static_env.Dir);
         TEST_CHECK(ret != min);
         TEST_CHECK(ret < C::now());
-        TEST_CHECK(CompareTime(ret, LastWriteTime(StaticEnv::Dir)));
+        TEST_CHECK(CompareTime(ret, LastWriteTime(static_env.Dir)));
 
-        file_time_type ret2 = last_write_time(StaticEnv::SymlinkToDir);
+        file_time_type ret2 = last_write_time(static_env.SymlinkToDir);
         TEST_CHECK(CompareTime(ret, ret2));
-        TEST_CHECK(CompareTime(ret2, LastWriteTime(StaticEnv::SymlinkToDir)));
+        TEST_CHECK(CompareTime(ret2, LastWriteTime(static_env.SymlinkToDir)));
     }
 }
 
@@ -564,7 +564,8 @@ TEST_CASE(test_write_max_time) {
 
 TEST_CASE(test_value_on_failure)
 {
-    const path p = StaticEnv::DNE;
+    static_test_env static_env;
+    const path p = static_env.DNE;
     std::error_code ec = GetTestEC();
     TEST_CHECK(last_write_time(p, ec) == file_time_type::min());
     TEST_CHECK(ErrorIs(ec, std::errc::no_such_file_or_directory));
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
index 2a8d8296c861e..ffe2b3890f0f2 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -42,77 +41,89 @@ TEST_CASE(test_signature_1) {
 }
 
 TEST_CASE(test_signature_2) {
-  fs::path p(StaticEnv::File);
+  static_test_env static_env;
+  fs::path p(static_env.File);
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::File));
+  TEST_CHECK(output == std::string(static_env.File));
 }
 
 TEST_CASE(test_signature_3) {
-  fs::path p(StaticEnv::Dir);
+  static_test_env static_env;
+  fs::path p(static_env.Dir);
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir));
+  TEST_CHECK(output == std::string(static_env.Dir));
 }
 
 TEST_CASE(test_signature_4) {
-  fs::path p(StaticEnv::SymlinkToDir);
+  static_test_env static_env;
+  fs::path p(static_env.SymlinkToDir);
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir));
+  TEST_CHECK(output == std::string(static_env.Dir));
 }
 
 TEST_CASE(test_signature_5) {
-  fs::path p(StaticEnv::SymlinkToDir / "dir2/.");
+  static_test_env static_env;
+  fs::path p(static_env.SymlinkToDir / "dir2/.");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir / "dir2"));
+  TEST_CHECK(output == std::string(static_env.Dir / "dir2"));
 }
 
 TEST_CASE(test_signature_6) {
+  static_test_env static_env;
   // FIXME? If the trailing separator occurs in a part of the path that exists,
   // it is omitted. Otherwise it is added to the end of the result.
-  fs::path p(StaticEnv::SymlinkToDir / "dir2/./");
+  fs::path p(static_env.SymlinkToDir / "dir2/./");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir / "dir2"));
+  TEST_CHECK(output == std::string(static_env.Dir / "dir2"));
 }
 
 TEST_CASE(test_signature_7) {
-  fs::path p(StaticEnv::SymlinkToDir / "dir2/DNE/./");
+  static_test_env static_env;
+  fs::path p(static_env.SymlinkToDir / "dir2/DNE/./");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir / "dir2/DNE/"));
+  TEST_CHECK(output == std::string(static_env.Dir / "dir2/DNE/"));
 }
 
 TEST_CASE(test_signature_8) {
-  fs::path p(StaticEnv::SymlinkToDir / "dir2");
+  static_test_env static_env;
+  fs::path p(static_env.SymlinkToDir / "dir2");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir2));
+  TEST_CHECK(output == std::string(static_env.Dir2));
 }
 
 TEST_CASE(test_signature_9) {
-  fs::path p(StaticEnv::SymlinkToDir / "dir2/../dir2/DNE/..");
+  static_test_env static_env;
+  fs::path p(static_env.SymlinkToDir / "dir2/../dir2/DNE/..");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir2 / ""));
+  TEST_CHECK(output == std::string(static_env.Dir2 / ""));
 }
 
 TEST_CASE(test_signature_10) {
-  fs::path p(StaticEnv::SymlinkToDir / "dir2/dir3/../DNE/DNE2");
+  static_test_env static_env;
+  fs::path p(static_env.SymlinkToDir / "dir2/dir3/../DNE/DNE2");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir2 / "DNE/DNE2"));
+  TEST_CHECK(output == std::string(static_env.Dir2 / "DNE/DNE2"));
 }
 
 TEST_CASE(test_signature_11) {
-  fs::path p(StaticEnv::Dir / "../dir1");
+  static_test_env static_env;
+  fs::path p(static_env.Dir / "../dir1");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir));
+  TEST_CHECK(output == std::string(static_env.Dir));
 }
 
 TEST_CASE(test_signature_12) {
-  fs::path p(StaticEnv::Dir / "./.");
+  static_test_env static_env;
+  fs::path p(static_env.Dir / "./.");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir));
+  TEST_CHECK(output == std::string(static_env.Dir));
 }
 
 TEST_CASE(test_signature_13) {
-  fs::path p(StaticEnv::Dir / "DNE/../foo");
+  static_test_env static_env;
+  fs::path p(static_env.Dir / "DNE/../foo");
   const fs::path output = fs::weakly_canonical(p);
-  TEST_CHECK(output == std::string(StaticEnv::Dir / "foo"));
+  TEST_CHECK(output == std::string(static_env.Dir / "foo"));
 }
 
 TEST_SUITE_END()
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
index 4a0936b920db4..6de0b10c6e549 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -45,6 +44,7 @@ TEST_CASE(signature_test)
 
 TEST_CASE(test_error_reporting)
 {
+    static_test_env static_env;
     auto checkThrow = [](path const& f, const std::error_code& ec)
     {
 #ifndef TEST_HAS_NO_EXCEPTIONS
@@ -63,8 +63,8 @@ TEST_CASE(test_error_reporting)
     };
     const path cases[] = {
         "",
-        StaticEnv::DNE,
-        StaticEnv::BadSymlink
+        static_env.DNE,
+        static_env.BadSymlink
     };
     for (auto& p : cases) {
         const auto expect = static_cast<std::uintmax_t>(-1);
@@ -80,11 +80,13 @@ TEST_CASE(test_error_reporting)
 
 TEST_CASE(basic_space_test)
 {
+    static_test_env static_env;
+
     // All the test cases should reside on the same filesystem and therefore
     // should have the same expected result. Compute this expected result
     // one and check that it looks semi-sane.
     struct statvfs expect;
-    TEST_REQUIRE(::statvfs(StaticEnv::Dir.c_str(), &expect) != -1);
+    TEST_REQUIRE(::statvfs(static_env.Dir.c_str(), &expect) != -1);
     TEST_CHECK(expect.f_bavail > 0);
     TEST_CHECK(expect.f_bfree > 0);
     TEST_CHECK(expect.f_bsize > 0);
@@ -106,11 +108,11 @@ TEST_CASE(basic_space_test)
     // Currently 5% of capacity
     const std::uintmax_t delta = expect_capacity / 20;
     const path cases[] = {
-        StaticEnv::File,
-        StaticEnv::Dir,
-        StaticEnv::Dir2,
-        StaticEnv::SymlinkToFile,
-        StaticEnv::SymlinkToDir
+        static_env.File,
+        static_env.Dir,
+        static_env.Dir2,
+        static_env.SymlinkToFile,
+        static_env.SymlinkToDir
     };
     for (auto& p : cases) {
         std::error_code ec = GetTestEC();
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
index 60c99ee923d5a..af4ddb9c89061 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -34,11 +33,12 @@ TEST_CASE(signature_test)
 
 TEST_CASE(test_status_not_found)
 {
+    static_test_env static_env;
     const std::error_code expect_ec =
         std::make_error_code(std::errc::no_such_file_or_directory);
     const path cases[] {
-        StaticEnv::DNE,
-        StaticEnv::BadSymlink
+        static_env.DNE,
+        static_env.BadSymlink
     };
     for (auto& p : cases) {
         std::error_code ec = std::make_error_code(std::errc::address_in_use);
@@ -103,17 +103,18 @@ TEST_CASE(test_status_cannot_resolve)
 
 TEST_CASE(status_file_types_test)
 {
+    static_test_env static_env;
     scoped_test_env env;
     struct TestCase {
       path p;
       file_type expect_type;
     } cases[] = {
-        {StaticEnv::File, file_type::regular},
-        {StaticEnv::SymlinkToFile, file_type::regular},
-        {StaticEnv::Dir, file_type::directory},
-        {StaticEnv::SymlinkToDir, file_type::directory},
+        {static_env.File, file_type::regular},
+        {static_env.SymlinkToFile, file_type::regular},
+        {static_env.Dir, file_type::directory},
+        {static_env.SymlinkToDir, file_type::directory},
         // Block files tested elsewhere
-        {StaticEnv::CharFile, file_type::character},
+        {static_env.CharFile, file_type::character},
 #if !defined(__APPLE__) && !defined(__FreeBSD__) // No support for domain sockets
         {env.create_socket("socket"), file_type::socket},
 #endif
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
index 350076fcf3709..e883739b7d5ba 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -34,10 +33,11 @@ TEST_CASE(signature_test)
 
 TEST_CASE(test_symlink_status_not_found)
 {
+    static_test_env static_env;
     const std::error_code expect_ec =
         std::make_error_code(std::errc::no_such_file_or_directory);
     const path cases[] {
-        StaticEnv::DNE
+        static_env.DNE
     };
     for (auto& p : cases) {
         std::error_code ec = std::make_error_code(std::errc::address_in_use);
@@ -110,18 +110,19 @@ TEST_CASE(test_symlink_status_cannot_resolve)
 
 TEST_CASE(symlink_status_file_types_test)
 {
+    static_test_env static_env;
     scoped_test_env env;
     struct TestCase {
       path p;
       file_type expect_type;
     } cases[] = {
-        {StaticEnv::BadSymlink, file_type::symlink},
-        {StaticEnv::File, file_type::regular},
-        {StaticEnv::SymlinkToFile, file_type::symlink},
-        {StaticEnv::Dir, file_type::directory},
-        {StaticEnv::SymlinkToDir, file_type::symlink},
+        {static_env.BadSymlink, file_type::symlink},
+        {static_env.File, file_type::regular},
+        {static_env.SymlinkToFile, file_type::symlink},
+        {static_env.Dir, file_type::directory},
+        {static_env.SymlinkToDir, file_type::symlink},
         // Block files tested elsewhere
-        {StaticEnv::CharFile, file_type::character},
+        {static_env.CharFile, file_type::character},
 #if !defined(__APPLE__) && !defined(__FreeBSD__) // No support for domain sockets
         {env.create_socket("socket"), file_type::socket},
 #endif
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
index 35d97b8703d59..c2c86bc35a6b4 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FILE_DEPENDENCIES: ../../Inputs/static_test_env
 // UNSUPPORTED: c++98, c++03
 
 // <filesystem>
@@ -27,6 +26,9 @@
 
 
 int main(int, char**) {
+
+  static_test_env static_env;
+
   // clang-format off
   struct {
     std::string input;
@@ -41,20 +43,20 @@ int main(int, char**) {
       {"a/b", fs::current_path() / "a/b"},
       {"a", fs::current_path() / "a"},
       {"a/b/", fs::current_path() / "a/b/"},
-      {StaticEnv::File, StaticEnv::File},
-      {StaticEnv::Dir, StaticEnv::Dir},
-      {StaticEnv::SymlinkToDir, StaticEnv::Dir},
-      {StaticEnv::SymlinkToDir / "dir2/.", StaticEnv::Dir / "dir2"},
+      {static_env.File, static_env.File},
+      {static_env.Dir, static_env.Dir},
+      {static_env.SymlinkToDir, static_env.Dir},
+      {static_env.SymlinkToDir / "dir2/.", static_env.Dir / "dir2"},
       // FIXME? If the trailing separator occurs in a part of the path that exists,
       // it is omitted. Otherwise it is added to the end of the result.
-      {StaticEnv::SymlinkToDir / "dir2/./", StaticEnv::Dir / "dir2"},
-      {StaticEnv::SymlinkToDir / "dir2/DNE/./", StaticEnv::Dir / "dir2/DNE/"},
-      {StaticEnv::SymlinkToDir / "dir2", StaticEnv::Dir2},
-      {StaticEnv::SymlinkToDir / "dir2/../dir2/DNE/..", StaticEnv::Dir2 / ""},
-      {StaticEnv::SymlinkToDir / "dir2/dir3/../DNE/DNE2", StaticEnv::Dir2 / "DNE/DNE2"},
-      {StaticEnv::Dir / "../dir1", StaticEnv::Dir},
-      {StaticEnv::Dir / "./.", StaticEnv::Dir},
-      {StaticEnv::Dir / "DNE/../foo", StaticEnv::Dir / "foo"}
+      {static_env.SymlinkToDir / "dir2/./", static_env.Dir / "dir2"},
+      {static_env.SymlinkToDir / "dir2/DNE/./", static_env.Dir / "dir2/DNE/"},
+      {static_env.SymlinkToDir / "dir2", static_env.Dir2},
+      {static_env.SymlinkToDir / "dir2/../dir2/DNE/..", static_env.Dir2 / ""},
+      {static_env.SymlinkToDir / "dir2/dir3/../DNE/DNE2", static_env.Dir2 / "DNE/DNE2"},
+      {static_env.Dir / "../dir1", static_env.Dir},
+      {static_env.Dir / "./.", static_env.Dir},
+      {static_env.Dir / "DNE/../foo", static_env.Dir / "foo"}
   };
   // clang-format on
   int ID = 0;
diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h
index 8553317de76f6..5cccca9f0493e 100644
--- a/libcxx/test/support/filesystem_test_helper.h
+++ b/libcxx/test/support/filesystem_test_helper.h
@@ -3,7 +3,8 @@
 
 #include "filesystem_include.h"
 
-#include <unistd.h> // for ftruncate
+#include <sys/stat.h> // for mkdir, mkfifo
+#include <unistd.h> // for ftruncate, link, symlink, getcwd, chdir
 
 #include <cassert>
 #include <cstdio> // for printf
@@ -24,89 +25,6 @@
 # include <sys/un.h>
 #endif
 
-// static test helpers
-
-namespace StaticEnv {
-
-// Tests that use these utilities should add '<...>/Inputs/static_test_env'
-// to their FILE_DEPENDENCIES, to make sure the directory is made available
-// to the test. Assuming that, the 'static_test_env' will be available in the
-// directory where the test is run.
-static const fs::path Root = fs::current_path() / "static_test_env";
-
-inline fs::path makePath(fs::path const& p) {
-    // env_path is expected not to contain symlinks.
-    fs::path const& env_path = Root;
-    return env_path / p;
-}
-
-static const fs::path TestFileList[] = {
-        makePath("empty_file"),
-        makePath("non_empty_file"),
-        makePath("dir1/file1"),
-        makePath("dir1/file2")
-};
-const std::size_t TestFileListSize = sizeof(TestFileList) / sizeof(fs::path);
-
-static const fs::path TestDirList[] = {
-        makePath("dir1"),
-        makePath("dir1/dir2"),
-        makePath("dir1/dir2/dir3")
-};
-const std::size_t TestDirListSize = sizeof(TestDirList) / sizeof(fs::path);
-
-static const fs::path File          = TestFileList[0];
-static const fs::path Dir           = TestDirList[0];
-static const fs::path Dir2          = TestDirList[1];
-static const fs::path Dir3          = TestDirList[2];
-static const fs::path SymlinkToFile = makePath("symlink_to_empty_file");
-static const fs::path SymlinkToDir  = makePath("symlink_to_dir");
-static const fs::path BadSymlink    = makePath("bad_symlink");
-static const fs::path DNE           = makePath("DNE");
-static const fs::path EmptyFile     = TestFileList[0];
-static const fs::path NonEmptyFile  = TestFileList[1];
-static const fs::path CharFile      = "/dev/null"; // Hopefully this exists
-
-static const fs::path DirIterationList[] = {
-    makePath("dir1/dir2"),
-    makePath("dir1/file1"),
-    makePath("dir1/file2")
-};
-const std::size_t DirIterationListSize = sizeof(DirIterationList)
-                                        / sizeof(fs::path);
-
-static const fs::path DirIterationListDepth1[] = {
-    makePath("dir1/dir2/afile3"),
-    makePath("dir1/dir2/dir3"),
-    makePath("dir1/dir2/symlink_to_dir3"),
-    makePath("dir1/dir2/file4"),
-};
-
-static const fs::path RecDirIterationList[] = {
-    makePath("dir1/dir2"),
-    makePath("dir1/file1"),
-    makePath("dir1/file2"),
-    makePath("dir1/dir2/afile3"),
-    makePath("dir1/dir2/dir3"),
-    makePath("dir1/dir2/symlink_to_dir3"),
-    makePath("dir1/dir2/file4"),
-    makePath("dir1/dir2/dir3/file5")
-};
-
-static const fs::path RecDirFollowSymlinksIterationList[] = {
-    makePath("dir1/dir2"),
-    makePath("dir1/file1"),
-    makePath("dir1/file2"),
-    makePath("dir1/dir2/afile3"),
-    makePath("dir1/dir2/dir3"),
-    makePath("dir1/dir2/file4"),
-    makePath("dir1/dir2/dir3/file5"),
-    makePath("dir1/dir2/symlink_to_dir3"),
-    makePath("dir1/dir2/symlink_to_dir3/file5"),
-};
-
-} // namespace StaticEnv
-
 namespace random_utils {
 inline char to_hex(int ch) {
   return ch < 10 ? static_cast<char>('0' + ch)
@@ -207,17 +125,18 @@ struct scoped_test_env
 
     std::string create_dir(std::string filename) {
         filename = sanitize_path(std::move(filename));
-        std::string cmd = "mkdir " + filename;
-        int ret = std::system(cmd.c_str());
+        int ret = ::mkdir(filename.c_str(), 0777); // rwxrwxrwx mode
         assert(ret == 0);
         return filename;
     }
 
-    std::string create_symlink(std::string source, std::string to) {
-        source = sanitize_path(std::move(source));
+    std::string create_symlink(std::string source,
+                               std::string to,
+                               bool sanitize_source = true) {
+        if (sanitize_source)
+            source = sanitize_path(std::move(source));
         to = sanitize_path(std::move(to));
-        std::string cmd = "ln -s " + source + ' ' + to;
-        int ret = std::system(cmd.c_str());
+        int ret = ::symlink(source.c_str(), to.c_str());
         assert(ret == 0);
         return to;
     }
@@ -225,16 +144,14 @@ struct scoped_test_env
     std::string create_hardlink(std::string source, std::string to) {
         source = sanitize_path(std::move(source));
         to = sanitize_path(std::move(to));
-        std::string cmd = "ln " + source + ' ' + to;
-        int ret = std::system(cmd.c_str());
+        int ret = ::link(source.c_str(), to.c_str());
         assert(ret == 0);
         return to;
     }
 
     std::string create_fifo(std::string file) {
         file = sanitize_path(std::move(file));
-        std::string cmd = "mkfifo " + file;
-        int ret = std::system(cmd.c_str());
+        int ret = ::mkfifo(file.c_str(), 0666); // rw-rw-rw- mode
         assert(ret == 0);
         return file;
     }
@@ -276,6 +193,131 @@ struct scoped_test_env
     }
 };
 
+/// This class generates the following tree:
+///
+///     static_test_env
+///     ├── bad_symlink -> dne
+///     ├── dir1
+///     │   ├── dir2
+///     │   │   ├── afile3
+///     │   │   ├── dir3
+///     │   │   │   └── file5
+///     │   │   ├── file4
+///     │   │   └── symlink_to_dir3 -> dir3
+///     │   ├── file1
+///     │   └── file2
+///     ├── empty_file
+///     ├── non_empty_file
+///     ├── symlink_to_dir -> dir1
+///     └── symlink_to_empty_file -> empty_file
+///
+class static_test_env {
+    scoped_test_env env_;
+public:
+    static_test_env() {
+        env_.create_symlink("dne", "bad_symlink", false);
+        env_.create_dir("dir1");
+        env_.create_dir("dir1/dir2");
+        env_.create_file("dir1/dir2/afile3");
+        env_.create_dir("dir1/dir2/dir3");
+        env_.create_file("dir1/dir2/dir3/file5");
+        env_.create_file("dir1/dir2/file4");
+        env_.create_symlink("dir3", "dir1/dir2/symlink_to_dir3", false);
+        env_.create_file("dir1/file1");
+        env_.create_file("dir1/file2", 42);
+        env_.create_file("empty_file");
+        env_.create_file("non_empty_file", 42);
+        env_.create_symlink("dir1", "symlink_to_dir", false);
+        env_.create_symlink("empty_file", "symlink_to_empty_file", false);
+    }
+
+    const fs::path Root = env_.test_root;
+
+    fs::path makePath(fs::path const& p) const {
+        // env_path is expected not to contain symlinks.
+        fs::path const& env_path = Root;
+        return env_path / p;
+    }
+
+    const std::vector<fs::path> TestFileList = {
+        makePath("empty_file"),
+        makePath("non_empty_file"),
+        makePath("dir1/file1"),
+        makePath("dir1/file2")
+    };
+
+    const std::vector<fs::path> TestDirList = {
+        makePath("dir1"),
+        makePath("dir1/dir2"),
+        makePath("dir1/dir2/dir3")
+    };
+
+    const fs::path File          = TestFileList[0];
+    const fs::path Dir           = TestDirList[0];
+    const fs::path Dir2          = TestDirList[1];
+    const fs::path Dir3          = TestDirList[2];
+    const fs::path SymlinkToFile = makePath("symlink_to_empty_file");
+    const fs::path SymlinkToDir  = makePath("symlink_to_dir");
+    const fs::path BadSymlink    = makePath("bad_symlink");
+    const fs::path DNE           = makePath("DNE");
+    const fs::path EmptyFile     = TestFileList[0];
+    const fs::path NonEmptyFile  = TestFileList[1];
+    const fs::path CharFile      = "/dev/null"; // Hopefully this exists
+
+    const std::vector<fs::path> DirIterationList = {
+        makePath("dir1/dir2"),
+        makePath("dir1/file1"),
+        makePath("dir1/file2")
+    };
+
+    const std::vector<fs::path> DirIterationListDepth1 = {
+        makePath("dir1/dir2/afile3"),
+        makePath("dir1/dir2/dir3"),
+        makePath("dir1/dir2/symlink_to_dir3"),
+        makePath("dir1/dir2/file4"),
+    };
+
+    const std::vector<fs::path> RecDirIterationList = {
+        makePath("dir1/dir2"),
+        makePath("dir1/file1"),
+        makePath("dir1/file2"),
+        makePath("dir1/dir2/afile3"),
+        makePath("dir1/dir2/dir3"),
+        makePath("dir1/dir2/symlink_to_dir3"),
+        makePath("dir1/dir2/file4"),
+        makePath("dir1/dir2/dir3/file5")
+    };
+
+    const std::vector<fs::path> RecDirFollowSymlinksIterationList = {
+        makePath("dir1/dir2"),
+        makePath("dir1/file1"),
+        makePath("dir1/file2"),
+        makePath("dir1/dir2/afile3"),
+        makePath("dir1/dir2/dir3"),
+        makePath("dir1/dir2/file4"),
+        makePath("dir1/dir2/dir3/file5"),
+        makePath("dir1/dir2/symlink_to_dir3"),
+        makePath("dir1/dir2/symlink_to_dir3/file5"),
+    };
+};
+
+struct CWDGuard {
+  // Assume that path lengths are not greater than this.
+  // This should be fine for testing purposes.
+  char OldCWD[4096];
+  CWDGuard() {
+    char* ret = ::getcwd(OldCWD, sizeof(OldCWD));
+    assert(ret && "getcwd failed");
+  }
+  ~CWDGuard() { 
+    int ret = ::chdir(OldCWD);
+    assert(ret == 0 && "chdir failed");
+  }
+
+  CWDGuard(CWDGuard const&) = delete;
+  CWDGuard& operator=(CWDGuard const&) = delete;
+};
+
 // Misc test types
 
 #define MKSTR(Str) {Str, TEST_CONCAT(L, Str), TEST_CONCAT(u, Str), TEST_CONCAT(U, Str)}

From 3873d0b3d899bb84a5983450dd2d98006c4527e2 Mon Sep 17 00:00:00 2001
From: Sergej Jaskiewicz <jaskiewiczs@icloud.com>
Date: Wed, 6 May 2020 01:10:57 +0300
Subject: [PATCH 041/770] Re-commit "[cmake] Allow std::filesystem tests in
 CrossWinToARMLinux.cmake"

https://reviews.llvm.org/D78200 has been re-committed, so we can now
enable building std::filesystem and running tests for it.
---
 clang/cmake/caches/CrossWinToARMLinux.cmake | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index c01c31ae5a722..3d1e961ada8d0 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -89,9 +89,6 @@ set(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX    OFF CACHE BOOL "")
 set(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI    OFF CACHE BOOL "")
 set(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX       OFF CACHE BOOL "")
 
-# FIXME: Remove this when https://reviews.llvm.org/D78200 is merged.
-set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") 
-
 set(LIBCXX_USE_COMPILER_RT                  ON CACHE BOOL "")
 set(LIBCXX_TARGET_TRIPLE                    "${CMAKE_C_COMPILER_TARGET}" CACHE STRING "")
 set(LIBCXX_SYSROOT                          "${DEFAULT_SYSROOT}" CACHE STRING "")

From a9b5edc5e2c4ec9d506b2c30465ee9f2dc21e5cc Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 24 May 2020 23:08:27 +0200
Subject: [PATCH 042/770] Make mlir::Value's bool conversion operator explicit

This still allows `if (value)` while requiring an explicit cast when not
in a boolean context. This means things like `std::set<Value>` will no
longer compile.

Differential Revision: https://reviews.llvm.org/D80497
---
 mlir/include/mlir/EDSC/Builders.h               | 2 +-
 mlir/include/mlir/IR/Value.h                    | 2 +-
 mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp | 3 ++-
 mlir/lib/Parser/Parser.cpp                      | 6 +++---
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/EDSC/Builders.h b/mlir/include/mlir/EDSC/Builders.h
index a6045db3d9985..70443608a2513 100644
--- a/mlir/include/mlir/EDSC/Builders.h
+++ b/mlir/include/mlir/EDSC/Builders.h
@@ -303,7 +303,7 @@ struct StructuredIndexed {
            "MemRef, RankedTensor or Vector expected");
   }
 
-  bool hasValue() const { return value; }
+  bool hasValue() const { return (bool)value; }
   Value getValue() const {
     assert(value && "StructuredIndexed Value not set.");
     return value;
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index 74f504c25156c..f5cb16f347ed3 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -92,7 +92,7 @@ class Value {
     return U(ownerAndKind);
   }
 
-  operator bool() const { return ownerAndKind.getPointer(); }
+  explicit operator bool() const { return ownerAndKind.getPointer(); }
   bool operator==(const Value &other) const {
     return ownerAndKind == other.ownerAndKind;
   }
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index f703a8c621e69..9868a14c21651 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -281,7 +281,8 @@ LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
     }
   });
 
-  assert((!options.unroll ^ result) && "Expected resulting Value iff unroll");
+  assert((!options.unroll ^ (bool)result) &&
+         "Expected resulting Value iff unroll");
   if (!result)
     result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
   rewriter.replaceOp(op, result);
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
index f5ad1b65f1a1c..d5108a4ed29ee 100644
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -3542,14 +3542,14 @@ ParseResult OperationParser::finalize() {
   // Check for any forward references that are left.  If we find any, error
   // out.
   if (!forwardRefPlaceholders.empty()) {
-    SmallVector<std::pair<const char *, Value>, 4> errors;
+    SmallVector<const char *, 4> errors;
     // Iteration over the map isn't deterministic, so sort by source location.
     for (auto entry : forwardRefPlaceholders)
-      errors.push_back({entry.second.getPointer(), entry.first});
+      errors.push_back(entry.second.getPointer());
     llvm::array_pod_sort(errors.begin(), errors.end());
 
     for (auto entry : errors) {
-      auto loc = SMLoc::getFromPointer(entry.first);
+      auto loc = SMLoc::getFromPointer(entry);
       emitError(loc, "use of undeclared SSA value name");
     }
     return failure();

From 77aec3b4c0e63b07d98dbb1aeb693d200e769a05 Mon Sep 17 00:00:00 2001
From: Dmitry Preobrazhensky <dmitry.preobrazhensky@amd.com>
Date: Mon, 25 May 2020 19:55:38 +0300
Subject: [PATCH 043/770] [AMDGPU][MC][GFX8+] Enabled clamp for v_add_u16,
 v_sub_u16 and v_subrev_u16

See https://bugs.llvm.org/show_bug.cgi?id=45926

Reviewers: arsenm, rampitec, vpykhtin

Differential Revision: https://reviews.llvm.org/D80430
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td            |  1 +
 llvm/lib/Target/AMDGPU/VOP2Instructions.td       |  6 +++---
 .../AMDGPU/GlobalISel/inst-select-add.s16.mir    | 16 ++++++++--------
 llvm/test/MC/AMDGPU/gfx10_asm_all.s              |  6 ++++++
 llvm/test/MC/AMDGPU/vop2.s                       | 15 +++++++++++++++
 .../MC/Disassembler/AMDGPU/gfx10_dasm_all.txt    |  6 ++++++
 llvm/test/MC/Disassembler/AMDGPU/vop2_vi.txt     |  9 +++++++++
 7 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 397791677eda7..b988de596c648 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2266,6 +2266,7 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
+def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], 0, /*EnableClamp=*/1>;
 
 def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 594e11ec4f54c..c1ce1b755322d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -626,9 +626,9 @@ defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
 } // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
index cc48e9126c9b7..ad8f5df33ab07 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -19,14 +19,14 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
     ; GFX10-LABEL: name: add_s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -51,14 +51,14 @@ body: |
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
     ; GFX10-LABEL: name: add_s16_zext_to_s32
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ADD_U16_e64_]], 0, 16, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
     %0:vgpr(s32) = COPY $vgpr0
@@ -84,13 +84,13 @@ body: |
     ; GFX6-LABEL: name: add_s16_neg_inline_const_64
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
     ; GFX10-LABEL: name: add_s16_neg_inline_const_64
     ; GFX10: liveins: $vgpr0
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -113,13 +113,13 @@ body: |
     ; GFX6-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
     ; GFX6: liveins: $vgpr0
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
     ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
     ; GFX10: liveins: $vgpr0
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_SUB_U16_e64_]], 0, 16, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
     %0:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_all.s b/llvm/test/MC/AMDGPU/gfx10_asm_all.s
index 13072ad0c049e..aa09c30903ba2 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_all.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_all.s
@@ -58995,6 +58995,9 @@ v_add_nc_u16 v5, v1, 0.5
 v_add_nc_u16 v5, v1, -4.0
 // GFX10: encoding: [0x05,0x00,0x03,0xd7,0x01,0xef,0x01,0x00]
 
+v_add_nc_u16 v5, v1, -4.0 clamp
+// GFX10: encoding: [0x05,0x80,0x03,0xd7,0x01,0xef,0x01,0x00]
+
 v_sub_nc_u16 v5, v1, v2
 // GFX10: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
 
@@ -59073,6 +59076,9 @@ v_sub_nc_u16 v5, v1, 0.5
 v_sub_nc_u16 v5, v1, -4.0
 // GFX10: encoding: [0x05,0x00,0x04,0xd7,0x01,0xef,0x01,0x00]
 
+v_sub_nc_u16 v5, v1, -4.0 clamp
+// GFX10: encoding: [0x05,0x80,0x04,0xd7,0x01,0xef,0x01,0x00]
+
 v_mul_lo_u16 v5, v1, v2
 // GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/vop2.s b/llvm/test/MC/AMDGPU/vop2.s
index 552bc468b63d2..1505c8cfa44d9 100644
--- a/llvm/test/MC/AMDGPU/vop2.s
+++ b/llvm/test/MC/AMDGPU/vop2.s
@@ -435,16 +435,31 @@ v_madak_f16 v1, v2, v3, 64.0
 // VI:     v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
 v_add_u16_e32 v1, v2, v3
 
+// NOSICI: error: invalid operand for instruction
+// NOSICI: v_add_u16 v1, v2, v3 clamp
+// VI:     v_add_u16_e64 v1, v2, v3 clamp  ; encoding: [0x01,0x80,0x26,0xd1,0x02,0x07,0x02,0x00]
+v_add_u16 v1, v2, v3 clamp
+
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_sub_u16_e32 v1, v2, v3
 // VI:     v_sub_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
 v_sub_u16_e32 v1, v2, v3
 
+// NOSICI: error: invalid operand for instruction
+// NOSICI: v_sub_u16 v1, v2, v3 clamp
+// VI:     v_sub_u16_e64 v1, v2, v3 clamp  ; encoding: [0x01,0x80,0x27,0xd1,0x02,0x07,0x02,0x00]
+v_sub_u16 v1, v2, v3 clamp
+
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_subrev_u16_e32 v1, v2, v3
 // VI:     v_subrev_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50]
 v_subrev_u16_e32 v1, v2, v3
 
+// NOSICI: error: invalid operand for instruction
+// NOSICI: v_subrev_u16 v1, v2, v3 clamp
+// VI:     v_subrev_u16_e64 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x28,0xd1,0x02,0x07,0x02,0x00]
+v_subrev_u16 v1, v2, v3 clamp
+
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_mul_lo_u16_e32 v1, v2, v3
 // VI:     v_mul_lo_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
index 6b920345d7b8a..89cbaa7f02998 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
@@ -21401,6 +21401,9 @@
 # GFX10: v_add_nc_u16_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0x04,0x02,0x00]
 0x05,0x00,0x03,0xd7,0x6a,0x04,0x02,0x00
 
+# GFX10: v_add_nc_u16_e64 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00]
+0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00
+
 # GFX10: v_add_nc_u32_e32 v255, v1, v2   ; encoding: [0x01,0x05,0xfe,0x4b]
 0x01,0x05,0xfe,0x4b
 
@@ -95808,6 +95811,9 @@
 # GFX10: v_sub_nc_u16_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0x04,0x02,0x00]
 0x05,0x00,0x04,0xd7,0x6a,0x04,0x02,0x00
 
+# GFX10: v_sub_nc_u16_e64 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00]
+0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00
+
 # GFX10: v_sub_nc_u32_e32 v255, v1, v2   ; encoding: [0x01,0x05,0xfe,0x4d]
 0x01,0x05,0xfe,0x4d
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/vop2_vi.txt b/llvm/test/MC/Disassembler/AMDGPU/vop2_vi.txt
index 4f44326a69bf7..eafbece4d96cf 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/vop2_vi.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/vop2_vi.txt
@@ -222,12 +222,21 @@
 # VI:     v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
 0x02 0x07 0x02 0x4c
 
+# VI:     v_add_u16_e64 v1, v2, v3 clamp  ; encoding: [0x01,0x80,0x26,0xd1,0x02,0x07,0x02,0x00]
+0x01 0x80 0x26 0xd1 0x02 0x07 0x02 0x00
+
 # VI:     v_sub_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e]
 0x02 0x07 0x02 0x4e
 
+# VI:     v_sub_u16_e64 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x27,0xd1,0x02,0x07,0x02,0x00]
+0x01 0x80 0x27 0xd1 0x02 0x07 0x02 0x00
+
 # VI:     v_subrev_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50]
 0x02 0x07 0x02 0x50
 
+# VI:     v_subrev_u16_e64 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x28,0xd1,0x02,0x07,0x02,0x00]
+0x01 0x80 0x28 0xd1 0x02 0x07 0x02 0x00
+
 # VI:     v_mul_lo_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52]
 0x02 0x07 0x02 0x52
 

From 5bf2409a4e4d23018ecffe4eff39988a957e76f7 Mon Sep 17 00:00:00 2001
From: stevewan <wan.yu@ibm.com>
Date: Mon, 25 May 2020 13:43:22 -0400
Subject: [PATCH 044/770] [AIX] Add '-bcdtors:all:0:s' to linker to gather
 static init functions

Summary: On AIX, add '-bcdtors:all:0:s' to the linker implicitly through the driver so that we can collect all static constructor and destructor functions.

Reviewers: hubert.reinterpretcast, Xiangling_L, ZarkoCA, daltenty

Reviewed By: hubert.reinterpretcast

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80415
---
 clang/lib/Driver/ToolChains/AIX.cpp |  7 +++++++
 clang/test/Driver/aix-ld.c          | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 6fbff61f76565..df2e30da32a8c 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -81,6 +81,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                const InputInfoList &Inputs, const ArgList &Args,
                                const char *LinkingOutput) const {
   const AIX &ToolChain = static_cast<const AIX &>(getToolChain());
+  const Driver &D = ToolChain.getDriver();
   ArgStringList CmdArgs;
 
   const bool IsArch32Bit = ToolChain.getTriple().isArch32Bit();
@@ -129,6 +130,12 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         Args.MakeArgString(ToolChain.GetFilePath(getCrt0Basename())));
   }
 
+  // Collect all static constructor and destructor functions in CXX mode. This
+  // has to come before AddLinkerInputs as the implied option needs to precede
+  // any other '-bcdtors' settings or '-bnocdtors' that '-Wl' might forward.
+  if (D.CCCIsCXX())
+    CmdArgs.push_back("-bcdtors:all:0:s");
+
   // Specify linker input file(s).
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
 
diff --git a/clang/test/Driver/aix-ld.c b/clang/test/Driver/aix-ld.c
index 95495718546ca..218fbd2bb3802 100644
--- a/clang/test/Driver/aix-ld.c
+++ b/clang/test/Driver/aix-ld.c
@@ -175,3 +175,21 @@
 // CHECK-LD64-NO-DEFAULT-LIBS: "-L[[SYSROOT]]/usr/lib" 
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lpthreads"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lc"
+
+// Check powerpc-ibm-aix7.1.0.0, 32-bit. 'bcdtors' and argument order.
+// RUN: %clangxx -no-canonical-prefixes %s 2>&1 -### \
+// RUN:          -Wl,-bnocdtors \
+// RUN:          -target powerpc-ibm-aix7.1.0.0 \
+// RUN:          --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: | FileCheck --check-prefix=CHECK-LD32-CXX-ARG-ORDER %s
+
+// CHECK-LD32-CXX-ARG-ORDER:     {{.*}}clang{{.*}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-CXX-ARG-ORDER:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32-CXX-ARG-ORDER:     "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD32-CXX-ARG-ORDER-NOT: "-bnso"
+// CHECK-LD32-CXX-ARG-ORDER:     "-b32"
+// CHECK-LD32-CXX-ARG-ORDER:     "-bpT:0x10000000" "-bpD:0x20000000"
+// CHECK-LD32-CXX-ARG-ORDER:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
+// CHECK-LD32-CXX-ARG-ORDER:     "-bcdtors:all:0:s"
+// CHECK-LD32-CXX-ARG-ORDER:     "-bnocdtors"
+// CHECK-LD32-CXX-ARG-ORDER-NOT: "-bcdtors:all:0:s"

From b321b429416ec51691a3c5372cb59912bded5f08 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 25 May 2020 10:59:39 -0700
Subject: [PATCH 045/770] [lldb/Test] Add a trace method to replace print
 statements.

Many tests use (commented out) print statement for debugging the test
itself. This patch adds a new trace method to lldbtest to reuse the
existing tracing infrastructure and replace these print statements.

Differential revision: https://reviews.llvm.org/D80448
---
 .../Python/lldbsuite/test/lldbtest.py         |   4 +
 .../benchmarks/stepping/TestSteppingSpeed.py  |   4 +-
 .../target/basic/TestTargetCommand.py         |   2 +-
 .../TestBreakpointConditions.py               |   4 +-
 .../serialize/TestBreakpointSerialization.py  |   2 +-
 .../TestDataFormatterSkipSummary.py           |   6 +-
 .../load_unload/TestLoadUnload.py             |   2 +-
 .../TestRegisterVariables.py                  | 164 +++++++++---------
 .../class_types/TestClassTypesDisassembly.py  |   4 +-
 .../lang/objc/blocks/TestObjCIvarsInBlocks.py |   2 +-
 .../lang/objc/foundation/TestSymbolTable.py   |  16 --
 .../breakpoint/TestBreakpointAPI.py           |   4 +-
 lldb/test/API/python_api/event/TestEvents.py  |  14 +-
 lldb/test/API/python_api/frame/TestFrames.py  |   6 +-
 .../frame/inlines/TestInlinedFrame.py         |   2 +-
 .../function_symbol/TestDisasmAPI.py          |  18 +-
 .../function_symbol/TestSymbolAPI.py          |   4 +-
 .../API/python_api/target/TestTargetAPI.py    |  16 +-
 .../API/python_api/thread/TestThreadAPI.py    |   2 +-
 .../lldb-server/TestGdbRemoteAuxvSupport.py   |   7 +-
 .../TestGdbRemoteExpeditedRegisters.py        |   5 +-
 .../lldb-server/TestGdbRemoteRegisterState.py |  15 +-
 .../tools/lldb-server/TestLldbGdbServer.py    |   5 +-
 23 files changed, 136 insertions(+), 172 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index b02181ae1ffc2..639f99463d927 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -505,6 +505,10 @@ def TraceOn(self):
         """Returns True if we are in trace mode (tracing detailed test execution)."""
         return traceAlways
 
+    def trace(self, *args,**kwargs):
+        with recording(self, self.TraceOn()) as sbuf:
+            print(*args, **kwargs, file=sbuf)
+
     @classmethod
     def setUpClass(cls):
         """
diff --git a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
index e5a8f168b6468..60e4a42108ae3 100644
--- a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
+++ b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
@@ -22,8 +22,8 @@ def setUp(self):
         self.break_spec = '-n main'
         self.count = 50
 
-        #print("self.exe=%s" % self.exe)
-        #print("self.break_spec=%s" % self.break_spec)
+        self.trace("self.exe=%s" % self.exe)
+        self.trace("self.break_spec=%s" % self.break_spec)
 
     @benchmarks_test
     @no_debug_info_test
diff --git a/lldb/test/API/commands/target/basic/TestTargetCommand.py b/lldb/test/API/commands/target/basic/TestTargetCommand.py
index 9bc9396e19ed4..83e27e2724642 100644
--- a/lldb/test/API/commands/target/basic/TestTargetCommand.py
+++ b/lldb/test/API/commands/target/basic/TestTargetCommand.py
@@ -82,7 +82,7 @@ def do_target_command(self):
                 if match:
                     # We will start from (index + 1) ....
                     base = int(match.group(1), 10) + 1
-                    #print("base is:", base)
+                    self.trace("base is:", base)
                     break
 
         self.runCmd("target create " + exe_a, CURRENT_EXECUTABLE_SET)
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py b/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py
index de9a47d8c2022..a5f9458c05a0e 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py
@@ -126,7 +126,7 @@ def breakpoint_conditions_python(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -194,7 +194,7 @@ def breakpoint_invalid_conditions_python(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
diff --git a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
index afeccbef3bae7..6a3f40ff3a35b 100644
--- a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
+++ b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py
@@ -118,7 +118,7 @@ def check_equivalence(self, source_bps, do_write = True):
             copy_text = copy_desc.GetData()
 
             # These two should be identical.
-            # print ("Source text for %d is %s."%(i, source_text))
+            self.trace("Source text for %d is %s."%(i, source_text))
             self.assertTrue (source_text == copy_text, "Source and dest breakpoints are not identical: \nsource: %s\ndest: %s"%(source_text, copy_text))
 
     def do_check_resolvers(self):
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-skip-summary/TestDataFormatterSkipSummary.py b/lldb/test/API/functionalities/data-formatter/data-formatter-skip-summary/TestDataFormatterSkipSummary.py
index fa13e922ce4ae..f5cf525427c8f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-skip-summary/TestDataFormatterSkipSummary.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-skip-summary/TestDataFormatterSkipSummary.py
@@ -147,13 +147,13 @@ def cleanup():
             import re
             gcc_version_output = system(
                 [[lldbutil.which(self.getCompiler()), "-v"]])[1]
-            #print("my output:", gcc_version_output)
+            self.trace("my output:", gcc_version_output)
             for line in gcc_version_output.split(os.linesep):
                 m = re.search('\(Apple Inc\. build ([0-9]+)\)', line)
-                #print("line:", line)
+                self.trace("line:", line)
                 if m:
                     gcc_build = int(m.group(1))
-                    #print("gcc build:", gcc_build)
+                    self.trace("gcc build:", gcc_build)
                     if gcc_build >= 5666:
                         # rdar://problem/9804600"
                         self.skipTest(
diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
index e0013ccd93fa6..853c0b2cea201 100644
--- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
+++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
@@ -267,7 +267,7 @@ def run_lldb_process_load_and_unload_commands(self):
         output = self.res.GetOutput()
         pattern = re.compile("Image ([0-9]+) loaded")
         for l in output.split(os.linesep):
-            #print("l:", l)
+            self.trace("l:", l)
             match = pattern.search(l)
             if match:
                 break
diff --git a/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py b/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py
index af0ad2a08719d..51b728be2fe6a 100644
--- a/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py
+++ b/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py
@@ -1,91 +1,11 @@
 """Check that compiler-generated register values work correctly"""
 
-from __future__ import print_function
-
 import re
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-# This method attempts to figure out if a given variable
-# is in a register.
-#
-# Return:
-#   True if the value has a readable value and is in a register
-#   False otherwise
-
-
-def is_variable_in_register(frame, var_name):
-    # Ensure we can lookup the variable.
-    var = frame.FindVariable(var_name)
-    # print("\nchecking {}...".format(var_name))
-    if var is None or not var.IsValid():
-        # print("{} cannot be found".format(var_name))
-        return False
-
-    # Check that we can get its value.  If not, this
-    # may be a variable that is just out of scope at this point.
-    value = var.GetValue()
-    # print("checking value...")
-    if value is None:
-        # print("value is invalid")
-        return False
-    # else:
-        # print("value is {}".format(value))
-
-    # We have a variable and we can get its value.  The variable is in
-    # a register if we cannot get an address for it, assuming it is
-    # not a struct pointer.  (This is an approximation - compilers can
-    # do other things with spitting up a value into multiple parts of
-    # multiple registers, but what we're verifying here is much more
-    # than it was doing before).
-    var_addr = var.GetAddress()
-    # print("checking address...")
-    if var_addr.IsValid():
-        # We have an address, it must not be in a register.
-        # print("var {} is not in a register: has a valid address {}".format(var_name, var_addr))
-        return False
-    else:
-        # We don't have an address but we can read the value.
-        # It is likely stored in a register.
-        # print("var {} is in a register (we don't have an address for it)".format(var_name))
-        return True
-
-
-def is_struct_pointer_in_register(frame, var_name, trace):
-    # Ensure we can lookup the variable.
-    var = frame.FindVariable(var_name)
-    if trace:
-        print("\nchecking {}...".format(var_name))
-
-    if var is None or not var.IsValid():
-        # print("{} cannot be found".format(var_name))
-        return False
-
-    # Check that we can get its value.  If not, this
-    # may be a variable that is just out of scope at this point.
-    value = var.GetValue()
-    # print("checking value...")
-    if value is None:
-        if trace:
-            print("value is invalid")
-        return False
-    else:
-        if trace:
-             print("value is {}".format(value))
-
-    var_loc = var.GetLocation()
-    if trace:
-        print("checking location: {}".format(var_loc))
-    if var_loc is None or var_loc.startswith("0x"):
-        # The frame var is not in a register but rather a memory location.
-        # print("frame var {} is not in a register".format(var_name))
-        return False
-    else:
-        # print("frame var {} is in a register".format(var_name))
-        return True
-
 
 def re_expr_equals(val_type, val):
     # Match ({val_type}) ${sum_digits} = {val}
@@ -136,12 +56,12 @@ def test_and_run_command(self):
         # Try some variables that should be visible
         frame = self.dbg.GetSelectedTarget().GetProcess(
         ).GetSelectedThread().GetSelectedFrame()
-        if is_variable_in_register(frame, 'a'):
+        if self.is_variable_in_register(frame, 'a'):
             register_variables_count += 1
             self.expect("expr a", VARIABLES_DISPLAYED_CORRECTLY,
                         patterns=[re_expr_equals('int', 2)])
 
-        if is_struct_pointer_in_register(frame, 'b', self.TraceOn()):
+        if self.is_struct_pointer_in_register(frame, 'b', self.TraceOn()):
             register_variables_count += 1
             self.expect("expr b->m1", VARIABLES_DISPLAYED_CORRECTLY,
                         patterns=[re_expr_equals('int', 3)])
@@ -163,12 +83,12 @@ def test_and_run_command(self):
         # Try some variables that should be visible
         frame = self.dbg.GetSelectedTarget().GetProcess(
         ).GetSelectedThread().GetSelectedFrame()
-        if is_struct_pointer_in_register(frame, 'b', self.TraceOn()):
+        if self.is_struct_pointer_in_register(frame, 'b', self.TraceOn()):
             register_variables_count += 1
             self.expect("expr b->m2", VARIABLES_DISPLAYED_CORRECTLY,
                         patterns=[re_expr_equals('int', 5)])
 
-        if is_variable_in_register(frame, 'c'):
+        if self.is_variable_in_register(frame, 'c'):
             register_variables_count += 1
             self.expect("expr c", VARIABLES_DISPLAYED_CORRECTLY,
                         patterns=[re_expr_equals('int', 5)])
@@ -190,7 +110,7 @@ def test_and_run_command(self):
         # Try some variables that should be visible
         frame = self.dbg.GetSelectedTarget().GetProcess(
         ).GetSelectedThread().GetSelectedFrame()
-        if is_variable_in_register(frame, 'f'):
+        if self.is_variable_in_register(frame, 'f'):
             register_variables_count += 1
             self.expect("expr f", VARIABLES_DISPLAYED_CORRECTLY,
                         patterns=[re_expr_equals('float', '3.1')])
@@ -199,6 +119,78 @@ def test_and_run_command(self):
         self.assertTrue(
             register_variables_count > 0,
             "expected to verify at least one variable in a register")
-        # print("executed {} expressions with values in registers".format(register_variables_count))
+        self.trace("executed {} expressions with values in registers".format(register_variables_count))
 
         self.runCmd("kill")
+
+
+    def is_variable_in_register(self, frame, var_name):
+        # Ensure we can lookup the variable.
+        var = frame.FindVariable(var_name)
+        self.trace("\nchecking {}...".format(var_name))
+        if var is None or not var.IsValid():
+            self.trace("{} cannot be found".format(var_name))
+            return False
+
+        # Check that we can get its value.  If not, this
+        # may be a variable that is just out of scope at this point.
+        value = var.GetValue()
+        self.trace("checking value...")
+        if value is None:
+            self.trace("value is invalid")
+            return False
+        else:
+            self.trace("value is {}".format(value))
+
+        # We have a variable and we can get its value.  The variable is in a
+        # register if we cannot get an address for it, assuming it is not a
+        # struct pointer.  (This is an approximation - compilers can do other
+        # things with spitting up a value into multiple parts of multiple
+        # registers, but what we're verifying here is much more than it was
+        # doing before).
+        var_addr = var.GetAddress()
+        self.trace("checking address...")
+        if var_addr.IsValid():
+            # We have an address, it must not be in a register.
+            self.trace("var {} is not in a register: has a valid address {}".format(var_name, var_addr))
+            return False
+        else:
+            # We don't have an address but we can read the value.
+            # It is likely stored in a register.
+            self.trace("var {} is in a register (we don't have an address for it)".format(var_name))
+            return True
+
+
+    def is_struct_pointer_in_register(self, frame, var_name, trace):
+        # Ensure we can lookup the variable.
+        var = frame.FindVariable(var_name)
+        if trace:
+            print("\nchecking {}...".format(var_name))
+
+        if var is None or not var.IsValid():
+            self.trace("{} cannot be found".format(var_name))
+            return False
+
+        # Check that we can get its value.  If not, this
+        # may be a variable that is just out of scope at this point.
+        value = var.GetValue()
+        self.trace("checking value...")
+        if value is None:
+            if trace:
+                print("value is invalid")
+            return False
+        else:
+            if trace:
+                print("value is {}".format(value))
+
+        var_loc = var.GetLocation()
+        if trace:
+            print("checking location: {}".format(var_loc))
+        if var_loc is None or var_loc.startswith("0x"):
+            # The frame var is not in a register but rather a memory location.
+            self.trace("frame var {} is not in a register".format(var_name))
+            return False
+        else:
+            self.trace("frame var {} is in a register".format(var_name))
+            return True
+
diff --git a/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py b/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py
index ad187d0394b6d..9f3d6806451e7 100644
--- a/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py
+++ b/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py
@@ -33,8 +33,8 @@ def test_and_run_command(self):
             match = frameRE.search(line)
             if match:
                 function = match.group(1)
-                #print("line:", line)
-                #print("function:", function)
+                self.trace("line:", line)
+                self.trace("function:", function)
                 self.runCmd("disassemble -n '%s'" % function)
 
     @add_test_categories(['pyapi'])
diff --git a/lldb/test/API/lang/objc/blocks/TestObjCIvarsInBlocks.py b/lldb/test/API/lang/objc/blocks/TestObjCIvarsInBlocks.py
index e790e6e9d96e4..9eb8931fb4a0e 100644
--- a/lldb/test/API/lang/objc/blocks/TestObjCIvarsInBlocks.py
+++ b/lldb/test/API/lang/objc/blocks/TestObjCIvarsInBlocks.py
@@ -125,7 +125,7 @@ def test_with_python_api(self):
             expr, "Successfully got a local variable in a block in a class method.")
 
         ret_value_signed = expr.GetValueAsSigned(error)
-        # print('ret_value_signed = %i' % (ret_value_signed))
+        self.trace('ret_value_signed = %i' % (ret_value_signed))
         self.assertTrue(
             ret_value_signed == 5,
             "The local variable in the block was what we expected.")
diff --git a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
index abfc7621e2e7d..b77a8dfc0ed90 100644
--- a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
+++ b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
@@ -50,19 +50,3 @@ def test_with_python_api(self):
         module = target.FindModule(filespec)
         self.assertTrue(module, VALID_MODULE)
 
-        # Create the set of known symbols.  As we iterate through the symbol
-        # table, remove the symbol from the set if it is a known symbol.
-        expected_symbols = set(self.symbols_list)
-        for symbol in module:
-            self.assertTrue(symbol, VALID_SYMBOL)
-            #print("symbol:", symbol)
-            name = symbol.GetName()
-            if name in expected_symbols:
-                #print("Removing %s from known_symbols %s" % (name, expected_symbols))
-                expected_symbols.remove(name)
-
-        # At this point, the known_symbols set should have become an empty set.
-        # If not, raise an error.
-        #print("symbols unaccounted for:", expected_symbols)
-        self.assertTrue(len(expected_symbols) == 0,
-                        "All the known symbols are accounted for")
diff --git a/lldb/test/API/python_api/breakpoint/TestBreakpointAPI.py b/lldb/test/API/python_api/breakpoint/TestBreakpointAPI.py
index dd57846963438..1c0c334fbeebf 100644
--- a/lldb/test/API/python_api/breakpoint/TestBreakpointAPI.py
+++ b/lldb/test/API/python_api/breakpoint/TestBreakpointAPI.py
@@ -25,7 +25,7 @@ def test_breakpoint_is_valid(self):
 
         # Now create a breakpoint on main.c by name 'AFunction'.
         breakpoint = target.BreakpointCreateByName('AFunction', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -59,7 +59,7 @@ def test_target_delete(self):
 
         # Now create a breakpoint on main.c by name 'AFunction'.
         breakpoint = target.BreakpointCreateByName('AFunction', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
diff --git a/lldb/test/API/python_api/event/TestEvents.py b/lldb/test/API/python_api/event/TestEvents.py
index 97ebe8ffc03d6..62ed195729f00 100644
--- a/lldb/test/API/python_api/event/TestEvents.py
+++ b/lldb/test/API/python_api/event/TestEvents.py
@@ -135,7 +135,7 @@ def test_wait_for_event(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -171,9 +171,9 @@ def run(self):
                 # Let's only try at most 3 times to retrieve any kind of event.
                 while not count > 3:
                     if listener.WaitForEvent(5, event):
-                        #print("Got a valid event:", event)
-                        #print("Event data flavor:", event.GetDataFlavor())
-                        #print("Event type:", lldbutil.state_type_to_str(event.GetType()))
+                        self.trace("Got a valid event:", event)
+                        self.trace("Event data flavor:", event.GetDataFlavor())
+                        self.trace("Event type:", lldbutil.state_type_to_str(event.GetType()))
                         listener.Clear()
                         return
                     count = count + 1
@@ -215,7 +215,7 @@ def test_add_listener_to_broadcaster(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -256,7 +256,7 @@ def test_add_listener_to_broadcaster(self):
         class MyListeningThread(threading.Thread):
 
             def run(self):
-                #print("Running MyListeningThread:", self)
+                self.trace("Running MyListeningThread:", self)
 
                 # Regular expression pattern for the event description.
                 pattern = re.compile("data = {.*, state = (.*)}$")
@@ -266,7 +266,7 @@ def run(self):
                 while True:
                     if listener.WaitForEvent(5, event):
                         desc = lldbutil.get_description(event)
-                        #print("Event description:", desc)
+                        self.trace("Event description:", desc)
                         match = pattern.search(desc)
                         if not match:
                             break
diff --git a/lldb/test/API/python_api/frame/TestFrames.py b/lldb/test/API/python_api/frame/TestFrames.py
index 6d4b7b51f4263..1ec66a3ddbeb1 100644
--- a/lldb/test/API/python_api/frame/TestFrames.py
+++ b/lldb/test/API/python_api/frame/TestFrames.py
@@ -28,7 +28,7 @@ def test_get_arg_vals_for_call_stack(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -131,7 +131,7 @@ def test_frame_api_boundary_condition(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -173,7 +173,7 @@ def test_frame_api_IsEqual(self):
 
         # Now create a breakpoint on main.c by name 'c'.
         breakpoint = target.BreakpointCreateByName('c', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
diff --git a/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py b/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py
index da4e9cb06e7b5..eb40b4c4993e7 100644
--- a/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py
+++ b/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py
@@ -37,7 +37,7 @@ def test_stop_at_outer_inline(self):
 
         # Now create a breakpoint on main.c by the name of 'inner_inline'.
         breakpoint = target.BreakpointCreateByName('inner_inline', 'a.out')
-        #print("breakpoint:", breakpoint)
+        self.trace("breakpoint:", breakpoint)
         self.assertTrue(breakpoint and
                         breakpoint.GetNumLocations() > 1,
                         VALID_BREAKPOINT)
diff --git a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py
index 2278d69fbbe3b..01d26da060d2b 100644
--- a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py
+++ b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py
@@ -38,8 +38,8 @@ def test(self):
         # Now create the two breakpoints inside function 'a'.
         breakpoint1 = target.BreakpointCreateByLocation('main.c', self.line1)
         breakpoint2 = target.BreakpointCreateByLocation('main.c', self.line2)
-        #print("breakpoint1:", breakpoint1)
-        #print("breakpoint2:", breakpoint2)
+        self.trace("breakpoint1:", breakpoint1)
+        self.trace("breakpoint2:", breakpoint2)
         self.assertTrue(breakpoint1 and
                         breakpoint1.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -64,7 +64,7 @@ def test(self):
         self.assertTrue(lineEntry.GetLine() == self.line1)
 
         address1 = lineEntry.GetStartAddress()
-        #print("address1:", address1)
+        self.trace("address1:", address1)
 
         # Now call SBTarget.ResolveSymbolContextForAddress() with address1.
         context1 = target.ResolveSymbolContextForAddress(
@@ -103,15 +103,11 @@ def test(self):
             print("disassembly=>\n", disasm_output)
 
         sa1 = symbol.GetStartAddress()
-        #print("sa1:", sa1)
-        #print("sa1.GetFileAddress():", hex(sa1.GetFileAddress()))
-        #ea1 = symbol.GetEndAddress()
-        #print("ea1:", ea1)
+        self.trace("sa1:", sa1)
+        self.trace("sa1.GetFileAddress():", hex(sa1.GetFileAddress()))
         sa2 = function.GetStartAddress()
-        #print("sa2:", sa2)
-        #print("sa2.GetFileAddress():", hex(sa2.GetFileAddress()))
-        #ea2 = function.GetEndAddress()
-        #print("ea2:", ea2)
+        self.trace("sa2:", sa2)
+        self.trace("sa2.GetFileAddress():", hex(sa2.GetFileAddress()))
         self.assertTrue(sa1 and sa2 and sa1 == sa2,
                         "The two starting addresses should be the same")
 
diff --git a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py
index 56fa73c84ad66..c5bcb152beb0c 100644
--- a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py
+++ b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py
@@ -38,8 +38,8 @@ def test(self):
         # Now create the two breakpoints inside function 'a'.
         breakpoint1 = target.BreakpointCreateByLocation('main.c', self.line1)
         breakpoint2 = target.BreakpointCreateByLocation('main.c', self.line2)
-        #print("breakpoint1:", breakpoint1)
-        #print("breakpoint2:", breakpoint2)
+        self.trace("breakpoint1:", breakpoint1)
+        self.trace("breakpoint2:", breakpoint2)
         self.assertTrue(breakpoint1 and
                         breakpoint1.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
diff --git a/lldb/test/API/python_api/target/TestTargetAPI.py b/lldb/test/API/python_api/target/TestTargetAPI.py
index c5b960528d4b5..016754720c8ce 100644
--- a/lldb/test/API/python_api/target/TestTargetAPI.py
+++ b/lldb/test/API/python_api/target/TestTargetAPI.py
@@ -359,8 +359,8 @@ def resolve_symbol_context_with_address(self):
         # Now create the two breakpoints inside function 'a'.
         breakpoint1 = target.BreakpointCreateByLocation('main.c', self.line1)
         breakpoint2 = target.BreakpointCreateByLocation('main.c', self.line2)
-        #print("breakpoint1:", breakpoint1)
-        #print("breakpoint2:", breakpoint2)
+        self.trace("breakpoint1:", breakpoint1)
+        self.trace("breakpoint2:", breakpoint2)
         self.assertTrue(breakpoint1 and
                         breakpoint1.GetNumLocations() == 1,
                         VALID_BREAKPOINT)
@@ -402,8 +402,8 @@ def resolve_symbol_context_with_address(self):
 
         address2 = lineEntry.GetStartAddress()
 
-        #print("address1:", address1)
-        #print("address2:", address2)
+        self.trace("address1:", address1)
+        self.trace("address2:", address2)
 
         # Now call SBTarget.ResolveSymbolContextForAddress() with the addresses
         # from our line entry.
@@ -413,15 +413,15 @@ def resolve_symbol_context_with_address(self):
             address2, lldb.eSymbolContextEverything)
 
         self.assertTrue(context1 and context2)
-        #print("context1:", context1)
-        #print("context2:", context2)
+        self.trace("context1:", context1)
+        self.trace("context2:", context2)
 
         # Verify that the context point to the same function 'a'.
         symbol1 = context1.GetSymbol()
         symbol2 = context2.GetSymbol()
         self.assertTrue(symbol1 and symbol2)
-        #print("symbol1:", symbol1)
-        #print("symbol2:", symbol2)
+        self.trace("symbol1:", symbol1)
+        self.trace("symbol2:", symbol2)
 
         from lldbsuite.test.lldbutil import get_description
         desc1 = get_description(symbol1)
diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py
index 144f062846aee..2101527dee6be 100644
--- a/lldb/test/API/python_api/thread/TestThreadAPI.py
+++ b/lldb/test/API/python_api/thread/TestThreadAPI.py
@@ -100,7 +100,7 @@ def get_process(self):
         self.runCmd("process status")
 
         proc_of_thread = thread.GetProcess()
-        #print("proc_of_thread:", proc_of_thread)
+        self.trace("proc_of_thread:", proc_of_thread)
         self.assertTrue(proc_of_thread.GetProcessID()
                         == process.GetProcessID())
 
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py
index 1a3a2b2936504..b89448fd5ba6a 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 import gdbremote_testcase
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -119,7 +116,7 @@ def auxv_data_is_correct_size(self):
         # Ensure auxv data is a multiple of 2*word_size (there should be two
         # unsigned long fields per auxv entry).
         self.assertEqual(len(auxv_data) % (2 * word_size), 0)
-        # print("auxv contains {} entries".format(len(auxv_data) / (2*word_size)))
+        self.trace("auxv contains {} entries".format(len(auxv_data) / (2*word_size)))
 
     @debugserver_test
     def test_auxv_data_is_correct_size_debugserver(self):
@@ -159,7 +156,7 @@ def auxv_keys_look_valid(self):
         for auxv_key in auxv_dict:
             self.assertTrue(auxv_key >= 1)
             self.assertTrue(auxv_key <= 1000)
-        # print("auxv dict: {}".format(auxv_dict))
+        self.trace("auxv dict: {}".format(auxv_dict))
 
     @debugserver_test
     def test_auxv_keys_look_valid_debugserver(self):
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py
index 7d8e28c745c94..f74143a3cceec 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 import gdbremote_testcase
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -59,7 +56,7 @@ def stop_notification_contains_generic_register(
 
         # Ensure the expedited registers contained it.
         self.assertTrue(reg_info["lldb_register_index"] in expedited_registers)
-        # print("{} reg_info:{}".format(generic_register_name, reg_info))
+        self.trace("{} reg_info:{}".format(generic_register_name, reg_info))
 
     def stop_notification_contains_any_registers(self):
         # Generate a stop reply, parse out expedited registers from stop
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py
index 2543ed6e90299..e20948ba38af6 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-
-
 import gdbremote_testcase
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -50,7 +47,7 @@ def grp_register_save_restore_works(self, with_suffix):
             self.assertIsNotNone(threads)
             thread_id = threads[0]
             self.assertIsNotNone(thread_id)
-            # print("Running on thread: 0x{:x}".format(thread_id))
+            self.trace("Running on thread: 0x{:x}".format(thread_id))
         else:
             thread_id = None
 
@@ -64,22 +61,22 @@ def grp_register_save_restore_works(self, with_suffix):
         (success, state_id) = self.parse_QSaveRegisterState_response(context)
         self.assertTrue(success)
         self.assertIsNotNone(state_id)
-        # print("saved register state id: {}".format(state_id))
+        self.trace("saved register state id: {}".format(state_id))
 
         # Remember initial register values.
         initial_reg_values = self.read_register_values(
             gpr_reg_infos, endian, thread_id=thread_id)
-        # print("initial_reg_values: {}".format(initial_reg_values))
+        self.trace("initial_reg_values: {}".format(initial_reg_values))
 
         # Flip gpr register values.
         (successful_writes, failed_writes) = self.flip_all_bits_in_each_register_value(
             gpr_reg_infos, endian, thread_id=thread_id)
-        # print("successful writes: {}, failed writes: {}".format(successful_writes, failed_writes))
+        self.trace("successful writes: {}, failed writes: {}".format(successful_writes, failed_writes))
         self.assertTrue(successful_writes > 0)
 
         flipped_reg_values = self.read_register_values(
             gpr_reg_infos, endian, thread_id=thread_id)
-        # print("flipped_reg_values: {}".format(flipped_reg_values))
+        self.trace("flipped_reg_values: {}".format(flipped_reg_values))
 
         # Restore register values.
         self.reset_test_sequence()
@@ -91,7 +88,7 @@ def grp_register_save_restore_works(self, with_suffix):
         # Verify registers match initial register values.
         final_reg_values = self.read_register_values(
             gpr_reg_infos, endian, thread_id=thread_id)
-        # print("final_reg_values: {}".format(final_reg_values))
+        self.trace("final_reg_values: {}".format(final_reg_values))
         self.assertIsNotNone(final_reg_values)
         self.assertEqual(final_reg_values, initial_reg_values)
 
diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
index 2b7f28a3aefbc..d46123e337c80 100644
--- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
+++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
@@ -10,9 +10,6 @@
 the initial set of tests implemented.
 """
 
-from __future__ import division, print_function
-
-
 import unittest2
 import gdbremote_testcase
 import lldbgdbserverutils
@@ -1442,7 +1439,7 @@ def P_writes_all_gpr_registers(self):
         # Write flipped bit pattern of existing value to each register.
         (successful_writes, failed_writes) = self.flip_all_bits_in_each_register_value(
             gpr_reg_infos, endian)
-        # print("successful writes: {}, failed writes: {}".format(successful_writes, failed_writes))
+        self.trace("successful writes: {}, failed writes: {}".format(successful_writes, failed_writes))
         self.assertTrue(successful_writes > 0)
 
     # Note: as of this moment, a hefty number of the GPR writes are failing with E32 (everything except rax-rdx, rdi, rsi, rbp).

From 2b8d6fa0acacba4dee31ed618a5596414b2279d5 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 25 May 2020 20:13:03 +0200
Subject: [PATCH 046/770] Revert "[PGO] Fix computation of function Hash"

This reverts commit 7c298c104bfe725d4315926a656263e8a5ac3054.
Fails make check-clang.

Failing Tests (8):
	Clang :: Profile/c-counter-overflows.c
	Clang :: Profile/c-general.c
	Clang :: Profile/c-unprofiled-blocks.c
	Clang :: Profile/cxx-rangefor.cpp
	Clang :: Profile/cxx-throws.cpp
	Clang :: Profile/misexpect-switch-default.c
	Clang :: Profile/misexpect-switch-nonconst.c
	Clang :: Profile/misexpect-switch.c
---
 clang/lib/CodeGen/CodeGenPGO.cpp |  8 +++-----
 clang/test/Profile/c-collision.c | 22 ----------------------
 2 files changed, 3 insertions(+), 27 deletions(-)
 delete mode 100644 clang/test/Profile/c-collision.c

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 98827bc3eec5e..3c91a04d54642 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -747,15 +747,13 @@ uint64_t PGOHash::finalize() {
     return Working;
 
   // Check for remaining work in Working.
-  if (Working) {
-    using namespace llvm::support;
-    uint64_t Swapped = endian::byte_swap<uint64_t, little>(Working);
-    MD5.update(llvm::makeArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
-  }
+  if (Working)
+    MD5.update(Working);
 
   // Finalize the MD5 and return the hash.
   llvm::MD5::MD5Result Result;
   MD5.final(Result);
+  using namespace llvm::support;
   return Result.low();
 }
 
diff --git a/clang/test/Profile/c-collision.c b/clang/test/Profile/c-collision.c
deleted file mode 100644
index fabecd752b4ef..0000000000000
--- a/clang/test/Profile/c-collision.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// Test that a slight change in the code leads to a different hash.
-// RUN: %clang_cc1 -UEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-NOEXTRA
-// RUN: %clang_cc1 -DEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-EXTRA
-
-// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 7156072912471487002,
-// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 -4383447408116050035,
-
-extern int bar;
-void foo() {
-  if (bar) {
-  }
-  if (bar) {
-  }
-  if (bar) {
-    if (bar) {
-#ifdef EXTRA
-      if (bar) {
-      }
-#endif
-    }
-  }
-}

From e0aefaedb617766f4667118911fccb4a14abfb94 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 25 May 2020 18:14:50 +0000
Subject: [PATCH 047/770] [gn build] Port ba92b274225

---
 llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
index fb171a4a48cb2..da6b850cd0d47 100644
--- a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
@@ -14,6 +14,7 @@ unittest("StaticAnalysisTests") {
     "AnalyzerOptionsTest.cpp",
     "CallDescriptionTest.cpp",
     "CallEventTest.cpp",
+    "RangeSetTest.cpp",
     "RegisterCustomCheckersTest.cpp",
     "StoreTest.cpp",
     "SymbolReaperTest.cpp",

From 37ef15143a5d77a0fba0ece4c26a72cfb9e050a0 Mon Sep 17 00:00:00 2001
From: zoecarver <z.zoelec2@gmail.com>
Date: Sat, 23 May 2020 14:43:12 -0700
Subject: [PATCH 048/770] [libcxx] Fix C++14 and up constexpr members in
 MoveOnly.

Summary: a4b8ee6 made all MoveOnly members constexpr but, some members and constructors contain expressions that are only valid in C++14 and later. This patch prefixes those methods and constructors with TEST_CONSTEXPR_CXX14.

Reviewers: ldionne, #libc!

Subscribers: dexonsmith, libcxx-commits

Tags: #libc

Differential Revision: https://reviews.llvm.org/D80482
---
 libcxx/test/support/MoveOnly.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/support/MoveOnly.h b/libcxx/test/support/MoveOnly.h
index f9e9298097f12..adcae2542bd3c 100644
--- a/libcxx/test/support/MoveOnly.h
+++ b/libcxx/test/support/MoveOnly.h
@@ -24,17 +24,19 @@ class MoveOnly
     int data_;
 public:
     constexpr MoveOnly(int data = 1) : data_(data) {}
-    constexpr MoveOnly(MoveOnly&& x)
+    TEST_CONSTEXPR_CXX14 MoveOnly(MoveOnly&& x)
         : data_(x.data_) {x.data_ = 0;}
-    constexpr MoveOnly& operator=(MoveOnly&& x)
+    TEST_CONSTEXPR_CXX14 MoveOnly& operator=(MoveOnly&& x)
         {data_ = x.data_; x.data_ = 0; return *this;}
 
     constexpr int get() const {return data_;}
 
     constexpr bool operator==(const MoveOnly& x) const {return data_ == x.data_;}
     constexpr bool operator< (const MoveOnly& x) const {return data_ <  x.data_;}
-    constexpr MoveOnly operator+(const MoveOnly& x) const { return MoveOnly{data_ + x.data_}; }
-    constexpr MoveOnly operator*(const MoveOnly& x) const { return MoveOnly{data_ * x.data_}; }
+    TEST_CONSTEXPR_CXX14 MoveOnly operator+(const MoveOnly& x) const
+        { return MoveOnly{data_ + x.data_}; }
+    TEST_CONSTEXPR_CXX14 MoveOnly operator*(const MoveOnly& x) const
+        { return MoveOnly{data_ * x.data_}; }
 };
 
 namespace std {

From 51a276c759c90c844bbabf5066195aaf42fb0c6e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Mon, 25 May 2020 11:34:09 -0700
Subject: [PATCH 049/770] [X86] Teach combineTruncatedArithmetic to push
 truncate through subtracts where only one of the inputs is free to truncate.

Fix combineSubToSubus to handle the new DAG to avoid a regression.

There are still regressions in test14/test15/test16. Where it
looks like were trying to set up cases we could match to
umin+trunc+subus but the handling was never finished. The
regression here isn't unique to sub. Its a lost opportunity for
taking an AND with two truncated inputs and producing a larger
AND with a single truncate. The same thing could happen with
any other node we handle in combineTruncatedArithmetic since we
are moving the truncate up the DAG.

Differential Revision: https://reviews.llvm.org/D80483
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  45 +-
 llvm/test/CodeGen/X86/psubus.ll         | 886 ++++++++++++------------
 2 files changed, 470 insertions(+), 461 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5101977a68edc..54a80151eb69a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43588,21 +43588,12 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   case ISD::AND:
   case ISD::XOR:
   case ISD::OR:
-  case ISD::ADD: {
-    SDValue Op0 = Src.getOperand(0);
-    SDValue Op1 = Src.getOperand(1);
-    if (TLI.isOperationLegal(SrcOpcode, VT) &&
-        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
-      return TruncateArithmetic(Op0, Op1);
-    break;
-  }
+  case ISD::ADD:
   case ISD::SUB: {
-    // TODO: ISD::SUB We are conservative and require both sides to be freely
-    // truncatable to avoid interfering with combineSubToSubus.
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(SrcOpcode, VT) &&
-        (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
+        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
@@ -46698,6 +46689,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
       SubusRHS = MinLHS;
     else
       return SDValue();
+  } else if (Op1.getOpcode() == ISD::TRUNCATE &&
+             Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+             (EltVT == MVT::i8 || EltVT == MVT::i16)) {
+    // Special case where the UMIN has been truncated. Try to push the truncate
+    // further up. This is similar to the i32/i64 special processing.
+    SubusLHS = Op0;
+    SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+    SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+    EVT TruncVT = Op1.getOperand(0).getValueType();
+    if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
+                                   TruncVT == MVT::v8i64)) &&
+        !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+      return SDValue();
+    SDValue OpToSaturate;
+    if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
+        MinLHS.getOperand(0) == Op0)
+      OpToSaturate = MinRHS;
+    else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
+             MinRHS.getOperand(0) == Op0)
+      OpToSaturate = MinLHS;
+    else
+      return SDValue();
+
+    // Saturate the non-extended input and then truncate it.
+    SDLoc DL(N);
+    SDValue SaturationConst =
+        DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
+                                             VT.getScalarSizeInBits()),
+                        DL, TruncVT);
+    SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
+                               SaturationConst);
+    SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
   } else
     return SDValue();
 
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 21c63da1d1eea..a51893ade0217 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -465,33 +465,33 @@ vector.ph:
   ret <32 x i8> %res
 }
 
+; FIXME: match this to UMIN+TRUNC+PSUBUS
 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: test13:
 ; SSE2:       # %bb.0: # %vector.ph
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    packssdw %xmm6, %xmm2
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    packssdw %xmm0, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    por %xmm6, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    packssdw %xmm7, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    psubw %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test13:
@@ -499,98 +499,96 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm3, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm3, %xmm6
-; SSSE3-NEXT:    por %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    por %xmm4, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    packssdw %xmm6, %xmm2
-; SSSE3-NEXT:    psubd %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pshufb %xmm1, %xmm4
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; SSSE3-NEXT:    pandn %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
+; SSSE3-NEXT:    por %xmm6, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    pxor %xmm6, %xmm3
+; SSSE3-NEXT:    por %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    packssdw %xmm7, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    psubw %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test13:
 ; SSE41:       # %bb.0: # %vector.ph
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pmaxud %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm3
+; SSE41-NEXT:    pmaxud %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm6
 ; SSE41-NEXT:    pmaxud %xmm2, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm6
 ; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    packssdw %xmm6, %xmm0
-; SSE41-NEXT:    psubd %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm4
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
-; SSE41-NEXT:    packusdw %xmm3, %xmm4
-; SSE41-NEXT:    pandn %xmm4, %xmm0
+; SSE41-NEXT:    packssdw %xmm6, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm2, %xmm1
+; SSE41-NEXT:    psubw %xmm1, %xmm0
+; SSE41-NEXT:    pandn %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test13:
 ; AVX1:       # %bb.0: # %vector.ph
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpandn %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm3, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test13:
 ; AVX2:       # %bb.0: # %vector.ph
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
 ; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test13:
 ; AVX512:       # %bb.0: # %vector.ph
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vpcmpnltud %ymm1, %ymm0, %k1
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpcmpnltud %ymm1, %ymm2, %k1
+; AVX512-NEXT:    vpmovdw %ymm1, %xmm1
+; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 vector.ph:
@@ -602,186 +600,172 @@ vector.ph:
   ret <8 x i16> %res
 }
 
+; FIXME: match this to UMIN+TRUNC+PSUBUS
 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: test14:
 ; SSE2:       # %bb.0: # %vector.ph
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
 ; SSE2-NEXT:    movdqa %xmm4, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm5, %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm5
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    psubd %xmm10, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT:    packssdw %xmm5, %xmm10
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm5
-; SSE2-NEXT:    por %xmm8, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    packssdw %xmm6, %xmm0
-; SSE2-NEXT:    packsswb %xmm10, %xmm0
-; SSE2-NEXT:    psubd %xmm8, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE2-NEXT:    pand %xmm5, %xmm4
 ; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    packuswb %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pand %xmm5, %xmm2
 ; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm5, %xmm9
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm10
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSE2-NEXT:    packssdw %xmm6, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    packsswb %xmm3, %xmm0
 ; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test14:
 ; SSSE3:       # %bb.0: # %vector.ph
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSSE3-NEXT:    movdqa %xmm6, %xmm8
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSSE3-NEXT:    movdqa %xmm5, %xmm10
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm8, %xmm8
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm9
-; SSSE3-NEXT:    pxor %xmm0, %xmm9
-; SSSE3-NEXT:    psubd %xmm5, %xmm4
-; SSSE3-NEXT:    por %xmm0, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm5
-; SSSE3-NEXT:    movdqa %xmm3, %xmm7
-; SSSE3-NEXT:    pxor %xmm0, %xmm7
-; SSSE3-NEXT:    psubd %xmm10, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm10
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm10
-; SSSE3-NEXT:    packssdw %xmm5, %xmm10
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm6, %xmm2
-; SSSE3-NEXT:    por %xmm0, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm0, %xmm5
-; SSSE3-NEXT:    por %xmm8, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    packssdw %xmm6, %xmm0
-; SSSE3-NEXT:    packsswb %xmm10, %xmm0
-; SSSE3-NEXT:    psubd %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm10
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSSE3-NEXT:    pand %xmm5, %xmm4
 ; SSSE3-NEXT:    pand %xmm5, %xmm3
 ; SSSE3-NEXT:    packuswb %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pand %xmm5, %xmm2
 ; SSSE3-NEXT:    pand %xmm5, %xmm1
 ; SSSE3-NEXT:    packuswb %xmm2, %xmm1
 ; SSSE3-NEXT:    packuswb %xmm3, %xmm1
+; SSSE3-NEXT:    psubb %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm5, %xmm9
+; SSSE3-NEXT:    por %xmm5, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm10
+; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm3
+; SSSE3-NEXT:    packssdw %xmm6, %xmm3
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm4
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSSE3-NEXT:    packsswb %xmm3, %xmm0
 ; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test14:
 ; SSE41:       # %bb.0: # %vector.ph
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pmaxud %xmm10, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    pmaxud %xmm9, %xmm7
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; SSE41-NEXT:    pmaxud %xmm4, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT:    pxor %xmm9, %xmm6
+; SSE41-NEXT:    pmaxud %xmm3, %xmm7
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT:    pxor %xmm6, %xmm7
-; SSE41-NEXT:    packssdw %xmm0, %xmm7
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pmaxud %xmm8, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    pmaxud %xmm11, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm6, %xmm5
-; SSE41-NEXT:    packssdw %xmm5, %xmm0
-; SSE41-NEXT:    packsswb %xmm7, %xmm0
-; SSE41-NEXT:    psubd %xmm11, %xmm2
-; SSE41-NEXT:    psubd %xmm8, %xmm1
-; SSE41-NEXT:    psubd %xmm9, %xmm3
-; SSE41-NEXT:    psubd %xmm10, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT:    pand %xmm5, %xmm4
-; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    pxor %xmm9, %xmm7
+; SSE41-NEXT:    packssdw %xmm6, %xmm7
+; SSE41-NEXT:    pmaxud %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm9, %xmm5
+; SSE41-NEXT:    pmaxud %xmm2, %xmm8
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm8
+; SSE41-NEXT:    pxor %xmm9, %xmm8
+; SSE41-NEXT:    packssdw %xmm8, %xmm5
+; SSE41-NEXT:    packsswb %xmm7, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pand %xmm6, %xmm4
+; SSE41-NEXT:    pand %xmm6, %xmm3
 ; SSE41-NEXT:    packusdw %xmm4, %xmm3
-; SSE41-NEXT:    pand %xmm5, %xmm1
-; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    pand %xmm6, %xmm2
+; SSE41-NEXT:    pand %xmm6, %xmm1
 ; SSE41-NEXT:    packusdw %xmm2, %xmm1
 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
-; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test14:
 ; AVX1:       # %bb.0: # %vector.ph
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm6, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm6, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
-; AVX1-NEXT:    vpmaxud %xmm11, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm7, %xmm4, %xmm10
-; AVX1-NEXT:    vpmaxud %xmm9, %xmm1, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm1, %xmm7
-; AVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpmaxud %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpmaxud %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpxor %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpmaxud %xmm8, %xmm4, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpacksswb %xmm10, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm9, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm11, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255]
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -790,40 +774,39 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
 ; AVX2:       # %bb.0: # %vector.ph
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm4
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpmaxud %ymm4, %ymm1, %ymm4
 ; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm1, %ymm4
 ; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
 ; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
 ; AVX2-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vpmaxud %ymm3, %ymm2, %ymm6
-; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm2, %ymm6
-; AVX2-NEXT:    vpxor %ymm5, %ymm6, %ymm5
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT:    vpackssdw %xmm5, %xmm3, %xmm3
+; AVX2-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpandn %xmm0, %xmm4, %xmm0
+; AVX2-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpandn %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test14:
 ; AVX512:       # %bb.0: # %vector.ph
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpcmpnltud %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpcmpnltud %zmm2, %zmm1, %k1
+; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512-NEXT:    vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 vector.ph:
@@ -835,123 +818,127 @@ vector.ph:
   ret <16 x i8> %res
 }
 
+; FIXME: match this to UMIN+TRUNC+PSUBUS
 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: test15:
 ; SSE2:       # %bb.0: # %vector.ph
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psubd %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm3, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    psubw %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test15:
 ; SSSE3:       # %bb.0: # %vector.ph
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
 ; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSSE3-NEXT:    packssdw %xmm5, %xmm4
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm3, %xmm2
+; SSSE3-NEXT:    pshufb %xmm3, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    psubw %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test15:
 ; SSE41:       # %bb.0: # %vector.ph
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pminud %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm4, %xmm5
+; SSE41-NEXT:    pminud %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT:    pxor %xmm4, %xmm5
 ; SSE41-NEXT:    movdqa %xmm3, %xmm6
 ; SSE41-NEXT:    pminud %xmm2, %xmm6
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    packssdw %xmm6, %xmm4
-; SSE41-NEXT:    psubd %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
-; SSE41-NEXT:    packusdw %xmm3, %xmm0
-; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; SSE41-NEXT:    packusdw %xmm2, %xmm1
+; SSE41-NEXT:    psubw %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test15:
 ; AVX1:       # %bb.0: # %vector.ph
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm0, %xmm5
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vpandn %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test15:
 ; AVX2:       # %bb.0: # %vector.ph
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpminud %ymm1, %ymm2, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test15:
 ; AVX512:       # %bb.0: # %vector.ph
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpcmpnleud %ymm1, %ymm2, %k1
+; AVX512-NEXT:    vpmovdw %ymm1, %xmm1
+; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 vector.ph:
@@ -963,80 +950,77 @@ vector.ph:
   ret <8 x i16> %res
 }
 
+; FIXME: match this to UMIN+TRUNC+PSUBUS
 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: test16:
 ; SSE2:       # %bb.0: # %vector.ph
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psubd %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm3, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    psubw %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test16:
 ; SSSE3:       # %bb.0: # %vector.ph
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    por %xmm3, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
 ; SSSE3-NEXT:    por %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSSE3-NEXT:    packssdw %xmm5, %xmm4
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pshufb %xmm1, %xmm3
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm3, %xmm2
+; SSSE3-NEXT:    pshufb %xmm3, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    psubw %xmm1, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test16:
 ; SSE41:       # %bb.0: # %vector.ph
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pminud %xmm1, %xmm4
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pmaxud %xmm1, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm6
-; SSE41-NEXT:    pminud %xmm2, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT:    pxor %xmm5, %xmm6
-; SSE41-NEXT:    packssdw %xmm6, %xmm4
-; SSE41-NEXT:    psubd %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
-; SSE41-NEXT:    packusdw %xmm3, %xmm0
+; SSE41-NEXT:    pmaxud %xmm2, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    packssdw %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; SSE41-NEXT:    packusdw %xmm2, %xmm1
+; SSE41-NEXT:    psubw %xmm1, %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1044,42 +1028,47 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; AVX1:       # %bb.0: # %vector.ph
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm0, %xmm5
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
-; AVX1-NEXT:    vpandn %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test16:
 ; AVX2:       # %bb.0: # %vector.ph
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test16:
 ; AVX512:       # %bb.0: # %vector.ph
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z}
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpcmpltud %ymm2, %ymm1, %k1
+; AVX512-NEXT:    vpmovdw %ymm1, %xmm1
+; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 vector.ph:
@@ -2057,60 +2046,57 @@ vector.ph:
 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: psubus_i16_i32_min:
 ; SSE2:       # %bb.0: # %vector.ph
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    por %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm6, %xmm1
 ; SSE2-NEXT:    por %xmm4, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm2, %xmm3
-; SSE2-NEXT:    psubd %xmm5, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    pslld $16, %xmm3
 ; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    packssdw %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm3
+; SSE2-NEXT:    psubw %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: psubus_i16_i32_min:
 ; SSSE3:       # %bb.0: # %vector.ph
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT:    movdqa %xmm6, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
-; SSSE3-NEXT:    pand %xmm7, %xmm2
-; SSSE3-NEXT:    pandn %xmm5, %xmm7
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pshufb %xmm3, %xmm7
-; SSSE3-NEXT:    pxor %xmm1, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT:    movdqa %xmm5, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pand %xmm6, %xmm1
-; SSSE3-NEXT:    pandn %xmm5, %xmm6
-; SSSE3-NEXT:    por %xmm1, %xmm6
-; SSSE3-NEXT:    pshufb %xmm3, %xmm6
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSSE3-NEXT:    psubusw %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    pandn %xmm4, %xmm6
+; SSSE3-NEXT:    por %xmm2, %xmm6
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm6
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    pandn %xmm4, %xmm5
+; SSSE3-NEXT:    por %xmm1, %xmm5
+; SSSE3-NEXT:    pshufb %xmm2, %xmm5
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; SSSE3-NEXT:    psubusw %xmm5, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: psubus_i16_i32_min:

From d1dbda10cefeaa124e28eb289cdc92c049c3d973 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek@quasardb.net>
Date: Mon, 25 May 2020 22:26:50 +0200
Subject: [PATCH 050/770] [libc++] [LWG3201] Update status page: lerp should be
 marked noexcept.

Summary: Update status page and test synopsis. Add synopsis in <cmath>.

Reviewed By: ldionne, #libc

Differential Revision: https://reviews.llvm.org/D80456
---
 libcxx/include/cmath                                       | 4 ++++
 .../std/numerics/c.math/c.math.lerp/c.math.lerp.pass.cpp   | 7 +++----
 libcxx/www/cxx2a_status.html                               | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/cmath b/libcxx/include/cmath
index 0f06486fb34f3..0901a23a2498d 100644
--- a/libcxx/include/cmath
+++ b/libcxx/include/cmath
@@ -296,6 +296,10 @@ floating_point trunc (arithmetic x);
 float          truncf(float x);
 long double    truncl(long double x);
 
+constexpr float       lerp(float a, float b, float t) noexcept;                   // C++20
+constexpr double      lerp(double a, double b, double t) noexcept;                // C++20
+constexpr long double lerp(long double a, long double b, long double t) noexcept; // C++20
+
 }  // std
 
 */
diff --git a/libcxx/test/std/numerics/c.math/c.math.lerp/c.math.lerp.pass.cpp b/libcxx/test/std/numerics/c.math/c.math.lerp/c.math.lerp.pass.cpp
index 7d9ceef8b48e8..6eeeec4898d64 100644
--- a/libcxx/test/std/numerics/c.math/c.math.lerp/c.math.lerp.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/c.math.lerp/c.math.lerp.pass.cpp
@@ -8,10 +8,9 @@
 // UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
 // <cmath>
 
-// constexpr float lerp(float a, float b, float t);
-// constexpr double lerp(double a, double b, double t);
-// constexpr long double lerp(long double a, long double b, long double t);
-
+// constexpr float lerp(float a, float b, float t) noexcept;
+// constexpr double lerp(double a, double b, double t) noexcept;
+// constexpr long double lerp(long double a, long double b, long double t) noexcept;
 
 #include <cmath>
 #include <limits>
diff --git a/libcxx/www/cxx2a_status.html b/libcxx/www/cxx2a_status.html
index 2a93d35feb50f..6214dd2cdd838 100644
--- a/libcxx/www/cxx2a_status.html
+++ b/libcxx/www/cxx2a_status.html
@@ -477,7 +477,7 @@ <h3>Library Working group Issues Status</h3>
 	<tr><td><a href="https://wg21.link/LWG3175">3175</a></td><td>The <tt>CommonReference</tt> requirement of concept <tt>SwappableWith</tt> is not satisfied in the example</td><td>Prague</td><td></td></tr>
 	<tr><td><a href="https://wg21.link/LWG3194">3194</a></td><td><tt>ConvertibleTo</tt> prose does not match code</td><td>Prague</td><td></td></tr>
 	<tr><td><a href="https://wg21.link/LWG3200">3200</a></td><td><tt>midpoint</tt> should not constrain <tt>T</tt> is complete</td><td>Prague</td><td></td></tr>
-	<tr><td><a href="https://wg21.link/LWG3201">3201</a></td><td><tt>lerp</tt> should be marked as <tt>noexcept</tt></td><td>Prague</td><td></td></tr>
+	<tr><td><a href="https://wg21.link/LWG3201">3201</a></td><td><tt>lerp</tt> should be marked as <tt>noexcept</tt></td><td>Prague</td><td>Complete</td></tr>
 	<tr><td><a href="https://wg21.link/LWG3226">3226</a></td><td><tt>zoned_time</tt> constructor from <tt>string_view</tt> should accept <tt>zoned_time&lt;Duration2, TimeZonePtr2&gt;</tt></td><td>Prague</td><td></td></tr>
 	<tr><td><a href="https://wg21.link/LWG3233">3233</a></td><td>Broken requirements for <tt>shared_ptr</tt> converting constructors</td><td>Prague</td><td></td></tr>
 	<tr><td><a href="https://wg21.link/LWG3237">3237</a></td><td>LWG 3038 and 3190 have inconsistent PRs</td><td>Prague</td><td></td></tr>

From bc93c2d72e84c38fc86e64c9c26aafcf2c61457a Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek@quasardb.net>
Date: Mon, 25 May 2020 22:34:08 +0200
Subject: [PATCH 051/770] [Transforms] Fix typos. NFC

---
 .../llvm/Transforms/Utils/CallGraphUpdater.h  |  2 +-
 .../Transforms/IPO/AttributorAttributes.cpp   |  2 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  4 +--
 .../Instrumentation/PoisonChecking.cpp        |  2 +-
 .../lib/Transforms/Scalar/LoopPredication.cpp | 26 +++++++++----------
 .../RewriteStatepointsForGC/preprocess.ll     |  2 +-
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h b/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
index 6103859ca959a..22954b469186c 100644
--- a/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/CallGraphUpdater.h
@@ -28,7 +28,7 @@ namespace llvm {
 class CallGraphUpdater {
   /// Containers for functions which we did replace or want to delete when
   /// `finalize` is called. This can happen explicitly or as part of the
-  /// destructor. Dead functions in comdat sections are tracked seperatly
+  /// destructor. Dead functions in comdat sections are tracked separately
   /// because a function with discardable linakage in a COMDAT should only
   /// be dropped if the entire COMDAT is dropped, see git ac07703842cf.
   ///{
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 6b3876ed91aae..f641ad4498cd1 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -54,7 +54,7 @@ STATISTIC(NumAAs, "Number of abstract attributes created");
 //  }
 // If there is a single "increment" side one can use the macro
 // STATS_DECLTRACK with a custom message. If there are multiple increment
-// sides, STATS_DECL and STATS_TRACK can also be used separatly.
+// sides, STATS_DECL and STATS_TRACK can also be used separately.
 //
 #define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \
   ("Number of " #TYPE " marked '" #NAME "'")
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index dd66c6703ba09..63eddbda94e73 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -224,7 +224,7 @@ struct OpenMPOpt {
         OMPRTL_omp_get_partition_num_places,
         OMPRTL_omp_get_partition_place_nums};
 
-    // Global-tid is handled separatly.
+    // Global-tid is handled separately.
     SmallSetVector<Value *, 16> GTIdArgs;
     collectGlobalThreadIdArguments(GTIdArgs);
     LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
@@ -556,7 +556,7 @@ struct OpenMPOpt {
     auto &ORE = OREGetter(F);
 
     ORE.emit([&]() {
-      return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); 
+      return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst));
     });
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
index e5c338fed9523..85e096112fca1 100644
--- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -190,7 +190,7 @@ static void generateCreationChecks(Instruction &I,
   if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
     generateCreationChecksForBinOp(I, Checks);
 
-  // Handle non-binops seperately
+  // Handle non-binops separately
   switch (I.getOpcode()) {
   default:
     // Note there are a couple of missing cases here, once implemented, this
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 1d73a9f24453a..edde22d6708fe 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -268,7 +268,7 @@ class LoopPredication {
   /// Return an insertion point suitable for inserting a safe to speculate
   /// instruction whose only user will be 'User' which has operands 'Ops'.  A
   /// trivial result would be the at the User itself, but we try to return a
-  /// loop invariant location if possible.  
+  /// loop invariant location if possible.
   Instruction *findInsertPt(Instruction *User, ArrayRef<Value*> Ops);
   /// Same as above, *except* that this uses the SCEV definition of invariant
   /// which is that an expression *can be made* invariant via SCEVExpander.
@@ -278,7 +278,7 @@ class LoopPredication {
 
   /// Return true if the value is known to produce a single fixed value across
   /// all iterations on which it executes.  Note that this does not imply
-  /// speculation safety.  That must be established seperately.  
+  /// speculation safety.  That must be established separately.
   bool isLoopInvariantValue(const SCEV* S);
 
   Value *expandCheck(SCEVExpander &Expander, Instruction *Guard,
@@ -398,7 +398,7 @@ LoopPredication::parseLoopICmp(ICmpInst *ICI) {
 }
 
 Value *LoopPredication::expandCheck(SCEVExpander &Expander,
-                                    Instruction *Guard, 
+                                    Instruction *Guard,
                                     ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS) {
   Type *Ty = LHS->getType();
@@ -522,7 +522,7 @@ Instruction *LoopPredication::findInsertPt(Instruction *Use,
   return Preheader->getTerminator();
 }
 
-bool LoopPredication::isLoopInvariantValue(const SCEV* S) { 
+bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
   // Handling expressions which produce invariant results, but *haven't* yet
   // been removed from the loop serves two important purposes.
   // 1) Most importantly, it resolves a pass ordering cycle which would
@@ -535,12 +535,12 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
   // much more obviously in the IR.  Otherwise, the cost modeling for other
   // transforms would end up needing to duplicate all of this logic to model a
   // check which becomes predictable based on a modeled peel or unswitch.
-  // 
+  //
   // The cost of doing so in the worst case is an extra fill from the stack  in
   // the loop to materialize the loop invariant test value instead of checking
   // against the original IV which is presumable in a register inside the loop.
   // Such cases are presumably rare, and hint at missing oppurtunities for
-  // other passes. 
+  // other passes.
 
   if (SE->isLoopInvariant(S, L))
     // Note: This the SCEV variant, so the original Value* may be within the
@@ -548,7 +548,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
     return true;
 
   // Handle a particular important case which SCEV doesn't yet know about which
-  // shows up in range checks on arrays with immutable lengths.  
+  // shows up in range checks on arrays with immutable lengths.
   // TODO: This should be sunk inside SCEV.
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
     if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
@@ -575,7 +575,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
   const SCEV *LatchLimit = LatchCheck.Limit;
   // Subtlety: We need all the values to be *invariant* across all iterations,
   // but we only need to check expansion safety for those which *aren't*
-  // already guaranteed to dominate the guard.  
+  // already guaranteed to dominate the guard.
   if (!isLoopInvariantValue(GuardStart) ||
       !isLoopInvariantValue(GuardLimit) ||
       !isLoopInvariantValue(LatchStart) ||
@@ -599,7 +599,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
   LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
   LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
   LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
- 
+
   auto *LimitCheck =
       expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, RHS);
   auto *FirstIterationCheck = expandCheck(Expander, Guard, RangeCheck.Pred,
@@ -618,7 +618,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
   const SCEV *LatchLimit = LatchCheck.Limit;
   // Subtlety: We need all the values to be *invariant* across all iterations,
   // but we only need to check expansion safety for those which *aren't*
-  // already guaranteed to dominate the guard.  
+  // already guaranteed to dominate the guard.
   if (!isLoopInvariantValue(GuardStart) ||
       !isLoopInvariantValue(GuardLimit) ||
       !isLoopInvariantValue(LatchStart) ||
@@ -659,7 +659,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
 static void normalizePredicate(ScalarEvolution *SE, Loop *L,
                                LoopICmp& RC) {
   // LFTR canonicalizes checks to the ICMP_NE/EQ form; normalize back to the
-  // ULT/UGE form for ease of handling by our caller. 
+  // ULT/UGE form for ease of handling by our caller.
   if (ICmpInst::isEquality(RC.Pred) &&
       RC.IV->getStepRecurrence(*SE)->isOne() &&
       SE->isKnownPredicate(ICmpInst::ICMP_ULE, RC.IV->getStart(), RC.Limit))
@@ -1044,7 +1044,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // inserting a branch on the value which can be either poison or undef.  In
   // this case, the branch can legally go either way; we just need to avoid
   // introducing UB.  This is achieved through the use of the freeze
-  // instruction.  
+  // instruction.
 
   SmallVector<BasicBlock *, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -1072,7 +1072,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // analyzeable after dropping widenability.
   {
     bool Invalidate = false;
-    
+
     for (auto *ExitingBB : ExitingBlocks) {
       if (LI->getLoopFor(ExitingBB) != L)
         continue;
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/preprocess.ll b/llvm/test/Transforms/RewriteStatepointsForGC/preprocess.ll
index 105e0e88ac215..6e03798b1d2a6 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/preprocess.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/preprocess.ll
@@ -29,7 +29,7 @@ next:                                             ; preds = %entry
 define void @test7() gc "statepoint-example" {
 ; CHECK-LABEL: test7
 ; CHECK-NOT: gc.statepoint
-; Need to delete unreachable gc.statepoint invoke - tested seperately given
+; Need to delete unreachable gc.statepoint invoke - tested separately given
 ; a correct implementation could only remove the instructions, not the block
   ret void
 

From 179c80117c91fc3ba3079740a91de40d98b18916 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 25 May 2020 20:44:38 +0100
Subject: [PATCH 052/770] [LoopUnroll] Remove dead NextBlocks argument (NFC).

---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index c0177bdb3f6fa..23b61c40a7567 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -717,7 +717,6 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   }
 
   auto setDest = [LoopExit, ContinueOnTrue](BasicBlock *Src, BasicBlock *Dest,
-                                            ArrayRef<BasicBlock *> NextBlocks,
                                             BasicBlock *BlockInLoop,
                                             bool NeedConditional) {
     auto *Term = cast<BranchInst>(Src->getTerminator());
@@ -779,7 +778,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         NeedConditional = false;
       }
 
-      setDest(Latches[i], Dest, Headers, Headers[i], NeedConditional);
+      setDest(Latches[i], Dest, Headers[i], NeedConditional);
     }
   } else {
     // Setup headers to branch to their new successors in the unrolled
@@ -803,7 +802,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // unconditional branch for some iterations.
         NeedConditional = false;
 
-      setDest(Headers[i], Dest, Headers, HeaderSucc[i], NeedConditional);
+      setDest(Headers[i], Dest, HeaderSucc[i], NeedConditional);
     }
 
     // Set up latches to branch to the new header in the unrolled iterations or

From cec20db588254289dc2953517310b9886f6dc243 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 25 May 2020 15:43:28 -0700
Subject: [PATCH 053/770] [Inlining] Set inline-deferral-scale to 2.

Summary:
This patch sets inline-deferral-scale to 2.

Both internal and SPEC benchmarking show that 2 is the best number
among -1, 2, 3, and 4.

inline-deferral-scale  SPECint2006
------------------------------------------------------------
                   -1  38.0 (the default without this patch)
                    2  38.5
                    3  38.1
                    4  38.1

With the new default number, shouldBeDeferred returns true if:

  TotalCost < IC.getCost() * 2

where

  TotalCost is TotalSecondaryCost + IC.getCost() * NumCallerUsers.

If TotalCost >= 0 and NumCallerUsers >= 2, then
TotalCost >= IC.getCost() * 2, so shouldBeDeferred returns true only
when NumCallerUsers is 1.

Now, if TotalSecondaryCost < 0, which can happen if
InlineConstants::LastCallToStaticBonus, a huge number, has been
subtracted from TotalSecondaryCost, then TotalCost may be negative.
In this case, shouldBeDeferred may return true even when
NumCallerUsers >= 2.

Reviewers: davidxl, nikic

Reviewed By: davidxl

Subscribers: xbolva00, hiraditya, dexonsmith, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80229
---
 llvm/lib/Analysis/InlineAdvisor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index dcaf9d0fea749..ac3ba451aa3f6 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -45,7 +45,7 @@ static cl::opt<bool>
 static cl::opt<int>
     InlineDeferralScale("inline-deferral-scale",
                         cl::desc("Scale to limit the cost of inline deferral"),
-                        cl::init(-1), cl::Hidden);
+                        cl::init(2), cl::Hidden);
 
 namespace {
 class DefaultInlineAdvice : public InlineAdvice {

From 3a2df3bad07f7e5fc22538ad782e08ee55f29e41 Mon Sep 17 00:00:00 2001
From: Yuanfang Chen <yuanfang.chen@sony.com>
Date: Fri, 22 May 2020 12:33:33 -0700
Subject: [PATCH 054/770] [Clang][test] fix tests when using external
 assembler.

Summary:
The test assume using integraed-as, so make it explicit.

Reviewered by: aganea

Differential Revision: https://reviews.llvm.org/D80454
---
 clang/test/Driver/cc1-spawnprocess.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/clang/test/Driver/cc1-spawnprocess.c b/clang/test/Driver/cc1-spawnprocess.c
index 8af8cc4c05553..36df7067487c9 100644
--- a/clang/test/Driver/cc1-spawnprocess.c
+++ b/clang/test/Driver/cc1-spawnprocess.c
@@ -1,18 +1,23 @@
-// RUN: %clang -fintegrated-cc1 -c -### %s 2>&1 | FileCheck %s --check-prefix=YES
+// If a toolchain uses an external assembler, the test would fail because using
+// an external assember would increase job counts. Most toolchains in tree
+// use integrated assembler, but we still support external assembler.
+// So -fintegrated-as is specified explicitly when applicable.
+
+// RUN: %clang -fintegrated-cc1 -fintegrated-as -c -### %s 2>&1 | FileCheck %s --check-prefix=YES
 // RUN: %clang -fno-integrated-cc1 -c -### %s 2>&1 | FileCheck %s --check-prefix=NO
 
 // RUN: %clang -fintegrated-cc1 -fno-integrated-cc1 -c -### %s 2>&1 \
 // RUN:     | FileCheck %s --check-prefix=NO
-// RUN: %clang -fno-integrated-cc1 -fintegrated-cc1 -c -### %s 2>&1 \
+// RUN: %clang -fno-integrated-cc1 -fintegrated-cc1 -fintegrated-as -c -### %s 2>&1 \
 // RUN:     | FileCheck %s --check-prefix=YES
 
-// RUN: %clang_cl -fintegrated-cc1 -c -### -- %s 2>&1 \
+// RUN: %clang_cl -fintegrated-cc1 -fintegrated-as -c -### -- %s 2>&1 \
 // RUN:     | FileCheck %s --check-prefix=YES
 // RUN: %clang_cl -fno-integrated-cc1 -c -### -- %s 2>&1 \
 // RUN:     | FileCheck %s --check-prefix=NO
 
 // RUN: env CCC_OVERRIDE_OPTIONS=+-fintegrated-cc1 \
-// RUN:     %clang -fintegrated-cc1 -c -### %s 2>&1 \
+// RUN:     %clang -fintegrated-cc1 -fintegrated-as -c -### %s 2>&1 \
 // RUN:     | FileCheck %s --check-prefix=YES
 // RUN: env CCC_OVERRIDE_OPTIONS=+-fno-integrated-cc1 \
 // RUN:     %clang -fintegrated-cc1 -c -### %s 2>&1 \
@@ -24,7 +29,7 @@
 // The following tests ensure that only one integrated-cc1 is executed.
 
 // Only one TU, one job, thus integrated-cc1 is enabled.
-// RUN: %clang -fintegrated-cc1 -c %s -### 2>&1 | FileCheck %s --check-prefix=YES
+// RUN: %clang -fintegrated-cc1 -fintegrated-as -c %s -### 2>&1 | FileCheck %s --check-prefix=YES
 
 // Only one TU, but we're linking, two jobs, thus integrated-cc1 is disabled.
 // RUN: %clang -fintegrated-cc1 %s -### 2>&1 | FileCheck %s --check-prefix=NO

From 9a8d7bd77040a6497233ea10fd866ad9de8bf98c Mon Sep 17 00:00:00 2001
From: Yuanfang Chen <yuanfang.chen@sony.com>
Date: Mon, 25 May 2020 17:36:28 -0700
Subject: [PATCH 055/770] [clang][test] fix tests for external assemblers

These three tests depend on using the integrated assembler. Make it
explicit by specifying -fintegrated-as.
---
 clang/test/Driver/debug-prefix-map.S    | 4 ++--
 clang/test/Driver/flang/flang.f90       | 2 +-
 clang/test/Driver/flang/flang_ucase.F90 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/debug-prefix-map.S b/clang/test/Driver/debug-prefix-map.S
index 7d12a17479726..6dd1ded9bfdfa 100644
--- a/clang/test/Driver/debug-prefix-map.S
+++ b/clang/test/Driver/debug-prefix-map.S
@@ -1,5 +1,5 @@
-// RUN: %clang -### -g -fdebug-prefix-map=old=new %s 2>&1 | FileCheck %s
-// RUN: %clang -### -g -ffile-prefix-map=old=new %s 2>&1 | FileCheck %s
+// RUN: %clang -### -g -fintegrated-as -fdebug-prefix-map=old=new %s 2>&1 | FileCheck %s
+// RUN: %clang -### -g -fintegrated-as -ffile-prefix-map=old=new %s 2>&1 | FileCheck %s
 
 // CHECK: cc1as
 // CHECK-SAME: -fdebug-prefix-map=old=new
diff --git a/clang/test/Driver/flang/flang.f90 b/clang/test/Driver/flang/flang.f90
index 9d47c7c90225c..a68be31343f9c 100644
--- a/clang/test/Driver/flang/flang.f90
+++ b/clang/test/Driver/flang/flang.f90
@@ -43,7 +43,7 @@
 ! CHECK-S-DAG: "-S"
 ! CHECK-S-DAG: "-o" "{{[^"]*}}.s"
 
-! RUN: %clang --driver-mode=flang -###                     %s 2>&1 | FileCheck --check-prefixes=ALL,CHECK-EMIT-OBJ %s
+! RUN: %clang --driver-mode=flang -### -fintegrated-as     %s 2>&1 | FileCheck --check-prefixes=ALL,CHECK-EMIT-OBJ %s
 ! CHECK-EMIT-OBJ-DAG: "-emit-obj"
 ! CHECK-EMIT-OBJ-DAG: "-o" "{{[^"]*}}.o"
 
diff --git a/clang/test/Driver/flang/flang_ucase.F90 b/clang/test/Driver/flang/flang_ucase.F90
index 323afb21dccf5..dd1e20088191f 100644
--- a/clang/test/Driver/flang/flang_ucase.F90
+++ b/clang/test/Driver/flang/flang_ucase.F90
@@ -43,7 +43,7 @@
 ! CHECK-S-DAG: "-S"
 ! CHECK-S-DAG: "-o" "{{[^"]*}}.s"
 
-! RUN: %clang --driver-mode=flang -###                     %s 2>&1 | FileCheck --check-prefixes=ALL,CHECK-EMIT-OBJ %s
+! RUN: %clang --driver-mode=flang -### -fintegrated-as     %s 2>&1 | FileCheck --check-prefixes=ALL,CHECK-EMIT-OBJ %s
 ! CHECK-EMIT-OBJ-DAG: "-emit-obj"
 ! CHECK-EMIT-OBJ-DAG: "-o" "{{[^"]*}}.o"
 

From 793cc518b9428a0b7a40c59d4ecd5939a7bc84f7 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanjai@ca.ibm.com>
Date: Mon, 25 May 2020 20:07:22 -0500
Subject: [PATCH 056/770] [PowerPC] Prevent legalization loop from promoting
 SELECT_CC from v4i32 to v4i32

As reported in https://bugs.llvm.org/show_bug.cgi?id=45709 we can hit an
infinite loop in legalization since we set the legalization action for
ISD::SELECT_CC for all fixed length vector types to Promote. Without some
different legalization action for the type being promoted to, the legalizer
simply loops. Since we don't have patterns to match the node, the right
legalization action should be Expand.

Differential revision: https://reviews.llvm.org/D79854
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  1 +
 llvm/test/CodeGen/PowerPC/pr45709.ll        | 58 +++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr45709.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d42eaa7b77062..2f9ff293c2775 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -718,6 +718,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
+    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
     if (!Subtarget.hasP8Vector()) {
       setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
       setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
diff --git a/llvm/test/CodeGen/PowerPC/pr45709.ll b/llvm/test/CodeGen/PowerPC/pr45709.ll
new file mode 100644
index 0000000000000..bc295fafd2105
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr45709.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
+; RUN:     -mcpu=pwr6 -ppc-asm-full-reg-names -mattr=-vsx \
+; RUN:     -ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; There is code in the SDAG to expand FMAX/FMIN with fast flags to SELECT_CC.
+; On PPC, we had SELECT_CC legalized using Promote for all vector types
+; (including the type that they are all promoted to - which caused an infinite
+; loop in legalization). This test just ensures that we terminate on such input.
+define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2 {
+; CHECK-LABEL: _ZN1a1bEv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bclr 12, 4*cr5+lt, 0
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    vxor v3, v3, v3
+; CHECK-NEXT:    addi r3, r3, .LCPI0_0@toc@l
+; CHECK-NEXT:    lvx v4, 0, r3
+; CHECK-NEXT:    addi r3, r1, -48
+; CHECK-NEXT:    stvx v3, 0, r3
+; CHECK-NEXT:    addi r3, r1, -32
+; CHECK-NEXT:    vperm v2, v2, v2, v4
+; CHECK-NEXT:    stvx v2, 0, r3
+; CHECK-NEXT:    lwz r3, -48(r1)
+; CHECK-NEXT:    lwz r4, -32(r1)
+; CHECK-NEXT:    cmpw r4, r3
+; CHECK-NEXT:    bc 12, gt, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %.preheader
+; CHECK-NEXT:    addi r3, r4, 0
+; CHECK-NEXT:  .LBB0_3: # %.preheader
+; CHECK-NEXT:    stw r3, -64(r1)
+; CHECK-NEXT:    addi r3, r1, -64
+; CHECK-NEXT:    lvx v2, 0, r3
+; CHECK-NEXT:    addi r3, r1, -16
+; CHECK-NEXT:    stvx v2, 0, r3
+; CHECK-NEXT:    blr
+  br i1 undef, label %7, label %1
+
+1:                                                ; preds = %1, %0
+  br i1 undef, label %2, label %1
+
+2:                                                ; preds = %1
+  %3 = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %4 = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %3, <4 x float> zeroinitializer)
+  %5 = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %4, <4 x float> undef)
+  %6 = extractelement <4 x float> %5, i32 0
+  br label %7
+
+7:                                                ; preds = %2, %0
+  %8 = phi float [ %6, %2 ], [ undef, %0 ]
+  %9 = fcmp fast une float %8, 0.000000e+00
+  ret void
+}
+
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
+
+attributes #0 = { nounwind }

From 9d55e4ee1367b440bb8402ce3a33d5a8b99aee06 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 25 May 2020 15:05:35 -0700
Subject: [PATCH 057/770] Make explicit -fno-semantic-interposition (in -fpic
 mode) infer dso_local

-fno-semantic-interposition is currently the CC1 default. (The opposite
disables some interprocedural optimizations.) However, it does not infer
dso_local: on most targets accesses to ExternalLinkage functions/variables
defined in the current module still need PLT/GOT.

This patch makes explicit -fno-semantic-interposition infer dso_local,
so that PLT/GOT can be eliminated if targets implement local aliases
for AsmPrinter::getSymbolPreferLocal (currently only x86).

Currently we check whether the module flag "SemanticInterposition" is 0.
If yes, infer dso_local. In the future, we can infer dso_local unless
"SemanticInterposition" is 1: frontends other than clang will also
benefit from the optimization if they don't bother setting the flag.
(There will be risks if they do want ELF interposition: they need to set
"SemanticInterposition" to 1.)
---
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Driver/Options.td         |  2 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  3 ++
 clang/lib/Driver/ToolChains/Clang.cpp         | 10 ++--
 clang/lib/Frontend/CompilerInvocation.cpp     |  3 ++
 clang/test/CodeGen/semantic-interposition.c   |  4 ++
 clang/test/Driver/fsemantic-interposition.c   |  6 ++-
 llvm/include/llvm/IR/GlobalValue.h            |  1 +
 llvm/include/llvm/IR/Module.h                 |  1 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  8 ++--
 llvm/lib/IR/Globals.cpp                       |  6 +++
 llvm/lib/IR/Module.cpp                        |  7 +++
 llvm/lib/Target/TargetMachine.cpp             |  8 ++++
 .../semantic-interposition-infer-dsolocal.ll  | 46 +++++++++++++++++++
 14 files changed, 96 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/semantic-interposition-infer-dsolocal.ll

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index e94305da46ba5..6e72b47f489b5 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -301,6 +301,7 @@ ENUM_LANGOPT(TypeVisibilityMode, Visibility, 3, DefaultVisibility,
 LANGOPT(SetVisibilityForExternDecls, 1, 0,
         "apply global symbol visibility to external declarations without an explicit visibility")
 BENIGN_LANGOPT(SemanticInterposition        , 1, 0, "semantic interposition")
+BENIGN_LANGOPT(ExplicitNoSemanticInterposition, 1, 0, "explicitly no semantic interposition")
 ENUM_LANGOPT(StackProtector, StackProtectorMode, 2, SSPOff,
              "stack protector mode")
 ENUM_LANGOPT(TrivialAutoVarInit, TrivialAutoVarInitKind, 2, TrivialAutoVarInitKind::Uninitialized,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7213af1731c17..e88e6cf8a1301 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3371,7 +3371,7 @@ defm ipa_cp : BooleanFFlag<"ipa-cp">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm ivopts : BooleanFFlag<"ivopts">, Group<clang_ignored_gcc_optimization_f_Group>;
 def fsemantic_interposition : Flag<["-"], "fsemantic-interposition">, Group<f_Group>, Flags<[CC1Option]>;
-def fno_semantic_interposition: Flag<["-"], "fno-semantic-interposition">, Group<f_Group>;
+def fno_semantic_interposition: Flag<["-"], "fno-semantic-interposition">, Group<f_Group>, Flags<[CC1Option]>;
 defm non_call_exceptions : BooleanFFlag<"non-call-exceptions">, Group<clang_ignored_f_Group>;
 defm peel_loops : BooleanFFlag<"peel-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm permissive : BooleanFFlag<"permissive">, Group<clang_ignored_f_Group>;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 8ba7fb756ada8..f43bc6434dafd 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -493,6 +493,9 @@ void CodeGenModule::Release() {
   if (Context.getLangOpts().SemanticInterposition)
     // Require various optimization to respect semantic interposition.
     getModule().setSemanticInterposition(1);
+  else if (Context.getLangOpts().ExplicitNoSemanticInterposition)
+    // Allow dso_local on applicable targets.
+    getModule().setSemanticInterposition(0);
 
   if (CodeGenOpts.EmitCodeView) {
     // Indicate that we want CodeView in the metadata.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index deb60ed68cfca..f33983db3e1eb 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4471,10 +4471,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(A->getValue());
   }
 
-  if (Args.hasFlag(options::OPT_fsemantic_interposition,
-                   options::OPT_fno_semantic_interposition, false) &&
-      RelocationModel != llvm::Reloc::Static && !IsPIE)
-    CmdArgs.push_back("-fsemantic-interposition");
+  // The default is -fno-semantic-interposition. We render it just because we
+  // require explicit -fno-semantic-interposition to infer dso_local.
+  if (Arg *A = Args.getLastArg(options::OPT_fsemantic_interposition,
+                               options::OPT_fno_semantic_interposition))
+    if (RelocationModel != llvm::Reloc::Static && !IsPIE)
+      A->render(Args, CmdArgs);
 
   CmdArgs.push_back("-mthread-model");
   if (Arg *A = Args.getLastArg(options::OPT_mthread_model)) {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index b4bc027e832b1..f98490cd9a114 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3049,6 +3049,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
   }
 
   Opts.SemanticInterposition = Args.hasArg(OPT_fsemantic_interposition);
+  // An explicit -fno-semantic-interposition infers dso_local.
+  Opts.ExplicitNoSemanticInterposition =
+      Args.hasArg(OPT_fno_semantic_interposition);
 
   // -mrtd option
   if (Arg *A = Args.getLastArg(OPT_mrtd)) {
diff --git a/clang/test/CodeGen/semantic-interposition.c b/clang/test/CodeGen/semantic-interposition.c
index 43656e36021ff..3d6c5f2872b57 100644
--- a/clang/test/CodeGen/semantic-interposition.c
+++ b/clang/test/CodeGen/semantic-interposition.c
@@ -1,5 +1,9 @@
 // RUN: %clang_cc1 -emit-llvm -fsemantic-interposition %s -o - | FileCheck --check-prefix=INTERPOSITION %s
 // RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck --check-prefix=NO %s
+/// With explicit -fno-semantic-interposition, add a module flag to inform the
+/// backend that dso_local can be inferred.
+// RUN: %clang_cc1 -emit-llvm -fno-semantic-interposition %s -o - | FileCheck --check-prefix=EXPLICIT_NO %s
 
 // INTERPOSITION: !{{[0-9]+}} = !{i32 1, !"SemanticInterposition", i32 1}
 // NO-NOT: "SemanticInterposition"
+// EXPLICIT_NO: !{{[0-9]+}} = !{i32 1, !"SemanticInterposition", i32 0}
diff --git a/clang/test/Driver/fsemantic-interposition.c b/clang/test/Driver/fsemantic-interposition.c
index 20bc2c6f72703..af3e7575a7997 100644
--- a/clang/test/Driver/fsemantic-interposition.c
+++ b/clang/test/Driver/fsemantic-interposition.c
@@ -2,8 +2,12 @@
 // RUN: %clang -target x86_64 %s -Werror -fPIC -fsemantic-interposition -c -### 2>&1 | FileCheck %s
 // CHECK: "-fsemantic-interposition"
 
-// RUN: %clang -target x86_64 %s -Werror -fPIC -fsemantic-interposition -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+/// Require explicit -fno-semantic-interposition to infer dso_local.
+// RUN: %clang -target x86_64 %s -Werror -fPIC -fsemantic-interposition -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=EXPLICIT_NO %s
+// EXPLICIT_NO: "-fno-semantic-interposition"
+
 // RUN: %clang -target x86_64 %s -Werror -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
 // RUN: %clang -target x86_64 %s -Werror -fPIC -c -### 2>&1 | FileCheck --check-prefix=NO %s
 // RUN: %clang -target x86_64 %s -Werror -fPIE -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
 // NO-NOT: "-fsemantic-interposition"
+// NO-NOT: "-fno-semantic-interposition"
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index 398eca2d9b2e9..1c19011c9131c 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -427,6 +427,7 @@ class GlobalValue : public Constant {
   /// inlining across interposable call edges, since the callee can be
   /// replaced with something arbitrary.
   bool isInterposable() const;
+  bool canBenefitFromLocalAlias() const;
 
   bool hasExternalLinkage() const { return isExternalLinkage(getLinkage()); }
   bool hasAvailableExternallyLinkage() const {
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 3052651a37226..ead003007904c 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -857,6 +857,7 @@ class Module {
 
   /// Returns whether semantic interposition is to be respected.
   bool getSemanticInterposition() const;
+  bool noSemanticInterposition() const;
 
   /// Set whether semantic interposition is to be respected.
   void setSemanticInterposition(bool);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index db458e2b8a92c..5fba0f01ba524 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -462,10 +462,10 @@ MCSymbol *AsmPrinter::getSymbolPreferLocal(const GlobalValue &GV) const {
   // assembler would otherwise be conservative and assume a global default
   // visibility symbol can be interposable, even if the code generator already
   // assumed it.
-  if (TM.getTargetTriple().isOSBinFormatELF() &&
-      GlobalObject::isExternalLinkage(GV.getLinkage()) && GV.isDSOLocal() &&
-      !GV.isDeclaration() && !isa<GlobalIFunc>(GV) && !GV.hasComdat())
-    return getSymbolWithGlobalValueBase(&GV, "$local");
+  if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias())
+    if (GV.isDSOLocal() || (TM.getTargetTriple().isX86() &&
+                            GV.getParent()->noSemanticInterposition()))
+      return getSymbolWithGlobalValueBase(&GV, "$local");
   return TM.getSymbol(&GV);
 }
 
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 7c1c682d0262e..eefd221ec389d 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -101,6 +101,12 @@ bool GlobalValue::isInterposable() const {
          !isDSOLocal();
 }
 
+bool GlobalValue::canBenefitFromLocalAlias() const {
+  // See AsmPrinter::getSymbolPreferLocal().
+  return GlobalObject::isExternalLinkage(getLinkage()) && !isDeclaration() &&
+         !isa<GlobalIFunc>(this) && !hasComdat();
+}
+
 unsigned GlobalValue::getAlignment() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 9ac1edb2519d3..1416cdce99749 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -600,6 +600,13 @@ void Module::setSemanticInterposition(bool SI) {
   addModuleFlag(ModFlagBehavior::Error, "SemanticInterposition", SI);
 }
 
+bool Module::noSemanticInterposition() const {
+  // Conservatively require an explicit zero value for now.
+  Metadata *MF = getModuleFlag("SemanticInterposition");
+  auto *Val = cast_or_null<ConstantAsMetadata>(MF);
+  return Val && cast<ConstantInt>(Val->getValue())->getZExtValue() == 0;
+}
+
 void Module::setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB) {
   OwnedMemoryBuffer = std::move(MB);
 }
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 1de6e871569cd..074e9fde79e6b 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -193,6 +193,14 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     // Check if we can use copy relocations.
     if (!(GV && GV->isThreadLocal()) && RM == Reloc::Static)
       return true;
+  } else if (TT.isOSBinFormatELF()) {
+    // If dso_local allows AsmPrinter::getSymbolPreferLocal to use a local
+    // alias, set the flag. We cannot set dso_local for other global values,
+    // because otherwise direct accesses to a probably interposable symbol (even
+    // if the codegen assumes not) will be rejected by the linker.
+    if (!GV || !GV->canBenefitFromLocalAlias())
+      return false;
+    return TT.isX86() && M.noSemanticInterposition();
   }
 
   // ELF & wasm support preemption of other symbols.
diff --git a/llvm/test/CodeGen/X86/semantic-interposition-infer-dsolocal.ll b/llvm/test/CodeGen/X86/semantic-interposition-infer-dsolocal.ll
new file mode 100644
index 0000000000000..a0391d0364681
--- /dev/null
+++ b/llvm/test/CodeGen/X86/semantic-interposition-infer-dsolocal.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=x86_64 -relocation-model=pic < %s | FileCheck %s
+
+;; With a module flag SemanticInterposition=0, infer dso_local flags even if PIC.
+;; Local aliases will be generated for applicable variables and functions.
+
+@var = global i32 0, align 4
+
+@ifunc = ifunc i32 (), bitcast (i32 ()* ()* @ifunc_resolver to i32 ()*)
+
+define i32 @ifunc_impl() {
+entry:
+  ret i32 0
+}
+
+define i32 ()* @ifunc_resolver() {
+entry:
+  ret i32 ()* @ifunc_impl
+}
+
+declare i32 @external()
+
+define i32 @func() {
+  ret i32 0
+}
+
+;; Don't set dso_local on declarations or ifuncs.
+define i32 @foo() {
+; CHECK: movl .Lvar$local(%rip), %ebp
+; CHECK: callq external@PLT
+; CHECK: callq ifunc@PLT
+; CHECK: callq .Lfunc$local{{$}}
+entry:
+  %0 = load i32, i32* @var, align 4
+  %call = tail call i32 @external()
+  %add = add nsw i32 %call, %0
+  %call1 = tail call i32 @ifunc()
+  %add2 = add nsw i32 %add, %call1
+  %call2 = tail call i32 @func()
+  %add3 = add nsw i32 %add, %call2
+  ret i32 %add3
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"SemanticInterposition", i32 0}
+!1 = !{i32 7, !"PIC Level", i32 2}

From d8e0ad9620c6e626d753a3ae0da6c712e4d400d3 Mon Sep 17 00:00:00 2001
From: Yuanfang Chen <yuanfang.chen@sony.com>
Date: Mon, 25 May 2020 22:14:05 -0700
Subject: [PATCH 058/770] [clang][test] fix tests for external assemblers

The test depends on using the integrated assembler. Make it
explicit by specifying -fintegrated-as.
---
 clang/test/Driver/modules-ts.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/modules-ts.cpp b/clang/test/Driver/modules-ts.cpp
index 3847b71f7b742..80eef081371fa 100644
--- a/clang/test/Driver/modules-ts.cpp
+++ b/clang/test/Driver/modules-ts.cpp
@@ -9,7 +9,7 @@
 
 // Check compiling a .pcm file to a .o file.
 //
-// RUN: %clang -fmodules-ts %t.pcm -c -o %t.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-COMPILE
+// RUN: %clang -fmodules-ts -fintegrated-as %t.pcm -c -o %t.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-COMPILE
 //
 // CHECK-COMPILE: -cc1 {{.*}} -emit-obj
 // CHECK-COMPILE-SAME: -o {{.*}}.pcm.o
@@ -18,7 +18,7 @@
 
 // Check use of a .pcm file in another compilation.
 //
-// RUN: %clang -fmodules-ts -fmodule-file=%t.pcm -Dexport= %s -c -o %t.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
+// RUN: %clang -fmodules-ts -fmodule-file=%t.pcm -fintegrated-as -Dexport= %s -c -o %t.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
 //
 // CHECK-USE: -cc1
 // CHECK-USE-SAME: -emit-obj
@@ -28,7 +28,7 @@
 
 // Check combining precompile and compile steps works.
 //
-// RUN: %clang -fmodules-ts -x c++-module %s -c -o %t.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE --check-prefix=CHECK-COMPILE
+// RUN: %clang -fmodules-ts -fintegrated-as -x c++-module %s -c -o %t.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE --check-prefix=CHECK-COMPILE
 
 // Check that .cppm is treated as a module implicitly.
 // RUN: cp %s %t.cppm

From eeedbd033612e105755156023bdeec2fba4eca21 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Mon, 25 May 2020 17:25:55 +0200
Subject: [PATCH 059/770] [clangd] Make use of SourceOrder to find first
 initializer in DefineOutline

Summary:
Constructors can have implicit initializers, this was crashing define
outline. Make sure we find the first "written" ctor initializer to figure out
`:` location.

Fixes https://github.com/clangd/clangd/issues/400

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80521
---
 .../clangd/refactor/tweaks/DefineOutline.cpp  | 10 ++--
 .../clangd/unittests/TweakTests.cpp           | 46 +++++++++++++++++--
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
index 405ff90a5945c..63a5ba6cb9988 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
@@ -317,18 +317,16 @@ SourceRange getDeletionRange(const FunctionDecl *FD,
                              const syntax::TokenBuffer &TokBuf) {
   auto DeletionRange = FD->getBody()->getSourceRange();
   if (auto *CD = llvm::dyn_cast<CXXConstructorDecl>(FD)) {
-    const auto &SM = TokBuf.sourceManager();
     // AST doesn't contain the location for ":" in ctor initializers. Therefore
     // we find it by finding the first ":" before the first ctor initializer.
     SourceLocation InitStart;
     // Find the first initializer.
     for (const auto *CInit : CD->inits()) {
-      // We don't care about in-class initializers.
-      if (CInit->isInClassMemberInitializer())
+      // SourceOrder is -1 for implicit initializers.
+      if (CInit->getSourceOrder() != 0)
         continue;
-      if (InitStart.isInvalid() ||
-          SM.isBeforeInTranslationUnit(CInit->getSourceLocation(), InitStart))
-        InitStart = CInit->getSourceLocation();
+      InitStart = CInit->getSourceLocation();
+      break;
     }
     if (InitStart.isValid()) {
       auto Toks = TokBuf.expandedTokens(CD->getSourceRange());
diff --git a/clang-tools-extra/clangd/unittests/TweakTests.cpp b/clang-tools-extra/clangd/unittests/TweakTests.cpp
index b0a941dae5d2c..319d9e088c2d8 100644
--- a/clang-tools-extra/clangd/unittests/TweakTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TweakTests.cpp
@@ -2059,21 +2059,57 @@ TEST_F(DefineOutlineTest, ApplyTest) {
           "void foo(int x, int y = 5, int = 2, int (*foo)(int) = nullptr) ;",
           "void foo(int x, int y , int , int (*foo)(int) ) {}",
       },
-      // Ctor initializers.
+      // Constructors
+      {
+          R"cpp(
+            class Foo {public: Foo(); Foo(int);};
+            class Bar {
+              Ba^r() {}
+              Bar(int x) : f1(x) {}
+              Foo f1;
+              Foo f2 = 2;
+            };)cpp",
+          R"cpp(
+            class Foo {public: Foo(); Foo(int);};
+            class Bar {
+              Bar() ;
+              Bar(int x) : f1(x) {}
+              Foo f1;
+              Foo f2 = 2;
+            };)cpp",
+          "Bar::Bar() {}\n",
+      },
+      // Ctor with initializer.
+      {
+          R"cpp(
+            class Foo {public: Foo(); Foo(int);};
+            class Bar {
+              Bar() {}
+              B^ar(int x) : f1(x), f2(3) {}
+              Foo f1;
+              Foo f2 = 2;
+            };)cpp",
+          R"cpp(
+            class Foo {public: Foo(); Foo(int);};
+            class Bar {
+              Bar() {}
+              Bar(int x) ;
+              Foo f1;
+              Foo f2 = 2;
+            };)cpp",
+          "Bar::Bar(int x) : f1(x), f2(3) {}\n",
+      },
+      // Ctor initializer with attribute.
       {
           R"cpp(
               class Foo {
-                int y = 2;
                 F^oo(int z) __attribute__((weak)) : bar(2){}
                 int bar;
-                int z = 2;
               };)cpp",
           R"cpp(
               class Foo {
-                int y = 2;
                 Foo(int z) __attribute__((weak)) ;
                 int bar;
-                int z = 2;
               };)cpp",
           "Foo::Foo(int z) __attribute__((weak)) : bar(2){}\n",
       },

From 34e39eb2adc2b3f16c2c2c0607a904ee55705c01 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Tue, 5 May 2020 17:55:11 +0200
Subject: [PATCH 060/770] [clangd] Change PreambleOnlyAction with content
 truncation

Summary:
Lexing until the token location is past preamble bound could be wrong
in some cases as preprocessor lexer can lex multiple tokens in a single call.

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79426
---
 clang-tools-extra/clangd/Preamble.cpp         | 29 ++++++-------------
 .../clangd/unittests/PreambleTests.cpp        |  5 ++++
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index 9d9c5eff8c682..d3eaa92d4c1ac 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -104,24 +104,6 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
   const SourceManager *SourceMgr = nullptr;
 };
 
-// Runs preprocessor over preamble section.
-class PreambleOnlyAction : public PreprocessorFrontendAction {
-protected:
-  void ExecuteAction() override {
-    Preprocessor &PP = getCompilerInstance().getPreprocessor();
-    auto &SM = PP.getSourceManager();
-    PP.EnterMainSourceFile();
-    auto Bounds = ComputePreambleBounds(getCompilerInstance().getLangOpts(),
-                                        SM.getBuffer(SM.getMainFileID()), 0);
-    Token Tok;
-    do {
-      PP.Lex(Tok);
-      assert(SM.isInMainFile(Tok.getLocation()));
-    } while (Tok.isNot(tok::eof) &&
-             SM.getDecomposedLoc(Tok.getLocation()).second < Bounds.Size);
-  }
-};
-
 /// Gets the includes in the preamble section of the file by running
 /// preprocessor over \p Contents. Returned includes do not contain resolved
 /// paths. \p VFS and \p Cmd is used to build the compiler invocation, which
@@ -142,8 +124,15 @@ scanPreambleIncludes(llvm::StringRef Contents,
                                    "failed to create compiler invocation");
   CI->getDiagnosticOpts().IgnoreWarnings = true;
   auto ContentsBuffer = llvm::MemoryBuffer::getMemBuffer(Contents);
+  // This means we're scanning (though not preprocessing) the preamble section
+  // twice. However, it's important to precisely follow the preamble bounds used
+  // elsewhere.
+  auto Bounds =
+      ComputePreambleBounds(*CI->getLangOpts(), ContentsBuffer.get(), 0);
+  auto PreambleContents =
+      llvm::MemoryBuffer::getMemBufferCopy(Contents.substr(0, Bounds.Size));
   auto Clang = prepareCompilerInstance(
-      std::move(CI), nullptr, std::move(ContentsBuffer),
+      std::move(CI), nullptr, std::move(PreambleContents),
       // Provide an empty FS to prevent preprocessor from performing IO. This
       // also implies missing resolved paths for includes.
       new llvm::vfs::InMemoryFileSystem, IgnoreDiags);
@@ -152,7 +141,7 @@ scanPreambleIncludes(llvm::StringRef Contents,
                                    "compiler instance had no inputs");
   // We are only interested in main file includes.
   Clang->getPreprocessorOpts().SingleFileParseMode = true;
-  PreambleOnlyAction Action;
+  PreprocessOnlyAction Action;
   if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0]))
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "failed BeginSourceFile");
diff --git a/clang-tools-extra/clangd/unittests/PreambleTests.cpp b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
index c1801980b1d50..db615e6e66e13 100644
--- a/clang-tools-extra/clangd/unittests/PreambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -118,6 +118,11 @@ TEST(PreamblePatchTest, IncludeParsing) {
         ^#include "a.h"
         #include <b
         ^#include <b.h>)cpp",
+      // Directive is not part of preamble if it is not the token immediately
+      // followed by the hash (#).
+      R"cpp(
+        ^#include "a.h"
+        #/**/include <b.h>)cpp",
   };
 
   for (const auto Case : Cases) {

From e6e89875b04ea521a9dbf3e6a82d81b23f9f77d7 Mon Sep 17 00:00:00 2001
From: Kang Zhang <shkzhang@cn.ibm.com>
Date: Tue, 26 May 2020 06:14:08 +0000
Subject: [PATCH 061/770] [NFC][PowerPC] Add a new case to test two-address
 verification

---
 .../CodeGen/PowerPC/two-address-crash.mir     | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/two-address-crash.mir

diff --git a/llvm/test/CodeGen/PowerPC/two-address-crash.mir b/llvm/test/CodeGen/PowerPC/two-address-crash.mir
new file mode 100644
index 0000000000000..6e98d3d8d398b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/two-address-crash.mir
@@ -0,0 +1,45 @@
+# RUN: not --crash llc -mtriple=ppc32-- %s -run-pass=phi-node-elimination \
+# RUN:   -verify-machineinstrs -o /dev/null 2>&1 | FileCheck %s
+
+--- |
+  define void @VerifyTwoAddressCrash(i16 %div.0.i.i.i.i, i32 %L_num.0.i.i.i.i, i32 %tmp1.i.i206.i.i, i16* %P) {
+    %X = shl i16 %div.0.i.i.i.i, 1
+    %tmp28.i.i.i.i = shl i32 %L_num.0.i.i.i.i, 1
+    %tmp31.i.i.i.i = icmp slt i32 %tmp28.i.i.i.i, %tmp1.i.i206.i.i
+    %tmp31.i.i.i.i.upgrd.1 = zext i1 %tmp31.i.i.i.i to i16
+    %tmp371.i.i.i.i1 = or i16 %tmp31.i.i.i.i.upgrd.1, %X
+    %div.0.be.i.i.i.i = xor i16 %tmp371.i.i.i.i1, 1
+    store i16 %div.0.be.i.i.i.i, i16* %P, align 2
+    ret void
+  }
+
+...
+---
+name:            VerifyTwoAddressCrash
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $r3, $r4, $r5, $r6
+  
+    %3:gprc_and_gprc_nor0 = COPY killed $r6
+    %2:gprc = COPY killed $r5
+    %1:gprc = COPY killed $r4
+    %0:gprc = COPY killed $r3
+    %4:gprc = RLWINM killed %1, 1, 0, 30
+    %5:crrc = CMPW killed %4, killed %2
+    %6:crbitrc = COPY killed %5.sub_lt
+    %7:gprc_and_gprc_nor0 = LI 0
+    %8:gprc_and_gprc_nor0 = LI 1
+    %9:gprc = ISEL killed %8, killed %7, killed %6
+    %10:gprc = RLWIMI killed %9, killed %0, 1, 0, 30
+    %11:gprc = XORI killed %10, 1
+    STH killed %11, 0, killed %3 :: (store 2 into %ir.P)
+    BLR implicit $lr, implicit $rm
+
+...
+
+# CHECK-LABEL: Bad machine code: Two-address instruction operands must be identical
+# CHECK-NEXT:  - function:    VerifyTwoAddressCrash
+# CHECK-NEXT:  - basic block: %bb.0
+# CHECK-NEXT:  - instruction: %10:gprc = RLWIMI killed %9:gprc(tied-def 0), killed %3:gprc, 1, 0, 30
+# CHECK-NEXT:  - operand 1:   killed %9:gprc(tied-def 0)
+# CHECK-NEXT:  LLVM ERROR: Found 1 machine code errors.

From 61f72dd8ace7c4bea1ae74d9734d2b02946b4898 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Tue, 26 May 2020 13:19:01 +0700
Subject: [PATCH 062/770] [FPEnv] Small fixes to implementation of flt.rounds

This change makes minor correction to the implementation of intrinsic
`llvm.flt.rounds`:
- Added documentation entry in LangRef,
- Attributes of the intrinsic changed to be in line with other functions
  dependent of floating-point environment.

Differential Revision: https://reviews.llvm.org/D79322
---
 clang/include/clang/Basic/Builtins.def |  4 ++-
 llvm/docs/LangRef.rst                  | 40 ++++++++++++++++++++++++++
 llvm/include/llvm/IR/Intrinsics.td     |  9 ++++--
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 4f1a7f24c4329..4c43d63ffec40 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -323,6 +323,9 @@ BUILTIN(__builtin_truncf, "ff", "Fnc")
 BUILTIN(__builtin_truncl, "LdLd", "Fnc")
 BUILTIN(__builtin_truncf16, "hh", "Fnc")
 
+// Access to floating point environment
+BUILTIN(__builtin_flt_rounds, "i", "n")
+
 // C99 complex builtins
 BUILTIN(__builtin_cabs, "dXd", "Fne")
 BUILTIN(__builtin_cabsf, "fXf", "Fne")
@@ -517,7 +520,6 @@ BUILTIN(__builtin_return_address, "v*IUi", "n")
 BUILTIN(__builtin_extract_return_addr, "v*v*", "n")
 BUILTIN(__builtin_frame_address, "v*IUi", "n")
 BUILTIN(__builtin___clear_cache, "vc*c*", "n")
-BUILTIN(__builtin_flt_rounds, "i", "nc")
 BUILTIN(__builtin_setjmp, "iv**", "j")
 BUILTIN(__builtin_longjmp, "vv**i", "r")
 BUILTIN(__builtin_unwind_init, "v", "")
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index bf0627e441960..8bcad09964e20 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18284,6 +18284,46 @@ This function returns the same values as the libm ``trunc`` functions
 would and handles error conditions in the same way.
 
 
+Floating Point Environment Manipulation intrinsics
+--------------------------------------------------
+
+These functions read or write floating point environment, such as rounding
+mode or state of floating point exceptions. Altering the floating point
+environment requires special care. See :ref:`Floating Point Environment <floatenv>`.
+
+'``llvm.flt.rounds``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.flt.rounds()
+
+Overview:
+"""""""""
+
+The '``llvm.flt.rounds``' intrinsic reads the current rounding mode.
+
+Semantics:
+""""""""""
+
+The '``llvm.flt.rounds``' intrinsic returns the current rounding mode.
+Encoding of the returned values is same as the result of ``FLT_ROUNDS``,
+specified by C standard:
+
+::
+
+    0  - toward zero
+    1  - to nearest, ties to even
+    2  - toward positive infinity
+    3  - toward negative infinity
+    4  - to nearest, ties away from zero
+
+Other values may be used to represent additional rounding modes, supported by a
+target. These values are target-specific.
+
 General Intrinsics
 ------------------
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index dafa17959e826..51df06cee3587 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -612,6 +612,13 @@ def int_objectsize : Intrinsic<[llvm_anyint_ty],
                                [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<1>, ImmArg<2>, ImmArg<3>]>,
                                GCCBuiltin<"__builtin_object_size">;
 
+//===--------------- Access to Floating Point Environment -----------------===//
+//
+
+let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
+  def int_flt_rounds    : Intrinsic<[llvm_i32_ty], []>;
+}
+
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
 
@@ -1115,8 +1122,6 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty],
 
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
-def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
-                     GCCBuiltin<"__builtin_flt_rounds">;
 def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold]>,
                GCCBuiltin<"__builtin_trap">;
 def int_debugtrap : Intrinsic<[]>,

From 872c5fb1432493c0a09b6f210765c0d94ce9b5d0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 25 May 2020 23:00:50 -0700
Subject: [PATCH 063/770] [AsmPrinter] Don't generate .Lfoo$local for -fno-PIC
 and -fPIE

-fno-PIC and -fPIE code generally cannot be linked in -shared mode and there is no benefit accessing via local aliases.

Actually, a .Lfoo$local reference will be converted to a STT_SECTION (if no section relaxation) reference which will cause the section symbol (sizeof(Elf64_Sym)=24) to be generated.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  12 +-
 .../CodeGen/AArch64/fp16_intrinsic_lane.ll    |  34 ----
 .../machine-outliner-retaddr-sign-sp-mod.ll   |   2 -
 llvm/test/CodeGen/X86/code-model-elf.ll       |  36 ++--
 llvm/test/CodeGen/X86/emutls.ll               |   8 +-
 .../X86/indirect-branch-tracking-eh2.ll       |   1 -
 llvm/test/CodeGen/X86/lifetime-alias.ll       |   4 +-
 llvm/test/CodeGen/X86/linux-preemption.ll     |  15 +-
 llvm/test/CodeGen/X86/oddsubvector.ll         | 176 +++++++++---------
 llvm/test/CodeGen/X86/pr38795.ll              |   2 +-
 .../X86/semantic-interposition-comdat.ll      |   7 +-
 llvm/test/CodeGen/X86/tls.ll                  |   8 +-
 12 files changed, 138 insertions(+), 167 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 5fba0f01ba524..1a2b3761b3a79 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -462,10 +462,14 @@ MCSymbol *AsmPrinter::getSymbolPreferLocal(const GlobalValue &GV) const {
   // assembler would otherwise be conservative and assume a global default
   // visibility symbol can be interposable, even if the code generator already
   // assumed it.
-  if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias())
-    if (GV.isDSOLocal() || (TM.getTargetTriple().isX86() &&
-                            GV.getParent()->noSemanticInterposition()))
-      return getSymbolWithGlobalValueBase(&GV, "$local");
+  if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias()) {
+    const Module &M = *GV.getParent();
+    if (TM.getRelocationModel() != Reloc::Static &&
+        M.getPIELevel() == PIELevel::Default)
+      if (GV.isDSOLocal() || (TM.getTargetTriple().isX86() &&
+                              GV.getParent()->noSemanticInterposition()))
+        return getSymbolWithGlobalValueBase(&GV, "$local");
+  }
   return TM.getSymbol(&GV);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index 1b0c7c3468870..90a5e2453a776 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -10,7 +10,6 @@ declare half @llvm.fma.f16(half, half, half) #1
 
 define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_lane_f16:
-; CHECK:       .Lt_vfma_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -24,7 +23,6 @@ entry:
 
 define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_lane_f16:
-; CHECK:       .Lt_vfmaq_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -38,7 +36,6 @@ entry:
 
 define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_laneq_f16:
-; CHECK:       .Lt_vfma_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
@@ -51,7 +48,6 @@ entry:
 
 define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_laneq_f16:
-; CHECK:       .Lt_vfmaq_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
@@ -64,7 +60,6 @@ entry:
 
 define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfma_n_f16:
-; CHECK:       .Lt_vfma_n_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -79,7 +74,6 @@ entry:
 
 define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmaq_n_f16:
-; CHECK:       .Lt_vfmaq_n_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -94,7 +88,6 @@ entry:
 
 define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16:
-; CHECK:       .Lt_vfmah_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -108,7 +101,6 @@ entry:
 
 define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16:
-; CHECK:       .Lt_vfmah_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[0]
@@ -121,7 +113,6 @@ entry:
 
 define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_lane_f16:
-; CHECK:       .Lt_vfms_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -136,7 +127,6 @@ entry:
 
 define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_lane_f16:
-; CHECK:       .Lt_vfmsq_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -151,7 +141,6 @@ entry:
 
 define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_laneq_f16:
-; CHECK:       .Lt_vfms_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
@@ -165,7 +154,6 @@ entry:
 
 define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_laneq_f16:
-; CHECK:       .Lt_vfmsq_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
@@ -179,7 +167,6 @@ entry:
 
 define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfms_n_f16:
-; CHECK:       .Lt_vfms_n_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -195,7 +182,6 @@ entry:
 
 define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmsq_n_f16:
-; CHECK:       .Lt_vfmsq_n_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -211,7 +197,6 @@ entry:
 
 define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16:
-; CHECK:       .Lt_vfmsh_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -226,7 +211,6 @@ entry:
 
 define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16:
-; CHECK:       .Lt_vfmsh_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[0]
@@ -240,7 +224,6 @@ entry:
 
 define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmul_laneq_f16:
-; CHECK:       .Lt_vmul_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.h[0]
@@ -253,7 +236,6 @@ entry:
 
 define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulq_laneq_f16:
-; CHECK:       .Lt_vmulq_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.h[0]
@@ -266,7 +248,6 @@ entry:
 
 define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_lane_f16:
-; CHECK:       .Lt_vmulh_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -280,7 +261,6 @@ entry:
 
 define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_laneq_f16:
-; CHECK:       .Lt_vmulh_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmul h0, h0, v1.h[0]
@@ -293,7 +273,6 @@ entry:
 
 define dso_local half @t_vmulx_f16(half %a, half %b) {
 ; CHECK-LABEL: t_vmulx_f16:
-; CHECK:       .Lt_vmulx_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, h1
@@ -305,7 +284,6 @@ entry:
 
 define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_lane_f16:
-; CHECK:       .Lt_vmulxh_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -319,7 +297,6 @@ entry:
 
 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_lane_f16:
-; CHECK:       .Lt_vmulx_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -333,7 +310,6 @@ entry:
 
 define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_lane_f16:
-; CHECK:       .Lt_vmulxq_lane_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -347,7 +323,6 @@ entry:
 
 define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_laneq_f16:
-; CHECK:       .Lt_vmulx_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.4h, v0.4h, v1.h[0]
@@ -360,7 +335,6 @@ entry:
 
 define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_laneq_f16:
-; CHECK:       .Lt_vmulxq_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.8h, v0.8h, v1.h[0]
@@ -373,7 +347,6 @@ entry:
 
 define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_laneq_f16:
-; CHECK:       .Lt_vmulxh_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, v1.h[7]
@@ -386,7 +359,6 @@ entry:
 
 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulx_n_f16:
-; CHECK:       .Lt_vmulx_n_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
@@ -402,7 +374,6 @@ entry:
 
 define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulxq_n_f16:
-; CHECK:       .Lt_vmulxq_n_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
@@ -418,7 +389,6 @@ entry:
 
 define dso_local half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmah_lane3_f16:
-; CHECK:       .Lt_vfmah_lane3_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -432,7 +402,6 @@ entry:
 
 define dso_local half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmah_laneq7_f16:
-; CHECK:       .Lt_vfmah_laneq7_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[7]
@@ -445,7 +414,6 @@ entry:
 
 define dso_local half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_lane3_f16:
-; CHECK:       .Lt_vfmsh_lane3_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -460,7 +428,6 @@ entry:
 
 define dso_local half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_laneq7_f16:
-; CHECK:       .Lt_vfmsh_laneq7_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[7]
@@ -474,7 +441,6 @@ entry:
 
 define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: t_fadd_vfmah_f16:
-; CHECK:       .Lt_fadd_vfmah_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    fadd v2.4h, v2.4h, v3.4h
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.ll
index 46355b35d0dec..8fd152869b23c 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.ll
@@ -4,7 +4,6 @@
 @v = common dso_local global i32* null, align 8
 
 ; CHECK-LABEL:  foo:                                    // @foo
-; CHECK-NEXT:   .Lfoo$local:
 ; CHECK-NEXT:   // %bb.0:                               // %entry
 ; CHECK-NEXT:       paciasp
 ; CHECK-NOT:        OUTLINED_FUNCTION_
@@ -23,7 +22,6 @@ entry:
 }
 
 ; CHECK-LABEL:  bar:                                    // @bar
-; CHECK-NEXT:   .Lbar$local:
 ; CHECK-NEXT:   // %bb.0:                               // %entry
 ; CHECK-NEXT:       paciasp
 ; CHECK-NOT:        OUTLINED_FUNCTION_
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index 748e2b0267d8d..f7ffd6ea1eb7c 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -83,28 +83,28 @@ define dso_local i32* @lea_static_data() #0 {
 define dso_local i32* @lea_global_data() #0 {
 ; SMALL-STATIC-LABEL: lea_global_data:
 ; SMALL-STATIC:       # %bb.0:
-; SMALL-STATIC-NEXT:    movl $.Lglobal_data$local, %eax
+; SMALL-STATIC-NEXT:    movl $global_data, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_global_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $.Lglobal_data$local, %rax
+; MEDIUM-STATIC-NEXT:    movabsq $global_data, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_global_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $.Lglobal_data$local, %rax
+; LARGE-STATIC-NEXT:    movabsq $global_data, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_global_data:
 ; SMALL-PIC:       # %bb.0:
-; SMALL-PIC-NEXT:    leaq .Lglobal_data$local(%rip), %rax
+; SMALL-PIC-NEXT:    leaq global_data(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_global_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
-; MEDIUM-PIC-NEXT:    movabsq $.Lglobal_data$local@GOTOFF, %rax
+; MEDIUM-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; MEDIUM-PIC-NEXT:    addq %rcx, %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
@@ -114,7 +114,7 @@ define dso_local i32* @lea_global_data() #0 {
 ; LARGE-PIC-NEXT:    leaq .L1$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L1$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
-; LARGE-PIC-NEXT:    movabsq $.Lglobal_data$local@GOTOFF, %rax
+; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
 ; LARGE-PIC-NEXT:    retq
   ret i32* getelementptr inbounds ([10 x i32], [10 x i32]* @global_data, i64 0, i64 0)
@@ -161,30 +161,30 @@ define dso_local i32* @lea_extern_data() #0 {
 define dso_local i32 @load_global_data() #0 {
 ; SMALL-STATIC-LABEL: load_global_data:
 ; SMALL-STATIC:       # %bb.0:
-; SMALL-STATIC-NEXT:    movl .Lglobal_data$local+8(%rip), %eax
+; SMALL-STATIC-NEXT:    movl global_data+8(%rip), %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: load_global_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $.Lglobal_data$local, %rax
+; MEDIUM-STATIC-NEXT:    movabsq $global_data, %rax
 ; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_global_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $.Lglobal_data$local, %rax
+; LARGE-STATIC-NEXT:    movabsq $global_data, %rax
 ; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_global_data:
 ; SMALL-PIC:       # %bb.0:
-; SMALL-PIC-NEXT:    movl .Lglobal_data$local+8(%rip), %eax
+; SMALL-PIC-NEXT:    movl global_data+8(%rip), %eax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_global_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; MEDIUM-PIC-NEXT:    movabsq $.Lglobal_data$local@GOTOFF, %rcx
+; MEDIUM-PIC-NEXT:    movabsq $global_data@GOTOFF, %rcx
 ; MEDIUM-PIC-NEXT:    movl 8(%rax,%rcx), %eax
 ; MEDIUM-PIC-NEXT:    retq
 ;
@@ -194,7 +194,7 @@ define dso_local i32 @load_global_data() #0 {
 ; LARGE-PIC-NEXT:    leaq .L3$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L3$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
-; LARGE-PIC-NEXT:    movabsq $.Lglobal_data$local@GOTOFF, %rax
+; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    movl 8(%rcx,%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @global_data, i64 0, i64 2)
@@ -302,27 +302,27 @@ define dso_local void ()* @lea_static_fn() #0 {
 define dso_local void ()* @lea_global_fn() #0 {
 ; SMALL-STATIC-LABEL: lea_global_fn:
 ; SMALL-STATIC:       # %bb.0:
-; SMALL-STATIC-NEXT:    movl $.Lglobal_fn$local, %eax
+; SMALL-STATIC-NEXT:    movl $global_fn, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_global_fn:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $.Lglobal_fn$local, %rax
+; MEDIUM-STATIC-NEXT:    movabsq $global_fn, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_global_fn:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $.Lglobal_fn$local, %rax
+; LARGE-STATIC-NEXT:    movabsq $global_fn, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_global_fn:
 ; SMALL-PIC:       # %bb.0:
-; SMALL-PIC-NEXT:    leaq .Lglobal_fn$local(%rip), %rax
+; SMALL-PIC-NEXT:    leaq global_fn(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_global_fn:
 ; MEDIUM-PIC:       # %bb.0:
-; MEDIUM-PIC-NEXT:    movabsq $.Lglobal_fn$local, %rax
+; MEDIUM-PIC-NEXT:    movabsq $global_fn, %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_global_fn:
@@ -331,7 +331,7 @@ define dso_local void ()* @lea_global_fn() #0 {
 ; LARGE-PIC-NEXT:    leaq .L8$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L8$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
-; LARGE-PIC-NEXT:    movabsq $.Lglobal_fn$local@GOTOFF, %rax
+; LARGE-PIC-NEXT:    movabsq $global_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
 ; LARGE-PIC-NEXT:    retq
   ret void ()* @global_fn
diff --git a/llvm/test/CodeGen/X86/emutls.ll b/llvm/test/CodeGen/X86/emutls.ll
index 8d836ef733b5e..1e706c1267d1b 100644
--- a/llvm/test/CodeGen/X86/emutls.ll
+++ b/llvm/test/CodeGen/X86/emutls.ll
@@ -135,7 +135,7 @@ entry:
 
 define i32 @f7() {
 ; X32-LABEL: f7:
-; X32:         movl $.L__emutls_v.i4$local, (%esp)
+; X32:         movl $__emutls_v.i4, (%esp)
 ; X32-NEXT:    calll __emutls_get_address
 ; X32-NEXT:    movl (%eax), %eax
 ; X32-NEXT:    addl $12, %esp
@@ -148,7 +148,7 @@ entry:
 
 define i32* @f8() {
 ; X32-LABEL: f8:
-; X32:         movl $.L__emutls_v.i4$local, (%esp)
+; X32:         movl $__emutls_v.i4, (%esp)
 ; X32-NEXT:    calll __emutls_get_address
 ; X32-NEXT:    addl $12, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
@@ -258,14 +258,12 @@ entry:
 ; X32-NEXT: .long 15
 
 ; X32-LABEL: __emutls_v.i4:
-; X32-NEXT: .L__emutls_v.i4$local:
 ; X32-NEXT: .long 4
 ; X32-NEXT: .long 4
 ; X32-NEXT: .long 0
 ; X32-NEXT: .long __emutls_t.i4
 
 ; X32-LABEL: __emutls_t.i4:
-; X32-NEXT: .L__emutls_t.i4$local:
 ; X32-NEXT: .long 15
 
 ; X32-NOT:   __emutls_v.i5:
@@ -312,14 +310,12 @@ entry:
 ; X64-NEXT: .long 15
 
 ; X64-LABEL: __emutls_v.i4:
-; X64-NEXT: .L__emutls_v.i4$local:
 ; X64-NEXT: .quad 4
 ; X64-NEXT: .quad 4
 ; X64-NEXT: .quad 0
 ; X64-NEXT: .quad __emutls_t.i4
 
 ; X64-LABEL: __emutls_t.i4:
-; X64-NEXT: .L__emutls_t.i4$local:
 ; X64-NEXT: .long 15
 
 ; X64-NOT:   __emutls_v.i5:
diff --git a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
index 312707a029cd9..6e41c94e979a1 100644
--- a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
+++ b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
@@ -4,7 +4,6 @@
 ; NUM-COUNT-3: endbr64
 
 ;SJLJ:       main:                                  # @main
-;SJLJ-NEXT: .Lmain$local:
 ;SJLJ-NEXT: .Lfunc_begin0:
 ;SJLJ-NEXT: # %bb.0:                                # %entry
 ;SJLJ-NEXT:         endbr64
diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll
index e57f1726a4ee2..010dc33b5051c 100644
--- a/llvm/test/CodeGen/X86/lifetime-alias.ll
+++ b/llvm/test/CodeGen/X86/lifetime-alias.ll
@@ -70,9 +70,9 @@ define i8 @main() local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__g
 ; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, .Ldo_not_optimize${{.*}}(%rip)
+; CHECK-NEXT:    movq %rax, do_not_optimize{{.*}}(%rip)
 ; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, .Ldo_not_optimize${{.*}}(%rip)
+; CHECK-NEXT:    movq %rax, do_not_optimize{{.*}}(%rip)
 ; CHECK-NEXT:    cmpb $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    jns .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %_ZNSt3__312basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED2Ev.exit50
diff --git a/llvm/test/CodeGen/X86/linux-preemption.ll b/llvm/test/CodeGen/X86/linux-preemption.ll
index 7d22b75132186..49a7becf13432 100644
--- a/llvm/test/CodeGen/X86/linux-preemption.ll
+++ b/llvm/test/CodeGen/X86/linux-preemption.ll
@@ -41,7 +41,7 @@ define i32* @get_strong_local_global() {
   ret i32* @strong_local_global
 }
 ; CHECK: leaq .Lstrong_local_global$local(%rip), %rax
-; STATIC: movl $.Lstrong_local_global$local, %eax
+; STATIC: movl $strong_local_global, %eax
 ; CHECK32: leal .Lstrong_local_global$local@GOTOFF(%eax), %eax
 
 @weak_local_global = weak dso_local global i32 42
@@ -109,7 +109,7 @@ define i32* @get_strong_local_alias() {
   ret i32* @strong_local_alias
 }
 ; CHECK: leaq .Lstrong_local_alias$local(%rip), %rax
-; STATIC: movl $.Lstrong_local_alias$local, %eax
+; STATIC: movl $strong_local_alias, %eax
 ; CHECK32: leal .Lstrong_local_alias$local@GOTOFF(%eax), %eax
 
 @weak_local_alias = weak dso_local alias i32, i32* @aliasee
@@ -174,9 +174,9 @@ define void()* @get_strong_local_function() {
   ret void()* @strong_local_function
 }
 ; COMMON:     {{^}}strong_local_function:
-; COMMON-NEXT: .Lstrong_local_function$local:
+; CHECK-NEXT: .Lstrong_local_function$local:
 ; CHECK: leaq .Lstrong_local_function$local(%rip), %rax
-; STATIC: movl $.Lstrong_local_function$local, %eax
+; STATIC: movl $strong_local_function, %eax
 ; CHECK32: leal .Lstrong_local_function$local@GOTOFF(%eax), %eax
 
 define weak dso_local void @weak_local_function() {
@@ -226,8 +226,11 @@ define void()* @get_external_preemptable_function() {
 ; STATIC: movl $external_preemptable_function, %eax
 ; CHECK32: movl external_preemptable_function@GOT(%eax), %eax
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 7, !"PIC Level", i32 2}
+
 ; COMMON:     {{^}}strong_local_global:
-; COMMON-NEXT: .Lstrong_local_global$local:
+; CHECK-NEXT: .Lstrong_local_global$local:
 
 ; COMMON:      .globl strong_default_alias
 ; COMMON-NEXT: .set strong_default_alias, aliasee
@@ -235,7 +238,7 @@ define void()* @get_external_preemptable_function() {
 ; COMMON-NEXT: .set weak_default_alias, aliasee
 ; COMMON-NEXT: .globl strong_local_alias
 ; COMMON-NEXT: .set strong_local_alias, aliasee
-; COMMON-NEXT: .set .Lstrong_local_alias$local, aliasee
+; CHECK-NEXT:  .set .Lstrong_local_alias$local, aliasee
 ; COMMON-NEXT: .weak weak_local_alias
 ; COMMON-NEXT: .set weak_local_alias, aliasee
 ; COMMON-NEXT: .globl strong_preemptable_alias
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index 8d3e01f86def6..46ff47b2a1001 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -187,189 +187,189 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) {
 define void @PR42833() {
 ; SSE2-LABEL: PR42833:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    addl .Lb${{.*}}(%rip), %eax
+; SSE2-NEXT:    addl b(%rip), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm4
+; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm4
 ; SSE2-NEXT:    psubd %xmm1, %xmm4
 ; SSE2-NEXT:    paddd %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
-; SSE2-NEXT:    movdqa %xmm1, .Lc$local+{{.*}}(%rip)
-; SSE2-NEXT:    movaps %xmm5, .Lc$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm3
-; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm5
-; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm6
-; SSE2-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm7
+; SSE2-NEXT:    movdqa %xmm1, c+{{.*}}(%rip)
+; SSE2-NEXT:    movaps %xmm5, c+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm5
+; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm6
+; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm7
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    psubd %xmm0, %xmm7
 ; SSE2-NEXT:    psubd %xmm3, %xmm6
 ; SSE2-NEXT:    psubd %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, .Ld$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa %xmm6, .Ld$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa %xmm4, .Ld$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa %xmm7, .Ld$local+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm5, d+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm6, d+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm4, d+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm7, d+{{.*}}(%rip)
 ; SSE2-NEXT:    paddd %xmm3, %xmm3
 ; SSE2-NEXT:    paddd %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, .Lc$local+{{.*}}(%rip)
-; SSE2-NEXT:    movdqa %xmm3, .Lc$local+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm1, c+{{.*}}(%rip)
+; SSE2-NEXT:    movdqa %xmm3, c+{{.*}}(%rip)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: PR42833:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm0
-; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm1
+; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm0
+; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm1
 ; SSE42-NEXT:    movd %xmm1, %eax
-; SSE42-NEXT:    addl .Lb${{.*}}(%rip), %eax
+; SSE42-NEXT:    addl b(%rip), %eax
 ; SSE42-NEXT:    movd %eax, %xmm2
 ; SSE42-NEXT:    paddd %xmm1, %xmm2
-; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm3
+; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm3
 ; SSE42-NEXT:    psubd %xmm0, %xmm3
 ; SSE42-NEXT:    paddd %xmm0, %xmm0
 ; SSE42-NEXT:    movdqa %xmm1, %xmm4
 ; SSE42-NEXT:    paddd %xmm1, %xmm4
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
-; SSE42-NEXT:    movdqa %xmm0, .Lc$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa %xmm4, .Lc$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm0
-; SSE42-NEXT:    movdqa .Lc$local+{{.*}}(%rip), %xmm2
-; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm4
-; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm5
-; SSE42-NEXT:    movdqa .Ld$local+{{.*}}(%rip), %xmm6
+; SSE42-NEXT:    movdqa %xmm0, c+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm4, c+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm0
+; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm2
+; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm4
+; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm5
+; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm6
 ; SSE42-NEXT:    pinsrd $0, %eax, %xmm1
 ; SSE42-NEXT:    psubd %xmm1, %xmm6
 ; SSE42-NEXT:    psubd %xmm2, %xmm5
 ; SSE42-NEXT:    psubd %xmm0, %xmm4
-; SSE42-NEXT:    movdqa %xmm4, .Ld$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa %xmm5, .Ld$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa %xmm3, .Ld$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa %xmm6, .Ld$local+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm4, d+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm5, d+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm3, d+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm6, d+{{.*}}(%rip)
 ; SSE42-NEXT:    paddd %xmm2, %xmm2
 ; SSE42-NEXT:    paddd %xmm0, %xmm0
-; SSE42-NEXT:    movdqa %xmm0, .Lc$local+{{.*}}(%rip)
-; SSE42-NEXT:    movdqa %xmm2, .Lc$local+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm0, c+{{.*}}(%rip)
+; SSE42-NEXT:    movdqa %xmm2, c+{{.*}}(%rip)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: PR42833:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm0
+; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    addl .Lb${{.*}}(%rip), %eax
+; AVX1-NEXT:    addl b(%rip), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm3
+; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
-; AVX1-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm2
-; AVX1-NEXT:    vpsubd .Lc$local+{{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vmovups %ymm1, .Lc$local+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm2
+; AVX1-NEXT:    vpsubd c+{{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vmovups %ymm1, c+{{.*}}(%rip)
 ; AVX1-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm1
+; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm1
-; AVX1-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm3
+; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
+; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm4
-; AVX1-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm5
+; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm4
+; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm5
 ; AVX1-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vmovdqa %xmm2, .Ld$local+{{.*}}(%rip)
-; AVX1-NEXT:    vmovdqa %xmm4, .Ld$local+{{.*}}(%rip)
-; AVX1-NEXT:    vmovdqa %xmm1, .Ld$local+{{.*}}(%rip)
-; AVX1-NEXT:    vmovdqa %xmm0, .Ld$local+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa %xmm2, d+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa %xmm4, d+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa %xmm1, d+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa %xmm0, d+{{.*}}(%rip)
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
-; AVX1-NEXT:    vmovdqa %xmm1, .Lc$local+{{.*}}(%rip)
-; AVX1-NEXT:    vmovdqa %xmm0, .Lc$local+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa %xmm1, c+{{.*}}(%rip)
+; AVX1-NEXT:    vmovdqa %xmm0, c+{{.*}}(%rip)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR42833:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl .Lb${{.*}}(%rip), %eax
-; AVX2-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm0
-; AVX2-NEXT:    addl .Lc$local+{{.*}}(%rip), %eax
+; AVX2-NEXT:    movl b(%rip), %eax
+; AVX2-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm0
+; AVX2-NEXT:    addl c+{{.*}}(%rip), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vmovdqu %ymm2, .Lc$local+{{.*}}(%rip)
-; AVX2-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm2
-; AVX2-NEXT:    vmovdqu .Ld$local+{{.*}}(%rip), %ymm3
-; AVX2-NEXT:    vmovdqu .Ld$local+{{.*}}(%rip), %ymm4
+; AVX2-NEXT:    vmovdqu %ymm2, c+{{.*}}(%rip)
+; AVX2-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vmovdqu d+{{.*}}(%rip), %ymm3
+; AVX2-NEXT:    vmovdqu d+{{.*}}(%rip), %ymm4
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpsubd %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, .Ld$local+{{.*}}(%rip)
-; AVX2-NEXT:    vmovdqu %ymm0, .Ld$local+{{.*}}(%rip)
+; AVX2-NEXT:    vmovdqu %ymm1, d+{{.*}}(%rip)
+; AVX2-NEXT:    vmovdqu %ymm0, d+{{.*}}(%rip)
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm2, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, .Lc$local+{{.*}}(%rip)
+; AVX2-NEXT:    vmovdqu %ymm0, c+{{.*}}(%rip)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: PR42833:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl .Lb${{.*}}(%rip), %eax
-; AVX512-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm0
-; AVX512-NEXT:    vmovdqu64 .Lc$local+{{.*}}(%rip), %zmm1
-; AVX512-NEXT:    addl .Lc$local+{{.*}}(%rip), %eax
+; AVX512-NEXT:    movl b(%rip), %eax
+; AVX512-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm0
+; AVX512-NEXT:    vmovdqu64 c+{{.*}}(%rip), %zmm1
+; AVX512-NEXT:    addl c+{{.*}}(%rip), %eax
 ; AVX512-NEXT:    vmovd %eax, %xmm2
 ; AVX512-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
 ; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vmovdqu %ymm0, .Lc$local+{{.*}}(%rip)
-; AVX512-NEXT:    vmovdqu .Lc$local+{{.*}}(%rip), %ymm0
-; AVX512-NEXT:    vmovdqu64 .Ld$local+{{.*}}(%rip), %zmm3
+; AVX512-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vmovdqu %ymm0, c+{{.*}}(%rip)
+; AVX512-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm0
+; AVX512-NEXT:    vmovdqu64 d+{{.*}}(%rip), %zmm3
 ; AVX512-NEXT:    vpinsrd $0, %eax, %xmm2, %xmm2
 ; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm1
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm3, %zmm1
-; AVX512-NEXT:    vmovdqu64 %zmm1, .Ld$local+{{.*}}(%rip)
+; AVX512-NEXT:    vmovdqu64 %zmm1, d+{{.*}}(%rip)
 ; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, .Lc$local+{{.*}}(%rip)
+; AVX512-NEXT:    vmovdqu %ymm0, c+{{.*}}(%rip)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; XOP-LABEL: PR42833:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm0
+; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm0
 ; XOP-NEXT:    vmovd %xmm0, %eax
-; XOP-NEXT:    addl .Lb${{.*}}(%rip), %eax
+; XOP-NEXT:    addl b(%rip), %eax
 ; XOP-NEXT:    vmovd %eax, %xmm1
 ; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; XOP-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; XOP-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm3
+; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
 ; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
 ; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
-; XOP-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm2
-; XOP-NEXT:    vpsubd .Lc$local+{{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT:    vmovups %ymm1, .Lc$local+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm2
+; XOP-NEXT:    vpsubd c+{{.*}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vmovups %ymm1, c+{{.*}}(%rip)
 ; XOP-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
-; XOP-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm1
+; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
 ; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; XOP-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm1
-; XOP-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm3
+; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
+; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
 ; XOP-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa .Ld$local+{{.*}}(%rip), %xmm4
-; XOP-NEXT:    vmovdqa .Lc$local+{{.*}}(%rip), %xmm5
+; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm4
+; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm5
 ; XOP-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
-; XOP-NEXT:    vmovdqa %xmm2, .Ld$local+{{.*}}(%rip)
-; XOP-NEXT:    vmovdqa %xmm4, .Ld$local+{{.*}}(%rip)
-; XOP-NEXT:    vmovdqa %xmm1, .Ld$local+{{.*}}(%rip)
-; XOP-NEXT:    vmovdqa %xmm0, .Ld$local+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa %xmm2, d+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa %xmm4, d+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa %xmm1, d+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa %xmm0, d+{{.*}}(%rip)
 ; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
 ; XOP-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
-; XOP-NEXT:    vmovdqa %xmm1, .Lc$local+{{.*}}(%rip)
-; XOP-NEXT:    vmovdqa %xmm0, .Lc$local+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa %xmm1, c+{{.*}}(%rip)
+; XOP-NEXT:    vmovdqa %xmm0, c+{{.*}}(%rip)
 ; XOP-NEXT:    vzeroupper
 ; XOP-NEXT:    retq
   %1 = load i32, i32* @b, align 4
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index 3c44798a805f0..d805dcad8b6e6 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -93,7 +93,7 @@ define dso_local void @fn() {
 ; CHECK-NEXT:  # %bb.18: # %if.then41
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $.Lfn$local, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $fn, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $.str, (%esp)
 ; CHECK-NEXT:    calll printf
 ; CHECK-NEXT:  .LBB0_19: # %for.end46
diff --git a/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll b/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll
index 06574056298dd..d0efd4d11c958 100644
--- a/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll
+++ b/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple x86_64-unknown-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64 -relocation-model=pic < %s | FileCheck %s
 
 $comdat_func = comdat any
 
@@ -21,3 +21,8 @@ entry:
   call void @func()
   ret void
 }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"SemanticInterposition", i32 0}
+!1 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/CodeGen/X86/tls.ll b/llvm/test/CodeGen/X86/tls.ll
index b1d29b34a9584..759f3d7c85500 100644
--- a/llvm/test/CodeGen/X86/tls.ll
+++ b/llvm/test/CodeGen/X86/tls.ll
@@ -210,10 +210,10 @@ entry:
 
 define i32 @f7() {
 ; X86_LINUX-LABEL: f7:
-; X86_LINUX:      movl %gs:.Li4$local@NTPOFF, %eax
+; X86_LINUX:      movl %gs:i4@NTPOFF, %eax
 ; X86_LINUX-NEXT: ret
 ; X64_LINUX-LABEL: f7:
-; X64_LINUX:      movl %fs:.Li4$local@TPOFF, %eax
+; X64_LINUX:      movl %fs:i4@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
 ; MINGW32-LABEL: _f7:
 ; MINGW32: movl __tls_index, %eax
@@ -230,11 +230,11 @@ entry:
 define i32* @f8() {
 ; X86_LINUX-LABEL: f8:
 ; X86_LINUX:      movl %gs:0, %eax
-; X86_LINUX-NEXT: leal .Li4$local@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: leal i4@NTPOFF(%eax), %eax
 ; X86_LINUX-NEXT: ret
 ; X64_LINUX-LABEL: f8:
 ; X64_LINUX:      movq %fs:0, %rax
-; X64_LINUX-NEXT: leaq .Li4$local@TPOFF(%rax), %rax
+; X64_LINUX-NEXT: leaq i4@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
 ; MINGW32-LABEL: _f8:
 ; MINGW32: movl __tls_index, %eax

From c34936dae734085c4bc01703da0f5b7456e1bf51 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Tue, 26 May 2020 09:21:54 +0200
Subject: [PATCH 064/770] [lldb] s/dyn_cast/isa

The cast result is unused and produces a warning with gcc.
---
 lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index c687251ed5dcb..9ff8bdb7537fa 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -344,7 +344,7 @@ static void SetMemberOwningModule(clang::Decl *member,
   member->setFromASTFile();
   member->setOwningModuleID(id.GetValue());
   member->setModuleOwnershipKind(clang::Decl::ModuleOwnershipKind::Visible);
-  if (auto *nd = llvm::dyn_cast<clang::NamedDecl>(member))
+  if (llvm::isa<clang::NamedDecl>(member))
     if (auto *dc = llvm::dyn_cast<clang::DeclContext>(parent)) {
       dc->setHasExternalVisibleStorage(true);
       // This triggers ExternalASTSource::FindExternalVisibleDeclsByName() to be

From 1f72d5880e332dfbd36c22388d2b72bd2bd70411 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 08:31:38 +0100
Subject: [PATCH 065/770] [CostModel] Check for free intrinsics in BasicTTI

Recommitting part of "[CostModel] Unify Intrinsic Costs."
de71def3f59dc9f12f67141b5040d8e15c84d08a

Now that the 'free' intrinsic information has been sunk to the lowest
level, query the base implementation in BasicTTI before doing
anything else. I suspect this is the change that was causing the main
changes, particularly the large effects on debug builds.

Differential Revision: https://reviews.llvm.org/D80012
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  3 +
 .../Analysis/CostModel/X86/free-intrinsics.ll | 78 ++++++++++++++++++
 .../CostModel/free-intrinsics-datalayout.ll   | 80 +++++++++++++++++++
 .../CostModel/free-intrinsics-no_info.ll      | 78 ++++++++++++++++++
 4 files changed, 239 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
 create mode 100644 llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
 create mode 100644 llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5a891779e1857..dbbcc795ea00f 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1091,6 +1091,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind) {
 
+    if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
+      return 0;
+
     // TODO: Combine these two logic paths.
     if (ICA.isTypeBasedOnly())
       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
new file mode 100644
index 0000000000000..f85e267637141
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -mtriple=x86_64-- -analyze -cost-model -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE
+; RUN: opt -mtriple=x86_64-- -analyze -cost-model -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT
+
+define i32 @trivially_free() {
+; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect()
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 true, i1 true, i1 true)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect()
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 true, i1 true, i1 true)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+  call void @llvm.assume(i1 undef)
+  call void @llvm.sideeffect()
+  call void @llvm.dbg.declare(metadata i8** undef, metadata !0, metadata !DIExpression())
+  call void @llvm.dbg.value(metadata i64 undef, i64 undef, metadata !DIExpression(), metadata !DIExpression())
+  call void @llvm.dbg.label(metadata !2)
+  %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+  call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+  %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+  %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+  %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+  %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 1, i1 1, i1 1)
+  %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+  call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+  ret i32 undef
+}
+
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32)
+declare void @llvm.assume(i1)
+declare void @llvm.sideeffect()
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+declare {}* @llvm.invariant.start.p0i8(i64, i8*)
+declare void @llvm.invariant.end.p0i8({}*, i64, i8*)
+declare i8* @llvm.launder.invariant.group.p0i8(i8*)
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)
+declare i1 @llvm.is.constant.i32(i32)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1)
+declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32)
+declare void @llvm.var.annotation(i8*, i8*, i8*, i32)
+
+
+!0 = !DILocalVariable(scope: !1)
+!1 = distinct !DISubprogram(name: "dummy", line: 79, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true)
+!2 = !DILabel(scope: !1, name: "label", file: !3, line: 7)
+!3 = !DIFile(filename: "debug-label.c", directory: "./")
diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
new file mode 100644
index 0000000000000..232265a5cdfdb
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -analyze -cost-model -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE
+; RUN: opt -analyze -cost-model -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define i32 @trivially_free() {
+; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect()
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 true, i1 true, i1 true)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect()
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 true, i1 true, i1 true)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+  call void @llvm.assume(i1 undef)
+  call void @llvm.sideeffect()
+  call void @llvm.dbg.declare(metadata i8** undef, metadata !0, metadata !DIExpression())
+  call void @llvm.dbg.value(metadata i64 undef, i64 undef, metadata !DIExpression(), metadata !DIExpression())
+  call void @llvm.dbg.label(metadata !2)
+  %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+  call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+  %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+  %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+  %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+  %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 1, i1 1, i1 1)
+  %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+  call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+  ret i32 undef
+}
+
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32)
+declare void @llvm.assume(i1)
+declare void @llvm.sideeffect()
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+declare {}* @llvm.invariant.start.p0i8(i64, i8*)
+declare void @llvm.invariant.end.p0i8({}*, i64, i8*)
+declare i8* @llvm.launder.invariant.group.p0i8(i8*)
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)
+declare i1 @llvm.is.constant.i32(i32)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1)
+declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32)
+declare void @llvm.var.annotation(i8*, i8*, i8*, i32)
+
+
+!0 = !DILocalVariable(scope: !1)
+!1 = distinct !DISubprogram(name: "dummy", line: 79, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true)
+!2 = !DILabel(scope: !1, name: "label", file: !3, line: 7)
+!3 = !DIFile(filename: "debug-label.c", directory: "./")
diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
new file mode 100644
index 0000000000000..9622a4f0dd1db
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -analyze -cost-model -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE
+; RUN: opt -analyze -cost-model -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT
+
+define i32 @trivially_free() {
+; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect()
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 true, i1 true, i1 true)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect()
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 true, i1 true, i1 true)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %a0 = call i32 @llvm.annotation.i32(i32 undef, i8* undef, i8* undef, i32 undef)
+  call void @llvm.assume(i1 undef)
+  call void @llvm.sideeffect()
+  call void @llvm.dbg.declare(metadata i8** undef, metadata !0, metadata !DIExpression())
+  call void @llvm.dbg.value(metadata i64 undef, i64 undef, metadata !DIExpression(), metadata !DIExpression())
+  call void @llvm.dbg.label(metadata !2)
+  %a1 = call {}* @llvm.invariant.start.p0i8(i64 1, i8* undef)
+  call void @llvm.invariant.end.p0i8({}* undef, i64 1, i8* undef)
+  %a2 = call i8* @llvm.launder.invariant.group.p0i8(i8* undef)
+  %a3 = call i8* @llvm.strip.invariant.group.p0i8(i8* undef)
+  %a4 = call i1 @llvm.is.constant.i32(i32 undef)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+  %a5 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 1, i1 1, i1 1)
+  %a6 = call i8* @llvm.ptr.annotation.p0i8(i8* undef, i8* undef, i8* undef, i32 undef)
+  call void @llvm.var.annotation(i8* undef, i8* undef, i8* undef, i32 undef)
+  ret i32 undef
+}
+
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32)
+declare void @llvm.assume(i1)
+declare void @llvm.sideeffect()
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+declare {}* @llvm.invariant.start.p0i8(i64, i8*)
+declare void @llvm.invariant.end.p0i8({}*, i64, i8*)
+declare i8* @llvm.launder.invariant.group.p0i8(i8*)
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)
+declare i1 @llvm.is.constant.i32(i32)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1)
+declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32)
+declare void @llvm.var.annotation(i8*, i8*, i8*, i32)
+
+
+!0 = !DILocalVariable(scope: !1)
+!1 = distinct !DISubprogram(name: "dummy", line: 79, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true)
+!2 = !DILabel(scope: !1, name: "label", file: !3, line: 7)
+!3 = !DIFile(filename: "debug-label.c", directory: "./")

From 80cc43b420a8ab8648f44fbb554b483a2998712d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Mon, 25 May 2020 23:59:00 -0700
Subject: [PATCH 066/770] [AArch64] Set i32 ISD::MULHU/S to Expand instead of
 Legal.

Looks like there are no isel patterns for these. A DAG combine
turns it into i64 multiply and a shift which hides this.

Extracted from D80485
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 856a2e4d9d67d..5eb9b7463411f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -354,6 +354,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ROTR, VT, Expand);
   }
 
+  // AArch64 doesn't have i32 MULH{S|U}.
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
   // AArch64 doesn't have {U|S}MUL_LOHI.
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

From 1abb883a048153c83a4e11070219d23f362e7377 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Wed, 20 May 2020 16:03:42 +0200
Subject: [PATCH 067/770] [clangd] Don't traverse the AST within uninteresting
 files during indexing

Summary:
We already skip function bodies from these files while parsing, and drop symbols
found in them. However, traversing their ASTs still takes a substantial amount
of time.

Non-scientific benchmark on my machine:
  background-indexing llvm-project (llvm+clang+clang-tools-extra), wall time
  before: 7:46
  after: 5:13
  change: -33%

Indexer.cpp libclang should be updated too, I'm less familiar with that code,
and it's doing tricky things with the ShouldSkipFunctionBody callback, so it
needs to be done separately.

Reviewers: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80296
---
 .../clangd/index/IndexAction.cpp              | 25 +++++++------
 .../clangd/unittests/IndexActionTests.cpp     | 37 +++++++++++++++++--
 clang/include/clang/Index/IndexingAction.h    | 17 ++++-----
 clang/include/clang/Index/IndexingOptions.h   |  6 +++
 clang/lib/Index/IndexDecl.cpp                 |  3 ++
 clang/lib/Index/IndexingAction.cpp            | 15 ++++++++
 6 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/clang-tools-extra/clangd/index/IndexAction.cpp b/clang-tools-extra/clangd/index/IndexAction.cpp
index 9f294d4ab9252..aa65008b51c00 100644
--- a/clang-tools-extra/clangd/index/IndexAction.cpp
+++ b/clang-tools-extra/clangd/index/IndexAction.cpp
@@ -132,11 +132,19 @@ class IndexAction : public ASTFrontendAction {
               std::function<void(RefSlab)> RefsCallback,
               std::function<void(RelationSlab)> RelationsCallback,
               std::function<void(IncludeGraph)> IncludeGraphCallback)
-      : SymbolsCallback(SymbolsCallback),
-        RefsCallback(RefsCallback), RelationsCallback(RelationsCallback),
+      : SymbolsCallback(SymbolsCallback), RefsCallback(RefsCallback),
+        RelationsCallback(RelationsCallback),
         IncludeGraphCallback(IncludeGraphCallback), Collector(C),
         Includes(std::move(Includes)), Opts(Opts),
-        PragmaHandler(collectIWYUHeaderMaps(this->Includes.get())) {}
+        PragmaHandler(collectIWYUHeaderMaps(this->Includes.get())) {
+    this->Opts.ShouldTraverseDecl = [this](const Decl *D) {
+      auto &SM = D->getASTContext().getSourceManager();
+      auto FID = SM.getFileID(SM.getExpansionLoc(D->getLocation()));
+      if (!FID.isValid())
+        return true;
+      return Collector->shouldIndexFile(FID);
+    };
+  }
 
   std::unique_ptr<ASTConsumer>
   CreateASTConsumer(CompilerInstance &CI, llvm::StringRef InFile) override {
@@ -146,15 +154,8 @@ class IndexAction : public ASTFrontendAction {
       CI.getPreprocessor().addPPCallbacks(
           std::make_unique<IncludeGraphCollector>(CI.getSourceManager(), IG));
 
-    return index::createIndexingASTConsumer(
-        Collector, Opts, CI.getPreprocessorPtr(),
-        /*ShouldSkipFunctionBody=*/[this](const Decl *D) {
-          auto &SM = D->getASTContext().getSourceManager();
-          auto FID = SM.getFileID(SM.getExpansionLoc(D->getLocation()));
-          if (!FID.isValid())
-            return false;
-          return !Collector->shouldIndexFile(FID);
-        });
+    return index::createIndexingASTConsumer(Collector, Opts,
+                                            CI.getPreprocessorPtr());
   }
 
   bool BeginInvocation(CompilerInstance &CI) override {
diff --git a/clang-tools-extra/clangd/unittests/IndexActionTests.cpp b/clang-tools-extra/clangd/unittests/IndexActionTests.cpp
index 6441d019c7e18..31e1bc573290f 100644
--- a/clang-tools-extra/clangd/unittests/IndexActionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IndexActionTests.cpp
@@ -19,6 +19,7 @@ namespace {
 
 using ::testing::AllOf;
 using ::testing::ElementsAre;
+using ::testing::EndsWith;
 using ::testing::Not;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
@@ -75,8 +76,7 @@ class IndexActionTest : public ::testing::Test {
         new FileManager(FileSystemOptions(), InMemoryFileSystem));
 
     auto Action = createStaticIndexingAction(
-        SymbolCollector::Options(),
-        [&](SymbolSlab S) { IndexFile.Symbols = std::move(S); },
+        Opts, [&](SymbolSlab S) { IndexFile.Symbols = std::move(S); },
         [&](RefSlab R) { IndexFile.Refs = std::move(R); },
         [&](RelationSlab R) { IndexFile.Relations = std::move(R); },
         [&](IncludeGraph IG) { IndexFile.Sources = std::move(IG); });
@@ -99,11 +99,12 @@ class IndexActionTest : public ::testing::Test {
 
   void addFile(llvm::StringRef Path, llvm::StringRef Content) {
     InMemoryFileSystem->addFile(Path, 0,
-                                llvm::MemoryBuffer::getMemBuffer(Content));
+                                llvm::MemoryBuffer::getMemBufferCopy(Content));
     FilePaths.push_back(std::string(Path));
   }
 
 protected:
+  SymbolCollector::Options Opts;
   std::vector<std::string> FilePaths;
   llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> InMemoryFileSystem;
 };
@@ -250,6 +251,36 @@ TEST_F(IndexActionTest, NoWarnings) {
   EXPECT_THAT(*IndexFile.Symbols, ElementsAre(HasName("foo"), HasName("bar")));
 }
 
+TEST_F(IndexActionTest, SkipFiles) {
+  std::string MainFilePath = testPath("main.cpp");
+  addFile(MainFilePath, R"cpp(
+    // clang-format off
+    #include "good.h"
+    #include "bad.h"
+    // clang-format on
+  )cpp");
+  addFile(testPath("good.h"), R"cpp(
+    struct S { int s; };
+    void f1() { S f; }
+    auto unskippable1() { return S(); }
+  )cpp");
+  addFile(testPath("bad.h"), R"cpp(
+    struct T { S t; };
+    void f2() { S f; }
+    auto unskippable2() { return S(); }
+  )cpp");
+  Opts.FileFilter = [](const SourceManager &SM, FileID F) {
+    return !SM.getFileEntryForID(F)->getName().endswith("bad.h");
+  };
+  IndexFileIn IndexFile = runIndexingAction(MainFilePath, {"-std=c++14"});
+  EXPECT_THAT(*IndexFile.Symbols,
+              UnorderedElementsAre(HasName("S"), HasName("s"), HasName("f1"),
+                                   HasName("unskippable1")));
+  for (const auto &Pair : *IndexFile.Refs)
+    for (const auto &Ref : Pair.second)
+      EXPECT_THAT(Ref.Location.FileURI, EndsWith("good.h"));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang/include/clang/Index/IndexingAction.h b/clang/include/clang/Index/IndexingAction.h
index 9ed2a018f1617..4baa2d5e72603 100644
--- a/clang/include/clang/Index/IndexingAction.h
+++ b/clang/include/clang/Index/IndexingAction.h
@@ -30,22 +30,21 @@ namespace serialization {
 }
 
 namespace index {
-  class IndexDataConsumer;
+class IndexDataConsumer;
 
 /// Creates an ASTConsumer that indexes all symbols (macros and AST decls).
+std::unique_ptr<ASTConsumer>
+createIndexingASTConsumer(std::shared_ptr<IndexDataConsumer> DataConsumer,
+                          const IndexingOptions &Opts,
+                          std::shared_ptr<Preprocessor> PP);
+
 std::unique_ptr<ASTConsumer> createIndexingASTConsumer(
     std::shared_ptr<IndexDataConsumer> DataConsumer,
     const IndexingOptions &Opts, std::shared_ptr<Preprocessor> PP,
+    // Prefer to set Opts.ShouldTraverseDecl and use the above overload.
+    // This version is only needed if used to *track* function body parsing.
     std::function<bool(const Decl *)> ShouldSkipFunctionBody);
 
-inline std::unique_ptr<ASTConsumer> createIndexingASTConsumer(
-    std::shared_ptr<IndexDataConsumer> DataConsumer,
-    const IndexingOptions &Opts, std::shared_ptr<Preprocessor> PP) {
-  return createIndexingASTConsumer(
-      std::move(DataConsumer), Opts, std::move(PP),
-      /*ShouldSkipFunctionBody=*/[](const Decl *) { return false; });
-}
-
 /// Creates a frontend action that indexes all symbols (macros and AST decls).
 std::unique_ptr<FrontendAction>
 createIndexingAction(std::shared_ptr<IndexDataConsumer> DataConsumer,
diff --git a/clang/include/clang/Index/IndexingOptions.h b/clang/include/clang/Index/IndexingOptions.h
index bbfd6e4a72c62..2dd276998abf7 100644
--- a/clang/include/clang/Index/IndexingOptions.h
+++ b/clang/include/clang/Index/IndexingOptions.h
@@ -34,6 +34,12 @@ struct IndexingOptions {
   // Has no effect if IndexFunctionLocals are false.
   bool IndexParametersInDeclarations = false;
   bool IndexTemplateParameters = false;
+
+  // If set, skip indexing inside some declarations for performance.
+  // This prevents traversal, so skipping a struct means its declaration an
+  // members won't be indexed, but references elsewhere to that struct will be.
+  // Currently this is only checked for top-level declarations.
+  std::function<bool(const Decl *)> ShouldTraverseDecl;
 };
 
 } // namespace index
diff --git a/clang/lib/Index/IndexDecl.cpp b/clang/lib/Index/IndexDecl.cpp
index 68160bc59eb6a..2ba323e635753 100644
--- a/clang/lib/Index/IndexDecl.cpp
+++ b/clang/lib/Index/IndexDecl.cpp
@@ -765,6 +765,9 @@ bool IndexingContext::indexTopLevelDecl(const Decl *D) {
   if (isa<ObjCMethodDecl>(D))
     return true; // Wait for the objc container.
 
+  if (IndexOpts.ShouldTraverseDecl && !IndexOpts.ShouldTraverseDecl(D))
+    return true; // skip
+
   return indexDecl(D);
 }
 
diff --git a/clang/lib/Index/IndexingAction.cpp b/clang/lib/Index/IndexingAction.cpp
index 4f402135672c3..e698c07133a9c 100644
--- a/clang/lib/Index/IndexingAction.cpp
+++ b/clang/lib/Index/IndexingAction.cpp
@@ -131,6 +131,21 @@ std::unique_ptr<ASTConsumer> index::createIndexingASTConsumer(
                                             ShouldSkipFunctionBody);
 }
 
+std::unique_ptr<ASTConsumer> clang::index::createIndexingASTConsumer(
+    std::shared_ptr<IndexDataConsumer> DataConsumer,
+    const IndexingOptions &Opts, std::shared_ptr<Preprocessor> PP) {
+  std::function<bool(const Decl *)> ShouldSkipFunctionBody = [](const Decl *) {
+    return false;
+  };
+  if (Opts.ShouldTraverseDecl)
+    ShouldSkipFunctionBody =
+        [ShouldTraverseDecl(Opts.ShouldTraverseDecl)](const Decl *D) {
+          return !ShouldTraverseDecl(D);
+        };
+  return createIndexingASTConsumer(std::move(DataConsumer), Opts, std::move(PP),
+                                   std::move(ShouldSkipFunctionBody));
+}
+
 std::unique_ptr<FrontendAction>
 index::createIndexingAction(std::shared_ptr<IndexDataConsumer> DataConsumer,
                             const IndexingOptions &Opts) {

From 64cfb8a864cf98dcd762a26d03cba95145b9aa41 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 09:41:04 +0100
Subject: [PATCH 068/770] [NFC][ARM] Add intrinsic code size runs

Add code size analysis of arithmetic intrinsics.
---
 .../Analysis/CostModel/ARM/arith-overflow.ll  | 1023 +++++++++++------
 .../test/Analysis/CostModel/ARM/arith-ssat.ll |  339 ++++--
 .../test/Analysis/CostModel/ARM/arith-usat.ll |  339 ++++--
 .../test/Analysis/CostModel/ARM/reduce-add.ll |  223 ++--
 4 files changed, 1245 insertions(+), 679 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
index 66a03888c0d45..b50aa97643e22 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-RECIP
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-SIZE
 
 declare {i64, i1}              @llvm.sadd.with.overflow.i64(i64, i64)
 declare {<2 x i64>, <2 x i1>}  @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
@@ -24,62 +27,119 @@ declare {<32 x i8>, <32 x i1>}  @llvm.sadd.with.overflow.v32i8(<32 x i8>, <32 x
 declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sadd(i32 %arg) {
-; V8M-LABEL: 'sadd'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'sadd'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'sadd'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 582 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'sadd'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'sadd'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'sadd'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 582 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'sadd'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'sadd'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'sadd'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
   %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -125,62 +185,119 @@ declare {<32 x i8>, <32 x i1>}  @llvm.uadd.with.overflow.v32i8(<32 x i8>, <32 x
 declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @uadd(i32 %arg) {
-; V8M-LABEL: 'uadd'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'uadd'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'uadd'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'uadd'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'uadd'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'uadd'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'uadd'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'uadd'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'uadd'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
   %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -226,62 +343,119 @@ declare {<32 x i8>, <32 x i1>}  @llvm.ssub.with.overflow.v32i8(<32 x i8>, <32 x
 declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @ssub(i32 %arg) {
-; V8M-LABEL: 'ssub'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'ssub'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'ssub'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 582 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'ssub'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'ssub'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'ssub'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 582 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'ssub'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'ssub'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'ssub'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
   %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -327,62 +501,119 @@ declare {<32 x i8>, <32 x i1>}  @llvm.usub.with.overflow.v32i8(<32 x i8>, <32 x
 declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @usub(i32 %arg) {
-; V8M-LABEL: 'usub'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'usub'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'usub'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'usub'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'usub'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'usub'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'usub'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'usub'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'usub'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
   %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -428,62 +659,119 @@ declare {<32 x i8>, <32 x i1>}  @llvm.smul.with.overflow.v32i8(<32 x i8>, <32 x
 declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @smul(i32 %arg) {
-; V8M-LABEL: 'smul'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'smul'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 300 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'smul'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 316 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1208 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 424 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 424 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'smul'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'smul'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 300 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'smul'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 316 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1208 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 424 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 424 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'smul'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'smul'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'smul'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
   %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -529,62 +817,119 @@ declare {<32 x i8>, <32 x i1>}  @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x
 declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @umul(i32 %arg) {
-; V8M-LABEL: 'umul'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'umul'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'umul'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1200 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'umul'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'umul'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'umul'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1200 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'umul'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'umul'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'umul'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
   %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
index 59687ce72249c..bc2ac44215e01 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-RECIP
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-SIZE
 
 declare i64        @llvm.sadd.sat.i64(i64, i64)
 declare <2 x i64>  @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>)
@@ -24,62 +27,119 @@ declare <32 x i8>  @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
-; V8M-LABEL: 'add'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'add'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'add'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-RECIP-LABEL: 'add'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'add'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 302 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1046 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-RECIP-LABEL: 'add'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 302 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1046 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'add'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'add'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'add'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -125,62 +185,119 @@ declare <32 x i8>  @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
-; V8M-LABEL: 'sub'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'sub'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'sub'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'sub'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 302 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1046 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'sub'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'sub'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-SIZE-LABEL: 'sub'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'sub'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 302 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1046 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-SIZE-LABEL: 'sub'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
index 92c313f4ab1c5..c4f09937b47e5 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-RECIP
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-SIZE
 
 declare i64        @llvm.uadd.sat.i64(i64, i64)
 declare <2 x i64>  @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>)
@@ -24,62 +27,119 @@ declare <32 x i8>  @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @add(i32 %arg) {
-; V8M-LABEL: 'add'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'add'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'add'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-RECIP-LABEL: 'add'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'add'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-RECIP-LABEL: 'add'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'add'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'add'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'add'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -125,62 +185,119 @@ declare <32 x i8>  @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
 declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 
 define i32 @sub(i32 %arg) {
-; V8M-LABEL: 'sub'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'sub'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; NEON-RECIP-LABEL: 'sub'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; MVE-RECIP-LABEL: 'sub'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'sub'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'sub'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-SIZE-LABEL: 'sub'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'sub'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-SIZE-LABEL: 'sub'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
index 73614ab5ece83..089061253d822 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
@@ -1,32 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M
-; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-RECIP
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-SIZE
 
 define i32 @reduce_i64(i32 %arg) {
-; V8M-LABEL: 'reduce_i64'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'reduce_i64'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'reduce_i64'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-RECIP-LABEL: 'reduce_i64'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'reduce_i64'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-RECIP-LABEL: 'reduce_i64'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; V8M-SIZE-LABEL: 'reduce_i64'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; NEON-SIZE-LABEL: 'reduce_i64'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; MVE-SIZE-LABEL: 'reduce_i64'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
   %V2  = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
@@ -37,105 +64,65 @@ define i32 @reduce_i64(i32 %arg) {
 }
 
 define i32 @reduce_i32(i32 %arg) {
-; V8M-LABEL: 'reduce_i32'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; MVE-LABEL: 'reduce_i32'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2166 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-  %V2  = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-  %V4  = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-  %V8  = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-  %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-  %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-  ret i32 undef
-}
-
-define i32 @reduce_i16(i32 %arg) {
-; V8M-LABEL: 'reduce_i16'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-RECIP-LABEL: 'reduce_i32'
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 297 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 493 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-RECIP-LABEL: 'reduce_i32'
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'reduce_i16'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 2708 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 8860 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-RECIP-LABEL: 'reduce_i32'
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  %V2  = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-  %V4  = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-  %V8  = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-  %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-  %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-  %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-  ret i32 undef
-}
-
-define i32 @reduce_i8(i32 %arg) {
-; V8M-LABEL: 'reduce_i8'
-; V8M-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; V8M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; V8M-SIZE-LABEL: 'reduce_i32'
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; NEON-SIZE-LABEL: 'reduce_i32'
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; MVE-LABEL: 'reduce_i8'
-; MVE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; MVE-SIZE-LABEL: 'reduce_i32'
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
   %V4   = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)

From 871556a494552c0f503eec17055f075bcd859937 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 09:23:18 +0100
Subject: [PATCH 069/770] [CostModel] Unify Intrinsic Costs.

Recommitting most of the remaining changes from
259eb619ff6dcd5b6111d1686e18559b9ca004d4, but excluding the call to
getUserCost from getInstructionThroughput. Though there's still no
test changes, I doubt that this is an NFC...

With the two getIntrinsicInstrCosts folded into one, now fold in the
scalar/code-size orientated getIntrinsicCost. The remaining scalar
intrinsics were memcpy, cttz and ctlz which now have special handling
in the BasicTTI implementation.

This had required a change in the AMDGPU backend for fabs as it
should always be 'free'. I've also changed the X86 backend to return
the BaseT implementation when the CostKind isn't RecipThroughput.

Differential Revision: https://reviews.llvm.org/D80012
---
 .../llvm/Analysis/TargetTransformInfo.h       | 40 ++---------
 .../llvm/Analysis/TargetTransformInfoImpl.h   | 37 ++--------
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 70 +++++++++----------
 llvm/lib/Analysis/TargetTransformInfo.cpp     | 27 ++++---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  3 +
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  6 ++
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |  8 ++-
 7 files changed, 73 insertions(+), 118 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c2ba9a488dca2..c50c696741b17 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -38,6 +38,7 @@ class AssumptionCache;
 class BlockFrequencyInfo;
 class DominatorTree;
 class BranchInst;
+class CallBase;
 class Function;
 class GlobalValue;
 class IntrinsicInst;
@@ -120,10 +121,12 @@ class IntrinsicCostAttributes {
 public:
   IntrinsicCostAttributes(const IntrinsicInst &I);
 
-  IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
+  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
+
+  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
                           unsigned Factor);
 
-  IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
+  IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
                           unsigned Factor, unsigned ScalarCost);
 
   IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
@@ -141,7 +144,7 @@ class IntrinsicCostAttributes {
   IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                           ArrayRef<Type *> Tys);
 
-  IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
+  IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
                           ArrayRef<Value *> Args);
 
   Intrinsic::ID getID() const { return IID; }
@@ -288,18 +291,6 @@ class TargetTransformInfo {
   /// scientific. A target may has no bonus on vector instructions.
   int getInlinerVectorBonusPercent() const;
 
-  /// Estimate the cost of an intrinsic when lowered.
-  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                       ArrayRef<Type *> ParamTys,
-                       const User *U = nullptr,
-                       TTI::TargetCostKind CostKind = TCK_SizeAndLatency) const;
-
-  /// Estimate the cost of an intrinsic when lowered.
-  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                       ArrayRef<const Value *> Arguments,
-                       const User *U = nullptr,
-                       TTI::TargetCostKind CostKind = TCK_SizeAndLatency) const;
-
   /// \return the expected cost of a memcpy, which could e.g. depend on the
   /// source/destination type and alignment and the number of bytes copied.
   int getMemcpyCost(const Instruction *I) const;
@@ -1231,13 +1222,6 @@ class TargetTransformInfo::Concept {
                          TTI::TargetCostKind CostKind) = 0;
   virtual unsigned getInliningThresholdMultiplier() = 0;
   virtual int getInlinerVectorBonusPercent() = 0;
-  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                               ArrayRef<Type *> ParamTys, const User *U,
-                               enum TargetCostKind CostKind) = 0;
-  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                               ArrayRef<const Value *> Arguments,
-                               const User *U,
-                               enum TargetCostKind CostKind) = 0;
   virtual int getMemcpyCost(const Instruction *I) = 0;
   virtual unsigned
   getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
@@ -1495,18 +1479,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   int getInlinerVectorBonusPercent() override {
     return Impl.getInlinerVectorBonusPercent();
   }
-  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                       ArrayRef<Type *> ParamTys,
-                       const User *U = nullptr,
-                       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) override {
-    return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U, CostKind);
-  }
-  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                       ArrayRef<const Value *> Arguments,
-                       const User *U = nullptr,
-                       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) override {
-    return Impl.getIntrinsicCost(IID, RetTy, Arguments, U, CostKind);
-  }
   int getMemcpyCost(const Instruction *I) override {
     return Impl.getMemcpyCost(I);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d353bf056df9d..60de70dcb16a0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,36 +772,8 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
     return TTI::TCC_Basic;
   }
 
-  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> ParamTys, const User *U,
-                            TTI::TargetCostKind CostKind) {
-    switch (IID) {
-    default:
-      break;
-    // TODO: other libc intrinsics.
-    case Intrinsic::memcpy:
-      return static_cast<T *>(this)->getMemcpyCost(dyn_cast<Instruction>(U));
-    }
-    IntrinsicCostAttributes Attrs(IID, RetTy, ParamTys);
-    return getIntrinsicInstrCost(Attrs, CostKind);
-  }
-
-  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<const Value *> Arguments, const User *U,
-                            TTI::TargetCostKind CostKind) {
-    // Delegate to the generic intrinsic handling code. This mostly provides an
-    // opportunity for targets to (for example) special case the cost of
-    // certain intrinsics based on constants used as arguments.
-    SmallVector<Type *, 8> ParamTys;
-    ParamTys.reserve(Arguments.size());
-    for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
-      ParamTys.push_back(Arguments[Idx]->getType());
-    return static_cast<T *>(this)->getIntrinsicCost(IID, RetTy, ParamTys, U,
-                                                    CostKind);
-  }
-
-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
-                       TTI::TargetCostKind CostKind) {
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                  TTI::TargetCostKind CostKind) {
     auto *TargetTTI = static_cast<T *>(this);
 
     // FIXME: Unlikely to be true for anything but CodeSize.
@@ -810,9 +782,8 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       if (F) {
         FunctionType *FTy = F->getFunctionType();
         if (Intrinsic::ID IID = F->getIntrinsicID()) {
-          SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
-          return TargetTTI->getIntrinsicCost(IID, FTy->getReturnType(),
-                                             ParamTys, U, CostKind);
+          IntrinsicCostAttributes Attrs(IID, *CB);
+          return TargetTTI->getIntrinsicInstrCost(Attrs, CostKind);
         }
 
         if (!TargetTTI->isLoweredToCall(F))
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index dbbcc795ea00f..7866e71853cf3 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -296,30 +296,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
   }
 
-  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<const Value *> Arguments, const User *U,
-                            TTI::TargetCostKind CostKind) {
-    return BaseT::getIntrinsicCost(IID, RetTy, Arguments, U, CostKind);
-  }
-
-  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> ParamTys, const User *U,
-                            TTI::TargetCostKind CostKind) {
-    if (IID == Intrinsic::cttz) {
-      if (getTLI()->isCheapToSpeculateCttz())
-        return TargetTransformInfo::TCC_Basic;
-      return TargetTransformInfo::TCC_Expensive;
-    }
-
-    if (IID == Intrinsic::ctlz) {
-      if (getTLI()->isCheapToSpeculateCtlz())
-        return TargetTransformInfo::TCC_Basic;
-      return TargetTransformInfo::TCC_Expensive;
-    }
-
-    return BaseT::getIntrinsicCost(IID, RetTy, ParamTys, U, CostKind);
-  }
-
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
                                             unsigned &JumpTableSize,
                                             ProfileSummaryInfo *PSI,
@@ -1090,6 +1066,28 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// Get intrinsic cost based on arguments.
   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind) {
+    Intrinsic::ID IID = ICA.getID();
+    auto *ConcreteTTI = static_cast<T *>(this);
+
+    // Special case some scalar intrinsics.
+    if (CostKind != TTI::TCK_RecipThroughput) {
+      switch (IID) {
+      default:
+        break;
+      case Intrinsic::cttz:
+        if (getTLI()->isCheapToSpeculateCttz())
+          return TargetTransformInfo::TCC_Basic;
+        break;
+      case Intrinsic::ctlz:
+        if (getTLI()->isCheapToSpeculateCtlz())
+          return TargetTransformInfo::TCC_Basic;
+        break;
+      case Intrinsic::memcpy:
+        return ConcreteTTI->getMemcpyCost(ICA.getInst());
+      // TODO: other libc intrinsics.
+      }
+      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+    }
 
     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
       return 0;
@@ -1098,17 +1096,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     if (ICA.isTypeBasedOnly())
       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
 
-    Intrinsic::ID IID = ICA.getID();
-    const IntrinsicInst *I = ICA.getInst();
     Type *RetTy = ICA.getReturnType();
-    const SmallVectorImpl<Value *> &Args = ICA.getArgs();
     unsigned VF = ICA.getVectorFactor();
-    FastMathFlags FMF = ICA.getFlags();
-
     unsigned RetVF =
         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getNumElements() : 1);
     assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
-    auto *ConcreteTTI = static_cast<T *>(this);
+    const IntrinsicInst *I = ICA.getInst();
+    const SmallVectorImpl<Value *> &Args = ICA.getArgs();
+    FastMathFlags FMF = ICA.getFlags();
 
     switch (IID) {
     default: {
@@ -1595,13 +1590,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                                  CostKind) +
              ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
                                                  CostKind);
-    if (IID == Intrinsic::experimental_constrained_fmuladd)
-      return ConcreteTTI->getIntrinsicCost(
-                 Intrinsic::experimental_constrained_fmul, RetTy, Tys, nullptr,
-                 CostKind) +
-             ConcreteTTI->getIntrinsicCost(
-                 Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr,
-                 CostKind);
+    if (IID == Intrinsic::experimental_constrained_fmuladd) {
+      IntrinsicCostAttributes FMulAttrs(
+        Intrinsic::experimental_constrained_fmul, RetTy, Tys);
+      IntrinsicCostAttributes FAddAttrs(
+        Intrinsic::experimental_constrained_fadd, RetTy, Tys);
+      return ConcreteTTI->getIntrinsicInstrCost(FMulAttrs, CostKind) +
+             ConcreteTTI->getIntrinsicInstrCost(FAddAttrs, CostKind);
+    }
 
     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e05fcca1170e..86952a5ad6592 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -63,7 +63,20 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) :
    FMF = FPMO->getFastMathFlags();
 }
 
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI) :
+  II(dyn_cast<IntrinsicInst>(&CI)),  RetTy(CI.getType()), IID(Id) {
+
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+    FMF = FPMO->getFastMathFlags();
+
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
+  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI,
                                                  unsigned Factor) :
     RetTy(CI.getType()), IID(Id), VF(Factor) {
 
@@ -76,7 +89,8 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
   ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
 }
 
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, CallInst &CI,
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI,
                                                  unsigned Factor,
                                                  unsigned ScalarCost) :
     RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
@@ -236,15 +250,6 @@ int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
   return TTIImpl->getGEPCost(PointeeType, Ptr, Operands, CostKind);
 }
 
-int TargetTransformInfo::getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
-                                          ArrayRef<const Value *> Arguments,
-                                          const User *U,
-                                          TTI::TargetCostKind CostKind) const {
-  int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments, U, CostKind);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
 unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters(
     const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI,
     BlockFrequencyInfo *BFI) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ca75531be4a46..2405a24dd14f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -560,6 +560,9 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
 
 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
+  if (ICA.getID() == Intrinsic::fabs)
+    return 0;
+
   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 09f99af8c8e82..4170b102f2b31 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2699,6 +2699,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
 
 int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                       TTI::TargetCostKind CostKind) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
   if (ICA.isTypeBasedOnly())
     return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
 
@@ -3932,6 +3935,9 @@ int X86TTIImpl::getGatherScatterOpCost(
     unsigned Alignment, TTI::TargetCostKind CostKind,
     const Instruction *I = nullptr) {
 
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
   unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8bd1aa8514ca5..5b93aad11e143 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1535,7 +1535,7 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
   //  %inc = add nsw %i.0, 1
   //  br i1 %tobool
 
-  const Value *Args[] =
+  Value *Args[] =
       {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
                         : ConstantInt::getFalse(InitX->getContext())};
 
@@ -1544,9 +1544,11 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
   uint32_t HeaderSize =
       std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
 
+  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
+  int Cost =
+    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
   if (HeaderSize != IdiomCanonicalSize &&
-      TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
-          TargetTransformInfo::TCC_Basic)
+      Cost > TargetTransformInfo::TCC_Basic)
     return false;
 
   transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,

From 98cad555e29187a03e2bc3db5780762981913902 Mon Sep 17 00:00:00 2001
From: Lucas Prates <lucas.prates@arm.com>
Date: Tue, 5 May 2020 11:52:09 +0100
Subject: [PATCH 070/770] [Clang][AArch64] Capturing proper pointer alignment
 for Neon vld1 intrinsicts

Summary:
During CodeGen for AArch64 Neon intrinsics, Clang was incorrectly
assuming all the pointers from which loads were being generated for vld1
intrinsics were aligned according to the intrinsics result type, causing
alignment faults on the code generated by the backend.

This patch updates vld1 intrinsics' CodeGen to properly capture the
correct load alignment based on the type of the pointer provided as
input for the intrinsic.

Reviewers: t.p.northover, ostannard, pcc

Reviewed By: ostannard

Subscribers: kristof.beyls, danielkiss, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79721
---
 clang/lib/CodeGen/CGBuiltin.cpp              | 12 ++---
 clang/test/CodeGen/aarch64-neon-intrinsics.c | 52 ++++++++++----------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1adae1a7ea42a..ddd9a68a8edb7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10327,9 +10327,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
   case NEON::BI__builtin_neon_vld1_v:
   case NEON::BI__builtin_neon_vld1q_v: {
+    auto Alignment = CGM.getNaturalPointeeTypeAlignment(
+        E->getArg(0)->IgnoreParenCasts()->getType());
     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
-    auto Alignment = CharUnits::fromQuantity(
-        BuiltinID == NEON::BI__builtin_neon_vld1_v ? 8 : 16);
     return Builder.CreateAlignedLoad(VTy, Ops[0], Alignment);
   }
   case NEON::BI__builtin_neon_vst1_v:
@@ -10342,8 +10342,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    auto Alignment = CharUnits::fromQuantity(
-        BuiltinID == NEON::BI__builtin_neon_vld1_lane_v ? 8 : 16);
+    auto Alignment = CGM.getNaturalPointeeTypeAlignment(
+        E->getArg(0)->IgnoreParenCasts()->getType());
     Ops[0] =
         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
@@ -10353,8 +10353,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Value *V = UndefValue::get(Ty);
     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    auto Alignment = CharUnits::fromQuantity(
-        BuiltinID == NEON::BI__builtin_neon_vld1_dup_v ? 8 : 16);
+    auto Alignment = CGM.getNaturalPointeeTypeAlignment(
+        E->getArg(0)->IgnoreParenCasts()->getType());
     Ops[0] =
         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CodeGen/aarch64-neon-intrinsics.c
index 7744b4f4a159d..1fb245f3d3429 100644
--- a/clang/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics.c
@@ -8956,7 +8956,7 @@ float64_t test_vrsqrted_f64(float64_t a) {
 
 // CHECK-LABEL: @test_vld1q_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
 // CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const *a) {
   return vld1q_u8(a);
@@ -8965,7 +8965,7 @@ uint8x16_t test_vld1q_u8(uint8_t const *a) {
 // CHECK-LABEL: @test_vld1q_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 // CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const *a) {
   return vld1q_u16(a);
@@ -8974,7 +8974,7 @@ uint16x8_t test_vld1q_u16(uint16_t const *a) {
 // CHECK-LABEL: @test_vld1q_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 // CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const *a) {
   return vld1q_u32(a);
@@ -8983,7 +8983,7 @@ uint32x4_t test_vld1q_u32(uint32_t const *a) {
 // CHECK-LABEL: @test_vld1q_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
 // CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const *a) {
   return vld1q_u64(a);
@@ -8991,7 +8991,7 @@ uint64x2_t test_vld1q_u64(uint64_t const *a) {
 
 // CHECK-LABEL: @test_vld1q_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
 // CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const *a) {
   return vld1q_s8(a);
@@ -9000,7 +9000,7 @@ int8x16_t test_vld1q_s8(int8_t const *a) {
 // CHECK-LABEL: @test_vld1q_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 // CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const *a) {
   return vld1q_s16(a);
@@ -9009,7 +9009,7 @@ int16x8_t test_vld1q_s16(int16_t const *a) {
 // CHECK-LABEL: @test_vld1q_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 // CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const *a) {
   return vld1q_s32(a);
@@ -9018,7 +9018,7 @@ int32x4_t test_vld1q_s32(int32_t const *a) {
 // CHECK-LABEL: @test_vld1q_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
 // CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const *a) {
   return vld1q_s64(a);
@@ -9027,7 +9027,7 @@ int64x2_t test_vld1q_s64(int64_t const *a) {
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[TMP2:%.*]] = load <8 x half>, <8 x half>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <8 x half>, <8 x half>* [[TMP1]], align 2
 // CHECK:   ret <8 x half> [[TMP2]]
 float16x8_t test_vld1q_f16(float16_t const *a) {
   return vld1q_f16(a);
@@ -9036,7 +9036,7 @@ float16x8_t test_vld1q_f16(float16_t const *a) {
 // CHECK-LABEL: @test_vld1q_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 // CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const *a) {
   return vld1q_f32(a);
@@ -9045,7 +9045,7 @@ float32x4_t test_vld1q_f32(float32_t const *a) {
 // CHECK-LABEL: @test_vld1q_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
-// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 // CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vld1q_f64(float64_t const *a) {
   return vld1q_f64(a);
@@ -9053,7 +9053,7 @@ float64x2_t test_vld1q_f64(float64_t const *a) {
 
 // CHECK-LABEL: @test_vld1q_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
 // CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const *a) {
   return vld1q_p8(a);
@@ -9062,7 +9062,7 @@ poly8x16_t test_vld1q_p8(poly8_t const *a) {
 // CHECK-LABEL: @test_vld1q_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 // CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const *a) {
   return vld1q_p16(a);
@@ -9070,7 +9070,7 @@ poly16x8_t test_vld1q_p16(poly16_t const *a) {
 
 // CHECK-LABEL: @test_vld1_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 // CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const *a) {
   return vld1_u8(a);
@@ -9079,7 +9079,7 @@ uint8x8_t test_vld1_u8(uint8_t const *a) {
 // CHECK-LABEL: @test_vld1_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
 // CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const *a) {
   return vld1_u16(a);
@@ -9088,7 +9088,7 @@ uint16x4_t test_vld1_u16(uint16_t const *a) {
 // CHECK-LABEL: @test_vld1_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
 // CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const *a) {
   return vld1_u32(a);
@@ -9097,7 +9097,7 @@ uint32x2_t test_vld1_u32(uint32_t const *a) {
 // CHECK-LABEL: @test_vld1_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]], align 8
 // CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const *a) {
   return vld1_u64(a);
@@ -9105,7 +9105,7 @@ uint64x1_t test_vld1_u64(uint64_t const *a) {
 
 // CHECK-LABEL: @test_vld1_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 // CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const *a) {
   return vld1_s8(a);
@@ -9114,7 +9114,7 @@ int8x8_t test_vld1_s8(int8_t const *a) {
 // CHECK-LABEL: @test_vld1_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
 // CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const *a) {
   return vld1_s16(a);
@@ -9123,7 +9123,7 @@ int16x4_t test_vld1_s16(int16_t const *a) {
 // CHECK-LABEL: @test_vld1_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
 // CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const *a) {
   return vld1_s32(a);
@@ -9132,7 +9132,7 @@ int32x2_t test_vld1_s32(int32_t const *a) {
 // CHECK-LABEL: @test_vld1_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]], align 8
 // CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const *a) {
   return vld1_s64(a);
@@ -9141,7 +9141,7 @@ int64x1_t test_vld1_s64(int64_t const *a) {
 // CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[TMP2:%.*]] = load <4 x half>, <4 x half>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x half>, <4 x half>* [[TMP1]], align 2
 // CHECK:   ret <4 x half> [[TMP2]]
 float16x4_t test_vld1_f16(float16_t const *a) {
   return vld1_f16(a);
@@ -9150,7 +9150,7 @@ float16x4_t test_vld1_f16(float16_t const *a) {
 // CHECK-LABEL: @test_vld1_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 // CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const *a) {
   return vld1_f32(a);
@@ -9159,7 +9159,7 @@ float32x2_t test_vld1_f32(float32_t const *a) {
 // CHECK-LABEL: @test_vld1_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
-// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]], align 8
 // CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vld1_f64(float64_t const *a) {
   return vld1_f64(a);
@@ -9167,7 +9167,7 @@ float64x1_t test_vld1_f64(float64_t const *a) {
 
 // CHECK-LABEL: @test_vld1_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 // CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const *a) {
   return vld1_p8(a);
@@ -9176,7 +9176,7 @@ poly8x8_t test_vld1_p8(poly8_t const *a) {
 // CHECK-LABEL: @test_vld1_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
 // CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const *a) {
   return vld1_p16(a);

From 2569787e44595d31942da2bb5558931351929e57 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Wed, 20 May 2020 17:01:35 +0300
Subject: [PATCH 071/770] [DebugInfo] - Fix multiple issues in
 DWARFDebugFrame::parse().

I've noticed an issue with "Data.getRelocatedValue(...)" call.

it might silently ignore an error when a content is truncated.
That leads to an infinite loop in the code (e.g. llvm-readobj hangs).

After fixing the issue I've found that actually we always tried
to read past the end of a section, even when a content was valid.
It happened because the terminator CIE (a CIE with the length == 0)
was never handled. At first I've tried just to stop adding the terminator
entry (and return), but it does not seem to be correct, because tools like
llvm-objdump might want to print something for such entries
(see comments in the code and test cases).

This patch fixes issues mentioned, provides new test cases for
both llvm-readobj and lib/DebugInfo and adds FIXMEs to existent
test cases related.

Differential revision: https://reviews.llvm.org/D80299
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp  | 22 ++++++++++++++++---
 llvm/test/DebugInfo/X86/eh-frame-truncated.s  | 10 +++++++++
 .../tools/llvm-objdump/eh_frame-mipsel.test   |  1 +
 .../tools/llvm-objdump/eh_frame_zero_cie.test |  1 +
 llvm/test/tools/llvm-readobj/ELF/unwind.test  | 19 ++++++++++++++++
 5 files changed, 50 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/eh-frame-truncated.s

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 269a45e57a8d0..0e8d521f94330 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -375,7 +375,19 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
     uint64_t Length;
     DwarfFormat Format;
     std::tie(Length, Format) = Data.getInitialLength(&Offset);
-    uint64_t Id;
+    bool IsDWARF64 = Format == DWARF64;
+
+    // If the Length is 0, then this CIE is a terminator. We add it because some
+    // dumper tools might need it to print something special for such entries
+    // (e.g. llvm-objdump --dwarf=frames prints "ZERO terminator").
+    if (Length == 0) {
+      auto Cie = std::make_unique<CIE>(
+          IsDWARF64, StartOffset, 0, 0, SmallString<8>(), 0, 0, 0, 0, 0,
+          SmallString<8>(), 0, 0, None, None, Arch);
+      CIEs[StartOffset] = Cie.get();
+      Entries.push_back(std::move(Cie));
+      break;
+    }
 
     // At this point, Offset points to the next field after Length.
     // Length is the structure size excluding itself. Compute an offset one
@@ -385,8 +397,12 @@ Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
     uint64_t EndStructureOffset = Offset + Length;
 
     // The Id field's size depends on the DWARF format
-    bool IsDWARF64 = Format == DWARF64;
-    Id = Data.getRelocatedValue((IsDWARF64 && !IsEH) ? 8 : 4, &Offset);
+    Error Err = Error::success();
+    uint64_t Id = Data.getRelocatedValue((IsDWARF64 && !IsEH) ? 8 : 4, &Offset,
+                                         /*SectionIndex=*/nullptr, &Err);
+    if (Err)
+      return Err;
+
     if (Id == getCIEId(IsDWARF64, IsEH)) {
       uint8_t Version = Data.getU8(&Offset);
       const char *Augmentation = Data.getCStr(&Offset);
diff --git a/llvm/test/DebugInfo/X86/eh-frame-truncated.s b/llvm/test/DebugInfo/X86/eh-frame-truncated.s
new file mode 100644
index 0000000000000..28107e13530a0
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/eh-frame-truncated.s
@@ -0,0 +1,10 @@
+## Check we report a proper error when the content
+## of the .eh_frame section is truncated.
+
+# RUN: llvm-mc -triple x86_64 %s -filetype=obj -o %t
+# RUN: not llvm-dwarfdump -debug-frame %t 2>&1 | FileCheck %s
+
+# CHECK: error: unexpected end of data at offset 0x4
+
+.section .eh_frame,"a",@unwind
+.long 0xFF ## Length
diff --git a/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test b/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test
index 67d2408269146..e89d9aeb53cb2 100644
--- a/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test
+++ b/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test
@@ -19,6 +19,7 @@
 # CHECK:   DW_CFA_offset: reg31 -4
 # CHECK:   DW_CFA_nop:
 
+## FIXME: GNU objdump prints "00000038 ZERO terminator" instead.
 # CHECK: 00000038 00000000 00000000 CIE
 # CHECK:   Version:               0
 # CHECK:   Augmentation:          ""
diff --git a/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test b/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test
index 4702162a749bd..510c944028cdf 100644
--- a/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test
+++ b/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test
@@ -2,6 +2,7 @@
 
 # CHECK: .eh_frame contents:
 
+## FIXME: GNU objdump prints "00000000 ZERO terminator" instead.
 # CHECK: 00000000 00000000 00000000 CIE
 # CHECK:   Version:               0
 # CHECK:   Augmentation:          ""
diff --git a/llvm/test/tools/llvm-readobj/ELF/unwind.test b/llvm/test/tools/llvm-readobj/ELF/unwind.test
index dbdc9617aae3d..466c6a6a75178 100644
--- a/llvm/test/tools/llvm-readobj/ELF/unwind.test
+++ b/llvm/test/tools/llvm-readobj/ELF/unwind.test
@@ -243,3 +243,22 @@ Sections:
 ##   .quad 0x00010000 # Address range
 ## .Lend:
     Content: 14000000FFFFFFFFCDAB1111000000000000010000000000
+
+## Check we report a error when the .eh_frame section contains truncated data.
+# RUN: yaml2obj --docnum=3 %s -o %t3.exe
+# RUN: not llvm-readobj --unwind %t3.exe 2>&1 | FileCheck %s -DFILE=%t3.exe --check-prefix=TRUNCATED-ERR
+
+# TRUNCATED-ERR:      .eh_frame section at offset 0x34 address 0x0:
+# TRUNCATED-ERR-NEXT: error: '[[FILE]]': unexpected end of data at offset 0x4
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+Sections:
+  - Name:    .eh_frame
+    Type:    SHT_PROGBITS
+## Length is set to 0xFF, though the actual section length is 4.
+    Content: "FF000000"

From 92f3828dc5675f9917d909eb75c29ba1e14920ad Mon Sep 17 00:00:00 2001
From: vpykhtin <valery.pykhtin@gmail.com>
Date: Tue, 26 May 2020 12:09:46 +0300
Subject: [PATCH 072/770] [AMDGPU] Fix wait counts in the presence of 16bit
 subregisters

Differential Revision: https://reviews.llvm.org/D80033
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  2 +-
 llvm/test/CodeGen/AMDGPU/waitcnt.mir        | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c115d26fa6a34..67c7ff1fcda43 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -505,7 +505,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
   unsigned Size = TRI->getRegSizeInBits(*RC);
-  Result.second = Result.first + (Size / 32);
+  Result.second = Result.first + ((Size + 16) / 32);
 
   return Result;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
index fd81ca83a1a1c..c568b8d32a237 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
@@ -41,6 +41,9 @@
     ret void
   }
 
+  define amdgpu_kernel void @subregs16bit() {
+    ret void
+  }
 ...
 ---
 
@@ -284,3 +287,19 @@ body: |
       FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     }
 ...
+
+---
+# CHECK-LABEL: name: subregs16bit
+# CHECK: S_WAITCNT 112
+# CHECK-NEXT: V_NOP_e32 
+
+name: subregs16bit
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4
+      $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+      $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
+      V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16
+...

From 48cdbd081c9111e2ffe41ac3022bdfc65df46655 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 10:29:42 +0100
Subject: [PATCH 073/770] [NFC][ARM] Add code size analysis tests

Add code size runs for the cast costs.
---
 llvm/test/Analysis/CostModel/ARM/cast.ll | 4383 ++++++++++++++--------
 1 file changed, 2821 insertions(+), 1562 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/ARM/cast.ll b/llvm/test/Analysis/CostModel/ARM/cast.ll
index a7fd0a141a56b..0e509c1f57b4f 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast.ll
@@ -1,1347 +1,2419 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK-NEON-RECIP
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
+; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
 
 define i32 @casts() {
-; CHECK-NEON-LABEL: 'casts'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r30 = fptoui float undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r31 = fptosi float undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r32 = fptoui float undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r33 = fptosi float undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r34 = fptoui float undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r35 = fptosi float undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r36 = fptoui float undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r37 = fptosi float undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r38 = fptoui float undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r39 = fptosi float undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r40 = fptoui double undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r41 = fptosi double undef to i1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r42 = fptoui double undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r43 = fptosi double undef to i8
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r44 = fptoui double undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r45 = fptosi double undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r46 = fptoui double undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r47 = fptosi double undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r48 = fptoui double undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r49 = fptosi double undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r50 = sitofp i1 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r53 = uitofp i1 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r54 = sitofp i8 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r57 = uitofp i8 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r58 = sitofp i16 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r61 = uitofp i16 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r62 = sitofp i32 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r65 = uitofp i32 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r66 = sitofp i64 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+  ; -- scalars --
+; CHECK-NEON-RECIP-LABEL: 'casts'
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r30 = fptoui float undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r31 = fptosi float undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r32 = fptoui float undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r33 = fptosi float undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r34 = fptoui float undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r35 = fptosi float undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r36 = fptoui float undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r37 = fptosi float undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r38 = fptoui float undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r39 = fptosi float undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r40 = fptoui double undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r41 = fptosi double undef to i1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r42 = fptoui double undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r43 = fptosi double undef to i8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r44 = fptoui double undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r45 = fptosi double undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r46 = fptoui double undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r47 = fptosi double undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r48 = fptoui double undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r49 = fptosi double undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-MVE-LABEL: 'casts'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r53 = uitofp i1 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r57 = uitofp i8 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r61 = uitofp i16 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r65 = uitofp i32 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-MVE-RECIP-LABEL: 'casts'
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8M-MAIN-LABEL: 'casts'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-MAIN-RECIP-LABEL: 'casts'
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8M-BASE-LABEL: 'casts'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-BASE-RECIP-LABEL: 'casts'
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8R-LABEL: 'casts'
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r30 = fptoui float undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r31 = fptosi float undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r32 = fptoui float undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r33 = fptosi float undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r34 = fptoui float undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r35 = fptosi float undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r36 = fptoui float undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r37 = fptosi float undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r38 = fptoui float undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r39 = fptosi float undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r40 = fptoui double undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r41 = fptosi double undef to i1
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r42 = fptoui double undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r43 = fptosi double undef to i8
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r44 = fptoui double undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r45 = fptosi double undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r46 = fptoui double undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r47 = fptosi double undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r48 = fptoui double undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r49 = fptosi double undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r50 = sitofp i1 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r53 = uitofp i1 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r54 = sitofp i8 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r57 = uitofp i8 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r58 = sitofp i16 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r61 = uitofp i16 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r62 = sitofp i32 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r65 = uitofp i32 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r66 = sitofp i64 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8R-RECIP-LABEL: 'casts'
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-MVE-SIZE-LABEL: 'casts'
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-MAIN-SIZE-LABEL: 'casts'
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-BASE-SIZE-LABEL: 'casts'
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8R-SIZE-LABEL: 'casts'
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r1 = zext i1 undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i1 undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = zext i1 undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i1 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i1 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r6 = sext i1 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i1 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = sext i8 undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = zext i8 undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = sext i8 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = zext i8 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = sext i8 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r14 = zext i8 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r24 = sext i32 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r25 = zext i32 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-  ; -- scalars --
   %r0 = sext i1 undef to i8
   %r1 = zext i1 undef to i8
   %r2 = sext i1 undef to i16
@@ -1644,187 +2716,330 @@ define i32 @casts() {
 
 
 define i32 @load_extends() {
-; CHECK-NEON-LABEL: 'load_extends'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEON-RECIP-LABEL: 'load_extends'
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-MVE-LABEL: 'load_extends'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-MVE-RECIP-LABEL: 'load_extends'
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8M-MAIN-LABEL: 'load_extends'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-MAIN-RECIP-LABEL: 'load_extends'
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8M-BASE-LABEL: 'load_extends'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-BASE-RECIP-LABEL: 'load_extends'
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8R-LABEL: 'load_extends'
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8R-RECIP-LABEL: 'load_extends'
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-MVE-SIZE-LABEL: 'load_extends'
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-MAIN-SIZE-LABEL: 'load_extends'
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-BASE-SIZE-LABEL: 'load_extends'
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8R-SIZE-LABEL: 'load_extends'
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i8 %loadi8 to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r5 = zext i8 %loadi8 to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = sext i16 %loadi16 to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-
   %loadi8 = load i8, i8* undef
   %loadi16 = load i16, i16* undef
   %loadi32 = load i32, i32* undef
@@ -1865,60 +3080,104 @@ define i32 @load_extends() {
 }
 
 define i32 @bitcasts() {
-; CHECK-NEON-LABEL: 'bitcasts'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEON-RECIP-LABEL: 'bitcasts'
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-MVE-RECIP-LABEL: 'bitcasts'
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-MAIN-RECIP-LABEL: 'bitcasts'
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-BASE-RECIP-LABEL: 'bitcasts'
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8R-RECIP-LABEL: 'bitcasts'
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-MVE-LABEL: 'bitcasts'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-MVE-SIZE-LABEL: 'bitcasts'
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8M-MAIN-LABEL: 'bitcasts'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-MAIN-SIZE-LABEL: 'bitcasts'
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8M-BASE-LABEL: 'bitcasts'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-BASE-SIZE-LABEL: 'bitcasts'
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; CHECK-V8R-LABEL: 'bitcasts'
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8R-SIZE-LABEL: 'bitcasts'
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %a = bitcast i32 undef to i32
   %b = bitcast float undef to float

From 3d4c873a14fe2ffb5cd6ac329354857eef245196 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Mon, 25 May 2020 17:30:41 +0300
Subject: [PATCH 074/770] [yaml2obj] - Map section names to chunks for each
 ELFYAML::ProgramHeader early. NFCI.

Each `ELFYAML::ProgramHeader` currently contains a list of section names
included. We are trying to map them to Fill/Sections very late,
though we can create such mapping early, in `initProgramHeaders`.

The benefit is that with such change it is possible to access mapped
chunks earlier (for example during writing section content) and have
simpler code.

Differential revision: https://reviews.llvm.org/D80520
---
 llvm/include/llvm/ObjectYAML/ELFYAML.h | 27 ++++++++------
 llvm/lib/ObjectYAML/ELFEmitter.cpp     | 51 ++++++++++++++++----------
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 2fd18fcd2957c..22ed82289ca8c 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -90,18 +90,6 @@ struct SectionName {
   StringRef Section;
 };
 
-struct ProgramHeader {
-  ELF_PT Type;
-  ELF_PF Flags;
-  llvm::yaml::Hex64 VAddr;
-  llvm::yaml::Hex64 PAddr;
-  Optional<llvm::yaml::Hex64> Align;
-  Optional<llvm::yaml::Hex64> FileSize;
-  Optional<llvm::yaml::Hex64> MemSize;
-  Optional<llvm::yaml::Hex64> Offset;
-  std::vector<SectionName> Sections;
-};
-
 struct Symbol {
   StringRef Name;
   ELF_STT Type;
@@ -503,6 +491,21 @@ struct MipsABIFlags : Section {
   }
 };
 
+struct ProgramHeader {
+  ELF_PT Type;
+  ELF_PF Flags;
+  llvm::yaml::Hex64 VAddr;
+  llvm::yaml::Hex64 PAddr;
+  Optional<llvm::yaml::Hex64> Align;
+  Optional<llvm::yaml::Hex64> FileSize;
+  Optional<llvm::yaml::Hex64> MemSize;
+  Optional<llvm::yaml::Hex64> Offset;
+
+  std::vector<SectionName> Sections;
+  // This vector is parallel to Sections and contains corresponding chunks.
+  std::vector<Chunk *> Chunks;
+};
+
 struct Object {
   FileHeader Header;
   std::vector<ProgramHeader> ProgramHeaders;
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index 95d74eeeb6e6e..78093491704be 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -330,7 +330,13 @@ void ELFState<ELFT>::writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream
 
 template <class ELFT>
 void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
-  for (const auto &YamlPhdr : Doc.ProgramHeaders) {
+  DenseMap<StringRef, ELFYAML::Fill *> NameToFill;
+  for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks)
+    if (auto S = dyn_cast<ELFYAML::Fill>(D.get()))
+      NameToFill[S->Name] = S;
+
+  std::vector<ELFYAML::Section *> Sections = Doc.getSections();
+  for (ELFYAML::ProgramHeader &YamlPhdr : Doc.ProgramHeaders) {
     Elf_Phdr Phdr;
     zero(Phdr);
     Phdr.p_type = YamlPhdr.Type;
@@ -338,6 +344,23 @@ void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
     Phdr.p_vaddr = YamlPhdr.VAddr;
     Phdr.p_paddr = YamlPhdr.PAddr;
     PHeaders.push_back(Phdr);
+
+    // Map Sections list to corresponding chunks.
+    for (const ELFYAML::SectionName &SecName : YamlPhdr.Sections) {
+      if (ELFYAML::Fill *Fill = NameToFill.lookup(SecName.Section)) {
+        YamlPhdr.Chunks.push_back(Fill);
+        continue;
+      }
+
+      unsigned Index;
+      if (SN2I.lookup(SecName.Section, Index)) {
+        YamlPhdr.Chunks.push_back(Sections[Index]);
+        continue;
+      }
+
+      reportError("unknown section or fill referenced: '" + SecName.Section +
+                  "' by program header");
+    }
   }
 }
 
@@ -757,31 +780,19 @@ template <class ELFT> void ELFState<ELFT>::reportError(const Twine &Msg) {
 template <class ELFT>
 std::vector<Fragment>
 ELFState<ELFT>::getPhdrFragments(const ELFYAML::ProgramHeader &Phdr,
-                                 ArrayRef<typename ELFT::Shdr> SHeaders) {
-  DenseMap<StringRef, ELFYAML::Fill *> NameToFill;
-  for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks)
-    if (auto S = dyn_cast<ELFYAML::Fill>(D.get()))
-      NameToFill[S->Name] = S;
-
+                                 ArrayRef<Elf_Shdr> SHeaders) {
   std::vector<Fragment> Ret;
-  for (const ELFYAML::SectionName &SecName : Phdr.Sections) {
-    unsigned Index;
-    if (SN2I.lookup(SecName.Section, Index)) {
-      const typename ELFT::Shdr &H = SHeaders[Index];
-      Ret.push_back({H.sh_offset, H.sh_size, H.sh_type, H.sh_addralign});
-      continue;
-    }
-
-    if (ELFYAML::Fill *Fill = NameToFill.lookup(SecName.Section)) {
-      Ret.push_back({*Fill->Offset, Fill->Size, llvm::ELF::SHT_PROGBITS,
+  for (const ELFYAML::Chunk *C : Phdr.Chunks) {
+    if (const ELFYAML::Fill *F = dyn_cast<ELFYAML::Fill>(C)) {
+      Ret.push_back({*F->Offset, F->Size, llvm::ELF::SHT_PROGBITS,
                      /*ShAddrAlign=*/1});
       continue;
     }
 
-    reportError("unknown section or fill referenced: '" + SecName.Section +
-                "' by program header");
+    const ELFYAML::Section *S = cast<ELFYAML::Section>(C);
+    const Elf_Shdr &H = SHeaders[SN2I.get(S->Name)];
+    Ret.push_back({H.sh_offset, H.sh_size, H.sh_type, H.sh_addralign});
   }
-
   return Ret;
 }
 

From 590f3a72c243b888ab10c4f9e71bf7f8eca99717 Mon Sep 17 00:00:00 2001
From: Xing GUO <higuoxing@gmail.com>
Date: Tue, 26 May 2020 17:14:23 +0800
Subject: [PATCH 075/770] [ObjectYAML][DWARF] Use .empty() to indicate if the
 DWARF sections are empty.

---
 llvm/lib/ObjectYAML/DWARFYAML.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 31c469f880042..63aea17324b66 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -16,11 +16,11 @@
 namespace llvm {
 
 bool DWARFYAML::Data::isEmpty() const {
-  return 0 == DebugStrings.size() + AbbrevDecls.size() + ARanges.size() +
-                  DebugRanges.size() + PubNames.Entries.size() +
-                  PubTypes.Entries.size() + GNUPubNames.Entries.size() +
-                  GNUPubTypes.Entries.size() + CompileUnits.size() +
-                  DebugLines.size();
+  return DebugStrings.empty() && AbbrevDecls.empty() && ARanges.empty() &&
+         DebugRanges.empty() && PubNames.Entries.empty() &&
+         PubTypes.Entries.empty() && GNUPubNames.Entries.empty() &&
+         GNUPubTypes.Entries.empty() && CompileUnits.empty() &&
+         DebugLines.empty();
 }
 
 namespace yaml {

From 2c04b8aacd070e88e64f08998dc583319e994d18 Mon Sep 17 00:00:00 2001
From: Xing GUO <higuoxing@gmail.com>
Date: Tue, 26 May 2020 17:22:23 +0800
Subject: [PATCH 076/770] [ObjectYAML][DWARF] Make variable names consistent.

---
 llvm/include/llvm/ObjectYAML/DWARFYAML.h |  2 +-
 llvm/lib/ObjectYAML/DWARFYAML.cpp        | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 8049d4911b86e..11b41e13b8e24 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -210,7 +210,7 @@ template <> struct MappingTraits<DWARFYAML::ARangeDescriptor> {
 };
 
 template <> struct MappingTraits<DWARFYAML::ARange> {
-  static void mapping(IO &IO, DWARFYAML::ARange &Range);
+  static void mapping(IO &IO, DWARFYAML::ARange &ARange);
 };
 
 template <> struct MappingTraits<DWARFYAML::RangeEntry> {
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 63aea17324b66..7ba73783cf63b 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -70,13 +70,13 @@ void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
 }
 
 void MappingTraits<DWARFYAML::ARange>::mapping(IO &IO,
-                                               DWARFYAML::ARange &Range) {
-  IO.mapRequired("Length", Range.Length);
-  IO.mapRequired("Version", Range.Version);
-  IO.mapRequired("CuOffset", Range.CuOffset);
-  IO.mapRequired("AddrSize", Range.AddrSize);
-  IO.mapRequired("SegSize", Range.SegSize);
-  IO.mapRequired("Descriptors", Range.Descriptors);
+                                               DWARFYAML::ARange &ARange) {
+  IO.mapRequired("Length", ARange.Length);
+  IO.mapRequired("Version", ARange.Version);
+  IO.mapRequired("CuOffset", ARange.CuOffset);
+  IO.mapRequired("AddrSize", ARange.AddrSize);
+  IO.mapRequired("SegSize", ARange.SegSize);
+  IO.mapRequired("Descriptors", ARange.Descriptors);
 }
 
 void MappingTraits<DWARFYAML::RangeEntry>::mapping(

From c5bbc8dd6d686175788e6c1a5fc0339814a5adfc Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 10:43:58 +0100
Subject: [PATCH 077/770] [NFC][ARM] Fix for previous commit

Actually analyse code-size for the size runs...
---
 llvm/test/Analysis/CostModel/ARM/cast.ll | 1748 +++++++++++-----------
 1 file changed, 874 insertions(+), 874 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/ARM/cast.ll b/llvm/test/Analysis/CostModel/ARM/cast.ll
index 0e509c1f57b4f..8d022f11e62c0 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast.ll
@@ -4,10 +4,10 @@
 ; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
 ; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
 ; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
-; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
-; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
 
 define i32 @casts() {
   ; -- scalars --
@@ -1366,7 +1366,7 @@ define i32 @casts() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r19 = sext i16 undef to i64
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
@@ -1399,219 +1399,219 @@ define i32 @casts() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2090 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2088 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-SIZE-LABEL: 'casts'
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
@@ -1633,7 +1633,7 @@ define i32 @casts() {
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r19 = sext i16 undef to i64
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
@@ -1684,201 +1684,201 @@ define i32 @casts() {
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-BASE-SIZE-LABEL: 'casts'
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
@@ -1900,7 +1900,7 @@ define i32 @casts() {
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r19 = sext i16 undef to i64
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
@@ -1951,201 +1951,201 @@ define i32 @casts() {
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8R-SIZE-LABEL: 'casts'
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i1 undef to i8
@@ -2167,7 +2167,7 @@ define i32 @casts() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r17 = sext i16 undef to i32
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r18 = zext i16 undef to i32
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r19 = sext i16 undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r19 = sext i16 undef to i64
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r20 = zext i16 undef to i64
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8
@@ -2178,80 +2178,80 @@ define i32 @casts() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r30 = fptoui float undef to i1
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r31 = fptosi float undef to i1
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r32 = fptoui float undef to i8
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r33 = fptosi float undef to i8
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r34 = fptoui float undef to i16
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r35 = fptosi float undef to i16
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r36 = fptoui float undef to i32
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r37 = fptosi float undef to i32
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r38 = fptoui float undef to i64
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r39 = fptosi float undef to i64
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r40 = fptoui double undef to i1
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r41 = fptosi double undef to i1
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r42 = fptoui double undef to i8
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r43 = fptosi double undef to i8
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r44 = fptoui double undef to i16
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r45 = fptosi double undef to i16
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r46 = fptoui double undef to i32
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r47 = fptosi double undef to i32
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r48 = fptoui double undef to i64
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r49 = fptosi double undef to i64
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r50 = sitofp i1 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r51 = uitofp i1 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r52 = sitofp i1 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r53 = uitofp i1 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r54 = sitofp i8 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r55 = uitofp i8 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r56 = sitofp i8 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r57 = uitofp i8 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r58 = sitofp i16 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r59 = uitofp i16 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r60 = sitofp i16 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r61 = uitofp i16 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r62 = sitofp i32 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r63 = uitofp i32 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r64 = sitofp i32 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r65 = uitofp i32 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r66 = sitofp i64 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r67 = uitofp i64 undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r68 = sitofp i64 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r69 = uitofp i64 undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q70 = sext <4 x i8> undef to <4 x i32>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q71 = sext <8 x i8> undef to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s70 = sext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r70 = sext <8 x i8> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r71 = sext <16 x i8> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q72 = zext <4 x i8> undef to <4 x i32>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q73 = zext <8 x i8> undef to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
@@ -2260,159 +2260,159 @@ define i32 @casts() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %r0 = sext i1 undef to i8
   %r1 = zext i1 undef to i8
@@ -2900,12 +2900,12 @@ define i32 @load_extends() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
@@ -2922,26 +2922,26 @@ define i32 @load_extends() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-SIZE-LABEL: 'load_extends'
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
@@ -2954,30 +2954,30 @@ define i32 @load_extends() {
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-BASE-SIZE-LABEL: 'load_extends'
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32
@@ -2990,19 +2990,19 @@ define i32 @load_extends() {
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %loadi16 to i64
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = sext i32 %loadi32 to i64
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8R-SIZE-LABEL: 'load_extends'
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1
@@ -3028,17 +3028,17 @@ define i32 @load_extends() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = zext i32 %loadi32 to i64
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %loadi8 = load i8, i8* undef
   %loadi16 = load i16, i16* undef
@@ -3144,29 +3144,29 @@ define i32 @bitcasts() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-SIZE-LABEL: 'bitcasts'
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-BASE-SIZE-LABEL: 'bitcasts'
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = bitcast i64 undef to double
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = bitcast double undef to i64
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8R-SIZE-LABEL: 'bitcasts'
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32
@@ -3177,7 +3177,7 @@ define i32 @bitcasts() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %a = bitcast i32 undef to i32
   %b = bitcast float undef to float

From 2e365ca2f7ce7a1f4a3938d79b894324b383ce5c Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Sat, 23 May 2020 18:34:08 +0300
Subject: [PATCH 078/770] [DebugInfo/llvm-objdump] - Print "ZERO terminator"
 for terminator entries when dumping .eh_frame.

A CIE with the Length == 0 is a terminator:
https://refspecs.linuxfoundation.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html

And GNU objdump recognizes them and prints the following for such entries:

"00000000 ZERO terminator"

This patch teaches llvm-objdump to do the same. I had to update tests to use
"CHECK-NEXT" too.

(Note: it looks perhaps not right that printing is done inside the DebugInfo library,
I'd expect to see the change in the llvm-objdump's code somewhere instead,
but that is how it done atm).

Differential revision: https://reviews.llvm.org/D80476
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp  |  6 +++
 .../tools/llvm-objdump/eh_frame-mipsel.test   | 49 +++++++++----------
 .../tools/llvm-objdump/eh_frame_zero_cie.test | 13 ++---
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 0e8d521f94330..51dc54e49fcc9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -304,6 +304,12 @@ constexpr uint64_t getCIEId(bool IsDWARF64, bool IsEH) {
 }
 
 void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
+  // A CIE with a zero length is a terminator entry in the .eh_frame sextion.
+  if (IsEH && Length == 0) {
+    OS << format("%08" PRIx64, Offset) << " ZERO terminator\n";
+    return;
+  }
+
   OS << format("%08" PRIx64, Offset)
      << format(" %0*" PRIx64, IsDWARF64 ? 16 : 8, Length)
      << format(" %0*" PRIx64, IsDWARF64 && !IsEH ? 16 : 8,
diff --git a/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test b/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test
index e89d9aeb53cb2..91058e28effc1 100644
--- a/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test
+++ b/llvm/test/tools/llvm-objdump/eh_frame-mipsel.test
@@ -1,28 +1,25 @@
 # RUN: llvm-objdump --dwarf=frames %p/Inputs/eh_frame.elf-mipsel | FileCheck %s
 
-# CHECK: .eh_frame contents:
-
-# CHECK: 00000000 00000018 00000000 CIE
-# CHECK:   Version:               1
-# CHECK:   Augmentation:          "zPLR"
-# CHECK:   Code alignment factor: 1
-# CHECK:   Data alignment factor: -4
-# CHECK:   Return address column: 31
-# CHECK:   Augmentation data:     80 90 0B 41 00 00 0B
-
-# CHECK:   DW_CFA_def_cfa: reg29 +0
-
-# CHECK: 0000001c 00000018 00000020 FDE cie=00000000 pc=00400890...004008dc
-# CHECK:   DW_CFA_advance_loc: 4
-# CHECK:   DW_CFA_def_cfa_offset: +24
-# CHECK:   DW_CFA_advance_loc: 4
-# CHECK:   DW_CFA_offset: reg31 -4
-# CHECK:   DW_CFA_nop:
-
-## FIXME: GNU objdump prints "00000038 ZERO terminator" instead.
-# CHECK: 00000038 00000000 00000000 CIE
-# CHECK:   Version:               0
-# CHECK:   Augmentation:          ""
-# CHECK:   Code alignment factor: 0
-# CHECK:   Data alignment factor: 0
-# CHECK:   Return address column: 0
+# CHECK:       .eh_frame contents:
+# CHECK-EMPTY:
+# CHECK-NEXT:  00000000 00000018 00000000 CIE
+# CHECK-NEXT:    Version:               1
+# CHECK-NEXT:    Augmentation:          "zPLR"
+# CHECK-NEXT:    Code alignment factor: 1
+# CHECK-NEXT:    Data alignment factor: -4
+# CHECK-NEXT:    Return address column: 31
+# CHECK-NEXT:    Personality Address: 0000000000410b90
+# CHECK-NEXT:    Augmentation data:     80 90 0B 41 00 00 0B
+# CHECK-EMPTY:
+# CHECK-NEXT:  DW_CFA_def_cfa: reg29 +0
+# CHECK-EMPTY:
+# CHECK-NEXT:  0000001c 00000018 00000020 FDE cie=00000000 pc=00400890...004008dc
+# CHECK-NEXT:    LSDA Address: 0000000000400a90
+# CHECK-NEXT:    DW_CFA_advance_loc: 4
+# CHECK-NEXT:    DW_CFA_def_cfa_offset: +24
+# CHECK-NEXT:    DW_CFA_advance_loc: 4
+# CHECK-NEXT:    DW_CFA_offset: reg31 -4
+# CHECK-NEXT:    DW_CFA_nop:
+# CHECK-EMPTY:
+# CHECK-NEXT:  00000038 ZERO terminator
+# CHECK-NOT: {{.}}
diff --git a/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test b/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test
index 510c944028cdf..30bbec9b97230 100644
--- a/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test
+++ b/llvm/test/tools/llvm-objdump/eh_frame_zero_cie.test
@@ -1,11 +1,6 @@
 # RUN: llvm-objdump --dwarf=frames %p/Inputs/eh_frame_zero_cie.o 2>/dev/null | FileCheck %s
 
-# CHECK: .eh_frame contents:
-
-## FIXME: GNU objdump prints "00000000 ZERO terminator" instead.
-# CHECK: 00000000 00000000 00000000 CIE
-# CHECK:   Version:               0
-# CHECK:   Augmentation:          ""
-# CHECK:   Code alignment factor: 0
-# CHECK:   Data alignment factor: 0
-# CHECK:   Return address column: 0
+# CHECK:       .eh_frame contents:
+# CHECK-EMPTY:
+# CHECK-NEXT:  00000000 ZERO terminator
+# CHECK-NOT:   {{.}}

From 8b4639d0a0e0e65f23e0315f7ade83b9126472af Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 26 May 2020 10:12:04 +0100
Subject: [PATCH 079/770] [X86][AVX] Add some initial movmsk combine tests

Show failure to reduce the signbit extraction for 256-bit integer vectors on AVX1 targets where the pcmpgt/ashr has to be done with split 128-bit vectors.
---
 llvm/test/CodeGen/X86/combine-movmsk-avx.ll | 98 +++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/combine-movmsk-avx.ll

diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
new file mode 100644
index 0000000000000..0de723e287c62
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+
+;
+; TODO - Avoid sign extension ops when just extracting the sign bits.
+;
+
+define i32 @movmskpd_cmpgt_v4i64(<4 x i64> %a0) {
+; AVX1-LABEL: movmskpd_cmpgt_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX1-NEXT:    vmovmskpd %ymm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: movmskpd_cmpgt_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovmskpd %ymm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %1 = icmp sgt <4 x i64> zeroinitializer, %a0
+  %2 = sext <4 x i1> %1 to <4 x i64>
+  %3 = bitcast <4 x i64> %2 to <4 x double>
+  %4 = tail call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %3)
+  ret i32 %4
+}
+
+define i32 @movmskps_ashr_v8i32(<8 x i32> %a0)  {
+; AVX1-LABEL: movmskps_ashr_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vmovmskps %ymm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: movmskps_ashr_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovmskps %ymm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %2 = bitcast <8 x i32> %1 to <8 x float>
+  %3 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %2)
+  ret i32 %3
+}
+
+define i32 @movmskps_sext_v4i64(<4 x i32> %a0)  {
+; AVX1-LABEL: movmskps_sext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovmskpd %ymm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: movmskps_sext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmovmskpd %ymm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %1 = sext <4 x i32> %a0 to <4 x i64>
+  %2 = bitcast <4 x i64> %1 to <4 x double>
+  %3 = tail call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %2)
+  ret i32 %3
+}
+
+define i32 @movmskps_sext_v8i32(<8 x i16> %a0)  {
+; AVX1-LABEL: movmskps_sext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovmskps %ymm0, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: movmskps_sext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vmovmskps %ymm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %1 = sext <8 x i16> %a0 to <8 x i32>
+  %2 = bitcast <8 x i32> %1 to <8 x float>
+  %3 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %2)
+  ret i32 %3
+}

From 6f802ec4333cc1227bb37e258a81e9a588f964dc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 26 May 2020 10:55:43 +0100
Subject: [PATCH 080/770] [X86] Fix fshr comment copy+paste typo. NFC.

Noticed by @foad on D80466.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 54a80151eb69a..6bf61af00590c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19073,7 +19073,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
 
   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
-  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
       !isa<ConstantSDNode>(Amt)) {
     unsigned EltSizeInBits = VT.getScalarSizeInBits();

From 09f7dcb64e1b2a3568ddb6ab327dd2f4a4d3d0fe Mon Sep 17 00:00:00 2001
From: hsmahesha <mahesha.comp@gmail.com>
Date: Tue, 26 May 2020 15:47:03 +0530
Subject: [PATCH 081/770] [AMDGPU/MemOpsCluster] Code clean-up around mem ops
 clustering logic

Summary:
Clean-up code around mem ops clustering logic. This patch cleans up code within
the function clusterNeighboringMemOps(). It is WIP, and this patch is a first cut.

Reviewers: foad, rampitec, arsenm, vpykhtin, javedabsar

Reviewed By: foad

Subscribers: MatzeB, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, javed.absar, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80119
---
 llvm/lib/CodeGen/MachineScheduler.cpp | 64 ++++++++++++++++-----------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 55b0075338619..92fd3edf92364 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1580,34 +1580,48 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     return;
 
   llvm::sort(MemOpRecords);
+
+  // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
+  // cluster mem ops collected within `MemOpRecords` array.
   unsigned ClusterLength = 1;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
-    SUnit *SUa = MemOpRecords[Idx].SU;
-    SUnit *SUb = MemOpRecords[Idx+1].SU;
-    if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps,
-                                 MemOpRecords[Idx + 1].BaseOps,
-                                 ClusterLength + 1)) {
-      if (SUa->NodeNum > SUb->NodeNum)
-        std::swap(SUa, SUb);
-      if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-        LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
-                          << SUb->NodeNum << ")\n");
-        // Copy successor edges from SUa to SUb. Interleaving computation
-        // dependent on SUa can prevent load combining due to register reuse.
-        // Predecessor edges do not need to be copied from SUb to SUa since
-        // nearby loads should have effectively the same inputs.
-        for (const SDep &Succ : SUa->Succs) {
-          if (Succ.getSUnit() == SUb)
-            continue;
-          LLVM_DEBUG(dbgs()
-                     << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
-          DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
-        }
-        ++ClusterLength;
-      } else
-        ClusterLength = 1;
-    } else
+    // Decision to cluster mem ops is taken based on target dependent logic
+    auto MemOpa = MemOpRecords[Idx];
+    auto MemOpb = MemOpRecords[Idx + 1];
+    ++ClusterLength;
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps,
+                                  ClusterLength)) {
+      // Current mem ops pair could not be clustered, reset cluster length, and
+      // go to next pair
+      ClusterLength = 1;
+      continue;
+    }
+
+    SUnit *SUa = MemOpa.SU;
+    SUnit *SUb = MemOpb.SU;
+    if (SUa->NodeNum > SUb->NodeNum)
+      std::swap(SUa, SUb);
+
+    // FIXME: Is this check really required?
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
       ClusterLength = 1;
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
+                      << SUb->NodeNum << ")\n");
+
+    // Copy successor edges from SUa to SUb. Interleaving computation
+    // dependent on SUa can prevent load combining due to register reuse.
+    // Predecessor edges do not need to be copied from SUb to SUa since
+    // nearby loads should have effectively the same inputs.
+    for (const SDep &Succ : SUa->Succs) {
+      if (Succ.getSUnit() == SUb)
+        continue;
+      LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                        << ")\n");
+      DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+    }
   }
 }
 

From 5229dd1366ab1423d66d3d16dddff6fbaee049d8 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 26 May 2020 06:23:57 -0400
Subject: [PATCH 082/770] [build] Add LLVM_LOCAL_RPATH which can set an rpath
 on just unit test binaries

After D80096, bots that build clang for distribution and that can't use
system gcc / libstdc++ need to pass a working rpath so that unit test
binaries can run. The method suggested in GettingStarted.rst works fine
for local development, but it results in an absolute local rpath ending
up even in distributed binaries like clang, which is both ugly and
unnecessary.

Add an explicit toggle that can be used to add an rpath only for the
non-distributed binaries that need it.

Differential Revision: https://reviews.llvm.org/D80534
---
 llvm/CMakeLists.txt              |  3 +++
 llvm/cmake/modules/AddLLVM.cmake |  4 ++++
 llvm/docs/GettingStarted.rst     | 10 ++++++++++
 3 files changed, 17 insertions(+)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3fbe1e6cf9a4b..06b8646ca37ba 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -439,6 +439,9 @@ set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING
 option(LLVM_FORCE_USE_OLD_TOOLCHAIN
        "Set to ON to force using an old, unsupported host toolchain." OFF)
 
+set(LLVM_LOCAL_RPATH "" CACHE FILEPATH
+  "If set, an absolute path added as rpath on binaries that do not already contain an executable-relative rpath.")
+
 option(LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN
        "Set to ON to only warn when using a toolchain which is about to be deprecated, instead of emitting an error." OFF)
 
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 5b6b7f56777b4..9f14561fe0a6f 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -821,6 +821,10 @@ macro(add_llvm_executable name)
 
   if(NOT ARG_NO_INSTALL_RPATH)
     llvm_setup_rpath(${name})
+  elseif (LLVM_LOCAL_RPATH)
+    set_target_properties(${name} PROPERTIES
+                          BUILD_WITH_INSTALL_RPATH On
+                          INSTALL_RPATH "${LLVM_LOCAL_RPATH}")
   endif()
 
   if(DEFINED windows_resource_file)
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 46e337d2cec95..5cce01b72c116 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -340,6 +340,16 @@ If you fail to set rpath, most LLVM binaries will fail on startup with a message
 from the loader similar to ``libstdc++.so.6: version `GLIBCXX_3.4.20' not
 found``. This means you need to tweak the -rpath linker flag.
 
+This method will add an absolute path to the rpath of all executables. That's
+fine for local development. If you want to distribute the binaries you build
+so that they can run on older systems, copy ``libstdc++.so.6`` into the
+``lib/`` directory.  All of LLVM's shipping binaries have an rpath pointing at
+``$ORIGIN/../lib``, so they will find ``libstdc++.so.6`` there.  Non-distributed
+binaries don't have an rpath set and won't find ``libstdc++.so.6``. Pass
+``-DLLVM_LOCAL_RPATH="$HOME/toolchains/lib64"`` to cmake to add an absolute
+path to ``libstdc++.so.6`` as above. Since these binaries are not distributed,
+having an absolute local path is fine for them.
+
 When you build Clang, you will need to give *it* access to modern C++
 standard library in order to use it as your new host in part of a bootstrap.
 There are two easy ways to do this, either build (and install) libc++ along

From 3785eb83af4161bd52ed993ef3a2184c998071e6 Mon Sep 17 00:00:00 2001
From: Eduardo Caldas <ecaldas@google.com>
Date: Tue, 26 May 2020 12:19:07 +0200
Subject: [PATCH 083/770] Add support for binary operators in Syntax Trees

Reviewers: gribozavr2

Reviewed By: gribozavr2

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80540
---
 clang/include/clang/Tooling/Syntax/Nodes.h  |  22 ++
 clang/lib/Tooling/Syntax/BuildTree.cpp      |  19 +-
 clang/lib/Tooling/Syntax/Nodes.cpp          |  23 ++
 clang/unittests/Tooling/Syntax/TreeTest.cpp | 269 +++++++++++++++++++-
 4 files changed, 315 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h
index f4d482bb848c6..5db99d4b9e350 100644
--- a/clang/include/clang/Tooling/Syntax/Nodes.h
+++ b/clang/include/clang/Tooling/Syntax/Nodes.h
@@ -40,6 +40,7 @@ enum class NodeKind : uint16_t {
 
   // Expressions.
   UnknownExpression,
+  BinaryOperatorExpression,
 
   // Statements.
   UnknownStatement,
@@ -104,6 +105,9 @@ enum class NodeRole : uint8_t {
   BodyStatement,
 
   // Roles specific to particular node kinds.
+  BinaryOperatorExpression_leftHandSide,
+  BinaryOperatorExpression_operatorToken,
+  BinaryOperatorExpression_rightHandSide,
   CaseStatement_value,
   IfStatement_thenStatement,
   IfStatement_elseKeyword,
@@ -158,6 +162,24 @@ class UnknownExpression final : public Expression {
   }
 };
 
+/// <lhs> <operator> <rhs>
+///
+/// For example:
+///   a + b
+///   a bitor 1
+///   a |= b
+///   a and_eq b
+class BinaryOperatorExpression final : public Expression {
+public:
+  BinaryOperatorExpression() : Expression(NodeKind::BinaryOperatorExpression) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::BinaryOperatorExpression;
+  }
+  syntax::Expression *lhs();
+  syntax::Leaf *operatorToken();
+  syntax::Expression *rhs();
+};
+
 /// An abstract node for C++ statements, e.g. 'while', 'if', etc.
 /// FIXME: add accessors for semicolon of statements that have it.
 class Statement : public Tree {
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index 11058edec615d..8fee44cdbf10d 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclarationName.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/TypeLoc.h"
@@ -594,10 +595,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
       for (auto *D : DS->decls())
         Builder.noticeDeclWithoutSemicolon(D);
     } else if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
-      // Do not recurse into subexpressions.
-      // We do not have syntax trees for expressions yet, so we only want to see
-      // the first top-level expression.
-      return WalkUpFromExpr(E->IgnoreImplicit());
+      return RecursiveASTVisitor::TraverseStmt(E->IgnoreImplicit());
     }
     return RecursiveASTVisitor::TraverseStmt(S);
   }
@@ -610,6 +608,19 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
     return true;
   }
 
+  bool WalkUpFromBinaryOperator(BinaryOperator *S) {
+    Builder.markExprChild(
+        S->getLHS(), syntax::NodeRole::BinaryOperatorExpression_leftHandSide);
+    Builder.markChildToken(
+        S->getOperatorLoc(),
+        syntax::NodeRole::BinaryOperatorExpression_operatorToken);
+    Builder.markExprChild(
+        S->getRHS(), syntax::NodeRole::BinaryOperatorExpression_rightHandSide);
+    Builder.foldNode(Builder.getExprRange(S),
+                     new (allocator()) syntax::BinaryOperatorExpression, S);
+    return true;
+  }
+
   bool WalkUpFromNamespaceDecl(NamespaceDecl *S) {
     auto Tokens = Builder.getDeclarationRange(S);
     if (Tokens.front().kind() == tok::coloncolon) {
diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp
index 75f025e5f8536..84c0143db81d3 100644
--- a/clang/lib/Tooling/Syntax/Nodes.cpp
+++ b/clang/lib/Tooling/Syntax/Nodes.cpp
@@ -18,6 +18,8 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
     return OS << "TranslationUnit";
   case NodeKind::UnknownExpression:
     return OS << "UnknownExpression";
+  case NodeKind::BinaryOperatorExpression:
+    return OS << "BinaryOperatorExpression";
   case NodeKind::UnknownStatement:
     return OS << "UnknownStatement";
   case NodeKind::DeclarationStatement:
@@ -110,6 +112,12 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeRole R) {
     return OS << "IfStatement_elseKeyword";
   case syntax::NodeRole::IfStatement_elseStatement:
     return OS << "IfStatement_elseStatement";
+  case syntax::NodeRole::BinaryOperatorExpression_leftHandSide:
+    return OS << "BinaryOperatorExpression_leftHandSide";
+  case syntax::NodeRole::BinaryOperatorExpression_operatorToken:
+    return OS << "BinaryOperatorExpression_operatorToken";
+  case syntax::NodeRole::BinaryOperatorExpression_rightHandSide:
+    return OS << "BinaryOperatorExpression_rightHandSide";
   case syntax::NodeRole::ReturnStatement_value:
     return OS << "ReturnStatement_value";
   case syntax::NodeRole::ExpressionStatement_expression:
@@ -142,6 +150,21 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeRole R) {
   llvm_unreachable("invalid role");
 }
 
+syntax::Expression *syntax::BinaryOperatorExpression::lhs() {
+  return llvm::cast_or_null<syntax::Expression>(
+      findChild(syntax::NodeRole::BinaryOperatorExpression_leftHandSide));
+}
+
+syntax::Leaf *syntax::BinaryOperatorExpression::operatorToken() {
+  return llvm::cast_or_null<syntax::Leaf>(
+      findChild(syntax::NodeRole::BinaryOperatorExpression_operatorToken));
+}
+
+syntax::Expression *syntax::BinaryOperatorExpression::rhs() {
+  return llvm::cast_or_null<syntax::Expression>(
+      findChild(syntax::NodeRole::BinaryOperatorExpression_rightHandSide));
+}
+
 syntax::Leaf *syntax::SwitchStatement::switchKeyword() {
   return llvm::cast_or_null<syntax::Leaf>(
       findChild(syntax::NodeRole::IntroducerKeyword));
diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
index d11a3652c8e1f..634f99f7c395c 100644
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -564,7 +564,8 @@ void test() {
     |-{
     |-ExpressionStatement
     | |-UnknownExpression
-    | | |-test
+    | | |-UnknownExpression
+    | | | `-test
     | | |-(
     | | `-)
     | `-;
@@ -576,14 +577,16 @@ void test() {
     | |-)
     | |-ExpressionStatement
     | | |-UnknownExpression
-    | | | |-test
+    | | | |-UnknownExpression
+    | | | | `-test
     | | | |-(
     | | | `-)
     | | `-;
     | |-else
     | `-ExpressionStatement
     |   |-UnknownExpression
-    |   | |-test
+    |   | |-UnknownExpression
+    |   | | `-test
     |   | |-(
     |   | `-)
     |   `-;
@@ -591,6 +594,237 @@ void test() {
 )txt");
 }
 
+TEST_F(SyntaxTreeTest, BinaryOperator) {
+  expectTreeDumpEqual(
+      R"cpp(
+void test(int a) {
+  1 - 2;
+  1 == 2;
+  a = 1;
+  a <<= 1;
+
+  true || false;
+  true or false;
+
+  1 & 2;
+  1 bitand 2;
+
+  a ^= 3;
+  a xor_eq 3;
+}
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-void
+  |-SimpleDeclarator
+  | |-test
+  | `-ParametersAndQualifiers
+  |   |-(
+  |   |-SimpleDeclaration
+  |   | |-int
+  |   | `-SimpleDeclarator
+  |   |   `-a
+  |   `-)
+  `-CompoundStatement
+    |-{
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-1
+    | | |--
+    | | `-UnknownExpression
+    | |   `-2
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-1
+    | | |-==
+    | | `-UnknownExpression
+    | |   `-2
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | |-=
+    | | `-UnknownExpression
+    | |   `-1
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | |-<<=
+    | | `-UnknownExpression
+    | |   `-1
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-true
+    | | |-||
+    | | `-UnknownExpression
+    | |   `-false
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-true
+    | | |-or
+    | | `-UnknownExpression
+    | |   `-false
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-1
+    | | |-&
+    | | `-UnknownExpression
+    | |   `-2
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-1
+    | | |-bitand
+    | | `-UnknownExpression
+    | |   `-2
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | |-^=
+    | | `-UnknownExpression
+    | |   `-3
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | |-xor_eq
+    | | `-UnknownExpression
+    | |   `-3
+    | `-;
+    `-}
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, NestedBinaryOperator) {
+  expectTreeDumpEqual(
+      R"cpp(
+void test(int a, int b) {
+  (1 + 2) * (4 / 2);
+  a + b + 42;
+  a = b = 42;
+  a + b * 4 + 2;
+  a % 2 + b * 42;
+}
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-void
+  |-SimpleDeclarator
+  | |-test
+  | `-ParametersAndQualifiers
+  |   |-(
+  |   |-SimpleDeclaration
+  |   | |-int
+  |   | `-SimpleDeclarator
+  |   |   `-a
+  |   |-,
+  |   |-SimpleDeclaration
+  |   | |-int
+  |   | `-SimpleDeclarator
+  |   |   `-b
+  |   `-)
+  `-CompoundStatement
+    |-{
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | |-(
+    | | | |-BinaryOperatorExpression
+    | | | | |-UnknownExpression
+    | | | | | `-1
+    | | | | |-+
+    | | | | `-UnknownExpression
+    | | | |   `-2
+    | | | `-)
+    | | |-*
+    | | `-UnknownExpression
+    | |   |-(
+    | |   |-BinaryOperatorExpression
+    | |   | |-UnknownExpression
+    | |   | | `-4
+    | |   | |-/
+    | |   | `-UnknownExpression
+    | |   |   `-2
+    | |   `-)
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-BinaryOperatorExpression
+    | | | |-UnknownExpression
+    | | | | `-a
+    | | | |-+
+    | | | `-UnknownExpression
+    | | |   `-b
+    | | |-+
+    | | `-UnknownExpression
+    | |   `-42
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | |-=
+    | | `-BinaryOperatorExpression
+    | |   |-UnknownExpression
+    | |   | `-b
+    | |   |-=
+    | |   `-UnknownExpression
+    | |     `-42
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-BinaryOperatorExpression
+    | | | |-UnknownExpression
+    | | | | `-a
+    | | | |-+
+    | | | `-BinaryOperatorExpression
+    | | |   |-UnknownExpression
+    | | |   | `-b
+    | | |   |-*
+    | | |   `-UnknownExpression
+    | | |     `-4
+    | | |-+
+    | | `-UnknownExpression
+    | |   `-2
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-BinaryOperatorExpression
+    | | | |-UnknownExpression
+    | | | | `-a
+    | | | |-%
+    | | | `-UnknownExpression
+    | | |   `-2
+    | | |-+
+    | | `-BinaryOperatorExpression
+    | |   |-UnknownExpression
+    | |   | `-b
+    | |   |-*
+    | |   `-UnknownExpression
+    | |     `-42
+    | `-;
+    `-}
+)txt");
+}
+
 TEST_F(SyntaxTreeTest, MultipleDeclaratorsGrouping) {
   expectTreeDumpEqual(
       R"cpp(
@@ -1201,10 +1435,12 @@ void test() {
     |-IfStatement
     | |-I: if
     | |-I: (
-    | |-I: UnknownExpression
-    | | |-I: 1
+    | |-I: BinaryOperatorExpression
+    | | |-I: UnknownExpression
+    | | | `-I: 1
     | | |-I: +
-    | | `-I: 1
+    | | `-I: UnknownExpression
+    | |   `-I: 1
     | |-I: )
     | |-I: CompoundStatement
     | | |-I: {
@@ -1312,13 +1548,17 @@ void f(int xs[static 10]);
 | | | `-]
 | | |-=
 | | `-UnknownExpression
-| |   |-{
-| |   |-1
-| |   |-,
-| |   |-2
-| |   |-,
-| |   |-3
-| |   `-}
+| |   `-UnknownExpression
+| |     |-{
+| |     |-UnknownExpression
+| |     | `-1
+| |     |-,
+| |     |-UnknownExpression
+| |     | `-2
+| |     |-,
+| |     |-UnknownExpression
+| |     | `-3
+| |     `-}
 | `-;
 `-SimpleDeclaration
   |-void
@@ -1628,7 +1868,8 @@ const int const *const *volatile b;
 | | |-=
 | | `-UnknownExpression
 | |   |--
-| |   `-1
+| |   `-UnknownExpression
+| |     `-1
 | `-;
 |-SimpleDeclaration
 | |-int

From 8aaabadeced32a1cd959a5b1524b9c927e82bcc0 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 11:27:57 +0100
Subject: [PATCH 084/770] [CostModel] Unify getCastInstrCost

Add the remaining cast instruction opcodes to the base implementation
of getUserCost and directly return the result. This allows
getInstructionThroughput to return getUserCost for the casts. This
has required changes to PPC and SystemZ because they implement
getUserCost and/or getCastInstrCost with adjustments for vector
operations. Adjusts have also been made in the remaining backends
that implement the method so that they still produce a cost of zero
or one for cost kinds other than throughput.

Differential Revision: https://reviews.llvm.org/D79848
---
 .../llvm/Analysis/TargetTransformInfoImpl.h   | 14 ++++----
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  6 ++--
 .../AArch64/AArch64TargetTransformInfo.cpp    | 13 ++++++--
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 30 ++++++++++-------
 .../Hexagon/HexagonTargetTransformInfo.cpp    |  6 +++-
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  9 +++--
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  6 ++++
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 33 +++++++++++--------
 8 files changed, 76 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 60de70dcb16a0..bd8d29cb22a12 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -826,18 +826,18 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       return TTI::TCC_Expensive;
     case Instruction::IntToPtr:
     case Instruction::PtrToInt:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
     case Instruction::Trunc:
+    case Instruction::FPTrunc:
     case Instruction::BitCast:
-      if (TargetTTI->getCastInstrCost(Opcode, Ty, OpTy, CostKind, I) ==
-          TTI::TCC_Free)
-        return TTI::TCC_Free;
-      break;
     case Instruction::FPExt:
     case Instruction::SExt:
     case Instruction::ZExt:
-      if (TargetTTI->getCastInstrCost(Opcode, Ty, OpTy, CostKind, I) == TTI::TCC_Free)
-        return TTI::TCC_Free;
-      break;
+    case Instruction::AddrSpaceCast:
+      return TargetTTI->getCastInstrCost(Opcode, Ty, OpTy, CostKind, I);
     }
     // By default, just classify everything as 'basic'.
     return TTI::TCC_Basic;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 86952a5ad6592..a14199515faf5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1325,10 +1325,8 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast:
-  case Instruction::AddrSpaceCast: {
-    Type *SrcTy = I->getOperand(0)->getType();
-    return getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, CostKind, I);
-  }
+  case Instruction::AddrSpaceCast:
+    return getUserCost(I, CostKind);
   case Instruction::ExtractElement: {
     const ExtractElementInst *EEI = cast<ExtractElementInst>(I);
     ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1324945c4d4ea..f0961646c31ff 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -295,11 +295,18 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     }
   }
 
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](int Cost) {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
+  };
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 
   static const TypeConversionCostTblEntry
   ConversionTbl[] = {
@@ -401,9 +408,9 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
                                                  DstTy.getSimpleVT(),
                                                  SrcTy.getSimpleVT()))
-    return Entry->Cost;
+    return AdjustCost(Entry->Cost);
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 }
 
 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1ca74bfc3df08..c1af19727ba2b 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -173,6 +173,13 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](int Cost) {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
+  };
+
   // Single to/from double precision conversions.
   static const CostTblEntry NEONFltDblTbl[] = {
     // Vector fptrunc/fpext conversions.
@@ -185,14 +192,14 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                           ISD == ISD::FP_EXTEND)) {
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
-      return LT.first * Entry->Cost;
+      return AdjustCost(LT.first * Entry->Cost);
   }
 
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 
   // The extend of a load is free
   if (I && isa<LoadInst>(I->getOperand(0))) {
@@ -212,7 +219,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     };
     if (const auto *Entry = ConvertCostTableLookup(
             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
 
     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
@@ -226,7 +233,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       if (const auto *Entry =
               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
     }
   }
 
@@ -253,7 +260,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
                                              DstTy.getSimpleVT(),
                                              SrcTy.getSimpleVT())) {
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
     }
   }
 
@@ -347,7 +354,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   // Scalar float to integer conversions.
@@ -377,7 +384,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   // Scalar integer to float conversions.
@@ -408,7 +415,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
                                                    ISD, DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
@@ -433,7 +440,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
                                                    ISD, DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost * ST->getMVEVectorCostFactor();
+      return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
   }
 
   // Scalar integer conversion costs.
@@ -452,13 +459,14 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                      ? ST->getMVEVectorCostFactor()
                      : 1;
-  return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  return AdjustCost(
+    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 92e32ca99090e..381941df2fb46 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -263,7 +263,11 @@ unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
 
     std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
     std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
-    return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+    unsigned Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+    // TODO: Allow non-throughput costs that aren't binary.
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
   }
   return 1;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 002905febbc8b..a41c6b41a991b 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -212,7 +212,8 @@ int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
 unsigned
 PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
                         TTI::TargetCostKind CostKind) {
-  if (U->getType()->isVectorTy()) {
+  // We already implement getCastInstrCost and perform the vector adjustment there.
+  if (!isa<CastInst>(U) && U->getType()->isVectorTy()) {
     // Instructions that need to be split should cost more.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
     return LT.first * BaseT::getUserCost(U, Operands, CostKind);
@@ -760,7 +761,11 @@ int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
-  return vectorCostAdjustment(Cost, Opcode, Dst, Src);
+  Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
+  // TODO: Allow non-throughput costs that aren't binary.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Cost == 0 ? 0 : 1;
+  return Cost;
 }
 
 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index d9efb40f0ab65..bce02cc793bf6 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -691,6 +691,12 @@ getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I) {
+  // FIXME: Can the logic below also be used for these cost kinds?
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
+    int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+    return BaseCost == 0 ? BaseCost : 1;
+  }
+
   unsigned DstScalarBits = Dst->getScalarSizeInBits();
   unsigned SrcScalarBits = Src->getScalarSizeInBits();
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 4170b102f2b31..6bfcadeaf8b67 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1368,6 +1368,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](int Cost) {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
+  };
+
   // FIXME: Need a better design of the cost table to handle non-simple types of
   // potential massive combinations (elem_num x src_type x dst_type).
 
@@ -1969,7 +1976,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   if (ST->hasSSE2() && !ST->hasAVX()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return LTSrc.first * Entry->Cost;
+      return AdjustCost(LTSrc.first * Entry->Cost);
   }
 
   EVT SrcTy = TLI->getValueType(DL, Src);
@@ -1977,7 +1984,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
   // The function getSimpleVT only handles simple value types.
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind);
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
 
   MVT SimpleSrcTy = SrcTy.getSimpleVT();
   MVT SimpleDstTy = DstTy.getSimpleVT();
@@ -1986,59 +1993,59 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (ST->hasBWI())
       if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
 
     if (ST->hasDQI())
       if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
 
     if (ST->hasAVX512())
       if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasBWI())
     if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
 
   if (ST->hasDQI())
     if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
 
   if (ST->hasAVX512())
     if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
 
   if (ST->hasAVX2()) {
     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasAVX()) {
     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasSSE41()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasSSE2()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 }
 
 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,

From 6f5431846bbf3270d8fc605324e8843c5aaf579b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kirst=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Mon, 13 Apr 2020 15:57:18 +0200
Subject: [PATCH 085/770] [analyzer][RetainCount] Remove the CheckOSObject
 option

As per http://lists.llvm.org/pipermail/cfe-dev/2019-August/063215.html, lets get rid of this option.

It presents 2 issues that have bugged me for years now:

* OSObject is NOT a boolean option. It in fact has 3 states:
  * osx.OSObjectRetainCount is enabled but OSObject it set to false: RetainCount
    regards the option as disabled.
  * sx.OSObjectRetainCount is enabled and OSObject it set to true: RetainCount
    regards the option as enabled.
  * osx.OSObjectRetainCount is disabled: RetainCount regards the option as
    disabled.
* The hack involves directly modifying AnalyzerOptions::ConfigTable, which
  shouldn't even be public in the first place.

This still isn't really ideal, because it would be better to preserve the option
and remove the checker (we want visible checkers to be associated with
diagnostics, and hidden options like this one to be associated with changing how
the modeling is done), but backwards compatibility is an issue.

Differential Revision: https://reviews.llvm.org/D78097
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td |  9 -------
 .../RetainCountChecker/RetainCountChecker.cpp | 24 +++----------------
 clang/test/Analysis/analyzer-config.c         |  1 -
 .../Analysis/test-separate-retaincount.cpp    |  4 ----
 4 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index f0ad8326929e5..bc4b7d00e2d40 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1094,15 +1094,6 @@ def NSErrorChecker : Checker<"NSError">,
 def RetainCountChecker : Checker<"RetainCount">,
   HelpText<"Check for leaks and improper reference count management">,
   CheckerOptions<[
-    CmdLineOption<Boolean,
-                  "CheckOSObject",
-                  "Find violations of retain-release rules applied to XNU "
-                  "OSObject instances. By default, the checker only checks "
-                  "retain-release rules for Objective-C NSObject instances "
-                  "and CoreFoundation objects.",
-                  "true",
-                  InAlpha,
-                  Hide>,
     CmdLineOption<Boolean,
                   "TrackNSCFStartParam",
                   "Check not only that the code follows retain-release rules "
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
index 4bf9beb365f66..280d511e87c56 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
@@ -1481,26 +1481,11 @@ bool ento::shouldRegisterRetainCountBase(const CheckerManager &mgr) {
   return true;
 }
 
-// FIXME: remove this, hack for backwards compatibility:
-// it should be possible to enable the NS/CF retain count checker as
-// osx.cocoa.RetainCount, and it should be possible to disable
-// osx.OSObjectRetainCount using osx.cocoa.RetainCount:CheckOSObject=false.
-static bool getOption(const AnalyzerOptions &Options,
-                      StringRef Postfix,
-                      StringRef Value) {
-  auto I = Options.Config.find(
-    (StringRef("osx.cocoa.RetainCount:") + Postfix).str());
-  if (I != Options.Config.end())
-    return I->getValue() == Value;
-  return false;
-}
-
 void ento::registerRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
   Chk->TrackObjCAndCFObjects = true;
-  Chk->TrackNSCFStartParam = getOption(Mgr.getAnalyzerOptions(),
-                                       "TrackNSCFStartParam",
-                                       "true");
+  Chk->TrackNSCFStartParam = Mgr.getAnalyzerOptions().getCheckerBooleanOption(
+      Mgr.getCurrentCheckerName(), "TrackNSCFStartParam");
 }
 
 bool ento::shouldRegisterRetainCountChecker(const CheckerManager &mgr) {
@@ -1509,10 +1494,7 @@ bool ento::shouldRegisterRetainCountChecker(const CheckerManager &mgr) {
 
 void ento::registerOSObjectRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
-  if (!getOption(Mgr.getAnalyzerOptions(),
-                 "CheckOSObject",
-                 "false"))
-    Chk->TrackOSObjects = true;
+  Chk->TrackOSObjects = true;
 }
 
 bool ento::shouldRegisterOSObjectRetainCountChecker(const CheckerManager &mgr) {
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 7784673873821..cb3d40688e91a 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -99,7 +99,6 @@
 // CHECK-NEXT: optin.osx.cocoa.localizability.NonLocalizedStringChecker:AggressiveReport = false
 // CHECK-NEXT: optin.performance.Padding:AllowedPad = 24
 // CHECK-NEXT: osx.NumberObjectConversion:Pedantic = false
-// CHECK-NEXT: osx.cocoa.RetainCount:CheckOSObject = true
 // CHECK-NEXT: osx.cocoa.RetainCount:TrackNSCFStartParam = false
 // CHECK-NEXT: prune-paths = true
 // CHECK-NEXT: region-store-small-struct-limit = 2
diff --git a/clang/test/Analysis/test-separate-retaincount.cpp b/clang/test/Analysis/test-separate-retaincount.cpp
index 621e1d120bbb2..41efad452e5ac 100644
--- a/clang/test/Analysis/test-separate-retaincount.cpp
+++ b/clang/test/Analysis/test-separate-retaincount.cpp
@@ -5,10 +5,6 @@
 // RUN: %clang_analyze_cc1 -std=c++14 -DNO_OS_OBJECT -verify %s \
 // RUN:   -analyzer-checker=core,osx \
 // RUN:   -analyzer-disable-checker osx.OSObjectRetainCount
-//
-// RUN: %clang_analyze_cc1 -std=c++14 -DNO_OS_OBJECT -verify %s \
-// RUN:   -analyzer-checker=core,osx \
-// RUN:   -analyzer-config "osx.cocoa.RetainCount:CheckOSObject=false"
 
 #include "os_object_base.h"
 

From bd9dce8f9acd710ed62bab44ad3563209503cd72 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 12:17:26 +0100
Subject: [PATCH 086/770] [CostModel] getUserCost for intrinsic throughput

Last part of recommitting 'Unify Intrinsic Costs'
259eb619ff6dcd5b6111d1686e18559b9ca004d4. This patch now uses
getUserCost from getInstructionThroughput.

Differential Revision: https://reviews.llvm.org/D80012
---
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 8 ++++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp            | 9 +++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index bd8d29cb22a12..f98b8bf7da2c9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -776,6 +776,14 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
                   TTI::TargetCostKind CostKind) {
     auto *TargetTTI = static_cast<T *>(this);
 
+    // FIXME: We shouldn't have to special-case intrinsics here.
+    if (CostKind == TTI::TCK_RecipThroughput) {
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+        IntrinsicCostAttributes CostAttrs(*II);
+        return TargetTTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+      }
+    }
+
     // FIXME: Unlikely to be true for anything but CodeSize.
     if (const auto *CB = dyn_cast<CallBase>(U)) {
       const Function *F = CB->getCalledFunction();
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a14199515faf5..9f319c40ae6a7 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -260,7 +260,8 @@ int TargetTransformInfo::getUserCost(const User *U,
                                      ArrayRef<const Value *> Operands,
                                      enum TargetCostKind CostKind) const {
   int Cost = TTIImpl->getUserCost(U, Operands, CostKind);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  assert((CostKind == TTI::TCK_RecipThroughput || Cost >= 0) &&
+         "TTI should not produce negative costs!");
   return Cost;
 }
 
@@ -1419,11 +1420,7 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
     return TTIImpl->getShuffleCost(SK_PermuteTwoSrc, Ty, 0, nullptr);
   }
   case Instruction::Call:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      IntrinsicCostAttributes CostAttrs(*II);
-      return getIntrinsicInstrCost(CostAttrs, CostKind);
-    }
-    return -1;
+    return getUserCost(I, CostKind);
   default:
     // We don't have any information on this instruction.
     return -1;

From 4b7812116d513a66fb5fb3c83e7d8be08c1efc65 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 26 May 2020 12:32:29 +0100
Subject: [PATCH 087/770] MachineInstr.h - remove unnecessary MachineMemOperand
 forward declaration. NFC.

We already have to include MachineMemOperand.h
---
 llvm/include/llvm/CodeGen/MachineInstr.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 48db14e6cd695..1c841155e6434 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -42,7 +42,6 @@ class DIExpression;
 class DILocalVariable;
 class MachineBasicBlock;
 class MachineFunction;
-class MachineMemOperand;
 class MachineRegisterInfo;
 class ModuleSlotTracker;
 class raw_ostream;

From 0d52a7d038e189770984594a6ca71bea50fee4d9 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 20 May 2020 11:55:28 +0000
Subject: [PATCH 088/770] [libc][NFC] Simplify memcpy implementation

Summary: This is a NFC, it aims at simplifying both the code and build files.

Reviewers: abrachet, sivachandra

Subscribers: mgorny, tschuett, ecnelises, libc-commits, courbet

Tags: #libc-project

Differential Revision: https://reviews.llvm.org/D80291
---
 libc/src/string/CMakeLists.txt                | 21 +----
 libc/src/string/memcpy.cpp                    | 22 -----
 libc/src/string/memcpy_arch_specific.h.def    | 65 -------------
 libc/src/string/memory_utils/memcpy_utils.h   | 18 ++--
 libc/src/string/x86/memcpy.cpp                | 94 +++++++++++++++++++
 .../src/string/x86/memcpy_arch_specific.h.inc | 35 -------
 .../string/memory_utils/memcpy_utils_test.cpp | 36 +++----
 7 files changed, 125 insertions(+), 166 deletions(-)
 delete mode 100644 libc/src/string/memcpy.cpp
 delete mode 100644 libc/src/string/memcpy_arch_specific.h.def
 create mode 100644 libc/src/string/x86/memcpy.cpp
 delete mode 100644 libc/src/string/x86/memcpy_arch_specific.h.inc

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 3fe0d861aea36..cd3a9b5f77b3f 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -68,30 +68,17 @@ endfunction()
 
 # include the relevant architecture specific implementations
 if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
-  set(LIBC_STRING_TARGET_FOLDER "x86")
+  set(LIBC_STRING_TARGET_ARCH "x86")
 else()
-  set(LIBC_STRING_TARGET_FOLDER ${LIBC_TARGET_MACHINE})
+  set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE})
 endif()
 
-add_gen_header(
-  memcpy_arch_specific
-  DEF_FILE
-    memcpy_arch_specific.h.def
-  GEN_HDR
-    memcpy_arch_specific.h
-  PARAMS
-    memcpy_arch_specific=${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc
-  DATA_FILES
-    ${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc
-)
-
 function(add_memcpy memcpy_name)
   add_implementation(memcpy ${memcpy_name}
-    SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp
+    SRCS ${LIBC_SOURCE_DIR}/src/string/${LIBC_STRING_TARGET_ARCH}/memcpy.cpp
     HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h
     DEPENDS
       .memory_utils.memory_utils
-      .memcpy_arch_specific
       libc.include.string
     COMPILE_OPTIONS
       -fno-builtin-memcpy
@@ -144,4 +131,4 @@ add_bzero(bzero MARCH native)
 # Add all other relevant implementations for the native target.
 # ------------------------------------------------------------------------------
 
-include(${LIBC_STRING_TARGET_FOLDER}/CMakeLists.txt)
+include(${LIBC_STRING_TARGET_ARCH}/CMakeLists.txt)
diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
deleted file mode 100644
index 2dee707bdc4e9..0000000000000
--- a/libc/src/string/memcpy.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- Implementation of memcpy ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memcpy.h"
-#include "src/__support/common.h"
-#include "src/string/memcpy_arch_specific.h"
-
-namespace __llvm_libc {
-
-void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
-                                   const void *__restrict src, size_t size) {
-  memcpy_no_return(reinterpret_cast<char *>(dst),
-                   reinterpret_cast<const char *>(src), size);
-  return dst;
-}
-
-} // namespace __llvm_libc
diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def
deleted file mode 100644
index 8b991e8040007..0000000000000
--- a/libc/src/string/memcpy_arch_specific.h.def
+++ /dev/null
@@ -1,65 +0,0 @@
-//===-- Implementation of arch specific memcpy ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
-
-%%include_file(${memcpy_arch_specific})
-
-namespace __llvm_libc {
-
-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// The tests for `count` are in ascending order so the cost of branching is
-// proportional to the cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-//   Guided Optimization as the optimized code can take advantage of branching
-//   probabilities.
-// - It also allows for easier customization and favors testing multiple
-//   implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-//   with little change on the code side.
-static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
-                             size_t count) {
-  if (count == 0)
-    return;
-  if (count == 1)
-    return Copy<1>(dst, src);
-  if (count == 2)
-    return Copy<2>(dst, src);
-  if (count == 3)
-    return Copy<3>(dst, src);
-  if (count == 4)
-    return Copy<4>(dst, src);
-  if (count < 8)
-    return CopyOverlap<4>(dst, src, count);
-  if (count == 8)
-    return Copy<8>(dst, src);
-  if (count < 16)
-    return CopyOverlap<8>(dst, src, count);
-  if (count == 16)
-    return Copy<16>(dst, src);
-  if (count < 32)
-    return CopyOverlap<16>(dst, src, count);
-  if (count < 64)
-    return CopyOverlap<32>(dst, src, count);
-  if (count < 128)
-    return CopyOverlap<64>(dst, src, count);
-  CopyGE128(dst, src, count);
-}
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
index 09e379393cf24..a0e5ccc81c9e1 100644
--- a/libc/src/string/memory_utils/memcpy_utils.h
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -32,7 +32,7 @@ extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
 
 // Copies `kBlockSize` bytes from `src` to `dst`.
 template <size_t kBlockSize>
-static void Copy(char *__restrict dst, const char *__restrict src) {
+static void CopyBlock(char *__restrict dst, const char *__restrict src) {
 #if defined(LLVM_LIBC_MEMCPY_MONITOR)
   LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
 #elif defined(USE_BUILTIN_MEMCPY_INLINE)
@@ -52,7 +52,7 @@ template <size_t kBlockSize>
 static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
                           size_t count) {
   const size_t offset = count - kBlockSize;
-  Copy<kBlockSize>(dst + offset, src + offset);
+  CopyBlock<kBlockSize>(dst + offset, src + offset);
 }
 
 // Copies `kBlockSize` bytes twice with an overlap between the two.
@@ -64,9 +64,9 @@ static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
 //
 // Precondition: `count >= kBlockSize && count <= kBlockSize`.
 template <size_t kBlockSize>
-static void CopyOverlap(char *__restrict dst, const char *__restrict src,
-                        size_t count) {
-  Copy<kBlockSize>(dst, src);
+static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
+                             size_t count) {
+  CopyBlock<kBlockSize>(dst, src);
   CopyLastBlock<kBlockSize>(dst, src, count);
 }
 
@@ -85,14 +85,14 @@ static void CopyOverlap(char *__restrict dst, const char *__restrict src,
 // Precondition: `count > 2 * kBlockSize` for efficiency.
 //               `count >= kBlockSize` for correctness.
 template <size_t kBlockSize>
-static void CopyAligned(char *__restrict dst, const char *__restrict src,
-                        size_t count) {
-  Copy<kBlockSize>(dst, src); // Copy first block
+static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
+                              size_t count) {
+  CopyBlock<kBlockSize>(dst, src); // Copy first block
 
   // Copy aligned blocks
   size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
   for (; offset + kBlockSize < count; offset += kBlockSize)
-    Copy<kBlockSize>(dst + offset, src + offset);
+    CopyBlock<kBlockSize>(dst + offset, src + offset);
 
   CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
 }
diff --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp
new file mode 100644
index 0000000000000..811ce5183fe4e
--- /dev/null
+++ b/libc/src/string/x86/memcpy.cpp
@@ -0,0 +1,94 @@
+//===-- Implementation of memcpy ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+                         size_t count) {
+  // FIXME: Add MSVC support with
+  // #include <intrin.h>
+  // __movsb(reinterpret_cast<unsigned char *>(dst),
+  //         reinterpret_cast<const unsigned char *>(src), count);
+  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+#if defined(__AVX__)
+#define BEST_SIZE 64
+#else
+#define BEST_SIZE 32
+#endif
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+static void memcpy_x86(char *__restrict dst, const char *__restrict src,
+                       size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return CopyBlock<1>(dst, src);
+  if (count == 2)
+    return CopyBlock<2>(dst, src);
+  if (count == 3)
+    return CopyBlock<3>(dst, src);
+  if (count == 4)
+    return CopyBlock<4>(dst, src);
+  if (count < 8)
+    return CopyBlockOverlap<4>(dst, src, count);
+  if (count == 8)
+    return CopyBlock<8>(dst, src);
+  if (count < 16)
+    return CopyBlockOverlap<8>(dst, src, count);
+  if (count == 16)
+    return CopyBlock<16>(dst, src);
+  if (count < 32)
+    return CopyBlockOverlap<16>(dst, src, count);
+  if (count < 64)
+    return CopyBlockOverlap<32>(dst, src, count);
+  if (count < 128)
+    return CopyBlockOverlap<64>(dst, src, count);
+#if defined(__AVX__)
+  if (count < 256)
+    return CopyBlockOverlap<128>(dst, src, count);
+#endif
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize = -1;
+  if (count <= kRepMovsBSize)
+    return CopyAlignedBlocks<BEST_SIZE>(dst, src, count);
+  return CopyRepMovsb(dst, src, count);
+}
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+                                   const void *__restrict src, size_t size) {
+  memcpy_x86(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src),
+             size);
+  return dst;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/string/x86/memcpy_arch_specific.h.inc b/libc/src/string/x86/memcpy_arch_specific.h.inc
deleted file mode 100644
index 60610d4c73d25..0000000000000
--- a/libc/src/string/x86/memcpy_arch_specific.h.inc
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "src/string/memory_utils/memcpy_utils.h"
-
-namespace __llvm_libc {
-
-static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
-                         size_t count) {
-  // FIXME: Add MSVC support with
-  // #include <intrin.h>
-  // __movsb(reinterpret_cast<unsigned char *>(dst),
-  //         reinterpret_cast<const unsigned char *>(src), count);
-  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
-}
-
-#if defined(__AVX__)
-#define BEST_SIZE 64
-#else
-#define BEST_SIZE 32
-#endif
-
-static void CopyGE128(char *__restrict dst, const char *__restrict src,
-                      size_t count) {
-#if defined(__AVX__)
-  if (count < 256)
-    return CopyOverlap<128>(dst, src, count);
-#endif
-  // kRepMovsBSize == -1 : Only CopyAligned is used.
-  // kRepMovsBSize ==  0 : Only RepMovsb is used.
-  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
-  constexpr size_t kRepMovsBSize = -1;
-  if (count <= kRepMovsBSize)
-    return CopyAligned<BEST_SIZE>(dst, src, count);
-  CopyRepMovsb(dst, src, count);
-}
-
-} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
index 491c632216b7a..7e32fb4f3080a 100644
--- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -83,37 +83,37 @@ TEST(MemcpyUtilsTest, CopyTrivial) {
   auto &trace = GetTrace();
 
   trace.Clear();
-  Copy<1>(I(0), I(0));
+  CopyBlock<1>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "1");
   EXPECT_STREQ(trace.Read(), "1");
 
   trace.Clear();
-  Copy<2>(I(0), I(0));
+  CopyBlock<2>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "11");
   EXPECT_STREQ(trace.Read(), "11");
 
   trace.Clear();
-  Copy<4>(I(0), I(0));
+  CopyBlock<4>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "1111");
   EXPECT_STREQ(trace.Read(), "1111");
 
   trace.Clear();
-  Copy<8>(I(0), I(0));
+  CopyBlock<8>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "11111111");
   EXPECT_STREQ(trace.Read(), "11111111");
 
   trace.Clear();
-  Copy<16>(I(0), I(0));
+  CopyBlock<16>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "1111111111111111");
   EXPECT_STREQ(trace.Read(), "1111111111111111");
 
   trace.Clear();
-  Copy<32>(I(0), I(0));
+  CopyBlock<32>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
   EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
 
   trace.Clear();
-  Copy<64>(I(0), I(0));
+  CopyBlock<64>(I(0), I(0));
   EXPECT_STREQ(
       trace.Write(),
       "1111111111111111111111111111111111111111111111111111111111111111");
@@ -126,41 +126,41 @@ TEST(MemcpyUtilsTest, CopyOffset) {
   auto &trace = GetTrace();
 
   trace.Clear();
-  Copy<1>(I(3), I(1));
+  CopyBlock<1>(I(3), I(1));
   EXPECT_STREQ(trace.Write(), "0001");
   EXPECT_STREQ(trace.Read(), "01");
 
   trace.Clear();
-  Copy<1>(I(2), I(1));
+  CopyBlock<1>(I(2), I(1));
   EXPECT_STREQ(trace.Write(), "001");
   EXPECT_STREQ(trace.Read(), "01");
 }
 
-TEST(MemcpyUtilsTest, CopyOverlap) {
+TEST(MemcpyUtilsTest, CopyBlockOverlap) {
   auto &trace = GetTrace();
 
   trace.Clear();
-  CopyOverlap<2>(I(0), I(0), 2);
+  CopyBlockOverlap<2>(I(0), I(0), 2);
   EXPECT_STREQ(trace.Write(), "22");
   EXPECT_STREQ(trace.Read(), "22");
 
   trace.Clear();
-  CopyOverlap<2>(I(0), I(0), 3);
+  CopyBlockOverlap<2>(I(0), I(0), 3);
   EXPECT_STREQ(trace.Write(), "121");
   EXPECT_STREQ(trace.Read(), "121");
 
   trace.Clear();
-  CopyOverlap<2>(I(0), I(0), 4);
+  CopyBlockOverlap<2>(I(0), I(0), 4);
   EXPECT_STREQ(trace.Write(), "1111");
   EXPECT_STREQ(trace.Read(), "1111");
 
   trace.Clear();
-  CopyOverlap<4>(I(2), I(1), 7);
+  CopyBlockOverlap<4>(I(2), I(1), 7);
   EXPECT_STREQ(trace.Write(), "001112111");
   EXPECT_STREQ(trace.Read(), "01112111");
 }
 
-TEST(MemcpyUtilsTest, CopyAligned) {
+TEST(MemcpyUtilsTest, CopyAlignedBlocks) {
   auto &trace = GetTrace();
   // Destination is aligned already.
   //   "1111000000000"
@@ -169,7 +169,7 @@ TEST(MemcpyUtilsTest, CopyAligned) {
   // + "0000000001111"
   // = "1111111112221"
   trace.Clear();
-  CopyAligned<4>(I(0), I(0), 13);
+  CopyAlignedBlocks<4>(I(0), I(0), 13);
   EXPECT_STREQ(trace.Write(), "1111111112221");
   EXPECT_STREQ(trace.Read(), "1111111112221");
 
@@ -180,7 +180,7 @@ TEST(MemcpyUtilsTest, CopyAligned) {
   // + "00000000001111"
   // = "01112111112211"
   trace.Clear();
-  CopyAligned<4>(I(1), I(0), 13);
+  CopyAlignedBlocks<4>(I(1), I(0), 13);
   EXPECT_STREQ(trace.Write(), "01112111112211");
   EXPECT_STREQ(trace.Read(), "1112111112211");
 }
@@ -191,7 +191,7 @@ TEST(MemcpyUtilsTest, MaxReloads) {
     for (size_t count = 64; count < 768; ++count) {
       trace.Clear();
       // We should never reload more than twice when copying from count = 2x32.
-      CopyAligned<32>(I(alignment), I(0), count);
+      CopyAlignedBlocks<32>(I(alignment), I(0), count);
       const char *const written = trace.Write();
       // First bytes are untouched.
       for (size_t i = 0; i < alignment; ++i)

From c1c9eb0ab7d20e61f0fb345a60694bda0487c0da Mon Sep 17 00:00:00 2001
From: Yi Kong <yikong@google.com>
Date: Tue, 26 May 2020 19:13:10 +0800
Subject: [PATCH 089/770] [Transforms] Check validity of profile reader before
 invoking it

Although an invalid sampling profile would fail the compilation anyway,
this avoids crashing the compiler.
---
 llvm/lib/Transforms/IPO/SampleProfile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index eb0747fde6d34..697341443273a 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1843,9 +1843,9 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
 
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI, CallGraph *CG) {
-  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
   if (!ProfileIsValid)
     return false;
+  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
 
   PSI = _PSI;
   if (M.getProfileSummary(/* IsCS */ false) == nullptr)

From f368040c14f4bdac718798db28299a68adc42695 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 26 May 2020 07:56:50 -0400
Subject: [PATCH 090/770] [DAGCombiner] try to move splat after binop with
 splat constant

binop (splat X), (splat C) --> splat (binop X, C)
binop (splat C), (splat X) --> splat (binop C, X)

We do this in IR, and there's a similar fold for the case with 2
non-constant operands just above the code diff in this patch.

This was discussed in D79718, and the extra shuffle in the test
(llvm/test/CodeGen/X86/vector-fshl-128.ll::sink_splatvar) where it
was noticed disappears because demanded elements analysis is no
longer blocked. The large majority of the test diffs seem to be
benign code scheduling changes, but I do see another type of win:
moving the splat later allows binop narrowing in some cases.

Regressions were avoided on x86 and ARM with the INSERT_VECTOR_ELT
restriction.

Differential Revision: https://reviews.llvm.org/D79886
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  30 +-
 llvm/test/CodeGen/X86/vector-fshl-128.ll      | 394 +++++++++--------
 llvm/test/CodeGen/X86/vector-fshl-256.ll      | 408 ++++++++----------
 llvm/test/CodeGen/X86/vector-fshl-512.ll      | 218 +++++-----
 llvm/test/CodeGen/X86/vector-fshl-rot-128.ll  | 110 ++---
 llvm/test/CodeGen/X86/vector-fshr-128.ll      | 344 ++++++++-------
 llvm/test/CodeGen/X86/vector-fshr-256.ll      | 408 ++++++++----------
 llvm/test/CodeGen/X86/vector-fshr-512.ll      | 216 +++++-----
 llvm/test/CodeGen/X86/vector-fshr-rot-128.ll  |  96 ++---
 llvm/test/CodeGen/X86/vector-fshr-rot-256.ll  |  66 ++-
 llvm/test/CodeGen/X86/vector-rotate-128.ll    | 110 ++---
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |   8 +-
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |   4 +-
 13 files changed, 1099 insertions(+), 1313 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 40ceb5b34ad39..7e41b2fffeda1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20474,6 +20474,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   SDValue Ops[] = {LHS, RHS};
   EVT VT = N->getValueType(0);
   unsigned Opcode = N->getOpcode();
+  SDNodeFlags Flags = N->getFlags();
 
   // See if we can constant fold the vector operation.
   if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
@@ -20497,10 +20498,37 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
         (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
       SDLoc DL(N);
       SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
-                                     RHS.getOperand(0), N->getFlags());
+                                     RHS.getOperand(0), Flags);
       SDValue UndefV = LHS.getOperand(1);
       return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
     }
+
+    // Try to sink a splat shuffle after a binop with a uniform constant.
+    // This is limited to cases where neither the shuffle nor the constant have
+    // undefined elements because that could be poison-unsafe or inhibit
+    // demanded elements analysis. It is further limited to not change a splat
+    // of an inserted scalar because that may be optimized better by
+    // load-folding or other target-specific behaviors.
+    if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&
+        Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
+        Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
+      // binop (splat X), (splat C) --> splat (binop X, C)
+      SDLoc DL(N);
+      SDValue X = Shuf0->getOperand(0);
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
+                                  Shuf0->getMask());
+    }
+    if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&
+        Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
+        Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
+      // binop (splat C), (splat X) --> splat (binop C, X)
+      SDLoc DL(N);
+      SDValue X = Shuf1->getOperand(0);
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
+                                  Shuf1->getMask());
+    }
   }
 
   // The following pattern is likely to emerge with vector reduction ops. Moving
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 144a0457e63e5..0c5e19f24dbe5 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1182,7 +1182,6 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psllq %xmm2, %xmm3
@@ -1190,6 +1189,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE2-NEXT:    psubq %xmm2, %xmm4
 ; SSE2-NEXT:    psrlq %xmm4, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
@@ -1202,29 +1202,29 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE41-LABEL: splatvar_funnnel_v2i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psllq %xmm4, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psllq %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [64,64]
-; SSE41-NEXT:    psubq %xmm4, %xmm0
+; SSE41-NEXT:    psubq %xmm2, %xmm0
 ; SSE41-NEXT:    psrlq %xmm0, %xmm1
-; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpeqq %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_funnnel_v2i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -1232,13 +1232,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -1247,13 +1247,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1262,13 +1262,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VL-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1277,13 +1277,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1293,13 +1293,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1308,13 +1308,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1328,13 +1328,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -1342,13 +1342,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -1356,29 +1356,25 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; X32-SSE-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
+; X32-SSE-NEXT:    pand %xmm4, %xmm5
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm3, %xmm4
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X32-SSE-NEXT:    psrlq %xmm4, %xmm1
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
 ; X32-SSE-NEXT:    psllq %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    psllq %xmm4, %xmm5
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
-; X32-SSE-NEXT:    psubq %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X32-SSE-NEXT:    psrlq %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X32-SSE-NEXT:    orpd %xmm5, %xmm1
-; X32-SSE-NEXT:    pxor %xmm3, %xmm3
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pandn %xmm1, %xmm2
-; X32-SSE-NEXT:    por %xmm2, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm3
+; X32-SSE-NEXT:    pand %xmm5, %xmm0
+; X32-SSE-NEXT:    pandn %xmm3, %xmm5
+; X32-SSE-NEXT:    por %xmm5, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
   %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
@@ -1388,8 +1384,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    xorps %xmm4, %xmm4
 ; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
@@ -1401,6 +1396,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; SSE2-NEXT:    movd %ecx, %xmm4
 ; SSE2-NEXT:    psrld %xmm4, %xmm1
 ; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
@@ -1410,25 +1406,24 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; SSE41-LABEL: splatvar_funnnel_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,0,0]
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; SSE41-NEXT:    movdqa %xmm3, %xmm2
-; SSE41-NEXT:    pslld %xmm0, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pslld %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [32,32,32,32]
-; SSE41-NEXT:    psubd %xmm4, %xmm0
+; SSE41-NEXT:    psubd %xmm2, %xmm0
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    psrld %xmm0, %xmm1
-; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movaps %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_funnnel_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
@@ -1437,6 +1432,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1444,7 +1440,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1454,6 +1449,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1462,7 +1458,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1472,6 +1467,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1480,7 +1476,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm3, %xmm0, %xmm3
@@ -1489,6 +1484,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VL-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1497,7 +1493,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1507,6 +1502,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1516,7 +1512,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1526,6 +1521,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1534,7 +1530,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm3, %xmm0, %xmm3
@@ -1543,6 +1538,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1556,7 +1552,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; XOPAVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
@@ -1565,6 +1560,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1572,7 +1568,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1582,6 +1577,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1589,8 +1585,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm2
 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
 ; X32-SSE-NEXT:    xorps %xmm4, %xmm4
 ; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
@@ -1602,6 +1597,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; X32-SSE-NEXT:    movd %ecx, %xmm4
 ; X32-SSE-NEXT:    psrld %xmm4, %xmm1
 ; X32-SSE-NEXT:    por %xmm5, %xmm1
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm2
 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
 ; X32-SSE-NEXT:    pandn %xmm1, %xmm2
@@ -1615,13 +1611,10 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    psubw %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpeqw %xmm2, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,0,2,3,4,5,6,7]
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
@@ -1630,35 +1623,36 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpeqw %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_funnnel_v8i16:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; SSE41-NEXT:    movdqa %xmm3, %xmm2
-; SSE41-NEXT:    psllw %xmm0, %xmm2
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psllw %xmm0, %xmm4
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
-; SSE41-NEXT:    psubw %xmm4, %xmm0
+; SSE41-NEXT:    psubw %xmm2, %xmm0
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SSE41-NEXT:    psrlw %xmm0, %xmm1
-; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm4
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_funnnel_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1667,6 +1661,8 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1674,7 +1670,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1683,6 +1678,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1690,7 +1686,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1699,6 +1694,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1706,7 +1702,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1715,6 +1710,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1723,7 +1719,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1732,6 +1727,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1741,7 +1737,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1750,6 +1745,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VBMI2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1758,7 +1754,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1767,6 +1762,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %xmm1, %xmm0
@@ -1780,8 +1776,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1790,6 +1784,8 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1797,7 +1793,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
@@ -1806,6 +1801,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1813,13 +1809,10 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; X32-SSE-NEXT:    psubw %xmm2, %xmm3
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm4
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,0,2,3,4,5,6,7]
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
@@ -1828,9 +1821,12 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psrlw %xmm3, %xmm1
 ; X32-SSE-NEXT:    por %xmm5, %xmm1
-; X32-SSE-NEXT:    pand %xmm4, %xmm0
-; X32-SSE-NEXT:    pandn %xmm1, %xmm4
-; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
+; X32-SSE-NEXT:    pxor %xmm3, %xmm3
+; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm3
+; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pandn %xmm1, %xmm3
+; X32-SSE-NEXT:    por %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
   %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
@@ -1840,62 +1836,63 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT:    psubb %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpeqb %xmm2, %xmm4
-; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psllw %xmm2, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE2-NEXT:    psllw %xmm2, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
-; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    psrlw %xmm3, %xmm1
-; SSE2-NEXT:    psrlw %xmm3, %xmm2
-; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psllw %xmm3, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT:    psllw %xmm3, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm2, %xmm4
+; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    psrlw %xmm4, %xmm1
+; SSE2-NEXT:    psrlw %xmm4, %xmm5
+; SSE2-NEXT:    psrlw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_funnnel_v16i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    psllw %xmm5, %xmm4
+; SSE41-NEXT:    psllw %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
-; SSE41-NEXT:    psllw %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm0, %xmm7
-; SSE41-NEXT:    pand %xmm7, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT:    psubb %xmm2, %xmm5
-; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    psrlw %xmm5, %xmm1
-; SSE41-NEXT:    psrlw %xmm5, %xmm6
-; SSE41-NEXT:    pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT:    pand %xmm1, %xmm6
-; SSE41-NEXT:    por %xmm6, %xmm4
+; SSE41-NEXT:    psllw %xmm0, %xmm6
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm6
+; SSE41-NEXT:    pand %xmm6, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm2, %xmm6
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm6, %xmm1
+; SSE41-NEXT:    psrlw %xmm6, %xmm5
+; SSE41-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm1, %xmm5
+; SSE41-NEXT:    por %xmm5, %xmm4
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
@@ -1903,30 +1900,29 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsllw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
@@ -1943,6 +1939,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX2-NEXT:    vpbroadcastb %xmm4, %xmm4
 ; AVX2-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1950,9 +1947,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; AVX512F-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1970,9 +1967,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm3, %zmm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1991,9 +1988,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
@@ -2012,9 +2009,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
@@ -2032,9 +2029,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512VLBW-NEXT:    vpsllvw %ymm4, %ymm3, %ymm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -2052,9 +2049,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512VLVBMI2-NEXT:    vpsllvw %ymm4, %ymm3, %ymm3
 ; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -2072,9 +2069,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm4
 ; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm5
 ; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm1, %xmm1
@@ -2085,8 +2082,8 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm3
 ; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
@@ -2098,38 +2095,39 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X32-SSE-NEXT:    psubb %xmm2, %xmm3
-; X32-SSE-NEXT:    pxor %xmm4, %xmm4
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm4
-; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    psllw %xmm2, %xmm5
-; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
-; X32-SSE-NEXT:    psllw %xmm2, %xmm6
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
-; X32-SSE-NEXT:    pand %xmm5, %xmm6
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    psrlw %xmm3, %xmm1
-; X32-SSE-NEXT:    psrlw %xmm3, %xmm2
-; X32-SSE-NEXT:    psrlw $8, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    psllw %xmm3, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; X32-SSE-NEXT:    psllw %xmm3, %xmm6
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm2, %xmm4
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    psrlw %xmm4, %xmm1
+; X32-SSE-NEXT:    psrlw %xmm4, %xmm5
+; X32-SSE-NEXT:    psrlw $8, %xmm5
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm4
+; X32-SSE-NEXT:    por %xmm3, %xmm4
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    por %xmm6, %xmm2
-; X32-SSE-NEXT:    pand %xmm4, %xmm0
-; X32-SSE-NEXT:    pandn %xmm2, %xmm4
-; X32-SSE-NEXT:    por %xmm4, %xmm0
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqb %xmm1, %xmm2
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pandn %xmm4, %xmm2
+; X32-SSE-NEXT:    por %xmm2, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
@@ -2166,9 +2164,8 @@ define void @sink_splatvar(i32* %p, i32 %shift_amt) {
 ;
 ; SSE41-LABEL: sink_splatvar:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movd %esi, %xmm0
+; SSE41-NEXT:    movd %esi, %xmm1
 ; SSE41-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [32,32,32,32]
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
@@ -2192,7 +2189,6 @@ define void @sink_splatvar(i32* %p, i32 %shift_amt) {
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vmovd %esi, %xmm0
 ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [32,32,32,32]
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 4903104054479..860c2d576c728 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -906,39 +906,35 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
-; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsrlq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrlq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -947,14 +943,14 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
@@ -962,13 +958,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VL-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -977,14 +973,14 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -993,14 +989,14 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1008,13 +1004,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1028,39 +1024,35 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm4
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
-; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT:    vpsrlq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsrlq %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT:    vpsrlq %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsllq %xmm2, %ymm0, %ymm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpsrlq %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -1073,37 +1065,31 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpslld %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsrld %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpslld %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_funnnel_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1111,6 +1097,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
@@ -1119,9 +1106,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1129,6 +1115,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1136,8 +1123,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1145,6 +1131,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1153,9 +1140,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1163,6 +1149,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1171,9 +1158,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1181,6 +1167,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1188,8 +1175,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1197,6 +1183,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1210,37 +1197,31 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT:    vpslld %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
-; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT:    vpsrld %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; XOPAVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT:    vpslld %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT:    vpsrld %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
 ; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1248,6 +1229,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
@@ -1260,30 +1242,25 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsrlw %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -1291,8 +1268,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1300,6 +1276,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1307,8 +1284,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1316,6 +1292,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1323,8 +1300,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1332,6 +1308,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1340,8 +1317,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1349,6 +1325,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1357,8 +1334,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1366,6 +1342,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1373,8 +1350,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1382,6 +1358,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1395,37 +1372,31 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; XOPAVX1-NEXT:    vpsrlw %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; XOPAVX1-NEXT:    vpsrlw %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; XOPAVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
 ; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1433,6 +1404,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1445,41 +1417,33 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm9, %xmm9
-; AVX1-NEXT:    vpsllw %xmm4, %xmm9, %xmm7
-; AVX1-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT:    vpsubb %xmm5, %xmm10, %xmm3
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vpsrlw %xmm3, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrlw %xmm3, %xmm9, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsubb %xmm2, %xmm10, %xmm6
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw %xmm6, %xmm9, %xmm6
-; AVX1-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm4, %ymm1
-; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm8, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpcmpeqb %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -1487,8 +1451,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
 ; AVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -1504,6 +1467,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1511,8 +1475,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -1528,6 +1491,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1535,8 +1499,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -1552,6 +1515,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1560,9 +1524,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1580,9 +1544,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1599,9 +1563,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512VLBW-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1618,9 +1582,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512VLVBMI2-NEXT:    vpsllvw %zmm4, %zmm3, %zmm3
 ; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1637,36 +1601,30 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
-; XOPAVX1-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; XOPAVX1-NEXT:    vorps %ymm1, %ymm4, %ymm1
+; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm6
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm4, %xmm7
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vorps %ymm1, %ymm5, %ymm1
-; XOPAVX1-NEXT:    vpcomeqb %xmm8, %xmm4, %xmm3
-; XOPAVX1-NEXT:    vpcomeqb %xmm8, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm4
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index c65d3e7a49480..20cfba41a1c46 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -564,13 +564,14 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -578,13 +579,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512VL-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -592,13 +593,14 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -612,13 +614,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsllq %xmm2, %zmm0, %zmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlq %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -637,8 +639,8 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpslld %xmm3, %zmm0, %zmm3
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -646,6 +648,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -653,8 +656,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm3, %zmm0, %zmm3
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -662,6 +664,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -669,8 +672,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm3, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -678,6 +681,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -691,8 +695,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm3, %zmm0, %zmm3
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -700,6 +703,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -718,64 +722,53 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm4
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
-; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
-; AVX512F-NEXT:    vpsrlw %xmm6, %ymm7, %ymm6
-; AVX512F-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX512F-NEXT:    vpsrlw %xmm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm7
-; AVX512VL-NEXT:    vpsrlw %xmm6, %ymm7, %ymm6
-; AVX512VL-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm5, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -783,6 +776,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -796,8 +790,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -805,6 +798,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -823,90 +817,73 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT:    vpsllw %xmm3, %xmm5, %xmm6
+; AVX512F-NEXT:    vpsllw %xmm4, %xmm5, %xmm6
 ; AVX512F-NEXT:    vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT:    vpsubb %xmm4, %xmm6, %xmm7
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm8
-; AVX512F-NEXT:    vpsrlw %xmm7, %ymm8, %ymm8
-; AVX512F-NEXT:    vpsrlw %xmm7, %xmm5, %xmm7
-; AVX512F-NEXT:    vpsrlw $8, %xmm7, %xmm7
-; AVX512F-NEXT:    vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT:    vpand %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT:    vpsubb %xmm2, %xmm6, %xmm6
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vpsrlw %xmm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw %xmm6, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
 ; AVX512F-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX512F-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
 ; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT:    vpsllw %xmm3, %xmm5, %xmm6
+; AVX512VL-NEXT:    vpsllw %xmm4, %xmm5, %xmm6
 ; AVX512VL-NEXT:    vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT:    vpsubb %xmm4, %xmm6, %xmm7
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm8
-; AVX512VL-NEXT:    vpsrlw %xmm7, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpsrlw %xmm7, %xmm5, %xmm7
-; AVX512VL-NEXT:    vpsrlw $8, %xmm7, %xmm7
-; AVX512VL-NEXT:    vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT:    vpand %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT:    vpsubb %xmm2, %xmm6, %xmm6
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm6, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlw %xmm6, %xmm5, %xmm5
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
 ; AVX512VL-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX512VL-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm7, %zmm1, %zmm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -922,6 +899,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -929,8 +907,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
 ; AVX512VBMI2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -946,6 +923,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vpandq %zmm4, %zmm1, %zmm1
 ; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -953,8 +931,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
 ; AVX512VLBW-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -970,6 +947,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -977,8 +955,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -994,6 +971,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpandq %zmm4, %zmm1, %zmm1
 ; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 4d5b148b362e8..fb667a1106222 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -779,7 +779,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -793,7 +792,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; AVX1-LABEL: splatvar_funnnel_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
@@ -806,7 +804,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
@@ -881,15 +878,13 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllw %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    psubw %xmm1, %xmm2
-; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw %xmm1, %xmm3
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm2, %xmm0
@@ -898,8 +893,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -911,36 +904,20 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_funnnel_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splatvar_funnnel_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: splatvar_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_funnnel_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
@@ -966,15 +943,13 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw %xmm2, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
 ; X32-SSE-NEXT:    psubw %xmm1, %xmm2
-; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    psllw %xmm1, %xmm3
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psrlw %xmm2, %xmm0
@@ -988,9 +963,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; SSE2-NEXT:    psubb %xmm1, %xmm2
@@ -1021,44 +993,42 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pshufb %xmm3, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psllw %xmm4, %xmm2
+; SSE41-NEXT:    psllw %xmm3, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT:    psllw %xmm4, %xmm6
-; SSE41-NEXT:    pshufb %xmm3, %xmm6
-; SSE41-NEXT:    pand %xmm6, %xmm2
+; SSE41-NEXT:    psllw %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm3, %xmm5
+; SSE41-NEXT:    pand %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; SSE41-NEXT:    psubb %xmm1, %xmm3
 ; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    psrlw %xmm1, %xmm0
-; SSE41-NEXT:    psrlw %xmm1, %xmm5
-; SSE41-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT:    pand %xmm0, %xmm5
-; SSE41-NEXT:    por %xmm5, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm4
+; SSE41-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm0, %xmm4
+; SSE41-NEXT:    por %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_funnnel_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -1066,7 +1036,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
@@ -1173,9 +1142,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; X32-SSE-NEXT:    psubb %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index ca67324439f07..f09ded8d95f74 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1196,7 +1196,6 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    psrlq %xmm2, %xmm3
@@ -1204,6 +1203,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE2-NEXT:    psubq %xmm2, %xmm4
 ; SSE2-NEXT:    psllq %xmm4, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
@@ -1217,7 +1217,6 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE41-LABEL: splatvar_funnnel_v2i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq %xmm2, %xmm0
@@ -1225,6 +1224,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; SSE41-NEXT:    psubq %xmm2, %xmm4
 ; SSE41-NEXT:    psllq %xmm4, %xmm3
 ; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pcmpeqq %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
@@ -1233,13 +1233,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v2i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
@@ -1247,13 +1247,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
@@ -1262,13 +1262,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1277,13 +1277,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VL-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
@@ -1291,13 +1291,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1307,13 +1307,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1322,13 +1322,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vptestnmq %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1342,13 +1342,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
@@ -1356,13 +1356,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsrlq %xmm2, %xmm1, %xmm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
@@ -1370,30 +1370,26 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X32-SSE-NEXT:    psrlq %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
-; X32-SSE-NEXT:    psrlq %xmm4, %xmm5
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
-; X32-SSE-NEXT:    psubq %xmm2, %xmm3
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
+; X32-SSE-NEXT:    pxor %xmm5, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm5
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,0,3,2]
+; X32-SSE-NEXT:    pand %xmm5, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm5 = [64,0,64,0]
+; X32-SSE-NEXT:    psubq %xmm4, %xmm5
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
-; X32-SSE-NEXT:    psllq %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; X32-SSE-NEXT:    psllq %xmm3, %xmm0
+; X32-SSE-NEXT:    psllq %xmm5, %xmm4
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; X32-SSE-NEXT:    psllq %xmm5, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X32-SSE-NEXT:    orpd %xmm5, %xmm0
-; X32-SSE-NEXT:    pxor %xmm3, %xmm3
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm3
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pandn %xmm0, %xmm2
-; X32-SSE-NEXT:    por %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm2, %xmm4
+; X32-SSE-NEXT:    por %xmm0, %xmm4
+; X32-SSE-NEXT:    pand %xmm3, %xmm1
+; X32-SSE-NEXT:    pandn %xmm4, %xmm3
+; X32-SSE-NEXT:    por %xmm1, %xmm3
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
   %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
@@ -1403,8 +1399,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    xorps %xmm4, %xmm4
 ; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
@@ -1416,6 +1411,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; SSE2-NEXT:    movd %ecx, %xmm4
 ; SSE2-NEXT:    pslld %xmm4, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm2
@@ -1426,7 +1422,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; SSE41-LABEL: splatvar_funnnel_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
@@ -1436,6 +1431,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pslld %xmm0, %xmm3
 ; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
@@ -1444,7 +1440,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
@@ -1453,6 +1448,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
@@ -1460,7 +1456,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1470,6 +1465,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
@@ -1478,7 +1474,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1488,6 +1483,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1496,7 +1492,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
@@ -1505,6 +1500,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VL-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
@@ -1512,7 +1508,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1522,6 +1517,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1531,7 +1527,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1541,6 +1536,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1549,7 +1545,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
@@ -1558,6 +1553,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vptestnmd %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1571,7 +1567,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
@@ -1580,6 +1575,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
@@ -1587,7 +1583,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
@@ -1597,6 +1592,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm4, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
@@ -1604,8 +1600,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm2
 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
 ; X32-SSE-NEXT:    xorps %xmm4, %xmm4
 ; X32-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
@@ -1617,6 +1612,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 ; X32-SSE-NEXT:    movd %ecx, %xmm4
 ; X32-SSE-NEXT:    pslld %xmm4, %xmm0
 ; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm2
 ; X32-SSE-NEXT:    pand %xmm2, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm2
@@ -1631,21 +1627,21 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; SSE2-NEXT:    psubw %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; SSE2-NEXT:    psubw %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    psrlw %xmm2, %xmm5
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psrlw %xmm3, %xmm5
-; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    psllw %xmm4, %xmm0
+; SSE2-NEXT:    psllw %xmm3, %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm2
 ; SSE2-NEXT:    por %xmm1, %xmm2
@@ -1655,8 +1651,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; SSE41-LABEL: splatvar_funnnel_v8i16:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
@@ -1666,6 +1660,8 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SSE41-NEXT:    psllw %xmm0, %xmm3
 ; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
@@ -1674,8 +1670,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1684,6 +1678,8 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1691,7 +1687,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1700,6 +1695,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1707,7 +1703,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1716,6 +1711,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1723,7 +1719,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1732,6 +1727,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1740,7 +1736,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1749,6 +1744,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1758,7 +1754,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1767,6 +1762,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX512VBMI2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -1775,7 +1771,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1784,6 +1779,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vptestnmw %xmm2, %xmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1797,8 +1793,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1807,6 +1801,8 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1814,7 +1810,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
@@ -1823,6 +1818,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1830,21 +1826,21 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT:    psubw %xmm3, %xmm4
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpeqw %xmm3, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT:    psubw %xmm2, %xmm3
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
+; X32-SSE-NEXT:    psrlw %xmm2, %xmm5
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
-; X32-SSE-NEXT:    psrlw %xmm3, %xmm5
-; X32-SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    psllw %xmm4, %xmm0
+; X32-SSE-NEXT:    psllw %xmm3, %xmm0
 ; X32-SSE-NEXT:    por %xmm5, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; X32-SSE-NEXT:    pxor %xmm2, %xmm2
+; X32-SSE-NEXT:    pcmpeqw %xmm3, %xmm2
 ; X32-SSE-NEXT:    pand %xmm2, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm2
 ; X32-SSE-NEXT:    por %xmm1, %xmm2
@@ -1858,63 +1854,63 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT:    psubb %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psrlw %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psrlw %xmm3, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm6
 ; SSE2-NEXT:    psrlw %xmm3, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE2-NEXT:    psrlw $8, %xmm6
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
-; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT:    psubb %xmm2, %xmm4
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psllw %xmm4, %xmm0
-; SSE2-NEXT:    psllw %xmm4, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psllw %xmm4, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: splatvar_funnnel_v16i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
-; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    psrlw %xmm4, %xmm5
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
-; SSE41-NEXT:    psrlw %xmm4, %xmm7
-; SSE41-NEXT:    pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT:    pand %xmm5, %xmm7
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT:    psubb %xmm2, %xmm4
-; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    psllw %xmm4, %xmm3
-; SSE41-NEXT:    psllw %xmm4, %xmm6
-; SSE41-NEXT:    pshufb %xmm0, %xmm6
-; SSE41-NEXT:    pand %xmm6, %xmm3
-; SSE41-NEXT:    por %xmm7, %xmm3
+; SSE41-NEXT:    psrlw %xmm0, %xmm6
+; SSE41-NEXT:    pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm4, %xmm6
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    psubb %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    psllw %xmm0, %xmm3
+; SSE41-NEXT:    psllw %xmm0, %xmm5
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm5
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    por %xmm6, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
@@ -1922,30 +1918,29 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm3, %xmm1, %xmm4
@@ -1962,6 +1957,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX2-NEXT:    vpbroadcastb %xmm4, %xmm4
 ; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
@@ -1969,9 +1965,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1989,9 +1985,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm3, %zmm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -2010,9 +2006,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -2031,9 +2027,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -2051,9 +2047,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512VLBW-NEXT:    vpsrlvw %ymm4, %ymm3, %ymm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -2070,9 +2066,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
 ; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm4, %ymm3, %ymm3
 ; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -2089,9 +2085,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -2104,8 +2100,8 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
@@ -2119,39 +2115,39 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X32-SSE-NEXT:    psubb %xmm3, %xmm4
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pcmpeqb %xmm3, %xmm2
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
-; X32-SSE-NEXT:    psrlw %xmm3, %xmm5
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE-NEXT:    psrlw %xmm3, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm5, %xmm5
 ; X32-SSE-NEXT:    pcmpeqd %xmm6, %xmm6
 ; X32-SSE-NEXT:    psrlw %xmm3, %xmm6
-; X32-SSE-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X32-SSE-NEXT:    psrlw $8, %xmm6
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
-; X32-SSE-NEXT:    pand %xmm5, %xmm6
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm4, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X32-SSE-NEXT:    psubb %xmm2, %xmm4
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psllw %xmm4, %xmm0
-; X32-SSE-NEXT:    psllw %xmm4, %xmm3
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
-; X32-SSE-NEXT:    pand %xmm0, %xmm3
-; X32-SSE-NEXT:    por %xmm6, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm1
-; X32-SSE-NEXT:    pandn %xmm3, %xmm2
-; X32-SSE-NEXT:    por %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE-NEXT:    psllw %xmm4, %xmm5
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X32-SSE-NEXT:    pand %xmm0, %xmm4
+; X32-SSE-NEXT:    por %xmm3, %xmm4
+; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
+; X32-SSE-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT:    pand %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm4, %xmm0
+; X32-SSE-NEXT:    por %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index c57b9699861b7..e40e3cdfbd65a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -910,39 +910,35 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
-; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsllq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpsllq %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
@@ -951,14 +947,14 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -966,13 +962,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VL-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
@@ -980,14 +976,14 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -996,14 +992,14 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VBMI2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1011,13 +1007,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmq %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1031,39 +1027,35 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpsrlq %xmm2, %xmm1, %xmm4
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [64,64]
-; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm6
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT:    vpsllq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsllq %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsllq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpcomeqq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsrlq %xmm2, %ymm1, %ymm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; XOPAVX2-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpsllq %xmm4, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
@@ -1076,37 +1068,31 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpsrld %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpslld %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; AVX1-NEXT:    vpslld %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpsrld %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpslld %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: splatvar_funnnel_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1114,6 +1100,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
@@ -1122,9 +1109,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1132,6 +1118,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1139,8 +1126,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1148,6 +1134,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
@@ -1155,9 +1142,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1165,6 +1151,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1173,9 +1160,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1183,6 +1169,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1190,8 +1177,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1199,6 +1185,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1212,37 +1199,31 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT:    vpsrld %xmm3, %xmm1, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
-; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm6
-; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT:    vpslld %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
-; XOPAVX1-NEXT:    vpslld %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
+; XOPAVX1-NEXT:    vpsrld %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpslld %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpslld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpcomeqd %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
 ; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -1250,6 +1231,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
@@ -1262,30 +1244,25 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsllw %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -1293,8 +1270,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1302,6 +1278,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1309,8 +1286,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1318,6 +1294,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1325,8 +1302,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1334,6 +1310,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1342,8 +1319,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1351,6 +1327,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1359,8 +1336,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1368,6 +1344,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1375,8 +1352,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1384,6 +1360,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1397,37 +1374,31 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm4
-; XOPAVX1-NEXT:    vpsrlw %xmm3, %xmm1, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT:    vpsllw %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; XOPAVX1-NEXT:    vpsllw %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpcomeqw %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -1435,6 +1406,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1447,40 +1419,33 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm8, %xmm7
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm9
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT:    vpsubb %xmm5, %xmm7, %xmm6
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsllw %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpsllw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubb %xmm2, %xmm7, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpsubb %xmm2, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpsllw %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vpsllw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpsllw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm0, %ymm9, %ymm0
-; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm5, %xmm4
-; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -1488,8 +1453,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX2-LABEL: splatvar_funnnel_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm4
 ; AVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -1505,6 +1469,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1512,8 +1477,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm4
 ; AVX512F-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -1529,6 +1493,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1536,8 +1501,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm4
 ; AVX512VL-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -1553,6 +1517,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1561,9 +1526,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1581,9 +1546,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512VBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1600,9 +1565,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512VLBW-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1618,9 +1583,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
 ; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm4, %zmm3, %zmm3
 ; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -1636,43 +1601,36 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm3, %xmm5
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm6
-; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm6
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm8, %xmm7
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm8, %xmm7
-; XOPAVX1-NEXT:    vpshlb %xmm7, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm6, %xmm6
+; XOPAVX1-NEXT:    vpshlb %xmm5, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm5, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm4, %xmm4
+; XOPAVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    vpcomeqb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
 ; XOPAVX1-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
-; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX2-NEXT:    vpsubb %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm4
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm3
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; XOPAVX2-NEXT:    vpsubb %ymm2, %ymm4, %ymm4
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index ef4097addc60d..fd59d40cefc88 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -556,39 +556,41 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512VL-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512BW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -602,13 +604,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip){1to2}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlq %xmm2, %zmm1, %zmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [64,64]
 ; AVX512VLBW-NEXT:    vpsubq %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpsllq %xmm4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpbroadcastq %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmq %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -627,8 +629,8 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -636,14 +638,14 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -651,14 +653,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -666,6 +669,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -679,8 +683,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
@@ -688,6 +691,7 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -706,64 +710,53 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
-; AVX512F-NEXT:    vpsllw %xmm6, %ymm7, %ymm6
-; AVX512F-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX512F-NEXT:    vpsllw %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm3
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vpcmpeqw %ymm0, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpternlogq $202, %zmm3, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %xmm4, %xmm5, %xmm6
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
-; AVX512VL-NEXT:    vpsllw %xmm6, %ymm7, %ymm6
-; AVX512VL-NEXT:    vpsubw %xmm2, %xmm5, %xmm5
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm3
 ; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqw %ymm0, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpternlogq $202, %zmm3, %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -771,6 +764,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -784,8 +778,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm3
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
@@ -793,6 +786,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -811,88 +805,73 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT:    vpsrlw %xmm3, %xmm5, %xmm6
+; AVX512F-NEXT:    vpsrlw %xmm4, %xmm5, %xmm6
 ; AVX512F-NEXT:    vpsrlw $8, %xmm6, %xmm6
 ; AVX512F-NEXT:    vpbroadcastb %xmm6, %ymm6
-; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT:    vpsubb %xmm4, %xmm6, %xmm7
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm8
-; AVX512F-NEXT:    vpsllw %xmm7, %ymm8, %ymm8
-; AVX512F-NEXT:    vpsllw %xmm7, %xmm5, %xmm7
-; AVX512F-NEXT:    vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT:    vpand %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT:    vpsubb %xmm2, %xmm6, %xmm6
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vpsllw %xmm6, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw %xmm6, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm4
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
 ; AVX512F-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512F-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm3
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vpcmpeqb %ymm0, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpcmpeqb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpternlogq $202, %zmm3, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm5, %xmm6
+; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm5, %xmm6
 ; AVX512VL-NEXT:    vpsrlw $8, %xmm6, %xmm6
 ; AVX512VL-NEXT:    vpbroadcastb %xmm6, %ymm6
-; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT:    vpsubb %xmm4, %xmm6, %xmm7
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm8
-; AVX512VL-NEXT:    vpsllw %xmm7, %ymm8, %ymm8
-; AVX512VL-NEXT:    vpsllw %xmm7, %xmm5, %xmm7
-; AVX512VL-NEXT:    vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT:    vpand %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT:    vpsubb %xmm2, %xmm6, %xmm6
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm6, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsllw %xmm6, %xmm5, %xmm5
+; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
 ; AVX512VL-NEXT:    vpbroadcastb %xmm5, %ymm5
+; AVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm3
 ; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpeqb %ymm0, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpcmpeqb %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpternlogq $202, %zmm3, %zmm1, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -908,14 +887,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
 ; AVX512VBMI2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -931,14 +910,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VBMI2-NEXT:    vpandq %zmm4, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
 ; AVX512VLBW-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -954,14 +933,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLVBMI2-NEXT:    vpsrlw %xmm3, %zmm1, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
@@ -977,6 +956,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpandq %zmm4, %zmm0, %zmm0
 ; AVX512VLVBMI2-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index e923df1c01423..d0966b8fab6e0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -766,17 +766,17 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
@@ -824,7 +824,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    psubd %xmm1, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
@@ -840,7 +839,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; AVX1-LABEL: splatvar_funnnel_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
@@ -855,7 +853,6 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
@@ -901,17 +898,17 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
@@ -937,17 +934,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind
 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    psubw %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllw %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    psubw %xmm2, %xmm1
-; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw %xmm2, %xmm3
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm1, %xmm0
@@ -956,8 +951,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    psubw %xmm1, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
@@ -971,40 +964,22 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_funnnel_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splatvar_funnnel_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: splatvar_funnnel_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_funnnel_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
@@ -1019,34 +994,32 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
 ; X32-SSE-NEXT:    psubw %xmm1, %xmm2
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; X32-SSE-NEXT:    pand %xmm2, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw %xmm1, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
 ; X32-SSE-NEXT:    psubw %xmm2, %xmm1
-; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    psllw %xmm2, %xmm3
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
@@ -1060,9 +1033,6 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
@@ -1095,7 +1065,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ; SSE41-LABEL: splatvar_funnnel_v16i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pshufb %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    psubb %xmm1, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
@@ -1121,7 +1090,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ; AVX1-LABEL: splatvar_funnnel_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
@@ -1142,7 +1110,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
@@ -1239,24 +1206,21 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_funnnel_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
 ; X32-SSE-NEXT:    psubb %xmm1, %xmm2
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 723a9dc51bc82..930795283a24e 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -623,9 +623,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vprotq %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
@@ -634,12 +634,11 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vprotq %xmm1, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -651,9 +650,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind
 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -671,9 +670,9 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
@@ -715,9 +714,9 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vprotd %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
@@ -726,12 +725,11 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm1
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vprotd %xmm1, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -743,10 +741,10 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -764,9 +762,9 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
 ;
 ; AVX2-LABEL: splatvar_funnnel_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
@@ -779,9 +777,9 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
 ;
 ; AVX512-LABEL: splatvar_funnnel_v16i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
@@ -794,10 +792,10 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vprotw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
@@ -806,12 +804,11 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm1
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vprotw %xmm1, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
@@ -824,8 +821,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; AVX1-LABEL: splatvar_funnnel_v32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
@@ -852,9 +849,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ;
 ; AVX2-LABEL: splatvar_funnnel_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
@@ -875,9 +872,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
@@ -898,9 +895,9 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
@@ -956,8 +953,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vprotb %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
@@ -966,12 +963,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vprotb %xmm1, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index c4ce14c6cc056..7dad18a324d73 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -749,7 +749,6 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_rotate_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -763,7 +762,6 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX1-LABEL: splatvar_rotate_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
@@ -776,7 +774,6 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_rotate_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
@@ -854,15 +851,13 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-LABEL: splatvar_rotate_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psllw %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
 ; SSE2-NEXT:    psubw %xmm1, %xmm2
-; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllw %xmm1, %xmm3
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE2-NEXT:    psrlw %xmm2, %xmm0
@@ -871,8 +866,6 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_rotate_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -884,36 +877,20 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: splatvar_rotate_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splatvar_rotate_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: splatvar_rotate_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_rotate_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
@@ -939,15 +916,13 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_rotate_v8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,0,0]
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    psllw %xmm2, %xmm3
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
 ; X32-SSE-NEXT:    psubw %xmm1, %xmm2
-; X32-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    psllw %xmm1, %xmm3
 ; X32-SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
 ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    psrlw %xmm2, %xmm0
@@ -964,9 +939,6 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: splatvar_rotate_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; SSE2-NEXT:    psubb %xmm1, %xmm2
@@ -997,44 +969,42 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_rotate_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pshufb %xmm3, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psllw %xmm4, %xmm2
+; SSE41-NEXT:    psllw %xmm3, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT:    psllw %xmm4, %xmm6
-; SSE41-NEXT:    pshufb %xmm3, %xmm6
-; SSE41-NEXT:    pand %xmm6, %xmm2
+; SSE41-NEXT:    psllw %xmm3, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm3, %xmm5
+; SSE41-NEXT:    pand %xmm5, %xmm2
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; SSE41-NEXT:    psubb %xmm1, %xmm3
 ; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    psrlw %xmm1, %xmm0
-; SSE41-NEXT:    psrlw %xmm1, %xmm5
-; SSE41-NEXT:    pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT:    pand %xmm0, %xmm5
-; SSE41-NEXT:    por %xmm5, %xmm2
+; SSE41-NEXT:    psrlw %xmm1, %xmm4
+; SSE41-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    pand %xmm0, %xmm4
+; SSE41-NEXT:    por %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_rotate_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm4
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -1042,7 +1012,6 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_rotate_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
@@ -1137,9 +1106,6 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_rotate_v16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; X32-SSE-NEXT:    psubb %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 9ad8d44acb24b..dc0d0a1168b71 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -645,17 +645,17 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v2i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v2i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
@@ -848,16 +848,16 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX1-LABEL: splatvar_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index a587a43f1bbb1..1a2889ab861e2 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -698,16 +698,16 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX1-LABEL: splatvar_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;

From 4d20e31f736c76785e03367c036183474459ef9a Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Tue, 26 May 2020 19:24:05 +0700
Subject: [PATCH 091/770] [FPEnv] Intrinsic llvm.roundeven

This intrinsic implements IEEE-754 operation roundToIntegralTiesToEven,
and performs rounding to the nearest integer value, rounding halfway
cases to even. The intrinsic represents the missed case of IEEE-754
rounding operations and now llvm provides full support of the rounding
operations defined by the standard.

Differential Revision: https://reviews.llvm.org/D75670
---
 llvm/docs/LangRef.rst                         | 74 ++++++++++++++++
 .../llvm/Analysis/TargetLibraryInfo.def       |  9 ++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  3 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  2 +
 llvm/include/llvm/IR/ConstrainedOps.def       |  1 +
 llvm/include/llvm/IR/Intrinsics.td            |  4 +
 llvm/include/llvm/IR/RuntimeLibcalls.def      |  5 ++
 llvm/lib/Analysis/ConstantFolding.cpp         |  7 ++
 llvm/lib/Analysis/InstructionSimplify.cpp     |  2 +
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |  3 +
 llvm/lib/Analysis/ValueTracking.cpp           |  5 ++
 llvm/lib/Analysis/VectorUtils.cpp             |  1 +
 llvm/lib/CodeGen/IntrinsicLowering.cpp        |  4 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  9 ++
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 25 ++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +
 .../SelectionDAG/LegalizeVectorOps.cpp        |  1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  3 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  1 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  2 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |  2 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  2 +
 .../InstCombine/InstCombineCalls.cpp          |  1 +
 .../InstCombine/InstCombineCasts.cpp          |  1 +
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |  2 +
 llvm/test/CodeGen/Generic/fpoperations.ll     | 21 +++++
 .../ExecutionEngine/Interpreter/intrinsics.ll |  4 +
 .../InstCombine/double-float-shrink-2.ll      | 86 +++++++++++++++++++
 .../InstCombine/float-shrink-compare.ll       | 54 ++++++++++++
 .../InstSimplify/known-never-nan.ll           | 11 +++
 .../InstSimplify/round-intrinsics.ll          | 11 +++
 llvm/test/Transforms/LICM/hoist-round.ll      |  5 +-
 .../Transforms/LoopVectorize/intrinsic.ll     | 52 +++++++++++
 .../Analysis/TargetLibraryInfoTest.cpp        |  3 +
 llvm/unittests/IR/IRBuilderTest.cpp           | 25 ++++++
 35 files changed, 442 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Generic/fpoperations.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8bcad09964e20..01f41a7ea3f17 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -13160,6 +13160,44 @@ Semantics:
 This function returns the same values as the libm ``round``
 functions would, and handles error conditions in the same way.
 
+'``llvm.roundeven.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.roundeven`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.roundeven.f32(float  %Val)
+      declare double    @llvm.roundeven.f64(double %Val)
+      declare x86_fp80  @llvm.roundeven.f80(x86_fp80  %Val)
+      declare fp128     @llvm.roundeven.f128(fp128 %Val)
+      declare ppc_fp128 @llvm.roundeven.ppcf128(ppc_fp128  %Val)
+
+Overview:
+"""""""""
+
+The '``llvm.roundeven.*``' intrinsics returns the operand rounded to the nearest
+integer in floating-point format rounding halfway cases to even (that is, to the
+nearest value that is an even integer).
+
+Arguments:
+""""""""""
+
+The argument and return value are floating-point numbers of the same type.
+
+Semantics:
+""""""""""
+
+This function implements IEEE-754 operation ``roundToIntegralTiesToEven``. It
+also behaves in the same way as C standard function ``roundeven``, except that
+it does not raise floating point exceptions.
+
+
 '``llvm.lround.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -18174,6 +18212,42 @@ This function returns the same values as the libm ``round`` functions
 would and handles error conditions in the same way.
 
 
+'``llvm.experimental.constrained.roundeven``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.roundeven(<type> <op1>,
+                                               metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.roundeven``' intrinsic returns the first
+operand rounded to the nearest integer in floating-point format, rounding
+halfway cases to even (that is, to the nearest value that is an even integer),
+regardless of the current rounding direction.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second argument specifies the exception behavior as described above.
+
+Semantics:
+""""""""""
+
+This function implements IEEE-754 operation ``roundToIntegralTiesToEven``. It
+also behaves in the same way as C standard function ``roundeven`` and can signal
+the invalid operation exception for a SNAN operand.
+
+
 '``llvm.experimental.constrained.lround``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index f782c56d96a56..0022e7b8b5569 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -1158,6 +1158,15 @@ TLI_DEFINE_STRING_INTERNAL("rmdir")
 /// double round(double x);
 TLI_DEFINE_ENUM_INTERNAL(round)
 TLI_DEFINE_STRING_INTERNAL("round")
+/// double roundeven(double x);
+TLI_DEFINE_ENUM_INTERNAL(roundeven)
+TLI_DEFINE_STRING_INTERNAL("roundeven")
+/// float roundevenf(float x);
+TLI_DEFINE_ENUM_INTERNAL(roundevenf)
+TLI_DEFINE_STRING_INTERNAL("roundevenf")
+/// long double roundevenl(long double x);
+TLI_DEFINE_ENUM_INTERNAL(roundevenl)
+TLI_DEFINE_STRING_INTERNAL("roundevenl")
 /// float roundf(float x);
 TLI_DEFINE_ENUM_INTERNAL(roundf)
 TLI_DEFINE_STRING_INTERNAL("roundf")
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 7866e71853cf3..cc751a5b47898 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1328,6 +1328,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::round:
       ISDs.push_back(ISD::FROUND);
       break;
+    case Intrinsic::roundeven:
+      ISDs.push_back(ISD::FROUNDEVEN);
+      break;
     case Intrinsic::pow:
       ISDs.push_back(ISD::FPOW);
       break;
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 839e82d9d84f7..f081a53263eff 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -376,6 +376,7 @@ enum NodeType {
   STRICT_FCEIL,
   STRICT_FFLOOR,
   STRICT_FROUND,
+  STRICT_FROUNDEVEN,
   STRICT_FTRUNC,
   STRICT_LROUND,
   STRICT_LLROUND,
@@ -752,6 +753,7 @@ enum NodeType {
   FRINT,
   FNEARBYINT,
   FROUND,
+  FROUNDEVEN,
   FFLOOR,
   LROUND,
   LLROUND,
diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def
index 9be92b36da9f0..ecba68fe0c0e3 100644
--- a/llvm/include/llvm/IR/ConstrainedOps.def
+++ b/llvm/include/llvm/IR/ConstrainedOps.def
@@ -91,6 +91,7 @@ DAG_FUNCTION(pow,             2, 1, experimental_constrained_pow,        FPOW)
 DAG_FUNCTION(powi,            2, 1, experimental_constrained_powi,       FPOWI)
 DAG_FUNCTION(rint,            1, 1, experimental_constrained_rint,       FRINT)
 DAG_FUNCTION(round,           1, 0, experimental_constrained_round,      FROUND)
+DAG_FUNCTION(roundeven,       1, 0, experimental_constrained_roundeven,  FROUNDEVEN)
 DAG_FUNCTION(sin,             1, 1, experimental_constrained_sin,        FSIN)
 DAG_FUNCTION(sqrt,            1, 1, experimental_constrained_sqrt,       FSQRT)
 DAG_FUNCTION(trunc,           1, 0, experimental_constrained_trunc,      FTRUNC)
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 51df06cee3587..7bfb25b0ed7dd 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -579,6 +579,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_rint  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_nearbyint : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_round : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_roundeven    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_canonicalize : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
                                    [IntrNoMem]>;
 
@@ -783,6 +784,9 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in {
   def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_roundeven : Intrinsic<[ llvm_anyfloat_ty ],
+                                                         [ LLVMMatchType<0>,
+                                                           llvm_metadata_ty ]>;
   def int_experimental_constrained_trunc : Intrinsic<[ llvm_anyfloat_ty ],
                                                      [ LLVMMatchType<0>,
                                                        llvm_metadata_ty ]>;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index fe2c32e3c975e..903db6c704987 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -234,6 +234,11 @@ HANDLE_LIBCALL(ROUND_F64, "round")
 HANDLE_LIBCALL(ROUND_F80, "roundl")
 HANDLE_LIBCALL(ROUND_F128, "roundl")
 HANDLE_LIBCALL(ROUND_PPCF128, "roundl")
+HANDLE_LIBCALL(ROUNDEVEN_F32, "roundevenf")
+HANDLE_LIBCALL(ROUNDEVEN_F64, "roundeven")
+HANDLE_LIBCALL(ROUNDEVEN_F80, "roundevenl")
+HANDLE_LIBCALL(ROUNDEVEN_F128, "roundevenl")
+HANDLE_LIBCALL(ROUNDEVEN_PPCF128, "roundevenl")
 HANDLE_LIBCALL(FLOOR_F32, "floorf")
 HANDLE_LIBCALL(FLOOR_F64, "floor")
 HANDLE_LIBCALL(FLOOR_F80, "floorl")
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 4fdc73cdbe570..7eafc7a6623f7 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1493,6 +1493,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::ceil:
   case Intrinsic::floor:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::trunc:
   case Intrinsic::nearbyint:
   case Intrinsic::rint:
@@ -1501,6 +1502,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::experimental_constrained_ceil:
   case Intrinsic::experimental_constrained_floor:
   case Intrinsic::experimental_constrained_round:
+  case Intrinsic::experimental_constrained_roundeven:
   case Intrinsic::experimental_constrained_trunc:
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_rint:
@@ -1785,6 +1787,11 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantFP::get(Ty->getContext(), U);
     }
 
+    if (IntrinsicID == Intrinsic::roundeven) {
+      U.roundToIntegral(APFloat::rmNearestTiesToEven);
+      return ConstantFP::get(Ty->getContext(), U);
+    }
+
     if (IntrinsicID == Intrinsic::ceil) {
       U.roundToIntegral(APFloat::rmTowardPositive);
       return ConstantFP::get(Ty->getContext(), U);
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 15f5a9c672c8d..45850e41f978b 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5061,6 +5061,7 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::canonicalize:
     return true;
   }
@@ -5176,6 +5177,7 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
   case Intrinsic::trunc:
   case Intrinsic::ceil:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::nearbyint:
   case Intrinsic::rint: {
     // floor (sitofp x) -> sitofp x
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index cae71d130d79b..336480e8b9d99 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1341,6 +1341,9 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_round:
   case LibFunc_roundf:
   case LibFunc_roundl:
+  case LibFunc_roundeven:
+  case LibFunc_roundevenf:
+  case LibFunc_roundevenl:
   case LibFunc_sin:
   case LibFunc_sinf:
   case LibFunc_sinh:
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1b73a2062095c..545dab7714df7 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3227,6 +3227,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
   case LibFunc_roundf:
   case LibFunc_roundl:
     return Intrinsic::round;
+  case LibFunc_roundeven:
+  case LibFunc_roundevenf:
+  case LibFunc_roundevenl:
+    return Intrinsic::roundeven;
   case LibFunc_pow:
   case LibFunc_powf:
   case LibFunc_powl:
@@ -3567,6 +3571,7 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
     case Intrinsic::rint:
     case Intrinsic::nearbyint:
     case Intrinsic::round:
+    case Intrinsic::roundeven:
       return isKnownNeverNaN(II->getArgOperand(0), TLI, Depth + 1);
     case Intrinsic::sqrt:
       return isKnownNeverNaN(II->getArgOperand(0), TLI, Depth + 1) &&
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 8a8bb19f36637..23531b65ea32d 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -78,6 +78,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::pow:
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index d6635a6337aa9..e37c21e765977 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -421,6 +421,10 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     ReplaceFPIntrinsicWithCall(CI, "roundf", "round", "roundl");
     break;
   }
+  case Intrinsic::roundeven: {
+    ReplaceFPIntrinsicWithCall(CI, "roundevenf", "roundeven", "roundevenl");
+    break;
+  }
   case Intrinsic::copysign: {
     ReplaceFPIntrinsicWithCall(CI, "copysignf", "copysign", "copysignl");
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 8bf6cb514144b..2ffcc859f8051 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4107,6 +4107,14 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::ROUND_F128,
                     RTLIB::ROUND_PPCF128, Results);
     break;
+  case ISD::FROUNDEVEN:
+  case ISD::STRICT_FROUNDEVEN:
+    ExpandFPLibCall(Node, RTLIB::ROUNDEVEN_F32,
+                    RTLIB::ROUNDEVEN_F64,
+                    RTLIB::ROUNDEVEN_F80,
+                    RTLIB::ROUNDEVEN_F128,
+                    RTLIB::ROUNDEVEN_PPCF128, Results);
+    break;
   case ISD::FPOWI:
   case ISD::STRICT_FPOWI: {
     RTLIB::Libcall LC;
@@ -4601,6 +4609,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FTRUNC:
   case ISD::FNEG:
   case ISD::FSQRT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 37e5abaae3eae..7e8ad28f9b143 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -113,6 +113,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
     case ISD::STRICT_FROUND:
     case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
+    case ISD::STRICT_FROUNDEVEN:
+    case ISD::FROUNDEVEN:  R = SoftenFloatRes_FROUNDEVEN(N); break;
     case ISD::STRICT_FSIN:
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
     case ISD::STRICT_FSQRT:
@@ -616,6 +618,15 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
                                               RTLIB::ROUND_PPCF128));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUNDEVEN(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::ROUNDEVEN_F32,
+                                              RTLIB::ROUNDEVEN_F64,
+                                              RTLIB::ROUNDEVEN_F80,
+                                              RTLIB::ROUNDEVEN_F128,
+                                              RTLIB::ROUNDEVEN_PPCF128));
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
   return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
                                               RTLIB::SIN_F32,
@@ -1178,6 +1189,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
   case ISD::STRICT_FROUND:
   case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
+  case ISD::STRICT_FROUNDEVEN:
+  case ISD::FROUNDEVEN: ExpandFloatRes_FROUNDEVEN(N, Lo, Hi); break;
   case ISD::STRICT_FSIN:
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
   case ISD::STRICT_FSQRT:
@@ -1504,6 +1517,16 @@ void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
                                        RTLIB::ROUND_PPCF128), Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FROUNDEVEN(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::ROUNDEVEN_F32,
+                                       RTLIB::ROUNDEVEN_F64,
+                                       RTLIB::ROUNDEVEN_F80,
+                                       RTLIB::ROUNDEVEN_F128,
+                                       RTLIB::ROUNDEVEN_PPCF128), Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
@@ -2136,6 +2159,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FNEG:
     case ISD::FRINT:
     case ISD::FROUND:
+    case ISD::FROUNDEVEN:
     case ISD::FSIN:
     case ISD::FSQRT:
     case ISD::FTRUNC:
@@ -2476,6 +2500,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FREEZE:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index b729565ef7e73..4bc75ceb4928e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -530,6 +530,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_FREM(SDNode *N);
   SDValue SoftenFloatRes_FRINT(SDNode *N);
   SDValue SoftenFloatRes_FROUND(SDNode *N);
+  SDValue SoftenFloatRes_FROUNDEVEN(SDNode *N);
   SDValue SoftenFloatRes_FSIN(SDNode *N);
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
@@ -603,6 +604,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUNDEVEN(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8f746ec45f6c6..93ce338ff2327 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -427,6 +427,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FFLOOR:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6601dc68223eb..ff2c8d3a8db22 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -95,6 +95,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -888,6 +889,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -2825,6 +2827,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2dcab73b177b7..cfb15d6ca9d7c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4103,6 +4103,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   case ISD::FFLOOR:
   case ISD::FCEIL:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FRINT:
   case ISD::FNEARBYINT: {
     if (SNaN)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index efdf696f87944..dd03e415910cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6072,6 +6072,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::canonicalize: {
     unsigned Opcode;
     switch (Intrinsic) {
@@ -6086,6 +6087,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
     case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+    case Intrinsic::roundeven: Opcode = ISD::FROUNDEVEN; break;
     case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
     }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 816b1dcded2e8..7f9b8b7b28a38 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -211,6 +211,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FNEARBYINT:          return "strict_fnearbyint";
   case ISD::FROUND:                     return "fround";
   case ISD::STRICT_FROUND:              return "strict_fround";
+  case ISD::FROUNDEVEN:                 return "froundeven";
+  case ISD::STRICT_FROUNDEVEN:          return "strict_froundeven";
   case ISD::FEXP:                       return "fexp";
   case ISD::STRICT_FEXP:                return "strict_fexp";
   case ISD::FEXP2:                      return "fexp2";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b8062672efec9..62c3af95f9528 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -693,6 +693,7 @@ void TargetLoweringBase::initActions() {
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
+    setOperationAction(ISD::FROUNDEVEN, VT, Expand);
     setOperationAction(ISD::FPOWI, VT, Expand);
 
     // These operations default to expand for vector types.
@@ -758,6 +759,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::FRINT,      VT, Expand);
     setOperationAction(ISD::FTRUNC,     VT, Expand);
     setOperationAction(ISD::FROUND,     VT, Expand);
+    setOperationAction(ISD::FROUNDEVEN, VT, Expand);
     setOperationAction(ISD::LROUND,     VT, Expand);
     setOperationAction(ISD::LLROUND,    VT, Expand);
     setOperationAction(ISD::LRINT,      VT, Expand);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 118013a387647..7e20d241bbab5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2422,6 +2422,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ceil:
   case Intrinsic::floor:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::nearbyint:
   case Intrinsic::rint:
   case Intrinsic::trunc: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index c68f9e8980071..714d1ae8aaec3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1741,6 +1741,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
     case Intrinsic::nearbyint:
     case Intrinsic::rint:
     case Intrinsic::round:
+    case Intrinsic::roundeven:
     case Intrinsic::trunc: {
       Value *Src = II->getArgOperand(0);
       if (!Src->hasOneUse())
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 828f4ee5bbe46..c32db981ee7c2 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2930,6 +2930,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
     return replaceUnaryCall(CI, Builder, Intrinsic::floor);
   case LibFunc_round:
     return replaceUnaryCall(CI, Builder, Intrinsic::round);
+  case LibFunc_roundeven:
+    return replaceUnaryCall(CI, Builder, Intrinsic::roundeven);
   case LibFunc_nearbyint:
     return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
   case LibFunc_rint:
diff --git a/llvm/test/CodeGen/Generic/fpoperations.ll b/llvm/test/CodeGen/Generic/fpoperations.ll
new file mode 100644
index 0000000000000..53dd307db2492
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/fpoperations.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+
+; This test checks default lowering of the intrinsics operating floating point
+; values. MSP430 is used as a target in this test because it does not have
+; native FP support, so it won't get custom lowering for these intrinsics.
+;
+; REQUIRES: msp430-registered-target
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+
+define float @roundeven_01(float %x) {
+entry:
+  %res = call float @llvm.roundeven.f32(float %x)
+  ret float %res
+}
+; CHECK-LABEL: roundeven_01:
+; CHECK: call #roundeven
+
+declare float @llvm.roundeven.f32(float %x)
diff --git a/llvm/test/ExecutionEngine/Interpreter/intrinsics.ll b/llvm/test/ExecutionEngine/Interpreter/intrinsics.ll
index 49d0bbee30484..468b6b7ab24eb 100644
--- a/llvm/test/ExecutionEngine/Interpreter/intrinsics.ll
+++ b/llvm/test/ExecutionEngine/Interpreter/intrinsics.ll
@@ -13,6 +13,8 @@ declare float  @llvm.trunc.f32(float)
 declare double @llvm.trunc.f64(double)
 declare float  @llvm.round.f32(float)
 declare double @llvm.round.f64(double)
+declare float  @llvm.roundeven.f32(float)
+declare double @llvm.roundeven.f64(double)
 declare float  @llvm.copysign.f32(float, float)
 declare double @llvm.copysign.f64(double, double)
 
@@ -29,6 +31,8 @@ define i32 @main() {
   %trunc64 = call double @llvm.trunc.f64(double 0.000000e+00)
   %round32 = call float @llvm.round.f32(float 0.000000e+00)
   %round64 = call double @llvm.round.f64(double 0.000000e+00)
+  %roundeven32 = call float @llvm.roundeven.f32(float 0.000000e+00)
+  %roundeven64 = call double @llvm.roundeven.f64(double 0.000000e+00)
   %copysign32 = call float @llvm.copysign.f32(float 0.000000e+00, float 0.000000e+00)
   %copysign64 = call double @llvm.copysign.f64(double 0.000000e+00, double 0.000000e+00)
   ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index 76e497bd68fc7..3a8f224b1d814 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -10,6 +10,7 @@
 declare double @floor(double)
 declare double @ceil(double)
 declare double @round(double)
+declare double @roundeven(double)
 declare double @nearbyint(double)
 declare double @trunc(double)
 declare double @fabs(double)
@@ -32,6 +33,9 @@ declare <2 x float> @llvm.rint.v2f32(<2 x float>)
 declare double @llvm.round.f64(double)
 declare <2 x double> @llvm.round.v2f64(<2 x double>)
 
+declare double @llvm.roundeven.f64(double)
+declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
+
 declare double @llvm.trunc.f64(double)
 declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
 
@@ -71,6 +75,18 @@ define float @test_shrink_libcall_round(float %C) {
   ret float %F
 }
 
+define float @test_shrink_libcall_roundeven(float %C) {
+; CHECK-LABEL: @test_shrink_libcall_roundeven(
+; CHECK-NEXT:    [[F:%.*]] = call float @llvm.roundeven.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[F]]
+;
+  %D = fpext float %C to double
+  ; --> roundeven
+  %E = call double @roundeven(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
 define float @test_shrink_libcall_nearbyint(float %C) {
 ; CHECK-LABEL: @test_shrink_libcall_nearbyint(
 ; CHECK-NEXT:    [[F:%.*]] = call float @llvm.nearbyint.f32(float [[C:%.*]])
@@ -186,6 +202,17 @@ define float @test_shrink_intrin_round(float %C) {
   ret float %F
 }
 
+define float @test_shrink_intrin_roundeven(float %C) {
+; CHECK-LABEL: @test_shrink_intrin_roundeven(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.roundeven.f32(float [[C:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %D = fpext float %C to double
+  %E = call double @llvm.roundeven.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
 define float @test_shrink_intrin_trunc(float %C) {
 ; CHECK-LABEL: @test_shrink_intrin_trunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[C:%.*]])
@@ -292,6 +319,23 @@ define <2 x float> @test_shrink_intrin_round_multi_use(<2 x float> %C) {
   ret <2 x float> %F
 }
 
+define <2 x float> @test_shrink_intrin_roundeven_multi_use(<2 x float> %C) {
+; CHECK-LABEL: @test_shrink_intrin_roundeven_multi_use(
+; CHECK-NEXT:    [[D:%.*]] = fpext <2 x float> [[C:%.*]] to <2 x double>
+; CHECK-NEXT:    [[E:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> [[D]])
+; CHECK-NEXT:    [[F:%.*]] = fptrunc <2 x double> [[E]] to <2 x float>
+; CHECK-NEXT:    call void @use_v2f64(<2 x double> [[D]])
+; CHECK-NEXT:    call void @use_v2f64(<2 x double> [[E]])
+; CHECK-NEXT:    ret <2 x float> [[F]]
+;
+  %D = fpext <2 x float> %C to <2 x double>
+  %E = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %D)
+  %F = fptrunc <2 x double> %E to <2 x float>
+  call void @use_v2f64(<2 x double> %D)
+  call void @use_v2f64(<2 x double> %E)
+  ret <2 x float> %F
+}
+
 define <2 x float> @test_shrink_intrin_trunc_multi_use(<2 x float> %C) {
 ; CHECK-LABEL: @test_shrink_intrin_trunc_multi_use(
 ; CHECK-NEXT:    [[D:%.*]] = fpext <2 x float> [[C:%.*]] to <2 x double>
@@ -352,6 +396,17 @@ define float @test_no_shrink_intrin_round(double %D) {
   ret float %F
 }
 
+define float @test_no_shrink_intrin_roundeven(double %D) {
+; CHECK-LABEL: @test_no_shrink_intrin_roundeven(
+; CHECK-NEXT:    [[E:%.*]] = call double @llvm.roundeven.f64(double [[D:%.*]])
+; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; CHECK-NEXT:    ret float [[F]]
+;
+  %E = call double @llvm.roundeven.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
 define float @test_no_shrink_intrin_nearbyint(double %D) {
 ; CHECK-LABEL: @test_no_shrink_intrin_nearbyint(
 ; CHECK-NEXT:    [[E:%.*]] = call double @llvm.nearbyint.f64(double [[D:%.*]])
@@ -424,6 +479,15 @@ define float @test_shrink_float_convertible_constant_intrin_round() {
   ret float %F
 }
 
+define float @test_shrink_float_convertible_constant_intrin_roundeven() {
+; CHECK-LABEL: @test_shrink_float_convertible_constant_intrin_roundeven(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %E = call double @llvm.roundeven.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
 define float @test_shrink_float_convertible_constant_intrin_nearbyint() {
 ; CHECK-LABEL: @test_shrink_float_convertible_constant_intrin_nearbyint(
 ; CHECK-NEXT:    ret float 2.000000e+00
@@ -494,6 +558,17 @@ define half @test_no_shrink_mismatched_type_intrin_round(double %D) {
   ret half %F
 }
 
+define half @test_no_shrink_mismatched_type_intrin_roundeven(double %D) {
+; CHECK-LABEL: @test_no_shrink_mismatched_type_intrin_roundeven(
+; CHECK-NEXT:    [[E:%.*]] = call double @llvm.roundeven.f64(double [[D:%.*]])
+; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to half
+; CHECK-NEXT:    ret half [[F]]
+;
+  %E = call double @llvm.roundeven.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
 define half @test_no_shrink_mismatched_type_intrin_nearbyint(double %D) {
 ; CHECK-LABEL: @test_no_shrink_mismatched_type_intrin_nearbyint(
 ; CHECK-NEXT:    [[E:%.*]] = call double @llvm.nearbyint.f64(double [[D:%.*]])
@@ -573,6 +648,17 @@ define <2 x double> @test_shrink_intrin_round_fp16_vec(<2 x half> %C) {
   ret <2 x double> %E
 }
 
+define <2 x double> @test_shrink_intrin_roundeven_fp16_vec(<2 x half> %C) {
+; CHECK-LABEL: @test_shrink_intrin_roundeven_fp16_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.roundeven.v2f16(<2 x half> [[C:%.*]])
+; CHECK-NEXT:    [[E:%.*]] = fpext <2 x half> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[E]]
+;
+  %D = fpext <2 x  half> %C to <2 x double>
+  %E = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %D)
+  ret <2 x double> %E
+}
+
 define float @test_shrink_intrin_nearbyint_fp16_src(half %C) {
 ; CHECK-LABEL: @test_shrink_intrin_nearbyint_fp16_src(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.nearbyint.f16(half [[C:%.*]])
diff --git a/llvm/test/Transforms/InstCombine/float-shrink-compare.ll b/llvm/test/Transforms/InstCombine/float-shrink-compare.ll
index ca2f6d1c23cb2..aa0dd5e3007d8 100644
--- a/llvm/test/Transforms/InstCombine/float-shrink-compare.ll
+++ b/llvm/test/Transforms/InstCombine/float-shrink-compare.ll
@@ -160,6 +160,32 @@ define i1 @test6_intrin(float %x, float %y) {
   ret i1 %cmp
 }
 
+define i1 @test6a(float %x, float %y) {
+; CHECK-LABEL: @test6a(
+; CHECK-NEXT:    [[ROUND:%.*]] = call float @llvm.roundeven.f32(float %x)
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ROUND]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.ext = fpext float %x to double
+  %round = call double @roundeven(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %round, %y.ext
+  ret i1 %cmp
+}
+
+define i1 @test6a_intrin(float %x, float %y) {
+; CHECK-LABEL: @test6a_intrin(
+; CHECK-NEXT:    [[ROUND:%.*]] = call float @llvm.roundeven.f32(float %x)
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ROUND]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.ext = fpext float %x to double
+  %round = call double @llvm.roundeven.f64(double %x.ext) nounwind readnone
+  %y.ext = fpext float %y to double
+  %cmp = fcmp oeq double %round, %y.ext
+  ret i1 %cmp
+}
+
 define i1 @test7(float %x, float %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[TRUNC:%.*]] = call float @llvm.trunc.f32(float %x)
@@ -329,6 +355,32 @@ define i1 @test13_intrin(float %x, float %y) {
   ret i1 %cmp
 }
 
+define i1 @test13a(float %x, float %y) {
+; CHECK-LABEL: @test13a(
+; CHECK-NEXT:    [[ROUND:%.*]] = call float @llvm.roundeven.f32(float %x)
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ROUND]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %round = call double @roundeven(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %round
+  ret i1 %cmp
+}
+
+define i1 @test13a_intrin(float %x, float %y) {
+; CHECK-LABEL: @test13a_intrin(
+; CHECK-NEXT:    [[ROUND:%.*]] = call float @llvm.roundeven.f32(float %x)
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ROUND]], %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %round = call double @llvm.roundeven.f64(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %round
+  ret i1 %cmp
+}
+
 define i1 @test14(float %x, float %y) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:    [[TRUNC:%.*]] = call float @llvm.trunc.f32(float %x)
@@ -462,6 +514,7 @@ declare double @floor(double) nounwind readnone
 declare double @nearbyint(double) nounwind readnone
 declare double @rint(double) nounwind readnone
 declare double @round(double) nounwind readnone
+declare double @roundeven(double) nounwind readnone
 declare double @trunc(double) nounwind readnone
 declare double @fmin(double, double) nounwind readnone
 declare double @fmax(double, double) nounwind readnone
@@ -471,4 +524,5 @@ declare double @llvm.ceil.f64(double) nounwind readnone
 declare double @llvm.floor.f64(double) nounwind readnone
 declare double @llvm.nearbyint.f64(double) nounwind readnone
 declare double @llvm.round.f64(double) nounwind readnone
+declare double @llvm.roundeven.f64(double) nounwind readnone
 declare double @llvm.trunc.f64(double) nounwind readnone
diff --git a/llvm/test/Transforms/InstSimplify/known-never-nan.ll b/llvm/test/Transforms/InstSimplify/known-never-nan.ll
index 109775607bcc3..c2c26e6ee975f 100644
--- a/llvm/test/Transforms/InstSimplify/known-never-nan.ll
+++ b/llvm/test/Transforms/InstSimplify/known-never-nan.ll
@@ -147,6 +147,16 @@ define i1 @round_nnan_src(double %arg) {
   ret i1 %tmp
 }
 
+define i1 @roundeven_nnan_src(double %arg) {
+; CHECK-LABEL: @roundeven_nnan_src(
+; CHECK-NEXT:    ret i1 false
+;
+  %nnan = fadd nnan double %arg, 1.0
+  %op = call double @llvm.roundeven.f64(double %nnan)
+  %tmp = fcmp uno double %op, %op
+  ret i1 %tmp
+}
+
 define i1 @known_nan_select(i1 %cond, double %arg0, double %arg1) {
 ; CHECK-LABEL: @known_nan_select(
 ; CHECK-NEXT:    ret i1 true
@@ -416,3 +426,4 @@ declare double @llvm.trunc.f64(double)
 declare double @llvm.rint.f64(double)
 declare double @llvm.nearbyint.f64(double)
 declare double @llvm.round.f64(double)
+declare double @llvm.roundeven.f64(double)
diff --git a/llvm/test/Transforms/InstSimplify/round-intrinsics.ll b/llvm/test/Transforms/InstSimplify/round-intrinsics.ll
index 42c78e000acd2..3b63bd6be6f78 100644
--- a/llvm/test/Transforms/InstSimplify/round-intrinsics.ll
+++ b/llvm/test/Transforms/InstSimplify/round-intrinsics.ll
@@ -81,6 +81,16 @@ define float @uitofp_round(i32 %arg) {
   ret float %round
 }
 
+define float @uitofp_roundeven(i32 %arg) {
+; CHECK-LABEL: @uitofp_roundeven(
+; CHECK-NEXT:    [[CVT:%.*]] = uitofp i32 [[ARG:%.*]] to float
+; CHECK-NEXT:    ret float [[CVT]]
+;
+  %cvt = uitofp i32 %arg to float
+  %round = call float @llvm.roundeven.f32(float %cvt)
+  ret float %round
+}
+
 define float @sitofp_nearbyint(i32 %arg) {
 ; CHECK-LABEL: @sitofp_nearbyint(
 ; CHECK-NEXT:    [[CVT:%.*]] = sitofp i32 [[ARG:%.*]] to float
@@ -125,6 +135,7 @@ declare float @llvm.floor.f32(float) #0
 declare float @llvm.trunc.f32(float) #0
 declare float @llvm.ceil.f32(float) #0
 declare float @llvm.round.f32(float) #0
+declare float @llvm.roundeven.f32(float) #0
 declare float @llvm.nearbyint.f32(float) #0
 declare float @llvm.rint.f32(float) #0
 
diff --git a/llvm/test/Transforms/LICM/hoist-round.ll b/llvm/test/Transforms/LICM/hoist-round.ll
index 10f75be4d3270..c48847b40dbc2 100644
--- a/llvm/test/Transforms/LICM/hoist-round.ll
+++ b/llvm/test/Transforms/LICM/hoist-round.ll
@@ -20,6 +20,7 @@ target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:3
 ; CHECK: call float @llvm.minnum.f32
 ; CHECK: call float @llvm.maxnum.f32
 ; CHECK: call float @llvm.powi.f32
+; CHECK: call float @llvm.roundeven.f32
 ; CHECK: for.body:
 
 define void @test(float %arg1, float %arg2) {
@@ -45,7 +46,8 @@ for.body:
   %tmp.11 = call float @llvm.minimum.f32(float %tmp.10, float %arg2)
   %tmp.12 = call float @llvm.maximum.f32(float %tmp.11, float %arg2)
   %tmp.13 = call float @llvm.powi.f32(float %tmp.12, i32 4)
-  call void @consume(float %tmp.13)
+  %tmp.14 = call float @llvm.roundeven.f32(float %tmp.13)
+  call void @consume(float %tmp.14)
   %IND.new = add i32 %IND, 1
   br label %for.head
 
@@ -68,3 +70,4 @@ declare float @llvm.maxnum.f32(float, float)
 declare float @llvm.minimum.f32(float, float)
 declare float @llvm.maximum.f32(float, float)
 declare float @llvm.powi.f32(float, i32)
+declare float @llvm.roundeven.f32(float)
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 50cdb73ae8ec9..c2036c611334d 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -832,6 +832,58 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.round.f64(double) nounwind readnone
 
+;CHECK-LABEL: @roundeven_f32(
+;CHECK: llvm.roundeven.v4f32
+;CHECK: ret void
+define void @roundeven_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.roundeven.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.roundeven.f32(float) nounwind readnone
+
+;CHECK-LABEL: @roundeven_f64(
+;CHECK: llvm.roundeven.v4f64
+;CHECK: ret void
+define void @roundeven_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.roundeven.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.roundeven.f64(double) nounwind readnone
+
 ;CHECK-LABEL: @fma_f32(
 ;CHECK: llvm.fma.v4f32
 ;CHECK: ret void
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index f21081467c1eb..bd5fefc013e88 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -276,6 +276,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare double @round(double)\n"
       "declare float @roundf(float)\n"
       "declare x86_fp80 @roundl(x86_fp80)\n"
+      "declare double @roundeven(double)\n"
+      "declare float @roundevenf(float)\n"
+      "declare x86_fp80 @roundevenl(x86_fp80)\n"
       "declare i32 @scanf(i8*, ...)\n"
       "declare void @setbuf(%struct*, i8*)\n"
       "declare i32 @setitimer(i32, %struct*, %struct*)\n"
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 5af5bc87944b6..bf230597a589f 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -121,6 +121,12 @@ TEST_F(IRBuilderTest, Intrinsics) {
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
+
+  Call = Builder.CreateUnaryIntrinsic(Intrinsic::roundeven, V);
+  II = cast<IntrinsicInst>(Call);
+  EXPECT_EQ(II->getIntrinsicID(), Intrinsic::roundeven);
+  EXPECT_FALSE(II->hasNoInfs());
+  EXPECT_FALSE(II->hasNoNaNs());
 }
 
 TEST_F(IRBuilderTest, IntrinsicsWithScalableVectors) {
@@ -307,6 +313,25 @@ TEST_F(IRBuilderTest, ConstrainedFP) {
   EXPECT_FALSE(verifyModule(*M));
 }
 
+TEST_F(IRBuilderTest, ConstrainedFPIntrinsics) {
+  IRBuilder<> Builder(BB);
+  Value *V;
+  Value *VDouble;
+  ConstrainedFPIntrinsic *CII;
+  GlobalVariable *GVDouble = new GlobalVariable(
+      *M, Type::getDoubleTy(Ctx), true, GlobalValue::ExternalLinkage, nullptr);
+  VDouble = Builder.CreateLoad(GVDouble->getValueType(), GVDouble);
+
+  Builder.setDefaultConstrainedExcept(fp::ebStrict);
+  Builder.setDefaultConstrainedRounding(RoundingMode::TowardZero);
+  Function *Fn = Intrinsic::getDeclaration(M.get(),
+      Intrinsic::experimental_constrained_roundeven, { Type::getDoubleTy(Ctx) });
+  V = Builder.CreateConstrainedFPCall(Fn, { VDouble });
+  CII = cast<ConstrainedFPIntrinsic>(V);
+  EXPECT_EQ(Intrinsic::experimental_constrained_roundeven, CII->getIntrinsicID());
+  EXPECT_EQ(fp::ebStrict, CII->getExceptionBehavior());
+}
+
 TEST_F(IRBuilderTest, Lifetime) {
   IRBuilder<> Builder(BB);
   AllocaInst *Var1 = Builder.CreateAlloca(Builder.getInt8Ty());

From 6c906f7785dad3a1dea5357cfde0762952c2a2bd Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Tue, 26 May 2020 11:30:27 +0100
Subject: [PATCH 092/770] [Sema] Diagnose more cases of static data members in
 local or unnamed classes

We currently diagnose static data members directly contained in unnamed classes,
but we should also diagnose when they're in a class that is nested (directly or
indirectly) in an unnamed class. Do this by iterating up the list of parent
DeclContexts and checking if any is an unnamed class.

Similarly also check for function or method DeclContexts (which includes things
like blocks and openmp captured statements) as then the class is considered to
be a local class, which means static data members aren't allowed.

Differential Revision: https://reviews.llvm.org/D80295
---
 clang/lib/Sema/SemaDecl.cpp             | 30 +++++++++++++++++++------
 clang/test/OpenMP/for_loop_messages.cpp | 10 +++++++++
 clang/test/SemaCXX/anonymous-struct.cpp | 18 +++++++++++++++
 clang/test/SemaCXX/blocks.cpp           | 13 +++++++++++
 4 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 74a4fd8a06de3..6fe48c860864b 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -6885,18 +6885,34 @@ NamedDecl *Sema::ActOnVariableDeclarator(
 
     if (SC == SC_Static && CurContext->isRecord()) {
       if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(DC)) {
-        // C++ [class.static.data]p2:
-        //   A static data member shall not be a direct member of an unnamed
-        //   or local class
-        // FIXME: or of a (possibly indirectly) nested class thereof.
-        if (RD->isLocalClass()) {
+        // Walk up the enclosing DeclContexts to check for any that are
+        // incompatible with static data members.
+        const DeclContext *FunctionOrMethod = nullptr;
+        const CXXRecordDecl *AnonStruct = nullptr;
+        for (DeclContext *Ctxt = DC; Ctxt; Ctxt = Ctxt->getParent()) {
+          if (Ctxt->isFunctionOrMethod()) {
+            FunctionOrMethod = Ctxt;
+            break;
+          }
+          const CXXRecordDecl *ParentDecl = dyn_cast<CXXRecordDecl>(Ctxt);
+          if (ParentDecl && !ParentDecl->getDeclName()) {
+            AnonStruct = ParentDecl;
+            break;
+          }
+        }
+        if (FunctionOrMethod) {
+          // C++ [class.static.data]p5: A local class shall not have static data
+          // members.
           Diag(D.getIdentifierLoc(),
                diag::err_static_data_member_not_allowed_in_local_class)
             << Name << RD->getDeclName() << RD->getTagKind();
-        } else if (!RD->getDeclName()) {
+        } else if (AnonStruct) {
+          // C++ [class.static.data]p4: Unnamed classes and classes contained
+          // directly or indirectly within unnamed classes shall not contain
+          // static data members.
           Diag(D.getIdentifierLoc(),
                diag::err_static_data_member_not_allowed_in_anon_struct)
-            << Name << RD->getTagKind();
+            << Name << AnonStruct->getTagKind();
           Invalid = true;
         } else if (RD->isUnion()) {
           // C++98 [class.union]p1: If a union contains a static data member,
diff --git a/clang/test/OpenMP/for_loop_messages.cpp b/clang/test/OpenMP/for_loop_messages.cpp
index 73c69ede6d120..087db755273a2 100644
--- a/clang/test/OpenMP/for_loop_messages.cpp
+++ b/clang/test/OpenMP/for_loop_messages.cpp
@@ -831,3 +831,13 @@ void test_nowait() {
   for (int i = 0; i < 16; ++i)
     ;
 }
+
+void test_static_data_member() {
+#pragma omp parallel
+#pragma omp for
+  for (int i = 0; i < 16; ++i) {
+    class X {
+      static int x; // expected-error {{static data member 'x' not allowed in local class 'X'}}
+    };
+  }
+}
diff --git a/clang/test/SemaCXX/anonymous-struct.cpp b/clang/test/SemaCXX/anonymous-struct.cpp
index 10f6711dd340a..333b8f724f4e1 100644
--- a/clang/test/SemaCXX/anonymous-struct.cpp
+++ b/clang/test/SemaCXX/anonymous-struct.cpp
@@ -153,3 +153,21 @@ typedef struct {
   const Empty E;
 } C;
 } // namespace ImplicitDecls
+
+struct {
+  static int x; // expected-error {{static data member 'x' not allowed in anonymous struct}}
+} static_member_1;
+
+class {
+  struct A {
+    static int x; // expected-error {{static data member 'x' not allowed in anonymous class}}
+  } x;
+} static_member_2;
+
+union {
+  struct A {
+    struct B {
+      static int x; // expected-error {{static data member 'x' not allowed in anonymous union}}
+    } x;
+  } x;
+} static_member_3;
diff --git a/clang/test/SemaCXX/blocks.cpp b/clang/test/SemaCXX/blocks.cpp
index aacf63cfab420..5d0aa2af73601 100644
--- a/clang/test/SemaCXX/blocks.cpp
+++ b/clang/test/SemaCXX/blocks.cpp
@@ -153,3 +153,16 @@ void f() {
   auto some_block = ^{ (void)s; };
 }
 }
+
+void static_data_member() {
+  auto block = ^{
+    class X {
+      static int x; // expected-error {{static data member 'x' not allowed in local class 'X'}}
+    };
+    class Y {
+      struct Z {
+        static int z; // expected-error {{static data member 'z' not allowed in local struct 'Z'}}
+      };
+    };
+  };
+}

From 049c16ba93fa77df7984353b1a0124ed64fc0439 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 26 May 2020 13:23:23 +0100
Subject: [PATCH 093/770] [ARM] MVE VMINV/VMAXV test additions. NFC

---
 llvm/test/CodeGen/Thumb2/mve-vmaxv.ll | 384 ++++++++++++++++++++++++--
 1 file changed, 360 insertions(+), 24 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
index f96c2f422a3fa..36c201cced56c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
@@ -14,8 +14,8 @@ declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
 declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
 declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
 
-define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8_i32(<16 x i8> %s1) {
-; CHECK-LABEL: vmaxv_s_v16i8_i32:
+define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8(<16 x i8> %s1) {
+; CHECK-LABEL: vmaxv_s_v16i8:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    mvn r0, #127
 ; CHECK-NEXT:    vmaxv.s8 r0, q0
@@ -24,8 +24,8 @@ define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8_i32(<16 x i8> %s1) {
   ret i8 %r
 }
 
-define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16_i32(<8 x i16> %s1) {
-; CHECK-LABEL: vmaxv_s_v8i16_i32:
+define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16(<8 x i16> %s1) {
+; CHECK-LABEL: vmaxv_s_v8i16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movw r0, #32768
 ; CHECK-NEXT:    movt r0, #65535
@@ -35,8 +35,8 @@ define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16_i32(<8 x i16> %s1) {
   ret i16 %r
 }
 
-define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32_i32(<4 x i32> %s1) {
-; CHECK-LABEL: vmaxv_s_v4i32_i32:
+define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32(<4 x i32> %s1) {
+; CHECK-LABEL: vmaxv_s_v4i32:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    mov.w r0, #-2147483648
 ; CHECK-NEXT:    vmaxv.s32 r0, q0
@@ -45,8 +45,8 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32_i32(<4 x i32> %s1) {
   ret i32 %r
 }
 
-define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8_i32(<16 x i8> %s1) {
-; CHECK-LABEL: vmaxv_u_v16i8_i32:
+define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8(<16 x i8> %s1) {
+; CHECK-LABEL: vmaxv_u_v16i8:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
@@ -55,8 +55,8 @@ define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8_i32(<16 x i8> %s1) {
   ret i8 %r
 }
 
-define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16_i32(<8 x i16> %s1) {
-; CHECK-LABEL: vmaxv_u_v8i16_i32:
+define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16(<8 x i16> %s1) {
+; CHECK-LABEL: vmaxv_u_v8i16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u16 r0, q0
@@ -65,8 +65,8 @@ define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16_i32(<8 x i16> %s1) {
   ret i16 %r
 }
 
-define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32_i32(<4 x i32> %s1) {
-; CHECK-LABEL: vmaxv_u_v4i32_i32:
+define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32(<4 x i32> %s1) {
+; CHECK-LABEL: vmaxv_u_v4i32:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    vmaxv.u32 r0, q0
@@ -75,8 +75,8 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32_i32(<4 x i32> %s1) {
   ret i32 %r
 }
 
-define arm_aapcs_vfpcc i8 @vminv_s_v16i8_i32(<16 x i8> %s1) {
-; CHECK-LABEL: vminv_s_v16i8_i32:
+define arm_aapcs_vfpcc i8 @vminv_s_v16i8(<16 x i8> %s1) {
+; CHECK-LABEL: vminv_s_v16i8:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movs r0, #127
 ; CHECK-NEXT:    vminv.s8 r0, q0
@@ -85,8 +85,8 @@ define arm_aapcs_vfpcc i8 @vminv_s_v16i8_i32(<16 x i8> %s1) {
   ret i8 %r
 }
 
-define arm_aapcs_vfpcc i16 @vminv_s_v8i16_i32(<8 x i16> %s1) {
-; CHECK-LABEL: vminv_s_v8i16_i32:
+define arm_aapcs_vfpcc i16 @vminv_s_v8i16(<8 x i16> %s1) {
+; CHECK-LABEL: vminv_s_v8i16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movw r0, #32767
 ; CHECK-NEXT:    vminv.s16 r0, q0
@@ -95,8 +95,8 @@ define arm_aapcs_vfpcc i16 @vminv_s_v8i16_i32(<8 x i16> %s1) {
   ret i16 %r
 }
 
-define arm_aapcs_vfpcc i32 @vminv_s_v4i32_i32(<4 x i32> %s1) {
-; CHECK-LABEL: vminv_s_v4i32_i32:
+define arm_aapcs_vfpcc i32 @vminv_s_v4i32(<4 x i32> %s1) {
+; CHECK-LABEL: vminv_s_v4i32:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    mvn r0, #-2147483648
 ; CHECK-NEXT:    vminv.s32 r0, q0
@@ -105,8 +105,8 @@ define arm_aapcs_vfpcc i32 @vminv_s_v4i32_i32(<4 x i32> %s1) {
   ret i32 %r
 }
 
-define arm_aapcs_vfpcc i8 @vminv_u_v16i8_i32(<16 x i8> %s1) {
-; CHECK-LABEL: vminv_u_v16i8_i32:
+define arm_aapcs_vfpcc i8 @vminv_u_v16i8(<16 x i8> %s1) {
+; CHECK-LABEL: vminv_u_v16i8:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movs r0, #255
 ; CHECK-NEXT:    vminv.u8 r0, q0
@@ -115,8 +115,8 @@ define arm_aapcs_vfpcc i8 @vminv_u_v16i8_i32(<16 x i8> %s1) {
   ret i8 %r
 }
 
-define arm_aapcs_vfpcc i16 @vminv_u_v8i16_i32(<8 x i16> %s1) {
-; CHECK-LABEL: vminv_u_v8i16_i32:
+define arm_aapcs_vfpcc i16 @vminv_u_v8i16(<8 x i16> %s1) {
+; CHECK-LABEL: vminv_u_v8i16:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movw r0, #65535
 ; CHECK-NEXT:    vminv.u16 r0, q0
@@ -125,8 +125,8 @@ define arm_aapcs_vfpcc i16 @vminv_u_v8i16_i32(<8 x i16> %s1) {
   ret i16 %r
 }
 
-define arm_aapcs_vfpcc i32 @vminv_u_v4i32_i32(<4 x i32> %s1) {
-; CHECK-LABEL: vminv_u_v4i32_i32:
+define arm_aapcs_vfpcc i32 @vminv_u_v4i32(<4 x i32> %s1) {
+; CHECK-LABEL: vminv_u_v4i32:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    vminv.u32 r0, q0
@@ -134,3 +134,339 @@ define arm_aapcs_vfpcc i32 @vminv_u_v4i32_i32(<4 x i32> %s1) {
   %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
   ret i32 %r
 }
+
+
+
+define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8_i8(<16 x i8> %s1, i8 %s2) {
+; CHECK-LABEL: vmaxv_s_v16i8_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r1, #127
+; CHECK-NEXT:    sxtb r3, r0
+; CHECK-NEXT:    vmaxv.s8 r1, q0
+; CHECK-NEXT:    sxtb r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %c = icmp sgt i8 %r, %s2
+  %s = select i1 %c, i8 %r, i8 %s2
+  ret i8 %s
+}
+
+define arm_aapcs_vfpcc i32 @vmaxv_s_v16i8_i32(<16 x i8> %s1, i32 %s2) {
+; CHECK-LABEL: vmaxv_s_v16i8_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r1, #127
+; CHECK-NEXT:    vmaxv.s8 r1, q0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
+  %rs = sext i8 %r to i32
+  %c = icmp sgt i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16_i16(<8 x i16> %s1, i16 %s2) {
+; CHECK-LABEL: vmaxv_s_v8i16_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movw r1, #32768
+; CHECK-NEXT:    sxth r3, r0
+; CHECK-NEXT:    movt r1, #65535
+; CHECK-NEXT:    vmaxv.s16 r1, q0
+; CHECK-NEXT:    sxth r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %c = icmp sgt i16 %r, %s2
+  %s = select i1 %c, i16 %r, i16 %s2
+  ret i16 %s
+}
+
+define arm_aapcs_vfpcc i32 @vmaxv_s_v8i16_i32(<8 x i16> %s1, i32 %s2) {
+; CHECK-LABEL: vmaxv_s_v8i16_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movw r1, #32768
+; CHECK-NEXT:    movt r1, #65535
+; CHECK-NEXT:    vmaxv.s16 r1, q0
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
+  %rs = sext i16 %r to i32
+  %c = icmp sgt i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32_i32(<4 x i32> %s1, i32 %s2) {
+; CHECK-LABEL: vmaxv_s_v4i32_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov.w r1, #-2147483648
+; CHECK-NEXT:    vmaxv.s32 r1, q0
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1)
+  %c = icmp sgt i32 %r, %s2
+  %s = select i1 %c, i32 %r, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8_i8(<16 x i8> %s1, i8 %s2) {
+; CHECK-LABEL: vmaxv_u_v16i8_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    uxtb r3, r0
+; CHECK-NEXT:    vmaxv.u8 r1, q0
+; CHECK-NEXT:    uxtb r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %c = icmp ugt i8 %r, %s2
+  %s = select i1 %c, i8 %r, i8 %s2
+  ret i8 %s
+}
+
+define arm_aapcs_vfpcc i32 @vmaxv_u_v16i8_i32(<16 x i8> %s1, i32 %s2) {
+; CHECK-LABEL: vmaxv_u_v16i8_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmaxv.u8 r1, q0
+; CHECK-NEXT:    uxtb r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
+  %rs = zext i8 %r to i32
+  %c = icmp ugt i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16_i16(<8 x i16> %s1, i16 %s2) {
+; CHECK-LABEL: vmaxv_u_v8i16_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    uxth r3, r0
+; CHECK-NEXT:    vmaxv.u16 r1, q0
+; CHECK-NEXT:    uxth r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %c = icmp ugt i16 %r, %s2
+  %s = select i1 %c, i16 %r, i16 %s2
+  ret i16 %s
+}
+
+define arm_aapcs_vfpcc i32 @vmaxv_u_v8i16_i32(<8 x i16> %s1, i32 %s2) {
+; CHECK-LABEL: vmaxv_u_v8i16_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmaxv.u16 r1, q0
+; CHECK-NEXT:    uxth r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
+  %rs = zext i16 %r to i32
+  %c = icmp ugt i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32_i32(<4 x i32> %s1, i32 %s2) {
+; CHECK-LABEL: vmaxv_u_v4i32_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmaxv.u32 r1, q0
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1)
+  %c = icmp ugt i32 %r, %s2
+  %s = select i1 %c, i32 %r, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i8 @vminv_s_v16i8_i8(<16 x i8> %s1, i8 %s2) {
+; CHECK-LABEL: vminv_s_v16i8_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #127
+; CHECK-NEXT:    sxtb r3, r0
+; CHECK-NEXT:    vminv.s8 r1, q0
+; CHECK-NEXT:    sxtb r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %c = icmp slt i8 %r, %s2
+  %s = select i1 %c, i8 %r, i8 %s2
+  ret i8 %s
+}
+
+define arm_aapcs_vfpcc i32 @vminv_s_v16i8_i32(<16 x i8> %s1, i32 %s2) {
+; CHECK-LABEL: vminv_s_v16i8_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #127
+; CHECK-NEXT:    vminv.s8 r1, q0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
+  %rs = sext i8 %r to i32
+  %c = icmp slt i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i16 @vminv_s_v8i16_i16(<8 x i16> %s1, i16 %s2) {
+; CHECK-LABEL: vminv_s_v8i16_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movw r1, #32767
+; CHECK-NEXT:    sxth r3, r0
+; CHECK-NEXT:    vminv.s16 r1, q0
+; CHECK-NEXT:    sxth r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %c = icmp slt i16 %r, %s2
+  %s = select i1 %c, i16 %r, i16 %s2
+  ret i16 %s
+}
+
+define arm_aapcs_vfpcc i32 @vminv_s_v8i16_i32(<8 x i16> %s1, i32 %s2) {
+; CHECK-LABEL: vminv_s_v8i16_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movw r1, #32767
+; CHECK-NEXT:    vminv.s16 r1, q0
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
+  %rs = sext i16 %r to i32
+  %c = icmp slt i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i32 @vminv_s_v4i32_i32(<4 x i32> %s1, i32 %s2) {
+; CHECK-LABEL: vminv_s_v4i32_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r1, #-2147483648
+; CHECK-NEXT:    vminv.s32 r1, q0
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1)
+  %c = icmp slt i32 %r, %s2
+  %s = select i1 %c, i32 %r, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i8 @vminv_u_v16i8_i8(<16 x i8> %s1, i8 %s2) {
+; CHECK-LABEL: vminv_u_v16i8_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #255
+; CHECK-NEXT:    uxtb r3, r0
+; CHECK-NEXT:    vminv.u8 r1, q0
+; CHECK-NEXT:    uxtb r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %c = icmp ult i8 %r, %s2
+  %s = select i1 %c, i8 %r, i8 %s2
+  ret i8 %s
+}
+
+define arm_aapcs_vfpcc i32 @vminv_u_v16i8_i32(<16 x i8> %s1, i32 %s2) {
+; CHECK-LABEL: vminv_u_v16i8_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #255
+; CHECK-NEXT:    vminv.u8 r1, q0
+; CHECK-NEXT:    uxtb r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
+  %rs = zext i8 %r to i32
+  %c = icmp ult i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i16 @vminv_u_v8i16_i16(<8 x i16> %s1, i16 %s2) {
+; CHECK-LABEL: vminv_u_v8i16_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movw r1, #65535
+; CHECK-NEXT:    uxth r3, r0
+; CHECK-NEXT:    vminv.u16 r1, q0
+; CHECK-NEXT:    uxth r2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %c = icmp ult i16 %r, %s2
+  %s = select i1 %c, i16 %r, i16 %s2
+  ret i16 %s
+}
+
+define arm_aapcs_vfpcc i32 @vminv_u_v8i16_i32(<8 x i16> %s1, i32 %s2) {
+; CHECK-LABEL: vminv_u_v8i16_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movw r1, #65535
+; CHECK-NEXT:    vminv.u16 r1, q0
+; CHECK-NEXT:    uxth r1, r1
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
+  %rs = zext i16 %r to i32
+  %c = icmp ult i32 %rs, %s2
+  %s = select i1 %c, i32 %rs, i32 %s2
+  ret i32 %s
+}
+
+define arm_aapcs_vfpcc i32 @vminv_u_v4i32_i32(<4 x i32> %s1, i32 %s2) {
+; CHECK-LABEL: vminv_u_v4i32_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov.w r1, #-1
+; CHECK-NEXT:    vminv.u32 r1, q0
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    bx lr
+  %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
+  %c = icmp ult i32 %r, %s2
+  %s = select i1 %c, i32 %r, i32 %s2
+  ret i32 %s
+}

From ff2743bf047deac7ef6cc6c3efd30ff05e55b2ad Mon Sep 17 00:00:00 2001
From: Yitzhak Mandelbaum <yitzhakm@google.com>
Date: Tue, 19 May 2020 14:26:46 -0400
Subject: [PATCH 094/770] [libTooling] In Transformer, allow atomic changes to
 span multiple files.

Summary:
Currently, all changes returned by a single application of a rule must fit in
one atomic change and therefore must apply to one file. However, there are
patterns in which a single rule will want to modify multiple files; for example,
a header and implementation to change a declaration and its definition. This
patch relaxes Transformer, libTooling's interpreter of RewriteRules, to support
multiple changes.

Reviewers: gribozavr

Subscribers: mgrang, jfb, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80239
---
 clang/lib/Tooling/Transformer/Transformer.cpp | 42 ++++++++++++-------
 clang/unittests/Tooling/TransformerTest.cpp   | 42 +++++++++++++++++++
 2 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/clang/lib/Tooling/Transformer/Transformer.cpp b/clang/lib/Tooling/Transformer/Transformer.cpp
index 93c2c0912d213..71340bf2f676d 100644
--- a/clang/lib/Tooling/Transformer/Transformer.cpp
+++ b/clang/lib/Tooling/Transformer/Transformer.cpp
@@ -12,6 +12,7 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Tooling/Refactoring/AtomicChange.h"
 #include "llvm/Support/Error.h"
+#include <map>
 #include <utility>
 #include <vector>
 
@@ -45,28 +46,39 @@ void Transformer::run(const MatchFinder::MatchResult &Result) {
     return;
   }
 
-  // Record the results in the AtomicChange, anchored at the location of the
-  // first change.
-  AtomicChange AC(*Result.SourceManager,
-                  (*Transformations)[0].Range.getBegin());
+  // Group the transformations, by file, into AtomicChanges, each anchored by
+  // the location of the first change in that file.
+  std::map<FileID, AtomicChange> ChangesByFileID;
   for (const auto &T : *Transformations) {
+    auto ID = Result.SourceManager->getFileID(T.Range.getBegin());
+    auto Iter = ChangesByFileID
+                    .emplace(ID, AtomicChange(*Result.SourceManager,
+                                              T.Range.getBegin()))
+                    .first;
+    auto &AC = Iter->second;
     if (auto Err = AC.replace(*Result.SourceManager, T.Range, T.Replacement)) {
       Consumer(std::move(Err));
       return;
     }
   }
 
-  for (const auto &I : Case.AddedIncludes) {
-    auto &Header = I.first;
-    switch (I.second) {
-    case transformer::IncludeFormat::Quoted:
-      AC.addHeader(Header);
-      break;
-    case transformer::IncludeFormat::Angled:
-      AC.addHeader((llvm::Twine("<") + Header + ">").str());
-      break;
+  for (auto &IDChangePair : ChangesByFileID) {
+    auto &AC = IDChangePair.second;
+    // FIXME: this will add includes to *all* changed files, which may not be
+    // the intent. We should upgrade the representation to allow associating
+    // headers with specific edits.
+    for (const auto &I : Case.AddedIncludes) {
+      auto &Header = I.first;
+      switch (I.second) {
+      case transformer::IncludeFormat::Quoted:
+        AC.addHeader(Header);
+        break;
+      case transformer::IncludeFormat::Angled:
+        AC.addHeader((llvm::Twine("<") + Header + ">").str());
+        break;
+      }
     }
-  }
 
-  Consumer(std::move(AC));
+    Consumer(std::move(AC));
+  }
 }
diff --git a/clang/unittests/Tooling/TransformerTest.cpp b/clang/unittests/Tooling/TransformerTest.cpp
index 1d955cf5e9b80..c8c6db059fedf 100644
--- a/clang/unittests/Tooling/TransformerTest.cpp
+++ b/clang/unittests/Tooling/TransformerTest.cpp
@@ -817,4 +817,46 @@ TEST(TransformerDeathTest, OrderedRuleTypes) {
                "Matcher must be.*node matcher");
 }
 #endif
+
+// Edits are able to span multiple files; in this case, a header and an
+// implementation file.
+TEST_F(TransformerTest, MultipleFiles) {
+  std::string Header = R"cc(void RemoveThisFunction();)cc";
+  std::string Source = R"cc(#include "input.h"
+                            void RemoveThisFunction();)cc";
+  Transformer T(
+      makeRule(functionDecl(hasName("RemoveThisFunction")), changeTo(cat(""))),
+      consumer());
+  T.registerMatchers(&MatchFinder);
+  auto Factory = newFrontendActionFactory(&MatchFinder);
+  EXPECT_TRUE(runToolOnCodeWithArgs(
+      Factory->create(), Source, std::vector<std::string>(), "input.cc",
+      "clang-tool", std::make_shared<PCHContainerOperations>(),
+      {{"input.h", Header}}));
+
+  std::sort(Changes.begin(), Changes.end(),
+            [](const AtomicChange &L, const AtomicChange &R) {
+              return L.getFilePath() < R.getFilePath();
+            });
+
+  ASSERT_EQ(Changes[0].getFilePath(), "./input.h");
+  EXPECT_THAT(Changes[0].getInsertedHeaders(), IsEmpty());
+  EXPECT_THAT(Changes[0].getRemovedHeaders(), IsEmpty());
+  llvm::Expected<std::string> UpdatedCode =
+      clang::tooling::applyAllReplacements(Header,
+                                           Changes[0].getReplacements());
+  ASSERT_TRUE(static_cast<bool>(UpdatedCode))
+      << "Could not update code: " << llvm::toString(UpdatedCode.takeError());
+  EXPECT_EQ(format(*UpdatedCode), format(R"cc(;)cc"));
+
+  ASSERT_EQ(Changes[1].getFilePath(), "input.cc");
+  EXPECT_THAT(Changes[1].getInsertedHeaders(), IsEmpty());
+  EXPECT_THAT(Changes[1].getRemovedHeaders(), IsEmpty());
+  UpdatedCode = clang::tooling::applyAllReplacements(
+      Source, Changes[1].getReplacements());
+  ASSERT_TRUE(static_cast<bool>(UpdatedCode))
+      << "Could not update code: " << llvm::toString(UpdatedCode.takeError());
+  EXPECT_EQ(format(*UpdatedCode), format(R"cc(#include "input.h"
+                        ;)cc"));
+}
 } // namespace

From a3b5ccddcc3512432fc386b9197e6f103e190894 Mon Sep 17 00:00:00 2001
From: Tharindu Rusira <tharindurusira@gmail.com>
Date: Tue, 26 May 2020 00:48:06 -0600
Subject: [PATCH 095/770] Update DialectConversion.md

line 164: typo? baz.add should be bar.add.
`bar.add` -> `foo.add`
---
 mlir/docs/DialectConversion.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index 0835527ae201f..1f9ec14b6a96c 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -161,7 +161,7 @@ a set of legal ones.
 As an example, say you define a target that supports one operation: `foo.add`.
 When providing the following patterns: [`bar.add` -> `baz.add`, `baz.add` ->
 `foo.add`], the framework will automatically detect that it can legalize
-`baz.add` -> `foo.add` even though a direct conversion does not exist. This
+`bar.add` -> `foo.add` even though a direct conversion does not exist. This
 means that you don’t have to define a direct legalization pattern for `bar.add`
 -> `foo.add`.
 

From 9578a54f5007e8a02cef449dd151da27837b388e Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Tue, 26 May 2020 09:16:54 -0400
Subject: [PATCH 096/770] [mlir][Vector] Add vector contraction to outerproduct
 lowering

This revision adds the additional lowering and exposes the patterns at a finer granularity for better programmatic reuse. The unit test makes use of the finer grained pattern for simpler checks.

As the ContractionOpLowering is exposed programmatically, cleanup opportunities appear and static class methods are turned into free functions with static visibility.

Differential Revision: https://reviews.llvm.org/D80375
---
 mlir/include/mlir/Dialect/Vector/VectorOps.h  |  21 +-
 mlir/include/mlir/Dialect/Vector/VectorOps.td |   5 +
 .../mlir/Dialect/Vector/VectorTransforms.h    | 117 ++-
 mlir/lib/Dialect/Vector/VectorOps.cpp         |   7 +
 mlir/lib/Dialect/Vector/VectorTransforms.cpp  | 751 ++++++++++--------
 .../Vector/vector-contract-transforms.mlir    |  34 +-
 .../lib/Transforms/TestVectorTransforms.cpp   |  19 +-
 7 files changed, 598 insertions(+), 356 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h
index 6394fae213750..423c72da64712 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h
@@ -25,13 +25,6 @@ class MLIRContext;
 class OwningRewritePatternList;
 namespace vector {
 
-/// Structure to control the behavior of vector transform patterns.
-struct VectorTransformsOptions {
-  /// Let vector.contract lower to vector.matrix_multiply and LLVM matrix
-  /// intrinsics.
-  bool lowerToLLVMMatrixIntrinsics = false;
-};
-
 /// Collect a set of vector-to-vector canonicalization patterns.
 void populateVectorToVectorCanonicalizationPatterns(
     OwningRewritePatternList &patterns, MLIRContext *context);
@@ -51,6 +44,20 @@ void populateVectorToVectorTransformationPatterns(
 void populateVectorSlicesLoweringPatterns(OwningRewritePatternList &patterns,
                                           MLIRContext *context);
 
+/// Enum to control the lowering of `vector.contract` operations.
+enum class VectorContractLowering {
+  /// Progressively lower to finer grained `vector.contract` and `vector.fma`.
+  FMA = 0,
+  /// Lower to `vector.matrix_multiply`, maps 1-1 to LLVM matrix intrinsics.
+  Matmul = 1,
+  /// Lower to `vector.outerproduct`.
+  OuterProduct = 2,
+};
+/// Structure to control the behavior of vector transform patterns.
+struct VectorTransformsOptions {
+  VectorContractLowering vectorContractLowering = VectorContractLowering::FMA;
+};
+
 /// Collect a set of transformation patterns that are related to contracting
 /// or expanding vector operations:
 ///   ContractionOpLowering,
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index 264c8ad034c82..1b978e44dd6ab 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -686,6 +686,11 @@ def Vector_OuterProductOp :
     return %3: vector<4x8xf32>
     ```
   }];
+  let builders = [
+    // Build an op without mask, use the type of `acc` as the return type.
+    OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
+    "Value acc">];
   let extraClassDeclaration = [{
     VectorType getOperandVectorTypeLHS() {
       return lhs().getType().cast<VectorType>();
diff --git a/mlir/include/mlir/Dialect/Vector/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/VectorTransforms.h
index 337ac75f7cbbc..08aa579d651b4 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorTransforms.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorTransforms.h
@@ -9,6 +9,7 @@
 #ifndef DIALECT_VECTOR_VECTORTRANSFORMS_H_
 #define DIALECT_VECTOR_VECTORTRANSFORMS_H_
 
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
 
 namespace mlir {
@@ -22,13 +23,6 @@ void populateVectorToVectorConversionPatterns(
     ArrayRef<int64_t> coarseVectorShape = {},
     ArrayRef<int64_t> fineVectorShape = {});
 
-////////////////////////////////////////////////////////////////////////////////
-// The following Declarative Rewrite Rule (DRR) helpers are used in rewrite
-// patterns. As such, they must not call into `rewriter.erase/replace` APIs and
-// it is the responsibility of the enclosing PatternRewriter to erase on
-// success.
-////////////////////////////////////////////////////////////////////////////////
-
 namespace vector {
 
 // Entry point for unrolling declarative pattern rewrites.
@@ -69,6 +63,115 @@ unrollSingleResultOpMatchingType(OpBuilder &builder, Operation *op,
                                  ArrayRef<int64_t> targetShape);
 
 } // namespace vector
+
+//===----------------------------------------------------------------------===//
+// Finer-grained patterns exposed for more control over individual lowerings.
+//===----------------------------------------------------------------------===//
+
+/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to:
+/// ```
+///    %flattened_a = vector.shape_cast %a
+///    %flattened_b = vector.shape_cast %b
+///    %flattened_d = vector.matmul %flattened_a, %flattened_b
+///    %d = vector.shape_cast %%flattened_d
+///    %e = add %c, %d
+/// ```
+/// `vector.matmul` later lowers to `llvm.matrix.multiply`.
+//
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
+/// the vector.contract op is a row-major matrix multiply.
+class ContractionOpToMatmulOpLowering
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+
+  ContractionOpToMatmulOpLowering(
+      vector::VectorTransformsOptions vectorTransformsOptions,
+      MLIRContext *context)
+      : OpRewritePattern<vector::ContractionOp>(context),
+        vectorTransformsOptions(vectorTransformsOptions) {}
+
+  LogicalResult match(vector::ContractionOp op) const override;
+  void rewrite(vector::ContractionOp op,
+               PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformsOptions;
+};
+
+/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to a reduction_size-unrolled sequence:
+/// ```
+///    %at = vector.transpose %a, [1, 0]
+///    %bRow0 = vector.extract %b[0]
+///    %atRow0 = vector.extract %at[0]
+///    %c0 = vector.outerproduct %atRow0, %bRow0, %c
+///    ...
+///    %bRowK = vector.extract %b[K]
+///    %atRowK = vector.extract %at[K]
+///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
+/// ```
+///
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
+/// the vector.contract op is a row-major matrix multiply.
+class ContractionOpToOuterProductOpLowering
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+  ContractionOpToOuterProductOpLowering(
+      vector::VectorTransformsOptions vectorTransformsOptions,
+      MLIRContext *context)
+      : OpRewritePattern<vector::ContractionOp>(context),
+        vectorTransformsOptions(vectorTransformsOptions) {}
+
+  LogicalResult match(vector::ContractionOp op) const override;
+  void rewrite(vector::ContractionOp op,
+               PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformsOptions;
+};
+
+/// Progressive lowering of ContractionOp.
+///
+/// One:
+///   %x = vector.contract with at least one free/batch dimension
+/// is replaced by:
+///   %a = vector.contract with one less free/batch dimension
+///   %b = vector.contract with one less free/batch dimension
+///   ..
+///   %x = combine %a %b ..
+/// until a pure contraction is reached (no free/batch dimensions),
+/// which is replaced by a fma/reduction op.
+///
+/// This only kicks in when either VectorTransformsOptions is set to FMA or when
+/// other contraction patterns fail.
+class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+
+  ContractionOpLowering(vector::VectorTransformsOptions vectorTransformsOptions,
+                        MLIRContext *context)
+      : OpRewritePattern<vector::ContractionOp>(context),
+        vectorTransformsOptions(vectorTransformsOptions) {}
+
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override;
+
+private:
+  /// Options to control the vector patterns.
+  vector::VectorTransformsOptions vectorTransformsOptions;
+  // Lower one parallel dimension.
+  Value lowerParallel(vector::ContractionOp op, int64_t lhsIndex,
+                      int64_t rhsIndex, PatternRewriter &rewriter) const;
+  // Lower one reduction dimension.
+  Value lowerReduction(vector::ContractionOp op,
+                       PatternRewriter &rewriter) const;
+};
+
 } // namespace mlir
 
 #endif // DIALECT_VECTOR_VECTORTRANSFORMS_H_
diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp
index 5439233c96b15..1574edb344941 100644
--- a/mlir/lib/Dialect/Vector/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -957,6 +957,13 @@ static LogicalResult verify(InsertStridedSliceOp op) {
 // OuterProductOp
 //===----------------------------------------------------------------------===//
 
+/// Build an op without mask, use the type of `acc` as the return type.
+void OuterProductOp::build(OpBuilder &builder, OperationState &result,
+                           Value lhs, Value rhs, Value acc) {
+  result.addOperands({lhs, rhs, acc});
+  result.addTypes(acc.getType());
+}
+
 static void print(OpAsmPrinter &p, OuterProductOp op) {
   p << op.getOperationName() << " " << op.lhs() << ", " << op.rhs();
   if (!op.acc().empty())
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index 1c1de155d8b63..44ff03a04f223 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -39,6 +39,120 @@
 using namespace mlir;
 using llvm::dbgs;
 
+// Helper to find an index in an affine map.
+static Optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
+  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+    int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
+    if (idx == index)
+      return i;
+  }
+  return None;
+}
+
+// Helper to construct iterator types with one index removed.
+static SmallVector<Attribute, 4> adjustIter(ArrayAttr iteratorTypes,
+                                            int64_t index) {
+  SmallVector<Attribute, 4> results;
+  for (auto it : llvm::enumerate(iteratorTypes)) {
+    int64_t idx = it.index();
+    if (idx == index)
+      continue;
+    results.push_back(it.value());
+  }
+  return results;
+}
+
+// Helper to construct an affine map with one index removed.
+static AffineMap adjustMap(AffineMap map, int64_t index,
+                           PatternRewriter &rewriter) {
+  auto *ctx = rewriter.getContext();
+  SmallVector<AffineExpr, 4> results;
+  for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
+    int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
+    if (idx == index)
+      continue;
+    // Re-insert remaining indices, but renamed when occurring
+    // after the removed index.
+    auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);
+    results.push_back(targetExpr);
+  }
+  return AffineMap::get(map.getNumDims() - 1, 0, results, ctx);
+}
+
+// Helper to drop dimension from vector type.
+static Type adjustType(VectorType tp, int64_t index) {
+  int64_t rank = tp.getRank();
+  Type eltType = tp.getElementType();
+  if (rank == 1) {
+    assert(index == 0 && "index for scalar result out of bounds");
+    return eltType;
+  }
+  SmallVector<int64_t, 4> adjustedShape;
+  for (int64_t i = 0; i < rank; ++i) {
+    // Omit dimension at the given index.
+    if (i == index)
+      continue;
+    // Otherwise, add dimension back.
+    adjustedShape.push_back(tp.getDimSize(i));
+  }
+  return VectorType::get(adjustedShape, eltType);
+}
+
+// Helper method to possibly drop a dimension in a load.
+// TODO(ajcbik): use a reshaping vector load (and share lowering code)
+static Value reshapeLoad(Location loc, Value val, VectorType type,
+                         int64_t index, int64_t pos,
+                         PatternRewriter &rewriter) {
+  if (index == -1)
+    return val;
+  Type lowType = adjustType(type, 0);
+  // At extraction dimension?
+  if (index == 0) {
+    auto posAttr = rewriter.getI64ArrayAttr(pos);
+    return rewriter.create<vector::ExtractOp>(loc, lowType, val, posAttr);
+  }
+  // Unroll leading dimensions.
+  VectorType vType = lowType.cast<VectorType>();
+  VectorType resType = adjustType(type, index).cast<VectorType>();
+  Value result =
+      rewriter.create<ConstantOp>(loc, resType, rewriter.getZeroAttr(resType));
+  for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) {
+    auto posAttr = rewriter.getI64ArrayAttr(d);
+    Value ext = rewriter.create<vector::ExtractOp>(loc, vType, val, posAttr);
+    Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
+    result =
+        rewriter.create<vector::InsertOp>(loc, resType, load, result, posAttr);
+  }
+  return result;
+}
+
+// Helper method to possibly drop a dimension in a store.
+// TODO(ajcbik): use a reshaping vector store (and share lowering code)
+static Value reshapeStore(Location loc, Value val, Value result,
+                          VectorType type, int64_t index, int64_t pos,
+                          PatternRewriter &rewriter) {
+  // Unmodified?
+  if (index == -1)
+    return val;
+  // At insertion dimension?
+  if (index == 0) {
+    auto posAttr = rewriter.getI64ArrayAttr(pos);
+    return rewriter.create<vector::InsertOp>(loc, type, val, result, posAttr);
+  }
+  // Unroll leading dimensions.
+  Type lowType = adjustType(type, 0);
+  VectorType vType = lowType.cast<VectorType>();
+  Type insType = adjustType(vType, 0);
+  for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
+    auto posAttr = rewriter.getI64ArrayAttr(d);
+    Value ext = rewriter.create<vector::ExtractOp>(loc, vType, result, posAttr);
+    Value ins = rewriter.create<vector::ExtractOp>(loc, insType, val, posAttr);
+    Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
+    result = rewriter.create<vector::InsertOp>(loc, type, sto, result, posAttr);
+  }
+  return result;
+}
+
 // Clones `op` into a new operations that takes `operands` and returns
 // `resultTypes`.
 static Operation *cloneOpWithOperandsAndTypes(OpBuilder &builder, Location loc,
@@ -1252,343 +1366,6 @@ class CreateMaskOpLowering : public OpRewritePattern<vector::CreateMaskOp> {
   }
 };
 
-/// Progressive lowering of ContractionOp.
-/// One:
-///   %x = vector.contract with at least one free/batch dimension
-/// is replaced by:
-///   %a = vector.contract with one less free/batch dimension
-///   %b = vector.contract with one less free/batch dimension
-///   ..
-///   %x = combine %a %b ..
-/// until a pure contraction is reached (no free/batch dimensions),
-/// which is replaced by a fma/reduction op.
-///
-/// TODO(ajcbik): break down into transpose/reshape/cast ops
-///               when they become available to avoid code dup
-/// TODO(ajcbik): investigate lowering order impact on performance
-class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
-
-  ContractionOpLowering(vector::VectorTransformsOptions vectorTransformsOptions,
-                        MLIRContext *context)
-      : OpRewritePattern<vector::ContractionOp>(context),
-        vectorTransformsOptions(vectorTransformsOptions) {}
-
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override {
-    // TODO(ajcbik): implement masks
-    if (llvm::size(op.masks()) != 0)
-      return failure();
-
-    // TODO(ntv, ajcbik): implement benefits, cost models, separate this out in
-    // a new pattern.
-    if (vectorTransformsOptions.lowerToLLVMMatrixIntrinsics &&
-        isRowMajorMatmul(op.indexing_maps())) {
-      VectorType lhsType = op.getLhsType();
-      VectorType rhsType = op.getRhsType();
-      unsigned lhsRows = op.getLhsType().getShape()[0];
-      unsigned lhsColumns = op.getLhsType().getShape()[1];
-      unsigned rhsColumns = op.getRhsType().getShape()[1];
-
-      Type flattenedLHSType =
-          VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
-      Type flattenedRHSType =
-          VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
-      auto lhs = rewriter.create<vector::ShapeCastOp>(
-          op.getLoc(), flattenedLHSType, op.lhs());
-      auto rhs = rewriter.create<vector::ShapeCastOp>(
-          op.getLoc(), flattenedRHSType, op.rhs());
-
-      Value mul = rewriter.create<vector::MatmulOp>(
-          op.getLoc(), lhs, rhs, lhsRows, lhsColumns, rhsColumns);
-      mul = rewriter.create<vector::ShapeCastOp>(op.getLoc(),
-                                                 op.acc().getType(), mul);
-      Type elementType = op.getLhsType().getElementType();
-      assert(elementType.isIntOrFloat());
-      if (elementType.isa<IntegerType>())
-        rewriter.replaceOpWithNewOp<AddIOp>(op, op.acc(), mul);
-      else
-        rewriter.replaceOpWithNewOp<AddFOp>(op, op.acc(), mul);
-      return success();
-    }
-
-    // Find first batch dimension in LHS/RHS, and lower when found.
-    std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
-    if (!batchDimMap.empty()) {
-      int64_t lhsIndex = batchDimMap[0].first;
-      int64_t rhsIndex = batchDimMap[0].second;
-      rewriter.replaceOp(op, lowerParallel(op, lhsIndex, rhsIndex, rewriter));
-      return success();
-    }
-
-    // Collect contracting dimensions.
-    std::vector<std::pair<int64_t, int64_t>> contractingDimMap =
-        op.getContractingDimMap();
-    DenseSet<int64_t> lhsContractingDimSet;
-    DenseSet<int64_t> rhsContractingDimSet;
-    for (auto &dimPair : contractingDimMap) {
-      lhsContractingDimSet.insert(dimPair.first);
-      rhsContractingDimSet.insert(dimPair.second);
-    }
-
-    // Find first free dimension in LHS, and lower when found.
-    VectorType lhsType = op.getLhsType();
-    for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e;
-         ++lhsIndex) {
-      if (lhsContractingDimSet.count(lhsIndex) == 0) {
-        rewriter.replaceOp(
-            op, lowerParallel(op, lhsIndex, /*rhsIndex=*/-1, rewriter));
-        return success();
-      }
-    }
-
-    // Find first free dimension in RHS, and lower when found.
-    VectorType rhsType = op.getRhsType();
-    for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e;
-         ++rhsIndex) {
-      if (rhsContractingDimSet.count(rhsIndex) == 0) {
-        rewriter.replaceOp(
-            op, lowerParallel(op, /*lhsIndex=*/-1, rhsIndex, rewriter));
-        return success();
-      }
-    }
-
-    // Lower the first remaining reduction dimension.
-    if (!contractingDimMap.empty()) {
-      rewriter.replaceOp(op, lowerReduction(op, rewriter));
-      return success();
-    }
-
-    return failure();
-  }
-
-private:
-  // Lower one parallel dimension.
-  // TODO(ajcbik): consider reusing existing contract unrolling
-  Value lowerParallel(vector::ContractionOp op, int64_t lhsIndex,
-                      int64_t rhsIndex, PatternRewriter &rewriter) const {
-    VectorType lhsType = op.getLhsType();
-    VectorType rhsType = op.getRhsType();
-    VectorType resType = op.getResultType().cast<VectorType>();
-    // Find the iterator type index and result index.
-    SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
-    int64_t iterIndex = -1;
-    int64_t dimSize = -1;
-    if (lhsIndex >= 0) {
-      iterIndex =
-          iMap[0].getResult(lhsIndex).cast<AffineDimExpr>().getPosition();
-      assert((rhsIndex < 0 || iterIndex == iMap[1]
-                                               .getResult(rhsIndex)
-                                               .cast<AffineDimExpr>()
-                                               .getPosition()) &&
-             "parallel index should be free in LHS or batch in LHS/RHS");
-      dimSize = lhsType.getDimSize(lhsIndex);
-    } else {
-      assert(rhsIndex >= 0 && "missing parallel index");
-      iterIndex =
-          iMap[1].getResult(rhsIndex).cast<AffineDimExpr>().getPosition();
-      dimSize = rhsType.getDimSize(rhsIndex);
-    }
-    assert(iterIndex >= 0 && "parallel index not listed in operand mapping");
-    Optional<int64_t> lookup = getResultIndex(iMap[2], iterIndex);
-    assert(lookup.hasValue() && "parallel index not listed in reduction");
-    int64_t resIndex = lookup.getValue();
-    // Construct new iterator types and affine map array attribute.
-    SmallVector<AffineMap, 4> lowIndexingMaps;
-    lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
-    auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
-    auto lowIter =
-        rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
-    // Unroll into a series of lower dimensional vector.contract ops.
-    Location loc = op.getLoc();
-    Value result = rewriter.create<ConstantOp>(loc, resType,
-                                               rewriter.getZeroAttr(resType));
-    for (int64_t d = 0; d < dimSize; ++d) {
-      auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
-      auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
-      auto acc = reshapeLoad(loc, op.acc(), resType, resIndex, d, rewriter);
-      Value lowContract = rewriter.create<vector::ContractionOp>(
-          loc, lhs, rhs, acc, lowAffine, lowIter);
-      result = reshapeStore(loc, lowContract, result, resType, resIndex, d,
-                            rewriter);
-    }
-    return result;
-  }
-
-  // Lower one reduction dimension.
-  Value lowerReduction(vector::ContractionOp op,
-                       PatternRewriter &rewriter) const {
-    auto loc = op.getLoc();
-    VectorType lhsType = op.getLhsType();
-    VectorType rhsType = op.getRhsType();
-    Type resType = op.getResultType();
-    assert(!resType.isa<VectorType>());
-    // Use iterator index 0.
-    int64_t iterIndex = 0;
-    SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
-    Optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);
-    Optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);
-    assert(lookupLhs.hasValue() && "missing LHS parallel index");
-    assert(lookupRhs.hasValue() && "missing RHS parallel index");
-    int64_t lhsIndex = lookupLhs.getValue();
-    int64_t rhsIndex = lookupRhs.getValue();
-    int64_t dimSize = lhsType.getDimSize(lhsIndex);
-    assert(dimSize == rhsType.getDimSize(rhsIndex) && "corrupt shape");
-    // Base case.
-    if (lhsType.getRank() == 1) {
-      assert(rhsType.getRank() == 1 && "corrupt contraction");
-      Value zero = rewriter.create<ConstantOp>(loc, lhsType,
-                                               rewriter.getZeroAttr(lhsType));
-      Value fma = rewriter.create<vector::FMAOp>(loc, op.lhs(), op.rhs(), zero);
-      StringAttr kind = rewriter.getStringAttr("add");
-      return rewriter.create<vector::ReductionOp>(loc, resType, kind, fma,
-                                                  op.acc());
-    }
-    // Construct new iterator types and affine map array attribute.
-    SmallVector<AffineMap, 4> lowIndexingMaps;
-    lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
-    lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
-    auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
-    auto lowIter =
-        rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
-    // Unroll into a series of lower dimensional vector.contract ops.
-    // By feeding the initial accumulator into the first contraction,
-    // and the result of each contraction into the next, eventually
-    // the sum of all reductions is computed.
-    Value result = op.acc();
-    for (int64_t d = 0; d < dimSize; ++d) {
-      auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
-      auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
-      result = rewriter.create<vector::ContractionOp>(loc, lhs, rhs, result,
-                                                      lowAffine, lowIter);
-    }
-    return result;
-  }
-
-  // Helper to find an index in an affine map.
-  static Optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
-    for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
-      int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
-      if (idx == index)
-        return i;
-    }
-    return None;
-  }
-
-  // Helper to construct iterator types with one index removed.
-  static SmallVector<Attribute, 4> adjustIter(ArrayAttr iteratorTypes,
-                                              int64_t index) {
-    SmallVector<Attribute, 4> results;
-    for (auto it : llvm::enumerate(iteratorTypes)) {
-      int64_t idx = it.index();
-      if (idx == index)
-        continue;
-      results.push_back(it.value());
-    }
-    return results;
-  }
-
-  // Helper to construct an affine map with one index removed.
-  static AffineMap adjustMap(AffineMap map, int64_t index,
-                             PatternRewriter &rewriter) {
-    auto *ctx = rewriter.getContext();
-    SmallVector<AffineExpr, 4> results;
-    for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
-      int64_t idx = map.getResult(i).cast<AffineDimExpr>().getPosition();
-      if (idx == index)
-        continue;
-      // Re-insert remaining indices, but renamed when occurring
-      // after the removed index.
-      auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx);
-      results.push_back(targetExpr);
-    }
-    return AffineMap::get(map.getNumDims() - 1, 0, results, ctx);
-  }
-
-  // Helper to drop dimension from vector type.
-  static Type adjustType(VectorType tp, int64_t index) {
-    int64_t rank = tp.getRank();
-    Type eltType = tp.getElementType();
-    if (rank == 1) {
-      assert(index == 0 && "index for scalar result out of bounds");
-      return eltType;
-    }
-    SmallVector<int64_t, 4> adjustedShape;
-    for (int64_t i = 0; i < rank; ++i) {
-      // Omit dimension at the given index.
-      if (i == index)
-        continue;
-      // Otherwise, add dimension back.
-      adjustedShape.push_back(tp.getDimSize(i));
-    }
-    return VectorType::get(adjustedShape, eltType);
-  }
-
-  // Helper method to possibly drop a dimension in a load.
-  // TODO(ajcbik): use a reshaping vector load (and share lowering code)
-  static Value reshapeLoad(Location loc, Value val, VectorType type,
-                           int64_t index, int64_t pos,
-                           PatternRewriter &rewriter) {
-    if (index == -1)
-      return val;
-    Type lowType = adjustType(type, 0);
-    // At extraction dimension?
-    if (index == 0) {
-      auto posAttr = rewriter.getI64ArrayAttr(pos);
-      return rewriter.create<vector::ExtractOp>(loc, lowType, val, posAttr);
-    }
-    // Unroll leading dimensions.
-    VectorType vType = lowType.cast<VectorType>();
-    VectorType resType = adjustType(type, index).cast<VectorType>();
-    Value result = rewriter.create<ConstantOp>(loc, resType,
-                                               rewriter.getZeroAttr(resType));
-    for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) {
-      auto posAttr = rewriter.getI64ArrayAttr(d);
-      Value ext = rewriter.create<vector::ExtractOp>(loc, vType, val, posAttr);
-      Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
-      result = rewriter.create<vector::InsertOp>(loc, resType, load, result,
-                                                 posAttr);
-    }
-    return result;
-  }
-
-  // Helper method to possibly drop a dimension in a store.
-  // TODO(ajcbik): use a reshaping vector store (and share lowering code)
-  static Value reshapeStore(Location loc, Value val, Value result,
-                            VectorType type, int64_t index, int64_t pos,
-                            PatternRewriter &rewriter) {
-    // Unmodified?
-    if (index == -1)
-      return val;
-    // At insertion dimension?
-    if (index == 0) {
-      auto posAttr = rewriter.getI64ArrayAttr(pos);
-      return rewriter.create<vector::InsertOp>(loc, type, val, result, posAttr);
-    }
-    // Unroll leading dimensions.
-    Type lowType = adjustType(type, 0);
-    VectorType vType = lowType.cast<VectorType>();
-    Type insType = adjustType(vType, 0);
-    for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
-      auto posAttr = rewriter.getI64ArrayAttr(d);
-      Value ext =
-          rewriter.create<vector::ExtractOp>(loc, vType, result, posAttr);
-      Value ins =
-          rewriter.create<vector::ExtractOp>(loc, insType, val, posAttr);
-      Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
-      result =
-          rewriter.create<vector::InsertOp>(loc, type, sto, result, posAttr);
-    }
-    return result;
-  }
-
-  vector::VectorTransformsOptions vectorTransformsOptions;
-};
-
 /// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
 /// vectors progressively on the way to target llvm.matrix intrinsics.
 /// This iterates over the most major dimension of the 2-D vector and performs
@@ -1656,6 +1433,302 @@ class ShapeCastOp2DUpCastRewritePattern
 
 } // namespace
 
+namespace mlir {
+
+/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to:
+/// ```
+///    %flattened_a = vector.shape_cast %a
+///    %flattened_b = vector.shape_cast %b
+///    %flattened_d = vector.matmul %flattened_a, %flattened_b
+///    %d = vector.shape_cast %%flattened_d
+///    %e = add %c, %d
+/// ```
+/// `vector.matmul` later lowers to `llvm.matrix.multiply`.
+//
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
+/// the vector.contract op is a row-major matrix multiply.
+LogicalResult
+ContractionOpToMatmulOpLowering::match(vector::ContractionOp op) const {
+  // TODO(ajcbik): implement masks
+  if (llvm::size(op.masks()) != 0)
+    return failure();
+
+  if (vectorTransformsOptions.vectorContractLowering !=
+          vector::VectorContractLowering::Matmul ||
+      !isRowMajorMatmul(op.indexing_maps()))
+    return failure();
+  return success();
+}
+
+void ContractionOpToMatmulOpLowering::rewrite(vector::ContractionOp op,
+                                              PatternRewriter &rewriter) const {
+  VectorType lhsType = op.getLhsType();
+  VectorType rhsType = op.getRhsType();
+  unsigned lhsRows = op.getLhsType().getShape()[0];
+  unsigned lhsColumns = op.getLhsType().getShape()[1];
+  unsigned rhsColumns = op.getRhsType().getShape()[1];
+
+  Type flattenedLHSType =
+      VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
+  Type flattenedRHSType =
+      VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
+  auto lhs = rewriter.create<vector::ShapeCastOp>(op.getLoc(), flattenedLHSType,
+                                                  op.lhs());
+  auto rhs = rewriter.create<vector::ShapeCastOp>(op.getLoc(), flattenedRHSType,
+                                                  op.rhs());
+
+  Value mul = rewriter.create<vector::MatmulOp>(op.getLoc(), lhs, rhs, lhsRows,
+                                                lhsColumns, rhsColumns);
+  mul = rewriter.create<vector::ShapeCastOp>(op.getLoc(), op.acc().getType(),
+                                             mul);
+  Type elementType = op.getLhsType().getElementType();
+  assert(elementType.isIntOrFloat());
+  if (elementType.isa<IntegerType>())
+    rewriter.replaceOpWithNewOp<AddIOp>(op, op.acc(), mul);
+  else
+    rewriter.replaceOpWithNewOp<AddFOp>(op, op.acc(), mul);
+}
+
+/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to a reduction_size-unrolled sequence:
+/// ```
+///    %at = vector.transpose %a, [1, 0]
+///    %bRow0 = vector.extract %b[0]
+///    %atRow0 = vector.extract %at[0]
+///    %c0 = vector.outerproduct %atRow0, %bRow0, %c
+///    ...
+///    %bRowK = vector.extract %b[K]
+///    %atRowK = vector.extract %at[K]
+///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
+/// ```
+///
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
+/// the vector.contract op is a row-major matrix multiply.
+void ContractionOpToOuterProductOpLowering::rewrite(
+    vector::ContractionOp op, PatternRewriter &rewriter) const {
+  VectorType lhsType = op.getLhsType();
+  // TODO(ntv) other modes.
+  // We know we are in row-major.
+  bool transposeLhs = false;
+  unsigned reductionSize =
+      transposeLhs ? lhsType.getShape()[0] : lhsType.getShape()[1];
+
+  // If transposeLhs == false (i.e. lhs(m, reductionSize)), we need to
+  // transpose it to extract the proper vector<m x f32>. Otherwise, just take
+  // the lhs.
+  Value lhs = transposeLhs
+                  ? op.lhs()
+                  : rewriter.create<vector::TransposeOp>(
+                        op.getLoc(), op.lhs(), ArrayRef<int64_t>{1, 0});
+  Value res = op.acc();
+  // ExtractOp does not allow dynamic indexing, we must unroll explicitly.
+  for (unsigned k = 0; k < reductionSize; ++k) {
+    Value a = rewriter.create<vector::ExtractOp>(op.getLoc(), lhs, k);
+    Value b = rewriter.create<vector::ExtractOp>(op.getLoc(), op.rhs(), k);
+    res = rewriter.create<vector::OuterProductOp>(op.getLoc(), a, b, res);
+  }
+  rewriter.replaceOp(op, res);
+}
+
+LogicalResult
+ContractionOpToOuterProductOpLowering ::match(vector::ContractionOp op) const {
+  // TODO(ajcbik): implement masks
+  if (llvm::size(op.masks()) != 0)
+    return failure();
+
+  if (vectorTransformsOptions.vectorContractLowering !=
+          vector::VectorContractLowering::OuterProduct ||
+      !isRowMajorMatmul(op.indexing_maps()))
+    return failure();
+  return success();
+}
+
+/// Progressive lowering of ContractionOp.
+/// One:
+///   %x = vector.contract with at least one free/batch dimension
+/// is replaced by:
+///   %a = vector.contract with one less free/batch dimension
+///   %b = vector.contract with one less free/batch dimension
+///   ..
+///   %x = combine %a %b ..
+/// until a pure contraction is reached (no free/batch dimensions),
+/// which is replaced by a fma/reduction op.
+///
+/// TODO(ajcbik): break down into transpose/reshape/cast ops
+///               when they become available to avoid code dup
+/// TODO(ajcbik): investigate lowering order impact on performance
+LogicalResult
+ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
+                                       PatternRewriter &rewriter) const {
+
+  // TODO(ajcbik): implement masks.
+  if (llvm::size(op.masks()) != 0)
+    return failure();
+
+  // TODO(ntv, ajcbik): implement benefits, cost models.
+  MLIRContext *ctx = op.getContext();
+  ContractionOpToMatmulOpLowering pat1(vectorTransformsOptions, ctx);
+  if (succeeded(pat1.match(op)))
+    return failure();
+  ContractionOpToOuterProductOpLowering pat2(vectorTransformsOptions, ctx);
+  if (succeeded(pat2.match(op)))
+    return failure();
+
+  // Find first batch dimension in LHS/RHS, and lower when found.
+  std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
+  if (!batchDimMap.empty()) {
+    int64_t lhsIndex = batchDimMap[0].first;
+    int64_t rhsIndex = batchDimMap[0].second;
+    rewriter.replaceOp(op, lowerParallel(op, lhsIndex, rhsIndex, rewriter));
+    return success();
+  }
+
+  // Collect contracting dimensions.
+  std::vector<std::pair<int64_t, int64_t>> contractingDimMap =
+      op.getContractingDimMap();
+  DenseSet<int64_t> lhsContractingDimSet;
+  DenseSet<int64_t> rhsContractingDimSet;
+  for (auto &dimPair : contractingDimMap) {
+    lhsContractingDimSet.insert(dimPair.first);
+    rhsContractingDimSet.insert(dimPair.second);
+  }
+
+  // Find first free dimension in LHS, and lower when found.
+  VectorType lhsType = op.getLhsType();
+  for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e; ++lhsIndex) {
+    if (lhsContractingDimSet.count(lhsIndex) == 0) {
+      rewriter.replaceOp(
+          op, lowerParallel(op, lhsIndex, /*rhsIndex=*/-1, rewriter));
+      return success();
+    }
+  }
+
+  // Find first free dimension in RHS, and lower when found.
+  VectorType rhsType = op.getRhsType();
+  for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e; ++rhsIndex) {
+    if (rhsContractingDimSet.count(rhsIndex) == 0) {
+      rewriter.replaceOp(
+          op, lowerParallel(op, /*lhsIndex=*/-1, rhsIndex, rewriter));
+      return success();
+    }
+  }
+
+  // Lower the first remaining reduction dimension.
+  if (!contractingDimMap.empty()) {
+    rewriter.replaceOp(op, lowerReduction(op, rewriter));
+    return success();
+  }
+
+  return failure();
+}
+
+// Lower one parallel dimension.
+// TODO(ajcbik): consider reusing existing contract unrolling
+Value ContractionOpLowering::lowerParallel(vector::ContractionOp op,
+                                           int64_t lhsIndex, int64_t rhsIndex,
+                                           PatternRewriter &rewriter) const {
+  VectorType lhsType = op.getLhsType();
+  VectorType rhsType = op.getRhsType();
+  VectorType resType = op.getResultType().cast<VectorType>();
+  // Find the iterator type index and result index.
+  SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
+  int64_t iterIndex = -1;
+  int64_t dimSize = -1;
+  if (lhsIndex >= 0) {
+    iterIndex = iMap[0].getResult(lhsIndex).cast<AffineDimExpr>().getPosition();
+    assert(
+        (rhsIndex < 0 ||
+         iterIndex ==
+             iMap[1].getResult(rhsIndex).cast<AffineDimExpr>().getPosition()) &&
+        "parallel index should be free in LHS or batch in LHS/RHS");
+    dimSize = lhsType.getDimSize(lhsIndex);
+  } else {
+    assert(rhsIndex >= 0 && "missing parallel index");
+    iterIndex = iMap[1].getResult(rhsIndex).cast<AffineDimExpr>().getPosition();
+    dimSize = rhsType.getDimSize(rhsIndex);
+  }
+  assert(iterIndex >= 0 && "parallel index not listed in operand mapping");
+  Optional<int64_t> lookup = getResultIndex(iMap[2], iterIndex);
+  assert(lookup.hasValue() && "parallel index not listed in reduction");
+  int64_t resIndex = lookup.getValue();
+  // Construct new iterator types and affine map array attribute.
+  SmallVector<AffineMap, 4> lowIndexingMaps;
+  lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
+  lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
+  lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
+  auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
+  auto lowIter =
+      rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
+  // Unroll into a series of lower dimensional vector.contract ops.
+  Location loc = op.getLoc();
+  Value result =
+      rewriter.create<ConstantOp>(loc, resType, rewriter.getZeroAttr(resType));
+  for (int64_t d = 0; d < dimSize; ++d) {
+    auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
+    auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
+    auto acc = reshapeLoad(loc, op.acc(), resType, resIndex, d, rewriter);
+    Value lowContract = rewriter.create<vector::ContractionOp>(
+        loc, lhs, rhs, acc, lowAffine, lowIter);
+    result =
+        reshapeStore(loc, lowContract, result, resType, resIndex, d, rewriter);
+  }
+  return result;
+}
+
+// Lower one reduction dimension.
+Value ContractionOpLowering::lowerReduction(vector::ContractionOp op,
+                                            PatternRewriter &rewriter) const {
+  auto loc = op.getLoc();
+  VectorType lhsType = op.getLhsType();
+  VectorType rhsType = op.getRhsType();
+  Type resType = op.getResultType();
+  assert(!resType.isa<VectorType>());
+  // Use iterator index 0.
+  int64_t iterIndex = 0;
+  SmallVector<AffineMap, 4> iMap = op.getIndexingMaps();
+  Optional<int64_t> lookupLhs = getResultIndex(iMap[0], iterIndex);
+  Optional<int64_t> lookupRhs = getResultIndex(iMap[1], iterIndex);
+  assert(lookupLhs.hasValue() && "missing LHS parallel index");
+  assert(lookupRhs.hasValue() && "missing RHS parallel index");
+  int64_t lhsIndex = lookupLhs.getValue();
+  int64_t rhsIndex = lookupRhs.getValue();
+  int64_t dimSize = lhsType.getDimSize(lhsIndex);
+  assert(dimSize == rhsType.getDimSize(rhsIndex) && "corrupt shape");
+  // Base case.
+  if (lhsType.getRank() == 1) {
+    assert(rhsType.getRank() == 1 && "corrupt contraction");
+    Value zero = rewriter.create<ConstantOp>(loc, lhsType,
+                                             rewriter.getZeroAttr(lhsType));
+    Value fma = rewriter.create<vector::FMAOp>(loc, op.lhs(), op.rhs(), zero);
+    StringAttr kind = rewriter.getStringAttr("add");
+    return rewriter.create<vector::ReductionOp>(loc, resType, kind, fma,
+                                                op.acc());
+  }
+  // Construct new iterator types and affine map array attribute.
+  SmallVector<AffineMap, 4> lowIndexingMaps;
+  lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter));
+  lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter));
+  lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter));
+  auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps);
+  auto lowIter =
+      rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex));
+  // Unroll into a series of lower dimensional vector.contract ops.
+  // By feeding the initial accumulator into the first contraction,
+  // and the result of each contraction into the next, eventually
+  // the sum of all reductions is computed.
+  Value result = op.acc();
+  for (int64_t d = 0; d < dimSize; ++d) {
+    auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter);
+    auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter);
+    result = rewriter.create<vector::ContractionOp>(loc, lhs, rhs, result,
+                                                    lowAffine, lowIter);
+  }
+  return result;
+}
+
+} // namespace mlir
+
 // TODO(andydavis) Add pattern to rewrite ExtractSlices(ConstantMaskOp).
 // TODO(andydavis) Add this as DRR pattern.
 void mlir::vector::populateVectorToVectorTransformationPatterns(
@@ -1685,6 +1758,8 @@ void mlir::vector::populateVectorContractLoweringPatterns(
                   ShapeCastOp2DDownCastRewritePattern,
                   ShapeCastOp2DUpCastRewritePattern,
                   TransposeOpLowering>(context);
+  patterns.insert<ContractionOpLowering,
+                  ContractionOpToMatmulOpLowering,
+                  ContractionOpToOuterProductOpLowering>(parameters, context);
   // clang-format on
-  patterns.insert<ContractionOpLowering>(parameters, context);
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
index 72270dab11538..7eea3baa8d87c 100644
--- a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -test-vector-contraction-conversion | FileCheck %s
-// RUN: mlir-opt %s -test-vector-contraction-conversion=vector-lower-matrix-intrinsics=1 | FileCheck %s --check-prefix=MATRIX
+// RUN: mlir-opt %s -test-vector-contraction-conversion | FileCheck %s --dump-input-on-failure
+// RUN: mlir-opt %s -test-vector-contraction-conversion=vector-lower-matrix-intrinsics=1 | FileCheck %s --check-prefix=MATRIX --dump-input-on-failure
+// RUN: mlir-opt %s -test-vector-contraction-conversion=vector-outerproduct=1 | FileCheck %s --check-prefix=OUTERPRODUCT --dump-input-on-failure
 
 #dotp_accesses = [
   affine_map<(i) -> (i)>,
@@ -382,6 +383,35 @@ func @shape_casts(%a: vector<2x2xf32>) -> (vector<4xf32>, vector<2x2xf32>) {
 //      MATRIX:  %[[mm4:.*]] = vector.extract_strided_slice %[[mm1]] {offsets = [3], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
 //      MATRIX:  %[[mm5:.*]] = vector.insert %[[mm4]], %[[mm3]] [1] : vector<3xf32> into vector<2x3xf32>
 //      MATRIX:  %[[mm6:.*]] = addf %[[C]], %[[mm5]] : vector<2x3xf32>
+
+// OUTERPRODUCT-LABEL: func @matmul
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+// OUTERPRODUCT-SAME:  : vector<2x4xf32> to vector<4x2xf32>
+//
+//      OUTERPRODUCT: %[[a0:.*]] = vector.extract %[[At]][0] : vector<4x2xf32>
+//      OUTERPRODUCT: %[[b0:.*]] = vector.extract %[[B]][0] : vector<4x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+// OUTERPRODUCT-SAME:  : vector<2xf32>, vector<3xf32>
+//
+//      OUTERPRODUCT: %[[a1:.*]] = vector.extract %[[At]][1] : vector<4x2xf32>
+//      OUTERPRODUCT: %[[b1:.*]] = vector.extract %[[B]][1] : vector<4x3xf32>
+//      OUTERPRODUCT: %[[c1:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[c0]]
+// OUTERPRODUCT-SAME:  : vector<2xf32>, vector<3xf32>
+//
+//      OUTERPRODUCT: %[[a2:.*]] = vector.extract %[[At]][2] : vector<4x2xf32>
+//      OUTERPRODUCT: %[[b2:.*]] = vector.extract %[[B]][2] : vector<4x3xf32>
+//      OUTERPRODUCT: %[[c2:.*]] = vector.outerproduct %[[a2]], %[[b2]], %[[c1]]
+// OUTERPRODUCT-SAME:  : vector<2xf32>, vector<3xf32>
+//
+//      OUTERPRODUCT: %[[a3:.*]] = vector.extract %[[At]][3] : vector<4x2xf32>
+//      OUTERPRODUCT: %[[b3:.*]] = vector.extract %[[B]][3] : vector<4x3xf32>
+//      OUTERPRODUCT: %[[c3:.*]] = vector.outerproduct %[[a3]], %[[b3]], %[[c2]]
+// OUTERPRODUCT-SAME:  : vector<2xf32>, vector<3xf32>
+//
+//      OUTERPRODUCT: return %[[c3]] : vector<2x3xf32>
 func @matmul(%arg0: vector<2x4xf32>,
                           %arg1: vector<4x3xf32>,
                           %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
index c57540bc2ef70..65024dbe3acda 100644
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -51,11 +51,26 @@ struct TestVectorContractionConversion
       *this, "vector-lower-matrix-intrinsics",
       llvm::cl::desc("Lower vector.contract to llvm.intr.matrix.multiply"),
       llvm::cl::init(false)};
+  Option<bool> lowerToOuterProduct{
+      *this, "vector-outerproduct",
+      llvm::cl::desc("Lower vector.contract to vector.outerproduct"),
+      llvm::cl::init(false)};
 
   void runOnFunction() override {
     OwningRewritePatternList patterns;
-    VectorTransformsOptions options{
-        /*lowerToLLVMMatrixIntrinsics=*/lowerToLLVMMatrixIntrinsics};
+    if (lowerToOuterProduct) {
+      VectorContractLowering lowering = VectorContractLowering::OuterProduct;
+      VectorTransformsOptions options{lowering};
+      patterns.insert<ContractionOpToOuterProductOpLowering>(options,
+                                                             &getContext());
+      applyPatternsAndFoldGreedily(getFunction(), patterns);
+      return;
+    }
+
+    VectorContractLowering lowering = VectorContractLowering::FMA;
+    if (lowerToLLVMMatrixIntrinsics)
+      lowering = VectorContractLowering::Matmul;
+    VectorTransformsOptions options{lowering};
     populateVectorContractLoweringPatterns(patterns, &getContext(), options);
     applyPatternsAndFoldGreedily(getFunction(), patterns);
   }

From 792575ff323b714d03215951c6fff105f1074aac Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 26 May 2020 14:47:02 +0100
Subject: [PATCH 097/770] [NFC][ARM][AArch64] More code size tests

Add analysis runs for icmp, fcmp and select instructions.
---
 llvm/test/Analysis/CostModel/AArch64/cmp.ll   |  52 +++
 .../test/Analysis/CostModel/AArch64/select.ll |  47 ++-
 llvm/test/Analysis/CostModel/ARM/cmps.ll      | 154 +++++++
 llvm/test/Analysis/CostModel/ARM/icmps.ll     |  56 ---
 llvm/test/Analysis/CostModel/ARM/select.ll    | 378 ++++++++++++------
 5 files changed, 492 insertions(+), 195 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/cmp.ll
 create mode 100644 llvm/test/Analysis/CostModel/ARM/cmps.ll
 delete mode 100644 llvm/test/Analysis/CostModel/ARM/icmps.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
new file mode 100644
index 0000000000000..c8512bb2664ce
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -cost-model -analyze -mtriple=aarch64-- < %s | FileCheck %s --check-prefix=CHECK-THROUGHPUT
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=aarch64-- < %s | FileCheck %s --check-prefix=CHECK-SIZE
+
+define i32 @cmps() {
+; CHECK-THROUGHPUT-LABEL: 'cmps'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = icmp slt i8 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = icmp ult i16 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = icmp sge i32 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a3 = icmp ne i64 undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4 = icmp slt <16 x i8> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = icmp ult <8 x i16> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a6 = icmp sge <4 x i32> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-SIZE-LABEL: 'cmps'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = icmp slt i8 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = icmp ult i16 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = icmp sge i32 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a3 = icmp ne i64 undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4 = icmp slt <16 x i8> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = icmp ult <8 x i16> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a6 = icmp sge <4 x i32> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %a0 = icmp slt i8 undef, undef
+  %a1 = icmp ult i16 undef, undef
+  %a2 = icmp sge i32 undef, undef
+  %a3 = icmp ne i64 undef, undef
+  %a4 = icmp slt <16 x i8> undef, undef
+  %a5 = icmp ult <8 x i16> undef, undef
+  %a6 = icmp sge <4 x i32> undef, undef
+  %a7 = fcmp oge half undef, undef
+  %a8 = fcmp ogt float undef, undef
+  %a9 = fcmp ogt double undef, undef
+  %a10 = fcmp olt <8 x half> undef, undef
+  %a11 = fcmp oge <4 x float> undef, undef
+  %a12 = fcmp oge <2 x double> undef, undef
+  ret i32 undef
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/select.ll b/llvm/test/Analysis/CostModel/AArch64/select.ll
index 1a1248e661c58..25af9af1c6e93 100644
--- a/llvm/test/Analysis/CostModel/AArch64/select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/select.ll
@@ -1,37 +1,56 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-THROUGHPUT
+; RUN: opt < %s  -cost-model -analyze -cost-kind=code-size -mtriple=aarch64-- | FileCheck %s --check-prefix=CHECK-SIZE
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 
-; CHECK-LABEL: select
 define void @select() {
     ; Scalar values
-  ; CHECK: cost of 1 {{.*}} select
+; CHECK-THROUGHPUT-LABEL: 'select'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-SIZE-LABEL: 'select'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %v1 = select i1 undef, i8 undef, i8 undef
-  ; CHECK: cost of 1 {{.*}} select
   %v2 = select i1 undef, i16 undef, i16 undef
-  ; CHECK: cost of 1 {{.*}} select
   %v3 = select i1 undef, i32 undef, i32 undef
-  ; CHECK: cost of 1 {{.*}} select
   %v4 = select i1 undef, i64 undef, i64 undef
-  ; CHECK: cost of 1 {{.*}} select
   %v5 = select i1 undef, float undef, float undef
-  ; CHECK: cost of 1 {{.*}} select
   %v6 = select i1 undef, double undef, double undef
 
-  ; CHECK: cost of 16 {{.*}} select
   %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
 
-  ; CHECK: cost of 8 {{.*}} select
   %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
-  ; CHECK: cost of 16 {{.*}} select
   %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
 
   ; Vector values - check for vectors of i64s that have a high cost because
   ; they end up scalarized.
-  ; CHECK: cost of 80 {{.*}} select
   %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-  ; CHECK: cost of 160 {{.*}} select
   %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-  ; CHECK: cost of 320 {{.*}} select
   %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
 
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/cmps.ll b/llvm/test/Analysis/CostModel/ARM/cmps.ll
new file mode 100644
index 0000000000000..d7d84b2388b0e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/cmps.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
+; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
+; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
+
+define i32 @cmps() {
+; CHECK-MVE-RECIP-LABEL: 'cmps'
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-MAIN-RECIP-LABEL: 'cmps'
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8M-BASE-RECIP-LABEL: 'cmps'
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-V8R-RECIP-LABEL: 'cmps'
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-MVE-SIZE-LABEL: 'cmps'
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; CHECK-V8M-MAIN-SIZE-LABEL: 'cmps'
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; CHECK-V8M-BASE-SIZE-LABEL: 'cmps'
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; CHECK-V8R-SIZE-LABEL: 'cmps'
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+  %a = icmp slt i8 undef, undef
+  %b = icmp ult i16 undef, undef
+  %c = icmp sge i32 undef, undef
+  %d = icmp ne i64 undef, undef
+  %e = icmp slt <16 x i8> undef, undef
+  %f = icmp ult <8 x i16> undef, undef
+  %g = icmp sge <4 x i32> undef, undef
+  %a7 = fcmp oge half undef, undef
+  %a8 = fcmp ogt float undef, undef
+  %a9 = fcmp ogt double undef, undef
+  %a10 = fcmp olt <8 x half> undef, undef
+  %a11 = fcmp oge <4 x float> undef, undef
+  %a12 = fcmp oge <2 x double> undef, undef
+  ret i32 undef
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/icmps.ll b/llvm/test/Analysis/CostModel/ARM/icmps.ll
deleted file mode 100644
index 962ddb499347e..0000000000000
--- a/llvm/test/Analysis/CostModel/ARM/icmps.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -cost-model -analyze -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -cost-model -analyze -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R
-
-define i32 @icmps() {
-; CHECK-MVE-LABEL: 'icmps'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = icmp slt <16 x i8> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f = icmp ult <8 x i16> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %g = icmp sge <4 x i32> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; CHECK-V8M-MAIN-LABEL: 'icmps'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %e = icmp slt <16 x i8> undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %f = icmp ult <8 x i16> undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %g = icmp sge <4 x i32> undef, undef
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; CHECK-V8M-BASE-LABEL: 'icmps'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %e = icmp slt <16 x i8> undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %f = icmp ult <8 x i16> undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %g = icmp sge <4 x i32> undef, undef
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; CHECK-V8R-LABEL: 'icmps'
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = icmp slt i8 undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-  %a = icmp slt i8 undef, undef
-  %b = icmp ult i16 undef, undef
-  %c = icmp sge i32 undef, undef
-  %d = icmp ne i64 undef, undef
-  %e = icmp slt <16 x i8> undef, undef
-  %f = icmp ult <8 x i16> undef, undef
-  %g = icmp sge <4 x i32> undef, undef
-  ret i32 undef
-}
diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll
index df62ff0f09aef..1e350f139afe8 100644
--- a/llvm/test/Analysis/CostModel/ARM/select.ll
+++ b/llvm/test/Analysis/CostModel/ARM/select.ll
@@ -1,135 +1,263 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
-; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
+; RUN: opt < %s  -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt < %s  -cost-model -analyze -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
+; RUN: opt < %s  -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
+; RUN: opt < %s  -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
 
 define void @selects() {
-; CHECK-MVE-LABEL: 'selects'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+  ; Scalar values
+; CHECK-MVE-RECIP-LABEL: 'selects'
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 576 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; CHECK-NEON-LABEL: 'selects'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-RECIP-LABEL: 'selects'
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; CHECK-THUMB1-LABEL: 'selects'
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-THUMB1-RECIP-LABEL: 'selects'
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; CHECK-THUMB2-LABEL: 'selects'
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-THUMB2-RECIP-LABEL: 'selects'
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-MVE-SIZE-LABEL: 'selects'
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; CHECK-NEON-SIZE-LABEL: 'selects'
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; CHECK-THUMB1-SIZE-LABEL: 'selects'
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; CHECK-THUMB2-SIZE-LABEL: 'selects'
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  ; Scalar values
   %v1 = select i1 undef, i8 undef, i8 undef
   %v2 = select i1 undef, i16 undef, i16 undef
   %v3 = select i1 undef, i32 undef, i32 undef

From 222e0e58a87649623b3d16ce3fef56a6a0555be3 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 25 May 2020 12:05:57 +0200
Subject: [PATCH 098/770] [MLIR] Helper class referencing MemRefType to unify
 runner implementations.

Summary:
Add DynamicMemRefType which can reference one of the statically ranked StridedMemRefType or a UnrankedMemRefType so that runner utils only need to be implemented once.

There is definitely room for more clean up and unification, but I will keep that for follow-ups.

Reviewers: nicolasvasilache

Reviewed By: nicolasvasilache

Subscribers: mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80513
---
 .../mlir/ExecutionEngine/CRunnerUtils.h       |  43 ++++-
 .../mlir/ExecutionEngine/RunnerUtils.h        | 150 ++++++++++--------
 mlir/lib/ExecutionEngine/RunnerUtils.cpp      |  47 +-----
 .../test/mlir-cpu-runner/unranked_memref.mlir |   4 -
 mlir/test/mlir-cpu-runner/utils.mlir          |   7 +-
 .../cuda-runtime-wrappers.cpp                 |  21 ++-
 6 files changed, 132 insertions(+), 140 deletions(-)

diff --git a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
index 8155820d63473..bc59d3de20860 100644
--- a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
@@ -33,12 +33,6 @@
 
 #include <cstdint>
 
-template <int N>
-void dropFront(int64_t arr[N], int64_t *res) {
-  for (unsigned i = 1; i < N; ++i)
-    *(res + i - 1) = arr[i];
-}
-
 //===----------------------------------------------------------------------===//
 // Codegen-compatible structures for Vector type.
 //===----------------------------------------------------------------------===//
@@ -129,6 +123,10 @@ struct StridedMemRefType {
     res.basePtr = basePtr;
     res.data = data;
     res.offset = offset + idx * strides[0];
+    auto dropFront = [](const int64_t *arr, int64_t *res) {
+      for (unsigned i = 1; i < N; ++i)
+        res[i - 1] = arr[i];
+    };
     dropFront<N>(sizes, res.sizes);
     dropFront<N>(strides, res.strides);
     return res;
@@ -164,6 +162,39 @@ struct UnrankedMemRefType {
   void *descriptor;
 };
 
+//===----------------------------------------------------------------------===//
+// DynamicMemRefType type.
+//===----------------------------------------------------------------------===//
+// A reference to one of the StridedMemRef types.
+template <typename T>
+class DynamicMemRefType {
+public:
+  explicit DynamicMemRefType(const StridedMemRefType<T, 0> &mem_ref)
+      : rank(0), basePtr(mem_ref.basePtr), data(mem_ref.data),
+        offset(mem_ref.offset), sizes(nullptr), strides(nullptr) {}
+  template <int N>
+  explicit DynamicMemRefType(const StridedMemRefType<T, N> &mem_ref)
+      : rank(N), basePtr(mem_ref.basePtr), data(mem_ref.data),
+        offset(mem_ref.offset), sizes(mem_ref.sizes), strides(mem_ref.strides) {
+  }
+  explicit DynamicMemRefType(const UnrankedMemRefType<T> &mem_ref)
+      : rank(mem_ref.rank) {
+    auto *desc = static_cast<StridedMemRefType<T, 1> *>(mem_ref.descriptor);
+    basePtr = desc->basePtr;
+    data = desc->data;
+    offset = desc->offset;
+    sizes = rank == 0 ? nullptr : desc->sizes;
+    strides = sizes + rank;
+  }
+
+  int64_t rank;
+  T *basePtr;
+  T *data;
+  int64_t offset;
+  const int64_t *sizes;
+  const int64_t *strides;
+};
+
 //===----------------------------------------------------------------------===//
 // Small runtime support "lib" for vector.print lowering during codegen.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/ExecutionEngine/RunnerUtils.h b/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
index 5f239a4c146ea..7729b9c887967 100644
--- a/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
@@ -35,29 +35,35 @@
 
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
 
-template <typename StreamType, typename T, int N>
-void printMemRefMetaData(StreamType &os, StridedMemRefType<T, N> &V) {
-  static_assert(N > 0, "Expected N > 0");
-  os << "Memref base@ = " << reinterpret_cast<void *>(V.data) << " rank = " << N
-     << " offset = " << V.offset << " sizes = [" << V.sizes[0];
-  for (unsigned i = 1; i < N; ++i)
-    os << ", " << V.sizes[i];
-  os << "] strides = [" << V.strides[0];
-  for (unsigned i = 1; i < N; ++i)
-    os << ", " << V.strides[i];
+template <typename T, typename StreamType>
+void printMemRefMetaData(StreamType &os, const DynamicMemRefType<T> &V) {
+  os << "base@ = " << reinterpret_cast<void *>(V.data) << " rank = " << V.rank
+     << " offset = " << V.offset;
+  auto print = [&](const int64_t *ptr) {
+    if (V.rank == 0)
+      return;
+    os << ptr[0];
+    for (int64_t i = 1; i < V.rank; ++i)
+      os << ", " << ptr[i];
+  };
+  os << " sizes = [";
+  print(V.sizes);
+  os << "] strides = [";
+  print(V.strides);
   os << "]";
 }
 
-template <typename StreamType, typename T>
-void printMemRefMetaData(StreamType &os, StridedMemRefType<T, 0> &V) {
-  os << "Memref base@ = " << reinterpret_cast<void *>(V.data) << " rank = 0"
-     << " offset = " << V.offset;
+template <typename StreamType, typename T, int N>
+void printMemRefMetaData(StreamType &os, StridedMemRefType<T, N> &V) {
+  static_assert(N >= 0, "Expected N > 0");
+  os << "MemRef ";
+  printMemRefMetaData(os, DynamicMemRefType<T>(V));
 }
 
-template <typename T, typename StreamType>
+template <typename StreamType, typename T>
 void printUnrankedMemRefMetaData(StreamType &os, UnrankedMemRefType<T> &V) {
-  os << "Unranked Memref rank = " << V.rank << " "
-     << "descriptor@ = " << reinterpret_cast<void *>(V.descriptor) << "\n";
+  os << "Unranked MemRef ";
+  printMemRefMetaData(os, DynamicMemRefType<T>(V));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -118,88 +124,92 @@ std::ostream &operator<<(std::ostream &os, const Vector<T, M, Dims...> &v) {
   return os;
 }
 
-template <typename T, int N> struct MemRefDataPrinter {
-  static void print(std::ostream &os, T *base, int64_t rank, int64_t offset,
-                    int64_t *sizes, int64_t *strides);
-  static void printFirst(std::ostream &os, T *base, int64_t rank,
-                         int64_t offset, int64_t *sizes, int64_t *strides);
-  static void printLast(std::ostream &os, T *base, int64_t rank, int64_t offset,
-                        int64_t *sizes, int64_t *strides);
-};
-
-template <typename T> struct MemRefDataPrinter<T, 0> {
-  static void print(std::ostream &os, T *base, int64_t rank, int64_t offset,
-                    int64_t *sizes = nullptr, int64_t *strides = nullptr);
+template <typename T>
+struct MemRefDataPrinter {
+  static void print(std::ostream &os, T *base, int64_t dim, int64_t rank,
+                    int64_t offset, const int64_t *sizes,
+                    const int64_t *strides);
+  static void printFirst(std::ostream &os, T *base, int64_t dim, int64_t rank,
+                         int64_t offset, const int64_t *sizes,
+                         const int64_t *strides);
+  static void printLast(std::ostream &os, T *base, int64_t dim, int64_t rank,
+                        int64_t offset, const int64_t *sizes,
+                        const int64_t *strides);
 };
 
-template <typename T, int N>
-void MemRefDataPrinter<T, N>::printFirst(std::ostream &os, T *base,
-                                         int64_t rank, int64_t offset,
-                                         int64_t *sizes, int64_t *strides) {
+template <typename T>
+void MemRefDataPrinter<T>::printFirst(std::ostream &os, T *base, int64_t dim,
+                                      int64_t rank, int64_t offset,
+                                      const int64_t *sizes,
+                                      const int64_t *strides) {
   os << "[";
-  MemRefDataPrinter<T, N - 1>::print(os, base, rank, offset, sizes + 1,
-                                     strides + 1);
+  print(os, base, dim - 1, rank, offset, sizes + 1, strides + 1);
   // If single element, close square bracket and return early.
   if (sizes[0] <= 1) {
     os << "]";
     return;
   }
   os << ", ";
-  if (N > 1)
+  if (dim > 1)
     os << "\n";
 }
 
-template <typename T, int N>
-void MemRefDataPrinter<T, N>::print(std::ostream &os, T *base, int64_t rank,
-                                    int64_t offset, int64_t *sizes,
-                                    int64_t *strides) {
-  printFirst(os, base, rank, offset, sizes, strides);
+template <typename T>
+void MemRefDataPrinter<T>::print(std::ostream &os, T *base, int64_t dim,
+                                 int64_t rank, int64_t offset,
+                                 const int64_t *sizes, const int64_t *strides) {
+  if (dim == 0) {
+    os << base[offset];
+    return;
+  }
+  printFirst(os, base, dim, rank, offset, sizes, strides);
   for (unsigned i = 1; i + 1 < sizes[0]; ++i) {
-    printSpace(os, rank - N + 1);
-    MemRefDataPrinter<T, N - 1>::print(os, base, rank, offset + i * strides[0],
-                                       sizes + 1, strides + 1);
+    printSpace(os, rank - dim + 1);
+    print(os, base, dim - 1, rank, offset + i * strides[0], sizes + 1,
+          strides + 1);
     os << ", ";
-    if (N > 1)
+    if (dim > 1)
       os << "\n";
   }
   if (sizes[0] <= 1)
     return;
-  printLast(os, base, rank, offset, sizes, strides);
+  printLast(os, base, dim, rank, offset, sizes, strides);
 }
 
-template <typename T, int N>
-void MemRefDataPrinter<T, N>::printLast(std::ostream &os, T *base, int64_t rank,
-                                        int64_t offset, int64_t *sizes,
-                                        int64_t *strides) {
-  printSpace(os, rank - N + 1);
-  MemRefDataPrinter<T, N - 1>::print(os, base, rank,
-                                     offset + (sizes[0] - 1) * (*strides),
-                                     sizes + 1, strides + 1);
+template <typename T>
+void MemRefDataPrinter<T>::printLast(std::ostream &os, T *base, int64_t dim,
+                                     int64_t rank, int64_t offset,
+                                     const int64_t *sizes,
+                                     const int64_t *strides) {
+  printSpace(os, rank - dim + 1);
+  print(os, base, dim - 1, rank, offset + (sizes[0] - 1) * (*strides),
+        sizes + 1, strides + 1);
   os << "]";
 }
 
 template <typename T>
-void MemRefDataPrinter<T, 0>::print(std::ostream &os, T *base, int64_t rank,
-                                    int64_t offset, int64_t *sizes,
-                                    int64_t *strides) {
-  os << base[offset];
-}
-
-template <typename T, int N> void printMemRef(StridedMemRefType<T, N> &M) {
-  static_assert(N > 0, "Expected N > 0");
+void printMemRef(const DynamicMemRefType<T> &M) {
   printMemRefMetaData(std::cout, M);
   std::cout << " data = " << std::endl;
-  MemRefDataPrinter<T, N>::print(std::cout, M.data, N, M.offset, M.sizes,
-                                 M.strides);
+  if (M.rank == 0)
+    std::cout << "[";
+  MemRefDataPrinter<T>::print(std::cout, M.data, M.rank, M.rank, M.offset,
+                              M.sizes, M.strides);
+  if (M.rank == 0)
+    std::cout << "]";
   std::cout << std::endl;
 }
 
-template <typename T> void printMemRef(StridedMemRefType<T, 0> &M) {
-  printMemRefMetaData(std::cout, M);
-  std::cout << " data = " << std::endl;
-  std::cout << "[";
-  MemRefDataPrinter<T, 0>::print(std::cout, M.data, 0, M.offset);
-  std::cout << "]" << std::endl;
+template <typename T, int N>
+void printMemRef(StridedMemRefType<T, N> &M) {
+  std::cout << "Memref ";
+  printMemRef(DynamicMemRefType<T>(M));
+}
+
+template <typename T>
+void printMemRef(UnrankedMemRefType<T> &M) {
+  std::cout << "Unranked Memref ";
+  printMemRef(DynamicMemRefType<T>(M));
 }
 } // namespace impl
 
diff --git a/mlir/lib/ExecutionEngine/RunnerUtils.cpp b/mlir/lib/ExecutionEngine/RunnerUtils.cpp
index 7991eca61994f..7497ebdacf689 100644
--- a/mlir/lib/ExecutionEngine/RunnerUtils.cpp
+++ b/mlir/lib/ExecutionEngine/RunnerUtils.cpp
@@ -24,57 +24,16 @@ extern "C" void _mlir_ciface_print_memref_vector_4x4xf32(
   impl::printMemRef(*M);
 }
 
-#define MEMREF_CASE(TYPE, RANK)                                                \
-  case RANK:                                                                   \
-    impl::printMemRef(*(static_cast<StridedMemRefType<TYPE, RANK> *>(ptr)));   \
-    break
-
 extern "C" void _mlir_ciface_print_memref_i8(UnrankedMemRefType<int8_t> *M) {
-  printUnrankedMemRefMetaData(std::cout, *M);
-  int64_t rank = M->rank;
-  void *ptr = M->descriptor;
-
-  switch (rank) {
-    MEMREF_CASE(int8_t, 0);
-    MEMREF_CASE(int8_t, 1);
-    MEMREF_CASE(int8_t, 2);
-    MEMREF_CASE(int8_t, 3);
-    MEMREF_CASE(int8_t, 4);
-  default:
-    assert(0 && "Unsupported rank to print");
-  }
+  impl::printMemRef(*M);
 }
 
 extern "C" void _mlir_ciface_print_memref_i32(UnrankedMemRefType<int32_t> *M) {
-  printUnrankedMemRefMetaData(std::cout, *M);
-  int64_t rank = M->rank;
-  void *ptr = M->descriptor;
-
-  switch (rank) {
-    MEMREF_CASE(int32_t, 0);
-    MEMREF_CASE(int32_t, 1);
-    MEMREF_CASE(int32_t, 2);
-    MEMREF_CASE(int32_t, 3);
-    MEMREF_CASE(int32_t, 4);
-  default:
-    assert(0 && "Unsupported rank to print");
-  }
+  impl::printMemRef(*M);
 }
 
 extern "C" void _mlir_ciface_print_memref_f32(UnrankedMemRefType<float> *M) {
-  printUnrankedMemRefMetaData(std::cout, *M);
-  int64_t rank = M->rank;
-  void *ptr = M->descriptor;
-
-  switch (rank) {
-    MEMREF_CASE(float, 0);
-    MEMREF_CASE(float, 1);
-    MEMREF_CASE(float, 2);
-    MEMREF_CASE(float, 3);
-    MEMREF_CASE(float, 4);
-  default:
-    assert(0 && "Unsupported rank to print");
-  }
+  impl::printMemRef(*M);
 }
 
 extern "C" void print_memref_i32(int64_t rank, void *ptr) {
diff --git a/mlir/test/mlir-cpu-runner/unranked_memref.mlir b/mlir/test/mlir-cpu-runner/unranked_memref.mlir
index aa54b56b06b74..0eb68ac033687 100644
--- a/mlir/test/mlir-cpu-runner/unranked_memref.mlir
+++ b/mlir/test/mlir-cpu-runner/unranked_memref.mlir
@@ -1,25 +1,21 @@
 // RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm -convert-std-to-llvm | mlir-cpu-runner -e main -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_test_cblas%shlibext,%linalg_test_lib_dir/libmlir_test_cblas_interface%shlibext | FileCheck %s
 
-// CHECK: rank = 2
 // CHECK: rank = 2
 // CHECK-SAME: sizes = [10, 3]
 // CHECK-SAME: strides = [3, 1]
 // CHECK-COUNT-10: [10, 10, 10]
 //
 // CHECK: rank = 2
-// CHECK: rank = 2
 // CHECK-SAME: sizes = [10, 3]
 // CHECK-SAME: strides = [3, 1]
 // CHECK-COUNT-10: [5, 5, 5]
 //
 // CHECK: rank = 2
-// CHECK: rank = 2
 // CHECK-SAME: sizes = [10, 3]
 // CHECK-SAME: strides = [3, 1]
 // CHECK-COUNT-10: [2, 2, 2]
 //
 // CHECK: rank = 0
-// CHECK: rank = 0
 // 122 is ASCII for 'z'.
 // CHECK: [z]
 func @main() -> () {
diff --git a/mlir/test/mlir-cpu-runner/utils.mlir b/mlir/test/mlir-cpu-runner/utils.mlir
index d3ab6177eb65c..65957400bf7f2 100644
--- a/mlir/test/mlir-cpu-runner/utils.mlir
+++ b/mlir/test/mlir-cpu-runner/utils.mlir
@@ -12,8 +12,7 @@ func @print_0d() {
   dealloc %A : memref<f32>
   return
 }
-// PRINT-0D: Unranked Memref rank = 0 descriptor@ = {{.*}}
-// PRINT-0D: Memref base@ = {{.*}} rank = 0 offset = 0 data =
+// PRINT-0D: Unranked Memref base@ = {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
 // PRINT-0D: [2]
 
 func @print_1d() {
@@ -26,7 +25,7 @@ func @print_1d() {
   dealloc %A : memref<16xf32>
   return
 }
-// PRINT-1D: Memref base@ = {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
+// PRINT-1D: Unranked Memref base@ = {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
 // PRINT-1D-NEXT: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
 
 func @print_3d() {
@@ -43,7 +42,7 @@ func @print_3d() {
   dealloc %A : memref<3x4x5xf32>
   return
 }
-// PRINT-3D: Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [3, 4, 5] strides = [20, 5, 1] data =
+// PRINT-3D: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [3, 4, 5] strides = [20, 5, 1] data =
 // PRINT-3D-COUNT-4: {{.*[[:space:]].*}}2,    2,    2,    2,    2
 // PRINT-3D-COUNT-4: {{.*[[:space:]].*}}2,    2,    2,    2,    2
 // PRINT-3D-COUNT-2: {{.*[[:space:]].*}}2,    2,    2,    2,    2
diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
index dbe78a55c0b1f..705fa9f00930a 100644
--- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
+++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp
@@ -83,10 +83,10 @@ extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
 // Allows to register a MemRef with the CUDA runtime. Initializes array with
 // value. Helpful until we have transfer functions implemented.
 template <typename T>
-void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
-                              llvm::ArrayRef<int64_t> strides, T value) {
-  assert(sizes.size() == strides.size());
-  llvm::SmallVector<int64_t, 4> denseStrides(strides.size());
+void mcuMemHostRegisterMemRef(const DynamicMemRefType<T> &mem_ref, T value) {
+  llvm::SmallVector<int64_t, 4> denseStrides(mem_ref.rank);
+  llvm::ArrayRef<int64_t> sizes(mem_ref.sizes, mem_ref.rank);
+  llvm::ArrayRef<int64_t> strides(mem_ref.strides, mem_ref.rank);
 
   std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
                    std::multiplies<int64_t>());
@@ -98,20 +98,17 @@ void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
   denseStrides.back() = 1;
   assert(strides == llvm::makeArrayRef(denseStrides));
 
+  auto *pointer = mem_ref.data + mem_ref.offset;
   std::fill_n(pointer, count, value);
   mgpuMemHostRegister(pointer, count * sizeof(T));
 }
 
 extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) {
-  auto *desc = static_cast<StridedMemRefType<float, 1> *>(ptr);
-  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
-  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
-  mcuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 1.23f);
+  UnrankedMemRefType<float> mem_ref = {rank, ptr};
+  mcuMemHostRegisterMemRef(DynamicMemRefType<float>(mem_ref), 1.23f);
 }
 
 extern "C" void mcuMemHostRegisterInt32(int64_t rank, void *ptr) {
-  auto *desc = static_cast<StridedMemRefType<int32_t, 1> *>(ptr);
-  auto sizes = llvm::ArrayRef<int64_t>(desc->sizes, rank);
-  auto strides = llvm::ArrayRef<int64_t>(desc->sizes + rank, rank);
-  mcuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 123);
+  UnrankedMemRefType<int32_t> mem_ref = {rank, ptr};
+  mcuMemHostRegisterMemRef(DynamicMemRefType<int32_t>(mem_ref), 123);
 }

From 2dd7714b8d264f6436b56582e4448f6a003a61fc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 19 May 2020 23:03:39 -0400
Subject: [PATCH 099/770] AMDGPU/GlobalISel: Don't select boolean phi by
 default

This is currently missing most of the hard parts to lower correctly,
so disable it for now. This fixes at least one OpenCL conformance test
and allows it to pass with fallback. Hide this behind an option for
now.
---
 .../Target/AMDGPU/AMDGPUInstructionSelector.cpp    | 14 ++++++++++++++
 .../AMDGPU/GlobalISel/divergent-control-flow.ll    |  2 +-
 .../CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir  |  2 +-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 95795f87faaf2..242a108f156c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -39,6 +39,12 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
+static cl::opt<bool> AllowRiskySelect(
+  "amdgpu-global-isel-risky-select",
+  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
+  cl::init(false),
+  cl::ReallyHidden);
+
 #define GET_GLOBALISEL_IMPL
 #define AMDGPUSubtarget GCNSubtarget
 #include "AMDGPUGenGlobalISel.inc"
@@ -196,6 +202,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   const Register DefReg = I.getOperand(0).getReg();
   const LLT DefTy = MRI->getType(DefReg);
+  if (DefTy == LLT::scalar(1)) {
+    if (!AllowRiskySelect) {
+      LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
+  }
 
   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index bd313de000df4..9c47fab05aa05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
index c0bfa38812226..b4ef0caebfc13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
+# RUN: llc -march=amdgcn -amdgpu-global-isel-risky-select -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
 
 ---
 name:            g_phi_s32_ss_sbranch

From 099a875f28d0131a6ae85af91b9eb8627917fbbe Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Tue, 26 May 2020 09:58:25 -0500
Subject: [PATCH 100/770] [PowerPC] Unaligned FP default should apply to
 scalars only

As reported in PR45186, we could be in a situation where we don't
want to handle unaligned memory accesses for FP scalars but still
have VSX (which allows unaligned access for vectors). Change the
default to only apply to scalars.

Fixes: https://bugs.llvm.org/show_bug.cgi?id=45186
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |   3 +-
 llvm/test/CodeGen/PowerPC/pr45186.ll        | 132 ++++++++++++++++++++
 2 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr45186.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2f9ff293c2775..42df83831113a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15659,7 +15659,8 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple())
     return false;
 
-  if (VT.isFloatingPoint() && !Subtarget.allowsUnalignedFPAccess())
+  if (VT.isFloatingPoint() && !VT.isVector() &&
+      !Subtarget.allowsUnalignedFPAccess())
     return false;
 
   if (VT.getSimpleVT().isVector()) {
diff --git a/llvm/test/CodeGen/PowerPC/pr45186.ll b/llvm/test/CodeGen/PowerPC/pr45186.ll
new file mode 100644
index 0000000000000..92f748e3ef5a3
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr45186.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -ppc-asm-full-reg-names -mtriple=powerpc64-- -mattr=+vsx \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+%struct.anon = type { i64, i64 }
+
+@d = local_unnamed_addr global %struct.anon zeroinitializer, align 8
+
+; Function Attrs: norecurse nounwind readonly
+define i64 @e(i8* nocapture readonly %f) local_unnamed_addr #0 {
+; CHECK-LABEL: e:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ldx r3, 0, r3
+; CHECK-NEXT:    blr
+entry:
+  %0 = load i8, i8* %f, align 1
+  %conv = zext i8 %0 to i64
+  %shl = shl nuw i64 %conv, 56
+  %arrayidx1 = getelementptr inbounds i8, i8* %f, i64 1
+  %1 = load i8, i8* %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i64
+  %shl3 = shl nuw nsw i64 %conv2, 48
+  %or = or i64 %shl3, %shl
+  %arrayidx4 = getelementptr inbounds i8, i8* %f, i64 2
+  %2 = load i8, i8* %arrayidx4, align 1
+  %conv5 = zext i8 %2 to i64
+  %shl6 = shl nuw nsw i64 %conv5, 40
+  %or7 = or i64 %or, %shl6
+  %arrayidx8 = getelementptr inbounds i8, i8* %f, i64 3
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %3 to i64
+  %shl10 = shl nuw nsw i64 %conv9, 32
+  %or11 = or i64 %or7, %shl10
+  %arrayidx12 = getelementptr inbounds i8, i8* %f, i64 4
+  %4 = load i8, i8* %arrayidx12, align 1
+  %conv13 = zext i8 %4 to i64
+  %shl14 = shl nuw nsw i64 %conv13, 24
+  %or15 = or i64 %or11, %shl14
+  %arrayidx16 = getelementptr inbounds i8, i8* %f, i64 5
+  %5 = load i8, i8* %arrayidx16, align 1
+  %conv17 = zext i8 %5 to i64
+  %shl18 = shl nuw nsw i64 %conv17, 16
+  %or20 = or i64 %or15, %shl18
+  %arrayidx21 = getelementptr inbounds i8, i8* %f, i64 6
+  %6 = load i8, i8* %arrayidx21, align 1
+  %conv22 = zext i8 %6 to i64
+  %shl23 = shl nuw nsw i64 %conv22, 8
+  %or25 = or i64 %or20, %shl23
+  %arrayidx26 = getelementptr inbounds i8, i8* %f, i64 7
+  %7 = load i8, i8* %arrayidx26, align 1
+  %conv27 = zext i8 %7 to i64
+  %or28 = or i64 %or25, %conv27
+  ret i64 %or28
+}
+
+; Function Attrs: nofree norecurse nounwind
+define void @g() local_unnamed_addr #0 {
+; CHECK-LABEL: g:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis r3, r2, .LC0@toc@ha
+; CHECK-NEXT:    addis r4, r2, .LC1@toc@ha
+; CHECK-NEXT:    ld r3, .LC0@toc@l(r3)
+; CHECK-NEXT:    ld r4, .LC1@toc@l(r4)
+; CHECK-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-NEXT:    stxvd2x vs0, 0, r4
+; CHECK-NEXT:    blr
+entry:
+  %0 = load i8, i8* getelementptr inbounds (i8, i8* bitcast (void ()* @g to i8*), i64 8), align 1
+  %conv.i = zext i8 %0 to i64
+  %shl.i = shl nuw i64 %conv.i, 56
+  %1 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 9), align 1
+  %conv2.i = zext i8 %1 to i64
+  %shl3.i = shl nuw nsw i64 %conv2.i, 48
+  %or.i = or i64 %shl3.i, %shl.i
+  %2 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 10), align 1
+  %conv5.i = zext i8 %2 to i64
+  %shl6.i = shl nuw nsw i64 %conv5.i, 40
+  %or7.i = or i64 %or.i, %shl6.i
+  %3 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 11), align 1
+  %conv9.i = zext i8 %3 to i64
+  %shl10.i = shl nuw nsw i64 %conv9.i, 32
+  %or11.i = or i64 %or7.i, %shl10.i
+  %4 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 12), align 1
+  %conv13.i = zext i8 %4 to i64
+  %shl14.i = shl nuw nsw i64 %conv13.i, 24
+  %or15.i = or i64 %or11.i, %shl14.i
+  %5 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 13), align 1
+  %conv17.i = zext i8 %5 to i64
+  %shl18.i = shl nuw nsw i64 %conv17.i, 16
+  %or20.i = or i64 %or15.i, %shl18.i
+  %6 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 14), align 1
+  %conv22.i = zext i8 %6 to i64
+  %shl23.i = shl nuw nsw i64 %conv22.i, 8
+  %or25.i = or i64 %or20.i, %shl23.i
+  %7 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 15), align 1
+  %conv27.i = zext i8 %7 to i64
+  %or28.i = or i64 %or25.i, %conv27.i
+  store i64 %or28.i, i64* getelementptr inbounds (%struct.anon, %struct.anon* @d, i64 0, i32 1), align 8
+  %8 = load i8, i8* bitcast (void ()* @g to i8*), align 1
+  %conv.i2 = zext i8 %8 to i64
+  %shl.i3 = shl nuw i64 %conv.i2, 56
+  %9 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 1), align 1
+  %conv2.i4 = zext i8 %9 to i64
+  %shl3.i5 = shl nuw nsw i64 %conv2.i4, 48
+  %or.i6 = or i64 %shl3.i5, %shl.i3
+  %10 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 2), align 1
+  %conv5.i7 = zext i8 %10 to i64
+  %shl6.i8 = shl nuw nsw i64 %conv5.i7, 40
+  %or7.i9 = or i64 %or.i6, %shl6.i8
+  %11 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 3), align 1
+  %conv9.i10 = zext i8 %11 to i64
+  %shl10.i11 = shl nuw nsw i64 %conv9.i10, 32
+  %or11.i12 = or i64 %or7.i9, %shl10.i11
+  %12 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 4), align 1
+  %conv13.i13 = zext i8 %12 to i64
+  %shl14.i14 = shl nuw nsw i64 %conv13.i13, 24
+  %or15.i15 = or i64 %or11.i12, %shl14.i14
+  %13 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 5), align 1
+  %conv17.i16 = zext i8 %13 to i64
+  %shl18.i17 = shl nuw nsw i64 %conv17.i16, 16
+  %or20.i18 = or i64 %or15.i15, %shl18.i17
+  %14 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 6), align 1
+  %conv22.i19 = zext i8 %14 to i64
+  %shl23.i20 = shl nuw nsw i64 %conv22.i19, 8
+  %or25.i21 = or i64 %or20.i18, %shl23.i20
+  %15 = load i8, i8* getelementptr (i8, i8* bitcast (void ()* @g to i8*), i64 7), align 1
+  %conv27.i22 = zext i8 %15 to i64
+  %or28.i23 = or i64 %or25.i21, %conv27.i22
+  store i64 %or28.i23, i64* getelementptr inbounds (%struct.anon, %struct.anon* @d, i64 0, i32 0), align 8
+  ret void
+}
+
+attributes #0 = { nounwind }

From e72cba975735c2202b254621d79fb9dbbed08d39 Mon Sep 17 00:00:00 2001
From: Daniel Frampton <Daniel.Frampton@microsoft.com>
Date: Thu, 21 May 2020 11:03:24 +0100
Subject: [PATCH 101/770] Use configure depends to trigger reconfiguration when
 LLVMBuild files change

Summary:
The existing logic has a workaround where configure_file is used to write a single dummy file output many times.

CMake has a feature to more directly add the dependency and avoid the dummy file (it is available in the minimum version specified).

Reviewers: theraven

Reviewed By: theraven

Subscribers: theraven, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80218
---
 llvm/utils/llvm-build/llvmbuild/main.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/llvm/utils/llvm-build/llvmbuild/main.py b/llvm/utils/llvm-build/llvmbuild/main.py
index 99b82ad5e20c3..4f64c52608f9c 100644
--- a/llvm/utils/llvm-build/llvmbuild/main.py
+++ b/llvm/utils/llvm-build/llvmbuild/main.py
@@ -563,19 +563,10 @@ def write_cmake_fragment(self, output_path, enabled_optional_components):
         f.write("""
 # LLVMBuild CMake fragment dependencies.
 #
-# CMake has no builtin way to declare that the configuration depends on
-# a particular file. However, a side effect of configure_file is to add
-# said input file to CMake's internal dependency list. So, we use that
-# and a dummy output file to communicate the dependency information to
-# CMake.
-#
-# FIXME: File a CMake RFE to get a properly supported version of this
-# feature.
 """)
         for dep in dependencies:
             f.write("""\
-configure_file(\"%s\"
-               ${CMAKE_CURRENT_BINARY_DIR}/DummyConfigureOutput)\n""" % (
+set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS \"%s\")\n""" % (
                 cmake_quote_path(dep),))
 
         # Write the properties we use to encode the required library dependency

From 8bc03d2168241f7b12265e9cd7e4eb7655709f34 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 15 May 2020 18:33:01 -0400
Subject: [PATCH 102/770] GlobalISel: Merge G_PTR_MASK with llvm.ptrmask
 intrinsic

Confusingly, these were unrelated and had different semantics. The
G_PTR_MASK instruction predates the llvm.ptrmask intrinsic, but has a
different format. G_PTR_MASK only allows clearing the low bits of a
pointer, and only a constant number of bits. The ptrmask intrinsic
allows an arbitrary mask. Replace G_PTR_MASK to match the intrinsic.

Only selects the cases that look like the old instruction. More work
is needed to select the general case. Also new legalization code is
still needed to deal with the case where the incoming mask size does
not match the pointer size, which has a specified behavior in the
langref.
---
 llvm/docs/GlobalISel/GenericOpcode.rst        |   8 +-
 .../CodeGen/GlobalISel/MachineIRBuilder.h     |  14 +-
 llvm/include/llvm/Support/TargetOpcodes.def   |   5 +-
 llvm/include/llvm/Target/GenericOpcodes.td    |   8 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   2 +
 .../CodeGen/GlobalISel/MachineIRBuilder.cpp   |  19 +-
 llvm/lib/CodeGen/MachineVerifier.cpp          |  16 +
 .../AArch64/AArch64InstructionSelector.cpp    |  13 +-
 .../Target/AArch64/AArch64LegalizerInfo.cpp   |   4 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  15 +-
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   2 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   7 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   1 +
 .../AArch64/GlobalISel/legalize-vaarg.mir     |  17 +-
 .../GlobalISel/legalizer-info-validation.mir  |   4 +-
 .../CodeGen/AArch64/GlobalISel/select.mir     |   5 +-
 .../GlobalISel/inst-select-ptr-mask.mir       | 475 -----------
 .../AMDGPU/GlobalISel/inst-select-ptrmask.mir | 800 ++++++++++++++++++
 .../AMDGPU/GlobalISel/irtranslator-ptrmask.ll | 161 ++++
 .../GlobalISel/regbankselect-ptrmask.mir      |  90 ++
 llvm/test/MachineVerifier/test_g_ptrmask.mir  |  54 ++
 21 files changed, 1195 insertions(+), 525 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-mask.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir
 create mode 100644 llvm/test/MachineVerifier/test_g_ptrmask.mir

diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 7c418a0c05c17..6372192c0088f 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -287,14 +287,16 @@ typically bytes but this may vary between targets.
   There are currently no in-tree targets that use this with addressable units
   not equal to 8 bit.
 
-G_PTR_MASK
+G_PTRMASK
 ^^^^^^^^^^
 
-Zero the least significant N bits of a pointer.
+Zero out an arbitrary mask of bits of a pointer. The mask type must be
+an integer, and the number of vector elements must match for all
+operands. This corresponds to :ref:`i_intr_llvm_ptrmask`.
 
 .. code-block:: none
 
-  %1:_(p0) = G_PTR_MASK %0, 3
+  %2:_(p0) = G_PTRMASK %0, %1
 
 G_SMIN, G_SMAX, G_UMIN, G_UMAX
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 19f65468791d4..0252a324de231 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -453,9 +453,15 @@ class MachineIRBuilder {
                                                   const LLT ValueTy,
                                                   uint64_t Value);
 
-  /// Build and insert \p Res = G_PTR_MASK \p Op0, \p NumBits
+  /// Build and insert \p Res = G_PTRMASK \p Op0, \p Op1
+  MachineInstrBuilder buildPtrMask(const DstOp &Res, const SrcOp &Op0,
+                                   const SrcOp &Op1) {
+    return buildInstr(TargetOpcode::G_PTRMASK, {Res}, {Op0, Op1});
+  }
+
+  /// Build and insert \p Res = G_PTRMASK \p Op0, \p G_CONSTANT (1 << NumBits) - 1
   ///
-  /// G_PTR_MASK clears the low bits of a pointer operand without destroying its
+  /// This clears the low bits of a pointer operand without destroying its
   /// pointer properties. This has the effect of rounding the address *down* to
   /// a specified alignment in bits.
   ///
@@ -466,8 +472,8 @@ class MachineIRBuilder {
   ///      be cleared in \p Op0.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildPtrMask(const DstOp &Res, const SrcOp &Op0,
-                                   uint32_t NumBits);
+  MachineInstrBuilder buildMaskLowPtrBits(const DstOp &Res, const SrcOp &Op0,
+                                          uint32_t NumBits);
 
   /// Build and insert \p Res, \p CarryOut = G_UADDO \p Op0, \p Op1
   ///
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 8385af9de2a41..eae831f3353b0 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -556,9 +556,8 @@ HANDLE_TARGET_OPCODE(G_FMAXIMUM)
 /// Generic pointer offset
 HANDLE_TARGET_OPCODE(G_PTR_ADD)
 
-/// Clear the specified number of low bits in a pointer. This rounds the value
-/// *down* to the given alignment.
-HANDLE_TARGET_OPCODE(G_PTR_MASK)
+/// Clear the specified bits in a pointer.
+HANDLE_TARGET_OPCODE(G_PTRMASK)
 
 /// Generic signed integer minimum.
 HANDLE_TARGET_OPCODE(G_SMIN)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 441f1580dd254..5ba7844c8c9ee 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -358,9 +358,11 @@ def G_PTR_ADD : GenericInstruction {
   let hasSideEffects = 0;
 }
 
-def G_PTR_MASK : GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src, unknown:$bits);
+// Generic pointer mask. type1 should be an integer with the same
+// bitwidth as the pointer type.
+def G_PTRMASK : GenericInstruction {
+  let OutOperandList = (outs ptype0:$dst);
+  let InOperandList = (ins ptype0:$src, type1:$bits);
   let hasSideEffects = 0;
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index eba352aedb071..df965e466698d 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1281,6 +1281,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_INTRINSIC_TRUNC;
     case Intrinsic::readcyclecounter:
       return TargetOpcode::G_READCYCLECOUNTER;
+    case Intrinsic::ptrmask:
+      return TargetOpcode::G_PTRMASK;
   }
   return Intrinsic::not_intrinsic;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 7af8332bbce34..510572e6d4121 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -237,17 +237,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
   return buildPtrAdd(Res, Op0, Cst.getReg(0));
 }
 
-MachineInstrBuilder MachineIRBuilder::buildPtrMask(const DstOp &Res,
-                                                   const SrcOp &Op0,
-                                                   uint32_t NumBits) {
-  assert(Res.getLLTTy(*getMRI()).isPointer() &&
-         Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
-
-  auto MIB = buildInstr(TargetOpcode::G_PTR_MASK);
-  Res.addDefToMIB(*getMRI(), MIB);
-  Op0.addSrcToMIB(MIB);
-  MIB.addImm(NumBits);
-  return MIB;
+MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
+                                                          const SrcOp &Op0,
+                                                          uint32_t NumBits) {
+  LLT PtrTy = Res.getLLTTy(*getMRI());
+  LLT MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+  Register MaskReg = getMRI()->createGenericVirtualRegister(MaskTy);
+  buildConstant(MaskReg, maskTrailingOnes<uint64_t>(NumBits));
+  return buildPtrMask(Res, Op0, MaskReg);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index f626c12916077..f07856d799c9f 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1102,6 +1102,22 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     // TODO: Is the offset allowed to be a scalar with a vector?
     break;
   }
+  case TargetOpcode::G_PTRMASK: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT MaskTy = MRI->getType(MI->getOperand(2).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid() || !MaskTy.isValid())
+      break;
+
+    if (!DstTy.getScalarType().isPointer())
+      report("ptrmask result type must be a pointer", MI);
+
+    if (!MaskTy.getScalarType().isScalar())
+      report("ptrmask mask type must be an integer", MI);
+
+    verifyVectorElementMatch(DstTy, MaskTy, MI);
+    break;
+  }
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_ANYEXT:
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 5789d1d2531c7..57eaf140a6380 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -2383,14 +2383,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return true;
   }
 
-  case TargetOpcode::G_PTR_MASK: {
-    uint64_t Align = I.getOperand(2).getImm();
-    if (Align >= 64 || Align == 0)
+  case TargetOpcode::G_PTRMASK: {
+    Register MaskReg = I.getOperand(2).getReg();
+    Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI);
+    // TODO: Implement arbitrary cases
+    if (!MaskVal || !isShiftedMask_64(*MaskVal))
       return false;
 
-    uint64_t Mask = ~((1ULL << Align) - 1);
+    uint64_t Mask = *MaskVal;
     I.setDesc(TII.get(AArch64::ANDXri));
-    I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+    I.getOperand(2).ChangeToImmediate(
+        AArch64_AM::encodeLogicalImmediate(Mask, 64));
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 4378f5dbd8f93..3caa9026c9d8e 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -108,7 +108,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .legalFor({{p0, s64}})
       .clampScalar(1, s64, s64);
 
-  getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0});
+  getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
 
   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
       .legalFor({s32, s64})
@@ -744,7 +744,7 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
     auto AlignMinus1 =
         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
-    DstPtr = MIRBuilder.buildPtrMask(PtrTy, ListTmp, Log2(Alignment));
+    DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
   } else
     DstPtr = List;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 242a108f156c7..e47f25f7828d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2231,9 +2231,14 @@ bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
 }
 
-bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
-  uint64_t Align = I.getOperand(2).getImm();
-  const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
+bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
+  Register MaskReg = I.getOperand(2).getReg();
+  Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, *MRI);
+  // TODO: Implement arbitrary cases
+  if (!MaskVal || !isShiftedMask_64(*MaskVal))
+    return false;
+
+  const uint64_t Mask = *MaskVal;
 
   MachineBasicBlock *BB = I.getParent();
 
@@ -2731,8 +2736,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_FRAME_INDEX:
   case TargetOpcode::G_GLOBAL_VALUE:
     return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
-  case TargetOpcode::G_PTR_MASK:
-    return selectG_PTR_MASK(I);
+  case TargetOpcode::G_PTRMASK:
+    return selectG_PTRMASK(I);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return selectG_EXTRACT_VECTOR_ELT(I);
   case TargetOpcode::G_INSERT_VECTOR_ELT:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 3e3a3d8326c22..0ac6788c69b8a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -130,7 +130,7 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   bool selectG_SELECT(MachineInstr &I) const;
   bool selectG_BRCOND(MachineInstr &I) const;
   bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const;
-  bool selectG_PTR_MASK(MachineInstr &I) const;
+  bool selectG_PTRMASK(MachineInstr &I) const;
   bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
   bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
   bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 63106df9e2a22..c24996b93fa06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -560,7 +560,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
-  getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
+  getActionDefinitionsBuilder(G_PTR_ADD)
+    .scalarize(0)
+    .alwaysLegal();
+
+  // TODO: Clamp mask to pointer sizes
+  getActionDefinitionsBuilder(G_PTRMASK)
     .scalarize(0)
     .alwaysLegal();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1cfc7ccc6cb7b..f11563a66d410 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3228,6 +3228,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLVM_FALLTHROUGH;
   }
   case AMDGPU::G_PTR_ADD:
+  case AMDGPU::G_PTRMASK:
   case AMDGPU::G_ADD:
   case AMDGPU::G_SUB:
   case AMDGPU::G_MUL:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
index 7446fde7ba08d..a0cc566771189 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
@@ -15,17 +15,18 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load 8)
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD]], [[C]](s64)
-    ; CHECK: G_STORE [[GEP]](p0), [[COPY]](p0) :: (store 8)
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD]], [[C]](s64)
+    ; CHECK: G_STORE [[PTR_ADD]](p0), [[COPY]](p0) :: (store 8)
     ; CHECK: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load 8)
-    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD1]], [[C]](s64)
-    ; CHECK: G_STORE [[GEP1]](p0), [[COPY]](p0) :: (store 8)
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD1]], [[C]](s64)
+    ; CHECK: G_STORE [[PTR_ADD1]](p0), [[COPY]](p0) :: (store 8)
     ; CHECK: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load 8)
     ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK: [[GEP2:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD2]], [[C1]](s64)
-    ; CHECK: [[PTR_MASK:%[0-9]+]]:_(p0) = G_PTR_MASK [[GEP2]], 4
-    ; CHECK: [[GEP3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_MASK]], [[C]](s64)
-    ; CHECK: G_STORE [[GEP3]](p0), [[COPY]](p0) :: (store 8)
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD2]], [[C1]](s64)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C1]](s64)
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[PTR_ADD2]], [[COPY1]](s64)
+    ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTRMASK]], [[C]](s64)
+    ; CHECK: G_STORE [[PTR_ADD3]](p0), [[COPY]](p0) :: (store 8)
     %0:_(p0) = COPY $x0
 
     %1:_(s8) = G_VAARG %0(p0), 1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 1b63f672aabec..4a68d09c545a0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -441,8 +441,8 @@
 # DEBUG-NEXT: G_PTR_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
-# DEBUG-NEXT: G_PTR_MASK (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: G_PTRMASK (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG: G_SMIN (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select.mir
index 2e38f1ce62e98..ca4091180b1a6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select.mir
@@ -125,12 +125,13 @@ legalized:       true
 regBankSelected: true
 
 # CHECK:  body:
-# CHECK: %1:gpr64sp = ANDXri %0, 8060
+# CHECK: %2:gpr64sp = ANDXri %0, 8060
 body:             |
   bb.0:
       liveins: $x0
     %0:gpr(p0) = COPY $x0
-    %1:gpr(p0) = G_PTR_MASK %0, 3
+    %const:gpr(s64) = G_CONSTANT i64 -8
+    %1:gpr(p0) = G_PTRMASK %0, %const
     $x0 = COPY %1(p0)
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-mask.mir
deleted file mode 100644
index fcc9565ce9ee4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-mask.mir
+++ /dev/null
@@ -1,475 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck %s
-
----
-name:  ptr_mask_p3_sgpr_sgpr_1
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_sgpr_sgpr_1
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
-    %0:sgpr(p3) = COPY $sgpr0
-    %1:sgpr(p3) = G_PTR_MASK %0, 1
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_sgpr_sgpr_2
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_sgpr_sgpr_2
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
-    %0:sgpr(p3) = COPY $sgpr0
-    %1:sgpr(p3) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_sgpr_sgpr_3
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_sgpr_sgpr_3
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
-    %0:sgpr(p3) = COPY $sgpr0
-    %1:sgpr(p3) = G_PTR_MASK %0, 3
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_sgpr_sgpr_4
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_sgpr_sgpr_4
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
-    %0:sgpr(p3) = COPY $sgpr0
-    %1:sgpr(p3) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_sgpr_sgpr_29
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_sgpr_sgpr_29
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
-    %0:sgpr(p3) = COPY $sgpr0
-    %1:sgpr(p3) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_sgpr_sgpr_1
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_sgpr_sgpr_1
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2
-    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:sgpr(p0) = COPY $sgpr0_sgpr1
-    %1:sgpr(p0) = G_PTR_MASK %0, 1
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_sgpr_sgpr_2
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_sgpr_sgpr_2
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
-    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:sgpr(p0) = COPY $sgpr0_sgpr1
-    %1:sgpr(p0) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_sgpr_sgpr_3
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_sgpr_sgpr_3
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8
-    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:sgpr(p0) = COPY $sgpr0_sgpr1
-    %1:sgpr(p0) = G_PTR_MASK %0, 3
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_sgpr_sgpr_4
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_sgpr_sgpr_4
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:sgpr(p0) = COPY $sgpr0_sgpr1
-    %1:sgpr(p0) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_sgpr_sgpr_29
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_sgpr_sgpr_29
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:sgpr(p0) = COPY $sgpr0_sgpr1
-    %1:sgpr(p0) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_vgpr_vgpr_1
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_vgpr_vgpr_1
-    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
-    %0:vgpr(p3) = COPY $vgpr0
-    %1:vgpr(p3) = G_PTR_MASK %0, 1
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_vgpr_vgpr_2
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_vgpr_vgpr_2
-    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
-    %0:vgpr(p3) = COPY $vgpr0
-    %1:vgpr(p3) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_vgpr_vgpr_3
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_vgpr_vgpr_3
-    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
-    %0:vgpr(p3) = COPY $vgpr0
-    %1:vgpr(p3) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_vgpr_vgpr_4
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_vgpr_vgpr_4
-    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
-    %0:vgpr(p3) = COPY $vgpr0
-    %1:vgpr(p3) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_vgpr_vgpr_29
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_vgpr_vgpr_29
-    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
-    %0:vgpr(p3) = COPY $vgpr0
-    %1:vgpr(p3) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_vgpr_vgpr_1
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_vgpr_vgpr_1
-    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec
-    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:vgpr(p0) = COPY $vgpr0_vgpr1
-    %1:vgpr(p0) = G_PTR_MASK %0, 1
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_vgpr_vgpr_2
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_vgpr_vgpr_2
-    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
-    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:vgpr(p0) = COPY $vgpr0_vgpr1
-    %1:vgpr(p0) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_vgpr_vgpr_3
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_vgpr_vgpr_3
-    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
-    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:vgpr(p0) = COPY $vgpr0_vgpr1
-    %1:vgpr(p0) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_vgpr_vgpr_4
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_vgpr_vgpr_4
-    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:vgpr(p0) = COPY $vgpr0_vgpr1
-    %1:vgpr(p0) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_vgpr_vgpr_29
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_vgpr_vgpr_29
-    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:vgpr(p0) = COPY $vgpr0_vgpr1
-    %1:vgpr(p0) = G_PTR_MASK %0, 4
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p3_vgpr_sgpr_2
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0
-
-    ; CHECK-LABEL: name: ptr_mask_p3_vgpr_sgpr_2
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
-    %0:sgpr(p3) = COPY $sgpr0
-    %1:vgpr(p3) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
-
----
-name:  ptr_mask_p0_vgpr_sgpr_2
-legalized:       true
-regBankSelected: true
-
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: ptr_mask_p0_vgpr_sgpr_2
-    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
-    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
-    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
-    %0:sgpr(p0) = COPY $sgpr0_sgpr1
-    %1:vgpr(p0) = G_PTR_MASK %0, 2
-    S_ENDPGM 0, implicit %1
-
-...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
new file mode 100644
index 0000000000000..1f8325018af2a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
@@ -0,0 +1,800 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck %s
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_sgpr
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_sgpr
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p3) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p3)
+    %0:sgpr(p3) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(p3) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_0xf0f0f0f0
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_0xf0f0f0f0
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK: %const:sgpr(s32) = G_CONSTANT i32 -252645136
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p3) = G_PTRMASK [[COPY]], %const(s32)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p3)
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -252645136
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearhi1
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearhi1
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32  -2147483648
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearhi2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearhi2
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1073741824
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32  -1073741824
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearlo1
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo1
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -2
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearlo2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo2
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -4
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearlo3
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo3
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -8
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearlo4
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo4
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -16
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_s32_sgpr_sgpr_clearlo29
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo29
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -536870912
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -536870912
+    %1:sgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_sgpr
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_sgpr
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p0) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p0)
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %1:sgpr(s64) = COPY $sgpr2_sgpr3
+    %2:sgpr(p0) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_sgpr_0xf0f0f0f0f0f0f0f0
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_sgpr_0xf0f0f0f0f0f0f0f0
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 -1085102592571150096
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p0) = G_PTRMASK [[COPY]], [[C]](s64)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p0)
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %1:sgpr(s64) = G_CONSTANT i64 -1085102592571150096
+    %2:sgpr(p0) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p0_s32_sgpr_sgpr_sgpr
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p0_s32_sgpr_sgpr_sgpr
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p0) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p0)
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %1:sgpr(s32) = COPY $sgpr2
+    %2:sgpr(p0) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearhi1
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearhi1
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -9223372036854775808
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -9223372036854775808
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearhi32
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearhi32
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4294967296
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -4294967296
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clear_32
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clear_32
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967296
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 4294967296
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearlo1
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearlo1
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -2
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearlo2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearlo2
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -4
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearlo3
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearlo3
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -8
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearlo4
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearlo4
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -16
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_sgpr_sgpr_clearlo29
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_sgpr_sgpr_clearlo29
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -536870912
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_AND_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s64) = G_CONSTANT i64 -536870912
+    %1:sgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_vgpr_0xf0f0f0f0
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_0xf0f0f0f0
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; CHECK: %const:vgpr(s32) = G_CONSTANT i32 -252645136
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p3) = G_PTRMASK [[COPY]], %const(s32)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p3)
+    %0:vgpr(p3) = COPY $vgpr0
+    %const:vgpr(s32) = G_CONSTANT i32 -252645136
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_vgpr_clearlo1
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo1
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %const:vgpr(s32) = G_CONSTANT i32 -2
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_vgpr_clearlo2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo2
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %const:vgpr(s32) = G_CONSTANT i32 -4
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_vgpr_clearlo3
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %const:vgpr(s32) = G_CONSTANT i32 -8
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_vgpr_clearlo4
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo4
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %const:vgpr(s32) = G_CONSTANT i32 -16
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_vgpr_clearlo29
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo29
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -536870912, implicit $exec
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %const:vgpr(s32) = G_CONSTANT i32 -536870912
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_vgpr
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_vgpr
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p0) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p0)
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = COPY $vgpr2_vgpr3
+    %2:vgpr(p0) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_vgpr_0xf0f0f0f0f0f0f0f0
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_vgpr_0xf0f0f0f0f0f0f0f0
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 -1085102592571150096
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p0) = G_PTRMASK [[COPY]], [[C]](s64)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p0)
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %1:vgpr(s64) = G_CONSTANT i64 -1085102592571150096
+    %2:vgpr(p0) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p0_s32_vgpr_vgpr_vgpr
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: ptrmask_p0_s32_vgpr_vgpr_vgpr
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p0) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+    ; CHECK: S_ENDPGM 0, implicit [[PTRMASK]](p0)
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %1:vgpr(s32) = COPY $vgpr2
+    %2:vgpr(p0) = G_PTRMASK %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_clearlo1
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_clearlo1
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %const:vgpr(s64) = G_CONSTANT i64 -2
+    %1:vgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_clearlo2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_clearlo2
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %const:vgpr(s64) = G_CONSTANT i64 -4
+    %1:vgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_clearlo3
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_clearlo3
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %const:vgpr(s64) = G_CONSTANT i64 -4
+    %1:vgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_clearlo4
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_clearlo4
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %const:vgpr(s64) = G_CONSTANT i64 -16
+    %1:vgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_vgpr_clearlo29
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_vgpr_clearlo29
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -536870912, implicit $exec
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:vgpr(p0) = COPY $vgpr0_vgpr1
+    %const:vgpr(s64) = G_CONSTANT i64 -536870912
+    %1:vgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p3_vgpr_sgpr_clearlo2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: ptrmask_p3_vgpr_sgpr_clearlo2
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    %0:sgpr(p3) = COPY $sgpr0
+    %const:sgpr(s32) = G_CONSTANT i32 -4
+    %1:vgpr(p3) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:  ptrmask_p0_s64_vgpr_sgpr_clearlo2
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64_vgpr_sgpr_clearlo2
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+    %0:sgpr(p0) = COPY $sgpr0_sgpr1
+    %const:sgpr(s32) = G_CONSTANT i32 -4
+    %1:vgpr(p0) = G_PTRMASK %0, %const
+    S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
new file mode 100644
index 0000000000000..cc1c75e404e05
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stop-after=irtranslator < %s | FileCheck %s
+
+define i8* @ptrmask_flat_i64(i8* %ptr, i64 %mask) {
+  ; CHECK-LABEL: name: ptrmask_flat_i64
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[MV1]](s64)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
+  ; CHECK:   S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %ptr, i64 %mask)
+  ret i8* %masked
+}
+
+define i8* @ptrmask_flat_i32(i8* %ptr, i32 %mask) {
+  ; CHECK-LABEL: name: ptrmask_flat_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[COPY2]](s32)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0, implicit $vgpr1
+  %masked = call i8* @llvm.ptrmask.p0i8.i32(i8* %ptr, i32 %mask)
+  ret i8* %masked
+}
+
+define i8* @ptrmask_flat_i16(i8* %ptr, i16 %mask) {
+  ; CHECK-LABEL: name: ptrmask_flat_i16
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[TRUNC]](s16)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0, implicit $vgpr1
+  %masked = call i8* @llvm.ptrmask.p0i8.i16(i8* %ptr, i16 %mask)
+  ret i8* %masked
+}
+
+define i8* @ptrmask_flat_i1(i8* %ptr, i1 %mask) {
+  ; CHECK-LABEL: name: ptrmask_flat_i1
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[TRUNC]](s1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0, implicit $vgpr1
+  %masked = call i8* @llvm.ptrmask.p0i8.i1(i8* %ptr, i1 %mask)
+  ret i8* %masked
+}
+
+define i8 addrspace(3)* @ptrmask_local_i64(i8 addrspace(3)* %ptr, i64 %mask) {
+  ; CHECK-LABEL: name: ptrmask_local_i64
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[MV]](s64)
+  ; CHECK:   $vgpr0 = COPY [[PTRMASK]](p3)
+  ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
+  ; CHECK:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
+  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask)
+  ret i8 addrspace(3)* %masked
+}
+
+define i8 addrspace(3)* @ptrmask_local_i32(i8 addrspace(3)* %ptr, i32 %mask) {
+  ; CHECK-LABEL: name: ptrmask_local_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+  ; CHECK:   $vgpr0 = COPY [[PTRMASK]](p3)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
+  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask)
+  ret i8 addrspace(3)* %masked
+}
+
+define i8 addrspace(3)* @ptrmask_local_i16(i8 addrspace(3)* %ptr, i16 %mask) {
+  ; CHECK-LABEL: name: ptrmask_local_i16
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[TRUNC]](s16)
+  ; CHECK:   $vgpr0 = COPY [[PTRMASK]](p3)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
+  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask)
+  ret i8 addrspace(3)* %masked
+}
+
+define i8 addrspace(3)* @ptrmask_local_i1(i8 addrspace(3)* %ptr, i1 %mask) {
+  ; CHECK-LABEL: name: ptrmask_local_i1
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[TRUNC]](s1)
+  ; CHECK:   $vgpr0 = COPY [[PTRMASK]](p3)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
+  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i1(i8 addrspace(3)* %ptr, i1 %mask)
+  ret i8 addrspace(3)* %masked
+}
+
+; Seems to not work
+; define <2 x i8*> @ptrmask_flat_i64_v2(<2 x i8*> %ptr, <2 x i64> %mask) {
+;   %masked = call <2 x i8*> @llvm.ptrmask.v2p0i8.v2i64(<2 x i8*> %ptr, <2 x i64> %mask)
+;   ret <2 x i8*> %masked
+; }
+
+declare i8* @llvm.ptrmask.p0i8.i64(i8*, i64)
+declare i8* @llvm.ptrmask.p0i8.i32(i8*, i32)
+declare i8* @llvm.ptrmask.p0i8.i16(i8*, i16)
+declare i8* @llvm.ptrmask.p0i8.i1(i8*, i1)
+declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)*, i64)
+declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)*, i32)
+declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)*, i16)
+declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i1(i8 addrspace(3)*, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir
new file mode 100644
index 0000000000000..0449b162968f9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+---
+name: ptrmask_p1_s_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p1_s_k
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p1) = G_PTRMASK [[COPY]], [[C]](s64)
+    %0:_(p1) = COPY $sgpr0_sgpr1
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(p1) = G_PTRMASK %0, %1
+...
+
+---
+name: ptrmask_p1_s_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p1_s_s
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+    ; CHECK: [[PTRMASK:%[0-9]+]]:sgpr(p1) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    %0:_(p1) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $sgpr2_sgpr3
+    %2:_(p1) = G_PTRMASK %0, %1
+...
+
+---
+name: ptrmask_p1_v_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p1_v_k
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64)
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p1) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(p1) = G_PTRMASK %0, %1
+...
+
+---
+name: ptrmask_p1_v_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p1_v_s
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p1) = G_PTRMASK [[COPY]], [[COPY2]](s64)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $sgpr0_sgpr1
+    %2:_(p1) = G_PTRMASK %0, %1
+...
+
+---
+name: ptrmask_p1_v_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p1_v_v
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p1) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(p1) = G_PTRMASK %0, %1
+...
diff --git a/llvm/test/MachineVerifier/test_g_ptrmask.mir b/llvm/test/MachineVerifier/test_g_ptrmask.mir
new file mode 100644
index 0000000000000..6edf1b1d894aa
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_ptrmask.mir
@@ -0,0 +1,54 @@
+# REQUIRES: aarch64-registered-target
+# RUN: not --crash llc -o - -march=arm64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+
+---
+name:            test_ptr_mask
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+
+    %0:_(p0) = G_IMPLICIT_DEF
+    %1:_(p0) = G_IMPLICIT_DEF
+    %2:_(s64) = G_IMPLICIT_DEF
+
+    ; CHECK:  Bad machine code: Type mismatch in generic instruction
+    ; CHECK: Bad machine code: ptrmask result type must be a pointer
+    %3:_(s64) = G_PTRMASK %0, %2
+
+    ; CHECK:  Bad machine code: Type mismatch in generic instruction
+    %4:_(p0) = G_PTRMASK %2, %2
+
+    ; CHECK: Bad machine code: ptrmask mask type must be an integer
+    %5:_(p0) = G_PTRMASK %0, %0
+
+    %6:_(<2 x p0>) = G_IMPLICIT_DEF
+    %7:_(<2 x s64>) = G_IMPLICIT_DEF
+
+    ; CHECK: Bad machine code: Type mismatch in generic instruction
+    ; CHECK: Bad machine code: ptrmask result type must be a pointer
+    %8:_(<2 x s64>) = G_PTRMASK %6, %7
+
+    ; CHECK:  Bad machine code: Type mismatch in generic instruction
+    %9:_(<2 x p0>) = G_PTRMASK %7, %7
+
+    ; CHECK: Bad machine code: Type mismatch in generic instruction
+    ; CHECK: Bad machine code: ptrmask mask type must be an integer
+    ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
+    %10:_(<2 x p0>) = G_PTRMASK %0, %0
+
+    ; CHECK: Bad machine code: Type mismatch in generic instruction
+    %11:_(p0) = G_PTRMASK %6, %2
+
+    ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
+    %12:_(p0) = G_PTRMASK %0, %7
+
+    ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
+    %13:_(<2 x p0>) = G_PTRMASK %6, %2
+
+    %14:_(<4 x p0>) = G_IMPLICIT_DEF
+
+    ; CHECK: Bad machine code: operand types must preserve number of vector elements
+    %15:_(<4 x p0>) = G_PTRMASK %14, %8
+
+...

From b59b3640bcbdfc6cf4b35ff3a6ad5f524a073b45 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 22 May 2020 17:33:03 -0700
Subject: [PATCH 103/770] Debug Info: Mark os_log helper functions as
 artificial

The os_log helper functions are linkonce_odr and supposed to be
uniqued across TUs, so attachine a DW_AT_decl_line on it is highly
misleading. By setting the function decl to implicit, CGDebugInfo
properly marks the functions as artificial and uses a default file /
line 0 location for the function.

rdar://problem/63450824

Differential Revision: https://reviews.llvm.org/D80463
---
 clang/lib/CodeGen/CGBuiltin.cpp       |  2 ++
 clang/test/CodeGen/debug-info-oslog.c | 15 +++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 clang/test/CodeGen/debug-info-oslog.c

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ddd9a68a8edb7..bef0ad27145f3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1271,6 +1271,8 @@ llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
   FunctionDecl *FD = FunctionDecl::Create(
       Ctx, Ctx.getTranslationUnitDecl(), SourceLocation(), SourceLocation(), II,
       FuncionTy, nullptr, SC_PrivateExtern, false, false);
+  // Avoid generating debug location info for the function.
+  FD->setImplicit();
 
   StartFunction(FD, ReturnTy, Fn, FI, Args);
 
diff --git a/clang/test/CodeGen/debug-info-oslog.c b/clang/test/CodeGen/debug-info-oslog.c
new file mode 100644
index 0000000000000..c32c79eb8a6f4
--- /dev/null
+++ b/clang/test/CodeGen/debug-info-oslog.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -triple x86_64-darwin-apple -debug-info-kind=limited \
+// RUN:   %s -emit-llvm -o -  | FileCheck %s
+void test_builtin_os_log(void *buf, int i, const char *data) {
+  __builtin_os_log_format(buf, "%d", i);
+}
+
+// CHECK: define linkonce_odr {{.*}}@__os_log_helper_1_0_1_4_0(
+// CHECK-SAME:   !dbg ![[OS_LOG_HELPER:[0-9]+]]
+
+// This helper is going to be uniqued, so it should not have a line
+// number between file and type.
+
+// CHECK: distinct !DISubprogram(name: "__os_log_helper_1_0_1_4_0",
+// CHECK-SAME:                   file: !{{[0-9+]}}, type
+// CHECK-SAME:                   flags: DIFlagArtificial

From 6b7d51ad4a16579b0a7d41c77715be4d9e266d8c Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 26 May 2020 08:53:02 -0700
Subject: [PATCH 104/770] Add missing forward decl to unbreak the modular build

---
 clang/include/clang/Index/IndexingOptions.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/include/clang/Index/IndexingOptions.h b/clang/include/clang/Index/IndexingOptions.h
index 2dd276998abf7..9f5c03d1b3b94 100644
--- a/clang/include/clang/Index/IndexingOptions.h
+++ b/clang/include/clang/Index/IndexingOptions.h
@@ -14,6 +14,7 @@
 #include <string>
 
 namespace clang {
+class Decl;
 namespace index {
 
 struct IndexingOptions {

From 50d4b22ca0dd8f25a2ab2cb53a04627b2504ecfe Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 20 May 2020 13:26:10 -0400
Subject: [PATCH 105/770] AMDGPU/GlobalISel: Fix assert on 16-bit G_EXTRACT
 results

I consider this to be a hack, since we probably should not mark any
16-bit extract as legal, and require all extracts to be done on
multiples of 32. There are quite a few more battles to fight in the
legalizer for sub-dword vectors, so just select this for now so we can
pass OpenCL conformance without crashing.

Also fix the same assert for G_INSERTs. Unlike G_EXTRACT there's not a
trivial way to select this so just fail on it.
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 11 +++-
 .../AMDGPU/GlobalISel/inst-select-extract.mir | 57 +++++++++++++++++++
 .../GlobalISel/inst-select-insert.xfail.mir   | 19 +++++++
 3 files changed, 85 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e47f25f7828d5..aee6c0dd8a8e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -499,13 +499,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
   LLT DstTy = MRI->getType(DstReg);
   LLT SrcTy = MRI->getType(SrcReg);
   const unsigned SrcSize = SrcTy.getSizeInBits();
-  const unsigned DstSize = DstTy.getSizeInBits();
+  unsigned DstSize = DstTy.getSizeInBits();
 
   // TODO: Should handle any multiple of 32 offset.
   unsigned Offset = I.getOperand(2).getImm();
   if (Offset % 32 != 0 || DstSize > 128)
     return false;
 
+  // 16-bit operations really use 32-bit registers.
+  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
+  if (DstSize == 16)
+    DstSize = 32;
+
   const TargetRegisterClass *DstRC =
     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
@@ -728,7 +733,9 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
   unsigned InsSize = Src1Ty.getSizeInBits();
 
   int64_t Offset = I.getOperand(3).getImm();
-  if (Offset % 32 != 0)
+
+  // FIXME: These cases should have been illegal and unnecessary to check here.
+  if (Offset % 32 != 0 || InsSize % 32 != 0)
     return false;
 
   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir
index 795ebc6a1a9a6..df16e9c1f0917 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir
@@ -255,3 +255,60 @@ body: |
     S_ENDPGM 0, implicit %1
 
 ...
+
+# FIXME: Probably should not be legal
+---
+name:            extract_sgpr_s16_from_v4s16_offset0
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: extract_sgpr_s16_from_v4s16_offset0
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
+    ; CHECK: S_ENDPGM 0, implicit [[COPY1]]
+    %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1
+    %1:sgpr(s16) = G_EXTRACT %0, 0
+    S_ENDPGM 0, implicit %1
+
+...
+
+# FIXME: Probably should not be legal
+---
+name:            extract_sgpr_s16_from_v4s16_offset32
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: extract_sgpr_s16_from_v4s16_offset32
+    ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: S_ENDPGM 0, implicit [[COPY1]]
+    %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1
+    %1:sgpr(s16) = G_EXTRACT %0, 32
+    S_ENDPGM 0, implicit %1
+
+...
+
+# FIXME: Probably should not be legal
+---
+name:            extract_sgpr_s16_from_v6s16_offset32
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2
+    ; CHECK-LABEL: name: extract_sgpr_s16_from_v6s16_offset32
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr_96 = COPY $sgpr0_sgpr1_sgpr2
+    ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+    ; CHECK: S_ENDPGM 0, implicit [[COPY1]]
+    %0:sgpr(<6 x s16>) = COPY $sgpr0_sgpr1_sgpr2
+    %1:sgpr(s16) = G_EXTRACT %0, 32
+    S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir
new file mode 100644
index 0000000000000..5e58e8b633ec4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir
@@ -0,0 +1,19 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+# FIXME: This should not be legal and this test should be deleted
+# ERR: remark: <unknown>:0:0: cannot select: %3:sgpr(<4 x s16>) = G_INSERT %0:sgpr, %2:sgpr(s16), 0 (in function: insert_sgpr_2s16_to_v4s16_offset0)
+---
+name:            insert_sgpr_2s16_to_v4s16_offset0
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1
+    %1:sgpr(s32) = COPY $sgpr2
+    %2:sgpr(s16) = G_TRUNC %1
+    %3:sgpr(<4 x s16>) = G_INSERT %0, %2, 0
+    S_ENDPGM 0, implicit %3
+
+...

From 5bd97eb28aff252a3a9e8b0ef00d563b557f5580 Mon Sep 17 00:00:00 2001
From: Sanne Wouda <Sanne.Wouda@arm.com>
Date: Tue, 26 May 2020 17:11:32 +0100
Subject: [PATCH 106/770] Fix MemoryLocation.h use without Instructions.h

MemoryLocation.h was changed to only include Instruction.h.  However,
cast<> still needs the full definiton, so move MemoryLocation::getOrNone
to the cpp file.
---
 llvm/include/llvm/Analysis/MemoryLocation.h | 17 +----------------
 llvm/lib/Analysis/MemoryLocation.cpp        | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryLocation.h b/llvm/include/llvm/Analysis/MemoryLocation.h
index ce70df66ab7a8..f7bb15d256fde 100644
--- a/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -221,22 +221,7 @@ class MemoryLocation {
   static MemoryLocation get(const Instruction *Inst) {
     return *MemoryLocation::getOrNone(Inst);
   }
-  static Optional<MemoryLocation> getOrNone(const Instruction *Inst) {
-    switch (Inst->getOpcode()) {
-    case Instruction::Load:
-      return get(cast<LoadInst>(Inst));
-    case Instruction::Store:
-      return get(cast<StoreInst>(Inst));
-    case Instruction::VAArg:
-      return get(cast<VAArgInst>(Inst));
-    case Instruction::AtomicCmpXchg:
-      return get(cast<AtomicCmpXchgInst>(Inst));
-    case Instruction::AtomicRMW:
-      return get(cast<AtomicRMWInst>(Inst));
-    default:
-      return None;
-    }
-  }
+  static Optional<MemoryLocation> getOrNone(const Instruction *Inst);
 
   /// Return a location representing the source of a memory transfer.
   static MemoryLocation getForSource(const MemTransferInst *MTI);
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 103cdea148e5e..4c31d6786ed8e 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -83,6 +83,23 @@ MemoryLocation MemoryLocation::get(const AtomicRMWInst *RMWI) {
                         AATags);
 }
 
+Optional<MemoryLocation> MemoryLocation::getOrNone(const Instruction *Inst) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+    return get(cast<LoadInst>(Inst));
+  case Instruction::Store:
+    return get(cast<StoreInst>(Inst));
+  case Instruction::VAArg:
+    return get(cast<VAArgInst>(Inst));
+  case Instruction::AtomicCmpXchg:
+    return get(cast<AtomicCmpXchgInst>(Inst));
+  case Instruction::AtomicRMW:
+    return get(cast<AtomicRMWInst>(Inst));
+  default:
+    return None;
+  }
+}
+
 MemoryLocation MemoryLocation::getForSource(const MemTransferInst *MTI) {
   return getForSource(cast<AnyMemTransferInst>(MTI));
 }

From d6c8736287371f1c9eba3629819209c5fb54e546 Mon Sep 17 00:00:00 2001
From: Sean Fertile <sd.fertile@gmail.com>
Date: Tue, 26 May 2020 10:37:51 -0400
Subject: [PATCH 107/770] [PowerPC][AIX] Spill CSRs to the ABI specified stack
 offsets.

Extend the CSR save/restore insertion code to support both 32-bit and
64-bit AIX.

Differential Revision: https://reviews.llvm.org/D79252
---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp  |  55 ++--
 .../CodeGen/PowerPC/aix-calleesavedregs.ll    |   4 +-
 llvm/test/CodeGen/PowerPC/aix-cc-abi.ll       |   2 +-
 llvm/test/CodeGen/PowerPC/aix-cc-byval-mem.ll |  12 +-
 llvm/test/CodeGen/PowerPC/aix-csr.ll          | 270 ++++++++++++++++++
 llvm/test/CodeGen/PowerPC/aix32-crsave.mir    |  28 +-
 llvm/test/CodeGen/PowerPC/ppc64-crsave.mir    |  35 +--
 7 files changed, 350 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-csr.ll

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index a21a9c6f50adf..7da24f03bc7a7 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -96,11 +96,6 @@ PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
 // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
 const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
     unsigned &NumEntries) const {
-  // Early exit if not using the SVR4 ABI.
-  if (!Subtarget.isSVR4ABI()) {
-    NumEntries = 0;
-    return nullptr;
-  }
 
 // Floating-point register save area offsets.
 #define CALLEE_SAVED_FPRS \
@@ -123,7 +118,8 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
       {PPC::F15, -136},   \
       {PPC::F14, -144}
 
-// 32-bit general purpose register save area offsets.
+// 32-bit general purpose register save area offsets shared by ELF and
+// AIX. AIX has an extra CSR with r13.
 #define CALLEE_SAVED_GPRS32 \
       {PPC::R31, -4},       \
       {PPC::R30, -8},       \
@@ -183,7 +179,7 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
   // Note that the offsets here overlap, but this is fixed up in
   // processFunctionBeforeFrameFinalized.
 
-  static const SpillSlot Offsets[] = {
+  static const SpillSlot ELFOffsets32[] = {
       CALLEE_SAVED_FPRS,
       CALLEE_SAVED_GPRS32,
 
@@ -218,25 +214,48 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
       {PPC::S15, -136},
       {PPC::S14, -144}};
 
-  static const SpillSlot Offsets64[] = {
+  static const SpillSlot ELFOffsets64[] = {
       CALLEE_SAVED_FPRS,
       CALLEE_SAVED_GPRS64,
 
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
-
       CALLEE_SAVED_VRS
   };
 
-  if (Subtarget.isPPC64()) {
-    NumEntries = array_lengthof(Offsets64);
+  static const SpillSlot AIXOffsets32[] = {
+    CALLEE_SAVED_FPRS,
+    CALLEE_SAVED_GPRS32,
+    // Add AIX's extra CSR.
+    {PPC::R13, -76},
+    // TODO Update when we add vector support for AIX.
+  };
 
-    return Offsets64;
-  } else {
-    NumEntries = array_lengthof(Offsets);
+  static const SpillSlot AIXOffsets64[] = {
+    CALLEE_SAVED_FPRS,
+    CALLEE_SAVED_GPRS64,
+    // TODO Update when we add vector support for AIX.
+  };
+
+  if (Subtarget.is64BitELFABI()) {
+    NumEntries = array_lengthof(ELFOffsets64);
+    return ELFOffsets64;
+  }
 
-    return Offsets;
+  if (Subtarget.is32BitELFABI()) {
+    NumEntries = array_lengthof(ELFOffsets32);
+    return ELFOffsets32;
   }
+
+  assert(Subtarget.isAIXABI() && "Unexpected ABI.");
+
+  if (Subtarget.isPPC64()) {
+    NumEntries = array_lengthof(AIXOffsets64);
+    return AIXOffsets64;
+  }
+
+  NumEntries = array_lengthof(AIXOffsets32);
+  return AIXOffsets32;
 }
 
 /// RemoveVRSaveCode - We have found that this function does not need any code
@@ -1805,12 +1824,6 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
 void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                                        RegScavenger *RS) const {
-  // Early exit if not using the SVR4 ABI.
-  if (!Subtarget.isSVR4ABI()) {
-    addScavengingSpillSlot(MF, RS);
-    return;
-  }
-
   // Get callee saved register information.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
diff --git a/llvm/test/CodeGen/PowerPC/aix-calleesavedregs.ll b/llvm/test/CodeGen/PowerPC/aix-calleesavedregs.ll
index e0826b5031292..5f89222e1eb70 100644
--- a/llvm/test/CodeGen/PowerPC/aix-calleesavedregs.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-calleesavedregs.ll
@@ -7,5 +7,5 @@ define void @usethirteen() {
     ret void
 }
 
-; CHECK: stw 13, -4(1)
-; CHECK: lwz 13, -4(1)
+; CHECK: stw 13, -76(1)
+; CHECK: lwz 13, -76(1)
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
index 36f070e7f162d..4e4d921bd6862 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
@@ -2337,7 +2337,7 @@ define void @caller_mix() {
 
 ; ASM64PWR4:      mflr 0
 ; ASM64PWR4-DAG:  std 0, 16(1)
-; ASM64PWR4-DAG:  stdu 1, -240(1)
+; ASM64PWR4-DAG:  stdu 1, -256(1)
 ; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 112(1)
 ; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 120(1)
 ; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 128(1)
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mem.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mem.ll
index 3f24a43ad6b2c..95b009c13ae50 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mem.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mem.ll
@@ -203,7 +203,7 @@ entry:
 
 ; CHECKASM-LABEL: .call_test_byval_mem3:
 
-; ASM32BIT:       stwu 1, -96(1)
+; ASM32BIT:       stwu 1, -112(1)
 ; ASM32BIT-DAG:   lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2)
 ; ASM32BIT-DAG:   addi 3, 1, 56
 ; ASM32BIT-DAG:   addi 4, [[REG]], 24
@@ -216,7 +216,7 @@ entry:
 ; ASM32BIT-DAG:   lwz 9, 16([[REG]])
 ; ASM32BIT-DAG:   lwz 10, 20([[REG]])
 ; ASM32BIT:       bl .test_byval_mem3
-; ASM32BIT:       addi 1, 1, 96
+; ASM32BIT:       addi 1, 1, 112
 
 ; The memcpy call was inlined in 64-bit so MIR test is redundant and omitted.
 ; ASM64BIT:       stdu 1, -128(1)
@@ -319,7 +319,7 @@ entry:
 ; 32BIT-NEXT:     BL_NOP <mcsymbol .test_byval_mem4>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
 ; 32BIT-NEXT:     ADJCALLSTACKUP 316, 0, implicit-def dead $r1, implicit $r1
 
-; ASM32BIT:       stwu 1, -320(1)
+; ASM32BIT:       stwu 1, -336(1)
 ; ASM32BIT-NEXT:  stw [[REG1:[0-9]+]], {{[0-9]+}}(1)
 ; ASM32BIT:       lwz [[REG1]], LC{{[0-9]+}}(2)
 ; ASM32BIT-DAG:   lhz [[REG2:[0-9]+]], 28([[REG1]])
@@ -338,7 +338,7 @@ entry:
 ; ASM32BIT-DAG:   lwz 9, 20([[REG1]])
 ; ASM32BIT-DAG:   lwz 10, 24([[REG1]])
 ; ASM32BIT:       bl .test_byval_mem4
-; ASM32BIT:       addi 1, 1, 320
+; ASM32BIT:       addi 1, 1, 336
 
 ; Confirm the expected memcpy call is independent of the call to test_byval_mem4.
 ; 64BIT:          ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
@@ -363,7 +363,7 @@ entry:
 ; 64BIT-NEXT:     BL8_NOP <mcsymbol .test_byval_mem4>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
 ; 64BIT-NEXT:     ADJCALLSTACKUP 344, 0, implicit-def dead $r1, implicit $r1
 
-; ASM64BIT:       stdu 1, -352(1)
+; ASM64BIT:       stdu 1, -368(1)
 ; ASM64BIT-DAG:   ld [[REG1:[0-9]+]], LC{{[0-9]+}}(2)
 ; ASM64BIT-DAG:   addi 3, 1, 112
 ; ASM64BIT-DAG:   addi 4, [[REG1]], 24
@@ -383,7 +383,7 @@ entry:
 ; ASM64BIT-DAG:   ld 9, 8([[REG1]])
 ; ASM64BIT-DAG:   ld 10, 16([[REG1]])
 ; ASM64BIT:       bl .test_byval_mem4
-; ASM64BIT:       addi 1, 1, 352
+; ASM64BIT:       addi 1, 1, 368
 
 define void @test_byval_mem4(i32, %struct_S31* byval(%struct_S31) align 1, %struct_S256* byval(%struct_S256) align 1 %s) {
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/aix-csr.ll b/llvm/test/CodeGen/PowerPC/aix-csr.ll
new file mode 100644
index 0000000000000..74da61b0ad67a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-csr.ll
@@ -0,0 +1,270 @@
+; RUN: llc -mtriple=powerpc64-unknown-aix-xcoff -verify-machineinstrs \
+; RUN: -mcpu=pwr4 -mattr=-altivec -stop-after=prologepilog < %s | \
+; RUN: FileCheck --check-prefix=MIR64 %s
+
+; RUN: llc -mtriple=powerpc64-unknown-aix-xcoff -verify-machineinstrs \
+; RUN: -mcpu=pwr4 -mattr=-altivec < %s | FileCheck --check-prefix=ASM64 %s
+
+; RUN: llc -mtriple=powerpc-unknown-aix-xcoff -verify-machineinstrs \
+; RUN: -mcpu=pwr4 -mattr=-altivec -stop-after=prologepilog < %s | \
+; RUN: FileCheck --check-prefix=MIR32 %s
+
+; RUN: llc -mtriple=powerpc-unknown-aix-xcoff -verify-machineinstrs \
+; RUN: -mcpu=pwr4 -mattr=-altivec < %s | FileCheck --check-prefix=ASM32 %s
+
+define dso_local signext i32 @gprs_only(i32 signext %i) {
+entry:
+  call void asm sideeffect "", "~{r16},~{r22},~{r30}"()
+  ret i32 %i
+}
+
+; MIR64:       name:            gprs_only
+; MIR64-LABEL: fixedStack:
+; MIR64-NEXT:   - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:       callee-saved-register: '$x30', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:   - { id: 1, type: spill-slot, offset: -80, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:       callee-saved-register: '$x22', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:   - { id: 2, type: spill-slot, offset: -128, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:       callee-saved-register: '$x16', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  stack:           []
+
+; MIR32:       name:            gprs_only
+; MIR32-LABEL: fixedStack:
+; MIR32:        - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:       callee-saved-register: '$r30', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:   - { id: 1, type: spill-slot, offset: -40, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:       callee-saved-register: '$r22', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:   - { id: 2, type: spill-slot, offset: -64, size: 4, alignment: 16, stack-id: default,
+; MIR32-NEXT:       callee-saved-register: '$r16', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:       debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  stack:           []
+
+
+; MIR64: liveins: $x3, $x16, $x22, $x30
+
+; MIR64-DAG: STD killed $x16, -128, $x1 :: (store 8 into %fixed-stack.2, align 16)
+; MIR64-DAG: STD killed $x22, -80, $x1 :: (store 8 into %fixed-stack.1, align 16)
+; MIR64-DAG: STD killed $x30, -16, $x1 :: (store 8 into %fixed-stack.0, align 16)
+
+; MIR64:     INLINEASM
+
+; MIR64-DAG: $x30 = LD -16, $x1 :: (load 8 from %fixed-stack.0, align 16)
+; MIR64-DAG: $x22 = LD -80, $x1 :: (load 8 from %fixed-stack.1, align 16)
+; MIR64-DAG: $x16 = LD -128, $x1 :: (load 8 from %fixed-stack.2, align 16)
+; MIR64:     BLR8 implicit $lr8, implicit $rm, implicit $x3
+
+
+; MIR32: liveins: $r3, $r16, $r22, $r30
+
+; MIR32-DAG: STW killed $r16, -64, $r1 :: (store 4 into %fixed-stack.2, align 16)
+; MIR32-DAG: STW killed $r22, -40, $r1 :: (store 4 into %fixed-stack.1, align 8)
+; MIR32-DAG: STW killed $r30, -8, $r1 :: (store 4 into %fixed-stack.0, align 8)
+
+; MIR32:     INLINEASM
+
+; MIR32-DAG: $r30 = LWZ -8, $r1 :: (load 4 from %fixed-stack.0, align 8)
+; MIR32-DAG: $r22 = LWZ -40, $r1 :: (load 4 from %fixed-stack.1, align 8)
+; MIR32-DAG: $r16 = LWZ -64, $r1 :: (load 4 from %fixed-stack.2, align 16)
+; MIR32:     BLR implicit $lr, implicit $rm, implicit $r3
+
+
+; ASM64-LABEL: .gprs_only:
+; ASM64-DAG:      std 16, -128(1)                 # 8-byte Folded Spill
+; ASM64-DAG:      std 22, -80(1)                  # 8-byte Folded Spill
+; ASM64-DAG:      std 30, -16(1)                  # 8-byte Folded Spill
+; ASM64:          #APP
+; ASM64-DAG:      ld 30, -16(1)                   # 8-byte Folded Reload
+; ASM64-DAG:      ld 22, -80(1)                   # 8-byte Folded Reload
+; ASM64-DAG:      ld 16, -128(1)                  # 8-byte Folded Reload
+; ASM64:          blr
+
+; ASM32-LABEl: .gprs_only:
+; ASM32-DAG:     stw 16, -64(1)                  # 4-byte Folded Spill
+; ASM32-DAG:     stw 22, -40(1)                  # 4-byte Folded Spill
+; ASM32-DAG:     stw 30, -8(1)                   # 4-byte Folded Spill
+; ASM32:         #APP
+; ASM32-DAG:     lwz 30, -8(1)                   # 4-byte Folded Reload
+; ASM32-DAG:     lwz 22, -40(1)                  # 4-byte Folded Reload
+; ASM32-DAG:     lwz 16, -64(1)                  # 4-byte Folded Reload
+; ASM32-DAG:     blr
+
+
+declare double @dummy(i32 signext);
+
+define dso_local double @fprs_and_gprs(i32 signext %i) {
+  call void asm sideeffect "", "~{r13},~{r14},~{r25},~{r31},~{f14},~{f19},~{f21},~{f31}"()
+  %result = call double @dummy(i32 signext %i)
+  ret double %result
+}
+
+; MIR64:       name:            fprs_and_gprs
+; MIR64-LABEL: fixedStack:
+; MIR64-NEXT:    - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:    - { id: 1, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:    - { id: 2, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:    - { id: 3, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:    - { id: 4, type: spill-slot, offset: -152, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$x31', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:    - { id: 5, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$x25', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:    - { id: 6, type: spill-slot, offset: -288, size: 8, alignment: 16, stack-id: default,
+; MIR64-NEXT:        callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
+; MIR64-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR64-NEXT:  stack:           []
+
+; MIR32:       name:            fprs_and_gprs
+; MIR32-LABEL: fixedStack:
+; MIR32-NEXT:    - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$f31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 1, type: spill-slot, offset: -88, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$f21', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 2, type: spill-slot, offset: -104, size: 8, alignment: 8, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$f19', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 3, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$f14', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 4, type: spill-slot, offset: -148, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$r31', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 5, type: spill-slot, offset: -172, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$r25', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 6, type: spill-slot, offset: -216, size: 4, alignment: 8, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:    - { id: 7, type: spill-slot, offset: -220, size: 4, alignment: 4, stack-id: default,
+; MIR32-NEXT:        callee-saved-register: '$r13', callee-saved-restored: true, debug-info-variable: '',
+; MIR32-NEXT:        debug-info-expression: '', debug-info-location: '' }
+; MIR32-NEXT:  stack:           []
+
+
+; MIR64: liveins: $x3, $x14, $x25, $x31, $f14, $f19, $f21, $f31
+
+; MIR64:       $x0 = MFLR8 implicit $lr8
+; MIR64-NEXT:  STD killed $x0, 16, $x1
+; MIR64-NEXT:  $x1 = STDU $x1, -400, $x1
+; MIR64-DAG:   STD killed $x14, 112, $x1 :: (store 8 into %fixed-stack.6, align 16)
+; MIR64-DAG:   STD killed $x25, 200, $x1 :: (store 8 into %fixed-stack.5)
+; MIR64-DAG:   STD killed $x31, 248, $x1 :: (store 8 into %fixed-stack.4)
+; MIR64-DAG:   STFD killed $f14, 256, $x1 :: (store 8 into %fixed-stack.3, align 16)
+; MIR64-DAG:   STFD killed $f19, 296, $x1 :: (store 8 into %fixed-stack.2)
+; MIR64-DAG:   STFD killed $f21, 312, $x1 :: (store 8 into %fixed-stack.1)
+; MIR64-DAG:   STFD killed $f31, 392, $x1 :: (store 8 into %fixed-stack.0)
+
+; MIR64:       INLINEASM
+; MIR64-NEXT:  BL8_NOP
+
+; MIR64-DAG:   $f31 = LFD 392, $x1 :: (load 8 from %fixed-stack.0)
+; MIR64-DAG:   $f21 = LFD 312, $x1 :: (load 8 from %fixed-stack.1)
+; MIR64-DAG:   $f19 = LFD 296, $x1 :: (load 8 from %fixed-stack.2)
+; MIR64-DAG:   $f14 = LFD 256, $x1 :: (load 8 from %fixed-stack.3, align 16)
+; MIR64-DAG:   $x31 = LD 248, $x1 :: (load 8 from %fixed-stack.4)
+; MIR64-DAG:   $x25 = LD 200, $x1 :: (load 8 from %fixed-stack.5)
+; MIR64-DAG:   $x14 = LD 112, $x1 :: (load 8 from %fixed-stack.6, align 16)
+; MIR64:       $x1 = ADDI8 $x1, 400
+; MIR64-NEXT:  $x0 = LD 16, $x1
+; MIR64-NEXT:  MTLR8 $x0, implicit-def $lr8
+; MIR64-NEXT:  BLR8 implicit $lr8, implicit $rm, implicit $f1
+
+
+; MIR32: liveins: $r3, $r13, $r14, $r25, $r31, $f14, $f19, $f21, $f31
+
+; MIR32:      $r0 = MFLR implicit $lr
+; MIR32-NEXT: STW killed $r0, 8, $r1
+; MIR32-NEXT: $r1 = STWU $r1, -288, $r1
+; MIR32-DAG:  STW killed $r13, 68, $r1 :: (store 4 into %fixed-stack.7)
+; MIR32-DAG:  STW killed $r14, 72, $r1 :: (store 4 into %fixed-stack.6, align 8)
+; MIR32-DAG:  STW killed $r25, 116, $r1 :: (store 4 into %fixed-stack.5)
+; MIR32-DAG:  STW killed $r31, 140, $r1 :: (store 4 into %fixed-stack.4)
+; MIR32-DAG:  STFD killed $f14, 144, $r1 :: (store 8 into %fixed-stack.3, align 16)
+; MIR32-DAG:  STFD killed $f19, 184, $r1 :: (store 8 into %fixed-stack.2)
+; MIR32-DAG:  STFD killed $f21, 200, $r1 :: (store 8 into %fixed-stack.1)
+; MIR32-DAG:  STFD killed $f31, 280, $r1 :: (store 8 into %fixed-stack.0)
+
+; MIR32:      INLINEASM
+; MIR32:      BL_NOP
+
+; MIR32-DAG:  $f31 = LFD 280, $r1 :: (load 8 from %fixed-stack.0)
+; MIR32-DAG:  $f21 = LFD 200, $r1 :: (load 8 from %fixed-stack.1)
+; MIR32-DAG:  $f19 = LFD 184, $r1 :: (load 8 from %fixed-stack.2)
+; MIR32-DAG:  $f14 = LFD 144, $r1 :: (load 8 from %fixed-stack.3, align 16)
+; MIR32-DAG:  $r31 = LWZ 140, $r1 :: (load 4 from %fixed-stack.4)
+; MIR32-DAG:  $r25 = LWZ 116, $r1 :: (load 4 from %fixed-stack.5)
+; MIR32-DAG:  $r14 = LWZ 72, $r1 :: (load 4 from %fixed-stack.6, align 8)
+; MIR32-DAG:  $r13 = LWZ 68, $r1 :: (load 4 from %fixed-stack.7)
+; MIR32:      $r1 = ADDI $r1, 288
+; MIR32-NEXT: $r0 = LWZ 8, $r1
+; MIR32-NEXT: MTLR $r0, implicit-def $lr
+; MIR32-NEXT: BLR implicit $lr, implicit $rm, implicit $f1
+
+; ASM64-LABEL: .fprs_and_gprs:
+; ASM64:         mflr 0
+; ASM64-NEXT:    std 0, 16(1)
+; ASM64-NEXT:    stdu 1, -400(1)
+; ASM64-DAG:     std 14, 112(1)                  # 8-byte Folded Spill
+; ASM64-DAG:     std 25, 200(1)                  # 8-byte Folded Spill
+; ASM64-DAG:     std 31, 248(1)                  # 8-byte Folded Spill
+; ASM64-DAG:     stfd 14, 256(1)                 # 8-byte Folded Spill
+; ASM64-DAG:     stfd 19, 296(1)                 # 8-byte Folded Spill
+; ASM64-DAG:     stfd 21, 312(1)                 # 8-byte Folded Spill
+; ASM64-DAG:     stfd 31, 392(1)                 # 8-byte Folded Spill
+
+; ASM64:         bl .dummy
+
+; ASM64-DAG:     lfd 31, 392(1)                  # 8-byte Folded Reload
+; ASM64-DAG:     lfd 21, 312(1)                  # 8-byte Folded Reload
+; ASM64-DAG:     lfd 19, 296(1)                  # 8-byte Folded Reload
+; ASM64-DAG:     lfd 14, 256(1)                  # 8-byte Folded Reload
+; ASM64-DAG:     ld 31, 248(1)                   # 8-byte Folded Reload
+; ASM64-DAG:     ld 25, 200(1)                   # 8-byte Folded Reload
+; ASM64-DAG:     ld 14, 112(1)                   # 8-byte Folded Reload
+; ASM64:         addi 1, 1, 400
+; ASM64-NEXT:    ld 0, 16(1)
+; ASM64-NEXT:    mtlr 0
+; ASM64-NEXT:    blr
+
+; ASM32-LABEL: .fprs_and_gprs:
+; ASM32:         mflr 0
+; ASM32-NEXT:    stw 0, 8(1)
+; ASM32-NEXT:    stwu 1, -288(1)
+; ASM32-DAG:     stw 13, 68(1)                   # 4-byte Folded Spill
+; ASM32-DAG:     stw 14, 72(1)                   # 4-byte Folded Spill
+; ASM32-DAG:     stw 25, 116(1)                  # 4-byte Folded Spill
+; ASM32-DAG:     stw 31, 140(1)                  # 4-byte Folded Spill
+; ASM32-DAG:     stfd 14, 144(1)                 # 8-byte Folded Spill
+; ASM32-DAG:     stfd 19, 184(1)                 # 8-byte Folded Spill
+; ASM32-DAG:     stfd 21, 200(1)                 # 8-byte Folded Spill
+; ASM32-DAG:     stfd 31, 280(1)                 # 8-byte Folded Spill
+
+; ASM32-DAG:     bl .dummy
+
+; ASM32-DAG:     lfd 31, 280(1)                  # 8-byte Folded Reload
+; ASM32-DAG:     lfd 21, 200(1)                  # 8-byte Folded Reload
+; ASM32-DAG:     lfd 19, 184(1)                  # 8-byte Folded Reload
+; ASM32-DAG:     lfd 14, 144(1)                  # 8-byte Folded Reload
+; ASM32-DAG:     lwz 31, 140(1)                  # 4-byte Folded Reload
+; ASM32-DAG:     lwz 25, 116(1)                  # 4-byte Folded Reload
+; ASM32-DAG:     lwz 14, 72(1)                   # 4-byte Folded Reload
+; ASM32-DAG:     lwz 13, 68(1)                   # 4-byte Folded Reload
+; ASM32:         addi 1, 1, 288
+; ASM32-NEXT:    lwz 0, 8(1)
+; ASM32-NEXT:    mtlr 0
+; ASM32-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/aix32-crsave.mir b/llvm/test/CodeGen/PowerPC/aix32-crsave.mir
index 5a82bff33e973..8faf10233f7db 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-crsave.mir
+++ b/llvm/test/CodeGen/PowerPC/aix32-crsave.mir
@@ -17,20 +17,25 @@ body:             |
     renamable $r3 = COPY $r29
     BLR implicit $lr, implicit $rm, implicit $r3
 
-    ; CHECK-LABEL: fixedStack:
-    ; CHECK:       - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
-    ; CHECK-NEXT:      isImmutable: true, isAliased: false, callee-saved-register: '$cr4',
-    ; CHECK-NEXT:      callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
-    ; CHECK-NEXT:      debug-info-location: '' }
-    ; CHECK-LABEL: stack:
+    ; CHECK-LABEL:  fixedStack:
+    ; CHECK-NEXT:   - { id: 0, type: spill-slot, offset: -12, size: 4, alignment: 4, stack-id: default,
+    ; CHECK-NEXT:       callee-saved-register: '$r29', callee-saved-restored: true, debug-info-variable: '',
+    ; CHECK-NEXT:       debug-info-expression: '', debug-info-location: '' }
+    ; CHECK-NEXT:   - { id: 1, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
+    ; CHECK-NEXT:       isImmutable: true, isAliased: false, callee-saved-register: '$cr4',
+    ; CHECK-NEXT:       callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+    ; CHECK-NEXT:       debug-info-location: '' }
+    ; CHECK-LABEL:  stack:
 
     ; CHECK:      bb.0.entry:
     ; CHECK-NEXT:  liveins: $r3, $r29, $cr2, $cr4
 
     ; CHECK:      $r12 = MFCR implicit killed $cr2, implicit killed $cr4
     ; CHECK-NEXT: STW killed $r12, 4, $r1
+    ; CHECK-NEXT: STW killed $r29, -12, $r1 :: (store 4 into %fixed-stack.0)
 
-    ; CHECK:      $r12 = LWZ 4, $r1
+    ; CHECK:      $r29 = LWZ -12, $r1 :: (load 4 from %fixed-stack.0)
+    ; CHECK-NEXT: $r12 = LWZ 4, $r1
     ; CHECK-NEXT: $cr2 = MTOCRF $r12
     ; CHECK-NEXT: $cr4 = MTOCRF killed $r12
 
@@ -50,7 +55,10 @@ body:             |
     BLR implicit $lr, implicit $rm, implicit $r3
 
     ; CHECK-LABEL: fixedStack:
-    ; CHECK:       - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
+    ; CHECK-NEXT:  - { id: 0, type: spill-slot, offset: -72, size: 4, alignment: 8, stack-id: default,
+    ; CHECK-NEXT:      callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '',
+    ; CHECK-NEXT:      debug-info-expression: '', debug-info-location: '' }
+    ; CHECK-NEXT:  - { id: 1, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
     ; CHECK-NEXT:      isImmutable: true, isAliased: false, callee-saved-register: '$cr3',
     ; CHECK-NEXT:      callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
     ; CHECK-NEXT:      debug-info-location: '' }
@@ -61,6 +69,8 @@ body:             |
 
     ; CHECK:      $r12 = MFCR implicit killed $cr3
     ; CHECK-NEXT: STW killed $r12, 4, $r1
+    ; CHECK-NEXT: STW killed $r14, -72, $r1 :: (store 4 into %fixed-stack.0, align 8)
 
-    ; CHECK:      $r12 = LWZ 4, $r1
+    ; CHECK:      $r14 = LWZ -72, $r1 :: (load 4 from %fixed-stack.0, align 8)
+    ; CHECK-NEXT: $r12 = LWZ 4, $r1
     ; CHECK-NEXT: $cr3 = MTOCRF killed $r12
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir b/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir
index b6a8748c8e3ee..b7c0b7ef8b66b 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir
+++ b/llvm/test/CodeGen/PowerPC/ppc64-crsave.mir
@@ -1,19 +1,16 @@
 # RUN: llc -mtriple powerpc64le-unknown-linux-gnu -x mir -mcpu=pwr8 \
 # RUN: -run-pass=prologepilog --verify-machineinstrs < %s | \
-# RUN: FileCheck %s --check-prefixes=CHECK,SAVEONE,ELF
+# RUN: FileCheck %s --check-prefixes=CHECK,SAVEONE
 
 # RUN: llc -mtriple powerpc64-unknown-linux-gnu -x mir -mcpu=pwr7 \
 # RUN: -run-pass=prologepilog --verify-machineinstrs < %s | \
-# RUN: FileCheck %s --check-prefixes=CHECK,SAVEALL,ELF
+# RUN: FileCheck %s --check-prefixes=CHECK,SAVEALL
 
 
 # RUN: llc -mtriple powerpc64-unknown-aix-xcoff -x mir -mcpu=pwr4 \
 # RUN: -run-pass=prologepilog --verify-machineinstrs < %s | \
 # RUN: FileCheck %s --check-prefixes=CHECK,SAVEALL
 
-# TODO FIXME: We only check the save and restores of the callee saved gpr for
-# ELF becuase AIX callee saved registers haven't been properly implemented yet.
-
 ---
 name:            CRAllSave
 alignment:       16
@@ -30,21 +27,23 @@ body:             |
     BLR8 implicit $lr8, implicit $rm, implicit $x3
 
     ; CHECK-LABEL: fixedStack:
-    ; ELF:          - { id: 1, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-    ; AIX:          - { id: 0, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-    ; CHECK:            isImmutable: true, isAliased: false, callee-saved-register: '$cr4',
-    ; CHECK-NEXT:       callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
-    ; CHECK-NEXT:       debug-info-location: '' }
+    ; CHECK-NEXT:     - { id: 0, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+    ; CHECK-NEXT:         callee-saved-register: '$x29', callee-saved-restored: true, debug-info-variable: '',
+    ; CHECK-NEXT:         debug-info-expression: '', debug-info-location: '' }
+    ; CHECK-NEXT:     - { id: 1, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
+    ; CHECK-NEXT:         isImmutable: true, isAliased: false, callee-saved-register: '$cr4',
+    ; CHECK-NEXT:         callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+    ; CHECK-NEXT:         debug-info-location: '' }
     ; CHECK-LABEL:  stack:
 
     ; Verify the proper live-ins have been added in the prologue.
     ; CHECK:    liveins: $x3, $x29, $cr2, $cr4
 
     ; CHECK:     $x12 = MFCR8 implicit killed $cr2, implicit killed $cr4
-    ; ELF-DAG:   STD killed $x29, -24, $x1 :: (store 8 into %fixed-stack.0)
+    ; CHECK-DAG: STD killed $x29, -24, $x1 :: (store 8 into %fixed-stack.0)
     ; CHECK-DAG: STW8 killed $x12, 8, $x1
 
-    ; ELF:       $x29 = LD -24, $x1 :: (load 8 from %fixed-stack.0)
+    ; CHECK:     $x29 = LD -24, $x1 :: (load 8 from %fixed-stack.0)
     ; CHECK:     $x12 = LWZ8 8, $x1
     ; CHECK:     $cr2 = MTOCRF8 $x12
     ; CHECK:     $cr4 = MTOCRF8 killed $x12
@@ -67,9 +66,11 @@ body:             |
     ; CHECK-LABEL: CR2Save
 
     ; CHECK-LABEL: fixedStack:
-    ; ELF:          - { id: 1, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-    ; AIX:          - { id: 0, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-    ; CHECK:            isImmutable: true, isAliased: false, callee-saved-register: '$cr2',
+    ; CHECK-NEXT:   - { id: 0, type: spill-slot, offset: -144, size: 8, alignment: 16, stack-id: default,
+    ; CHECK-NEXT:       callee-saved-register: '$x14', callee-saved-restored: true, debug-info-variable: '',
+    ; CHECK-NEXT:       debug-info-expression: '', debug-info-location: '' }
+    ; CHECK-NEXT:   - { id: 1, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
+    ; CHECK-NEXT:       isImmutable: true, isAliased: false, callee-saved-register: '$cr2',
     ; CHECK-NEXT:       callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
     ; CHECK-NEXT:       debug-info-location: '' }
     ; CHECK-LABEL:  stack:
@@ -82,10 +83,10 @@ body:             |
     ; SAVEONE:     $x12 = MFOCRF8 killed $cr2
     ; SAVEALL:     $x12 = MFCR8 implicit killed $cr2
 
-    ; ELF-DAG:   STD killed $x14, -144, $x1 :: (store 8 into %fixed-stack.0, align 16)
+    ; CHECK-DAG: STD killed $x14, -144, $x1 :: (store 8 into %fixed-stack.0, align 16)
     ; CHECK-DAG: STW8 killed $x12, 8, $x1
 
-    ; ELF:       $x14 = LD -144, $x1 :: (load 8 from %fixed-stack.0, align 16)
+    ; CHECK:     $x14 = LD -144, $x1 :: (load 8 from %fixed-stack.0, align 16)
     ; CHECK:     $x12 = LWZ8 8, $x1
     ; CHECK:     $cr2 = MTOCRF8 killed $x12
 

From bae7cf674621b5892a036fabe77692a59e2b115b Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 14 May 2020 13:55:20 -0700
Subject: [PATCH 108/770] [ELF][PPC64] Synthesize _savegpr[01]_{14..31} and
 _restgpr[01]_{14..31}

In the 64-bit ELF V2 API Specification: Power Architecture, 2.3.3.1. GPR
Save and Restore Functions defines some special functions which may be
referenced by GCC produced assembly (LLVM does not reference them).

With GCC -Os, when the number of call-saved registers exceeds a certain
threshold, GCC generates `_savegpr0_* _restgpr0_*` calls and expects the
linker to define them. See
https://sourceware.org/pipermail/binutils/2002-February/017444.html and
https://sourceware.org/pipermail/binutils/2004-August/036765.html . This
is weird because libgcc.a would be the natural place. However, the linker
generation approach has the advantage that the linker can generate
multiple copies to avoid long branch thunks. We don't consider the
advantage significant enough to complicate our trunk implementation, so
we take a simple approach.

* Check whether `_savegpr0_{14..31}` are used
* If yes, define needed symbols and add an InputSection with the code sequence.

`_savegpr1_*` `_restgpr0_*` and `_restgpr1_*` are similar.

Reviewed By: sfertile

Differential Revision: https://reviews.llvm.org/D79977
---
 lld/ELF/Arch/PPC64.cpp        | 80 +++++++++++++++++++++++++++++++++++
 lld/ELF/Target.h              |  1 +
 lld/ELF/Writer.cpp            |  2 +
 lld/test/ELF/ppc64-restgpr0.s | 38 +++++++++++++++++
 lld/test/ELF/ppc64-restgpr1.s | 34 +++++++++++++++
 lld/test/ELF/ppc64-savegpr0.s | 36 ++++++++++++++++
 lld/test/ELF/ppc64-savegpr1.s | 34 +++++++++++++++
 lld/test/ELF/ppc64-saveres.s  | 31 ++++++++++++++
 8 files changed, 256 insertions(+)
 create mode 100644 lld/test/ELF/ppc64-restgpr0.s
 create mode 100644 lld/test/ELF/ppc64-restgpr1.s
 create mode 100644 lld/test/ELF/ppc64-savegpr0.s
 create mode 100644 lld/test/ELF/ppc64-savegpr1.s
 create mode 100644 lld/test/ELF/ppc64-saveres.s

diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index b80f96f28b46b..a182c77209aef 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -6,11 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SymbolTable.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
 #include "Thunks.h"
 #include "lld/Common/ErrorHandler.h"
+#include "lld/Common/Memory.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -104,6 +106,84 @@ bool elf::isPPC64SmallCodeModelTocReloc(RelType type) {
   return type == R_PPC64_TOC16 || type == R_PPC64_TOC16_DS;
 }
 
+static bool addOptional(StringRef name, uint64_t value,
+                        std::vector<Defined *> &defined) {
+  Symbol *sym = symtab->find(name);
+  if (!sym || sym->isDefined())
+    return false;
+  sym->resolve(Defined{/*file=*/nullptr, saver.save(name), STB_GLOBAL,
+                       STV_HIDDEN, STT_FUNC, value,
+                       /*size=*/0, /*section=*/nullptr});
+  defined.push_back(cast<Defined>(sym));
+  return true;
+}
+
+// If from is 14, write ${prefix}14: firstInsn; ${prefix}15:
+// firstInsn+0x200008; ...; ${prefix}31: firstInsn+(31-14)*0x200008; $tail
+// The labels are defined only if they exist in the symbol table.
+static void writeSequence(MutableArrayRef<uint32_t> buf, const char *prefix,
+                          int from, uint32_t firstInsn,
+                          ArrayRef<uint32_t> tail) {
+  std::vector<Defined *> defined;
+  char name[16];
+  int first;
+  uint32_t *ptr = buf.data();
+  for (int r = from; r < 32; ++r) {
+    format("%s%d", prefix, r).snprint(name, sizeof(name));
+    if (addOptional(name, 4 * (r - from), defined) && defined.size() == 1)
+      first = r - from;
+    write32(ptr++, firstInsn + 0x200008 * (r - from));
+  }
+  for (uint32_t insn : tail)
+    write32(ptr++, insn);
+  assert(ptr == &*buf.end());
+
+  if (defined.empty())
+    return;
+  // The full section content has the extent of [begin, end). We drop unused
+  // instructions and write [first,end).
+  auto *sec = make<InputSection>(
+      nullptr, SHF_ALLOC, SHT_PROGBITS, 4,
+      makeArrayRef(reinterpret_cast<uint8_t *>(buf.data() + first),
+                   4 * (buf.size() - first)),
+      ".text");
+  inputSections.push_back(sec);
+  for (Defined *sym : defined) {
+    sym->section = sec;
+    sym->value -= 4 * first;
+  }
+}
+
+// Implements some save and restore functions as described by ELF V2 ABI to be
+// compatible with GCC. With GCC -Os, when the number of call-saved registers
+// exceeds a certain threshold, GCC generates _savegpr0_* _restgpr0_* calls and
+// expects the linker to define them. See
+// https://sourceware.org/pipermail/binutils/2002-February/017444.html and
+// https://sourceware.org/pipermail/binutils/2004-August/036765.html . This is
+// weird because libgcc.a would be the natural place. The linker generation
+// approach has the advantage that the linker can generate multiple copies to
+// avoid long branch thunks. However, we don't consider the advantage
+// significant enough to complicate our trunk implementation, so we take the
+// simple approach and synthesize .text sections providing the implementation.
+void elf::addPPC64SaveRestore() {
+  static uint32_t savegpr0[20], restgpr0[21], savegpr1[19], restgpr1[19];
+  constexpr uint32_t blr = 0x4e800020, mtlr_0 = 0x7c0803a6;
+
+  // _restgpr0_14: ld 14, -144(1); _restgpr0_15: ld 15, -136(1); ...
+  // Tail: ld 0, 16(1); mtlr 0; blr
+  writeSequence(restgpr0, "_restgpr0_", 14, 0xe9c1ff70,
+                {0xe8010010, mtlr_0, blr});
+  // _restgpr1_14: ld 14, -144(12); _restgpr1_15: ld 15, -136(12); ...
+  // Tail: blr
+  writeSequence(restgpr1, "_restgpr1_", 14, 0xe9ccff70, {blr});
+  // _savegpr0_14: std 14, -144(1); _savegpr0_15: std 15, -136(1); ...
+  // Tail: std 0, 16(1); blr
+  writeSequence(savegpr0, "_savegpr0_", 14, 0xf9c1ff70, {0xf8010010, blr});
+  // _savegpr1_14: std 14, -144(12); _savegpr1_15: std 15, -136(12); ...
+  // Tail: blr
+  writeSequence(savegpr1, "_savegpr1_", 14, 0xf9ccff70, {blr});
+}
+
 // Find the R_PPC64_ADDR64 in .rela.toc with matching offset.
 template <typename ELFT>
 static std::pair<Defined *, int64_t>
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index a308a41ff4b92..47905ae64a47d 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -213,6 +213,7 @@ unsigned getPPC64GlobalEntryToLocalEntryOffset(uint8_t stOther);
 // the .toc section.
 bool isPPC64SmallCodeModelTocReloc(RelType type);
 
+void addPPC64SaveRestore();
 uint64_t getPPC64TocBase();
 uint64_t getAArch64Page(uint64_t expr);
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index f782cd3cbc45a..9a6be7931a286 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -264,6 +264,8 @@ void elf::addReservedSymbols() {
     // glibc *crt1.o has a undefined reference to _SDA_BASE_. Since we don't
     // support Small Data Area, define it arbitrarily as 0.
     addOptionalRegular("_SDA_BASE_", nullptr, 0, STV_HIDDEN);
+  } else if (config->emachine == EM_PPC64) {
+    addPPC64SaveRestore();
   }
 
   // The Power Architecture 64-bit v2 ABI defines a TableOfContents (TOC) which
diff --git a/lld/test/ELF/ppc64-restgpr0.s b/lld/test/ELF/ppc64-restgpr0.s
new file mode 100644
index 0000000000000..3627272dab55f
--- /dev/null
+++ b/lld/test/ELF/ppc64-restgpr0.s
@@ -0,0 +1,38 @@
+# REQUIRES: ppc
+## Test code sequences of synthesized _restgpr0_{14..31}
+
+# RUN: llvm-mc -filetype=obj -triple=ppc64le %s -o %t14.o
+# RUN: ld.lld %t14.o -o %t14
+# RUN: llvm-objdump -d %t14 | FileCheck --check-prefix=R14 %s
+
+# R14-LABEL: <_restgpr0_14>:
+# R14-NEXT:    ld 14, -144(1)
+# R14-NEXT:    ld 15, -136(1)
+# R14-EMPTY:
+# R14-NEXT:  <_restgpr0_16>:
+# R14-NEXT:    ld 16, -128(1)
+# R14:         ld 31, -8(1)
+# R14-NEXT:    ld 0, 16(1)
+# R14-NEXT:    mtlr 0
+# R14-NEXT:    blr
+
+## Don't synthesize _restgpr0_{14..30} because they are unused.
+# RUN: echo 'bl _restgpr0_31' | llvm-mc -filetype=obj -triple=ppc64 - -o %t31.o
+# RUN: ld.lld %t31.o -o %t31
+# RUN: llvm-objdump -d %t31 | FileCheck --check-prefix=R31 %s
+
+# R31-LABEL: Disassembly of section .text:
+# R31-EMPTY:
+# R31-NEXT:  <_restgpr0_31>:
+# R31-NEXT:    ld 31, -8(1)
+# R31-NEXT:    ld 0, 16(1)
+# R31-NEXT:    mtlr 0
+# R31-NEXT:    blr
+
+# RUN: echo 'bl _restgpr0_32' | llvm-mc -filetype=obj -triple=ppc64 - -o %t32.o
+# RUN: not ld.lld %t32.o -o /dev/null
+
+.globl _start
+_start:
+  bl _restgpr0_14
+  bl _restgpr0_16
diff --git a/lld/test/ELF/ppc64-restgpr1.s b/lld/test/ELF/ppc64-restgpr1.s
new file mode 100644
index 0000000000000..e4b97daf06d75
--- /dev/null
+++ b/lld/test/ELF/ppc64-restgpr1.s
@@ -0,0 +1,34 @@
+# REQUIRES: ppc
+## Test code sequences of synthesized _restgpr1_{14..31}
+
+# RUN: llvm-mc -filetype=obj -triple=ppc64le %s -o %t14.o
+# RUN: ld.lld %t14.o -o %t14
+# RUN: llvm-objdump -d %t14 | FileCheck --check-prefix=R14 %s
+
+# R14:       <_restgpr1_14>:
+# R14-NEXT:    ld 14, -144(12)
+# R14-NEXT:    ld 15, -136(12)
+# R14-EMPTY:
+# R14-NEXT:  <_restgpr1_16>:
+# R14-NEXT:    ld 16, -128(12)
+# R14:         ld 31, -8(12)
+# R14-NEXT:    blr
+
+## Don't synthesize _restgpr1_{14..30} because they are unused.
+# RUN: echo 'bl _restgpr1_31' | llvm-mc -filetype=obj -triple=ppc64 - -o %t31.o
+# RUN: ld.lld %t31.o -o %t31
+# RUN: llvm-objdump -d %t31 | FileCheck --check-prefix=R31 %s
+
+# R31-LABEL: Disassembly of section .text:
+# R31-EMPTY:
+# R31-NEXT:  <_restgpr1_31>:
+# R31-NEXT:    ld 31, -8(12)
+# R31-NEXT:    blr
+
+# RUN: echo 'bl _restgpr1_32' | llvm-mc -filetype=obj -triple=ppc64le - -o %t32.o
+# RUN: not ld.lld %t32.o -o /dev/null
+
+.globl _start
+_start:
+  bl _restgpr1_14
+  bl _restgpr1_16
diff --git a/lld/test/ELF/ppc64-savegpr0.s b/lld/test/ELF/ppc64-savegpr0.s
new file mode 100644
index 0000000000000..1e85340b99dfa
--- /dev/null
+++ b/lld/test/ELF/ppc64-savegpr0.s
@@ -0,0 +1,36 @@
+# REQUIRES: ppc
+## Test code sequences of synthesized _savegpr0_{14..31}
+
+# RUN: llvm-mc -filetype=obj -triple=ppc64le %s -o %t14.o
+# RUN: ld.lld %t14.o -o %t14
+# RUN: llvm-objdump -d %t14 | FileCheck --check-prefix=R14 %s
+
+# R14-LABEL: <_savegpr0_14>:
+# R14-NEXT:    std 14, -144(1)
+# R14-NEXT:    std 15, -136(1)
+# R14-EMPTY:
+# R14-NEXT:  <_savegpr0_16>:
+# R14-NEXT:    std 16, -128(1)
+# R14:         std 31, -8(1)
+# R14-NEXT:    std 0, 16(1)
+# R14-NEXT:    blr
+
+## Don't synthesize _savegpr0_{14..30} because they are unused.
+# RUN: echo 'bl _savegpr0_31' | llvm-mc -filetype=obj -triple=ppc64 - -o %t31.o
+# RUN: ld.lld %t31.o -o %t31
+# RUN: llvm-objdump -d %t31 | FileCheck --check-prefix=R31 %s
+
+# R31-LABEL: Disassembly of section .text:
+# R31-EMPTY:
+# R31-NEXT:  <_savegpr0_31>:
+# R31-NEXT:    std 31, -8(1)
+# R31-NEXT:    std 0, 16(1)
+# R31-NEXT:    blr
+
+# RUN: echo 'bl _savegpr0_32' | llvm-mc -filetype=obj -triple=ppc64 - -o %t32.o
+# RUN: not ld.lld %t32.o -o /dev/null
+
+.globl _start
+_start:
+  bl _savegpr0_14
+  bl _savegpr0_16
diff --git a/lld/test/ELF/ppc64-savegpr1.s b/lld/test/ELF/ppc64-savegpr1.s
new file mode 100644
index 0000000000000..abb878285f823
--- /dev/null
+++ b/lld/test/ELF/ppc64-savegpr1.s
@@ -0,0 +1,34 @@
+# REQUIRES: ppc
+## Test code sequences of synthesized _savegpr1_{14..31}
+
+# RUN: llvm-mc -filetype=obj -triple=ppc64le %s -o %t14.o
+# RUN: ld.lld %t14.o -o %t14
+# RUN: llvm-objdump -d %t14 | FileCheck --check-prefix=R14 %s
+
+# R14-LABEL: <_savegpr1_14>:
+# R14-NEXT:    std 14, -144(12)
+# R14-NEXT:    std 15, -136(12)
+# R14-EMPTY:
+# R14-NEXT:  <_savegpr1_16>:
+# R14-NEXT:    std 16, -128(12)
+# R14:         std 31, -8(12)
+# R14-NEXT:    blr
+
+## Don't synthesize _savegpr1_{14..30} because they are unused.
+# RUN: echo 'bl _savegpr1_31' | llvm-mc -filetype=obj -triple=ppc64 - -o %t31.o
+# RUN: ld.lld %t31.o -o %t31
+# RUN: llvm-objdump -d %t31 | FileCheck --check-prefix=R31 %s
+
+# R31-LABEL: Disassembly of section .text:
+# R31-EMPTY:
+# R31-NEXT:  <_savegpr1_31>:
+# R31-NEXT:    std 31, -8(12)
+# R31-NEXT:    blr
+
+# RUN: echo 'bl _savegpr1_32' | llvm-mc -filetype=obj -triple=ppc64le - -o %t32.o
+# RUN: not ld.lld %t32.o -o /dev/null
+
+.globl _start
+_start:
+  bl _savegpr1_14
+  bl _savegpr1_16
diff --git a/lld/test/ELF/ppc64-saveres.s b/lld/test/ELF/ppc64-saveres.s
new file mode 100644
index 0000000000000..70ef71779952a
--- /dev/null
+++ b/lld/test/ELF/ppc64-saveres.s
@@ -0,0 +1,31 @@
+# REQUIRES: ppc
+## Test that some save and restore functions can be synthesized.
+## The code sequences are tested by ppc64-restgpr*.s and ppc64-savegpr*.s
+
+# RUN: llvm-mc -filetype=obj -triple=ppc64le %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readelf -s %t.so | FileCheck --check-prefix=NM %s
+# RUN: llvm-objdump -d %t.so | FileCheck %s
+
+## The synthesized symbols are not exported.
+# NM:      FUNC LOCAL HIDDEN {{.*}} _restgpr0_30
+# NM-NEXT: FUNC LOCAL HIDDEN {{.*}} _restgpr1_30
+# NM-NEXT: FUNC LOCAL HIDDEN {{.*}} _savegpr0_30
+# NM-NEXT: FUNC LOCAL HIDDEN {{.*}} _savegpr1_30
+
+# CHECK: 00000000000[[#%x,RESTGPR0:]] <_restgpr0_30>:
+# CHECK: 00000000000[[#%x,RESTGPR1:]] <_restgpr1_30>:
+# CHECK: 00000000000[[#%x,SAVEGPR0:]] <_savegpr0_30>:
+# CHECK: 00000000000[[#%x,SAVEGPR1:]] <_savegpr1_30>:
+# CHECK-LABEL: <_start>:
+# CHECK-NEXT:    bl 0x[[#RESTGPR0]]
+# CHECK-NEXT:    bl 0x[[#RESTGPR1]]
+# CHECK-NEXT:    bl 0x[[#SAVEGPR0]]
+# CHECK-NEXT:    bl 0x[[#SAVEGPR1]]
+
+.globl _start
+_start:
+  bl _restgpr0_30
+  bl _restgpr1_30
+  bl _savegpr0_30
+  bl _savegpr1_30

From d4086213c6d76fcaa5fa620ad680eaaf886cc66e Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 09:37:14 -0700
Subject: [PATCH 109/770] [dsymutil] Escape CFBundleIdentifier in plist.

Revision 333565 started escaping HTML special characters in the plist
written by dsymutil, but didn't include the updated CFBundleIdentifier.
---
 llvm/test/tools/dsymutil/Inputs/Info.plist      | 2 +-
 llvm/test/tools/dsymutil/X86/darwin-bundle.test | 2 +-
 llvm/tools/dsymutil/dsymutil.cpp                | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/test/tools/dsymutil/Inputs/Info.plist b/llvm/test/tools/dsymutil/Inputs/Info.plist
index 97c0ae261f35f..e330c951d68a9 100644
--- a/llvm/test/tools/dsymutil/Inputs/Info.plist
+++ b/llvm/test/tools/dsymutil/Inputs/Info.plist
@@ -5,7 +5,7 @@
 		<key>CFBundleDevelopmentRegion</key>
 		<string>English</string>
 		<key>CFBundleIdentifier</key>
-		<string>custom</string>
+		<string>Foo&amp;Bar</string>
 		<key>CFBundleInfoDictionaryVersion</key>
 		<string>6.0</string>
 		<key>CFBundlePackageType</key>
diff --git a/llvm/test/tools/dsymutil/X86/darwin-bundle.test b/llvm/test/tools/dsymutil/X86/darwin-bundle.test
index 7f1224f30a1ef..d44b25e487054 100644
--- a/llvm/test/tools/dsymutil/X86/darwin-bundle.test
+++ b/llvm/test/tools/dsymutil/X86/darwin-bundle.test
@@ -18,7 +18,7 @@ CHECK-NEXT:         <dict>
 CHECK-NEXT:                 <key>CFBundleDevelopmentRegion</key>
 CHECK-NEXT:                 <string>English</string>
 CHECK-NEXT:                 <key>CFBundleIdentifier</key>
-CHECK-NEXT:                 <string>com.apple.xcode.dsym.custom</string>
+CHECK-NEXT:                 <string>com.apple.xcode.dsym.Foo&amp;Bar</string>
 CHECK-NEXT:                 <key>CFBundleInfoDictionaryVersion</key>
 CHECK-NEXT:                 <string>6.0</string>
 CHECK-NEXT:                 <key>CFBundlePackageType</key>
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 32df55611f070..3a32acbec06f9 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -338,7 +338,9 @@ static Error createPlistFile(StringRef Bin, StringRef BundleRoot,
      << "\t\t<key>CFBundleDevelopmentRegion</key>\n"
      << "\t\t<string>English</string>\n"
      << "\t\t<key>CFBundleIdentifier</key>\n"
-     << "\t\t<string>com.apple.xcode.dsym." << BI.IDStr << "</string>\n"
+     << "\t\t<string>com.apple.xcode.dsym.";
+  printHTMLEscaped(BI.IDStr, PL);
+  PL << "</string>\n"
      << "\t\t<key>CFBundleInfoDictionaryVersion</key>\n"
      << "\t\t<string>6.0</string>\n"
      << "\t\t<key>CFBundlePackageType</key>\n"

From fb38b98338cc87442e3451665e82bf1c8ef9388f Mon Sep 17 00:00:00 2001
From: alex-t <alexander.timofeev@amd.com>
Date: Tue, 26 May 2020 19:47:29 +0300
Subject: [PATCH 110/770] [AMDGPU] NFC target dependent requiresUniformRegister
 refactored out

Summary: Target specific method encapsulated into the Target Lowering Info.

Reviewers: rampitec, vpykhtin

Reviewed By: rampitec

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70085
---
 llvm/include/llvm/CodeGen/TargetLowering.h          | 13 +++++++------
 .../CodeGen/SelectionDAG/FunctionLoweringInfo.cpp   |  3 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp           |  6 ++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h             |  5 +++--
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2689838b3e7cc..70bc6b986d3c4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -28,6 +28,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -821,12 +822,12 @@ class TargetLoweringBase {
     return RC;
   }
 
-  /// Allows target to decide about the register class of the
-  /// specific value that is live outside the defining block.
-  /// Returns true if the value needs uniform register class.
-  virtual bool requiresUniformRegister(MachineFunction &MF,
-                                       const Value *) const {
-    return false;
+  /// Allows target to decide about the divergence of the
+  /// specific value. Base class implementation returns true
+  /// if the Divergece Analysis exists and reports value as divergent.
+  virtual bool isDivergent(const LegacyDivergenceAnalysis *DA,
+                           MachineFunction &MF, const Value *V) const {
+    return DA && DA->isDivergent(V);
   }
 
   /// Return the 'representative' register class for the specified value
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 7a5fd7d24c681..36e9ea538b6b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -398,8 +398,7 @@ Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
 }
 
 Register FunctionLoweringInfo::CreateRegs(const Value *V) {
-  return CreateRegs(V->getType(), DA && DA->isDivergent(V) &&
-                    !TLI->requiresUniformRegister(*MF, V));
+  return CreateRegs(V->getType(), TLI->isDivergent(DA, *MF, V));
 }
 
 /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2c147fa8947c1..722275e00a137 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11226,6 +11226,12 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   return RC;
 }
 
+bool SITargetLowering::isDivergent(const LegacyDivergenceAnalysis *DA,
+                                   MachineFunction &MF, const Value *V) const {
+  return !requiresUniformRegister(MF, V) &&
+         TargetLoweringBase::isDivergent(DA, MF, V);
+}
+
 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
 // uniform values (as produced by the mask results of control flow intrinsics)
 // used outside of divergent blocks. The phi users need to also be treated as
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 7ef11eba4f9ce..80f3a87ce0fa9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -416,8 +416,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   virtual const TargetRegisterClass *
   getRegClassFor(MVT VT, bool isDivergent) const override;
-  virtual bool requiresUniformRegister(MachineFunction &MF,
-                                       const Value *V) const override;
+  virtual bool isDivergent(const LegacyDivergenceAnalysis *DA,
+                           MachineFunction &MF, const Value *V) const override;
+  bool requiresUniformRegister(MachineFunction &MF, const Value *V) const;
   Align getPrefLoopAlignment(MachineLoop *ML) const override;
 
   void allocateHSAUserSGPRs(CCState &CCInfo,

From 9786e7552d5564268484357866088d0a054bccaf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 26 May 2020 12:58:18 -0400
Subject: [PATCH 111/770] Revert "[AMDGPU] NFC target dependent
 requiresUniformRegister refactored out"

This reverts commit fb38b98338cc87442e3451665e82bf1c8ef9388f.

This will regress compile time.
---
 llvm/include/llvm/CodeGen/TargetLowering.h          | 13 ++++++-------
 .../CodeGen/SelectionDAG/FunctionLoweringInfo.cpp   |  3 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp           |  6 ------
 llvm/lib/Target/AMDGPU/SIISelLowering.h             |  5 ++---
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 70bc6b986d3c4..2689838b3e7cc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -28,7 +28,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -822,12 +821,12 @@ class TargetLoweringBase {
     return RC;
   }
 
-  /// Allows target to decide about the divergence of the
-  /// specific value. Base class implementation returns true
-  /// if the Divergece Analysis exists and reports value as divergent.
-  virtual bool isDivergent(const LegacyDivergenceAnalysis *DA,
-                           MachineFunction &MF, const Value *V) const {
-    return DA && DA->isDivergent(V);
+  /// Allows target to decide about the register class of the
+  /// specific value that is live outside the defining block.
+  /// Returns true if the value needs uniform register class.
+  virtual bool requiresUniformRegister(MachineFunction &MF,
+                                       const Value *) const {
+    return false;
   }
 
   /// Return the 'representative' register class for the specified value
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 36e9ea538b6b0..7a5fd7d24c681 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -398,7 +398,8 @@ Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
 }
 
 Register FunctionLoweringInfo::CreateRegs(const Value *V) {
-  return CreateRegs(V->getType(), TLI->isDivergent(DA, *MF, V));
+  return CreateRegs(V->getType(), DA && DA->isDivergent(V) &&
+                    !TLI->requiresUniformRegister(*MF, V));
 }
 
 /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 722275e00a137..2c147fa8947c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11226,12 +11226,6 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   return RC;
 }
 
-bool SITargetLowering::isDivergent(const LegacyDivergenceAnalysis *DA,
-                                   MachineFunction &MF, const Value *V) const {
-  return !requiresUniformRegister(MF, V) &&
-         TargetLoweringBase::isDivergent(DA, MF, V);
-}
-
 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
 // uniform values (as produced by the mask results of control flow intrinsics)
 // used outside of divergent blocks. The phi users need to also be treated as
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 80f3a87ce0fa9..7ef11eba4f9ce 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -416,9 +416,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   virtual const TargetRegisterClass *
   getRegClassFor(MVT VT, bool isDivergent) const override;
-  virtual bool isDivergent(const LegacyDivergenceAnalysis *DA,
-                           MachineFunction &MF, const Value *V) const override;
-  bool requiresUniformRegister(MachineFunction &MF, const Value *V) const;
+  virtual bool requiresUniformRegister(MachineFunction &MF,
+                                       const Value *V) const override;
   Align getPrefLoopAlignment(MachineLoop *ML) const override;
 
   void allocateHSAUserSGPRs(CCState &CCInfo,

From a0ce2338a0838ccb04e10bd4f8e9ec9d7136e1d2 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 26 May 2020 12:48:22 -0400
Subject: [PATCH 112/770] [InstCombine] reassociate fsub+fadd with FMF to
 increase adds and throughput

The -reassociate pass tends to transform this kind of pattern into
something that is worse for vectorization and codegen. See PR43953:
https://bugs.llvm.org/show_bug.cgi?id=43953
---
 .../InstCombine/InstCombineAddSub.cpp         | 11 +++++++
 llvm/test/Transforms/InstCombine/fsub.ll      | 32 +++++++++++++------
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 85d6f47b205b7..233e0c7b5de72 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2195,6 +2195,17 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
       return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
     }
 
+    // Reassociate fsub/fadd sequences to create more fadd instructions and
+    // reduce dependency chains:
+    // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+    Value *Z;
+    if (match(Op0, m_OneUse(m_c_FAdd(m_OneUse(m_FSub(m_Value(X), m_Value(Y))),
+                                     m_Value(Z))))) {
+      Value *XZ = Builder.CreateFAddFMF(X, Z, &I);
+      Value *YW = Builder.CreateFAddFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFSubFMF(XZ, YW, &I);
+    }
+
     if (Instruction *F = factorizeFAddFSub(I, Builder))
       return F;
 
diff --git a/llvm/test/Transforms/InstCombine/fsub.ll b/llvm/test/Transforms/InstCombine/fsub.ll
index 68e49c21b3b42..a0f02dee2909a 100644
--- a/llvm/test/Transforms/InstCombine/fsub.ll
+++ b/llvm/test/Transforms/InstCombine/fsub.ll
@@ -785,11 +785,13 @@ define float @fneg_fsub_constant(float %x) {
   ret float %sub
 }
 
+; ((w-x) + y) - z --> (w+y) - (x+z)
+
 define float @fsub_fadd_fsub_reassoc(float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: @fsub_fadd_fsub_reassoc(
-; CHECK-NEXT:    [[S1:%.*]] = fsub reassoc nsz float [[W:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = fadd reassoc nsz float [[S1]], [[Y:%.*]]
-; CHECK-NEXT:    [[S2:%.*]] = fsub reassoc nsz float [[A]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd reassoc nsz float [[W:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd reassoc nsz float [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = fsub reassoc nsz float [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret float [[S2]]
 ;
   %s1 = fsub reassoc nsz float %w, %x
@@ -798,12 +800,14 @@ define float @fsub_fadd_fsub_reassoc(float %w, float %x, float %y, float %z) {
   ret float %s2
 }
 
+; FMF on the last op is enough to do the transform; vectors work too.
+
 define <2 x float> @fsub_fadd_fsub_reassoc_commute(<2 x float> %w, <2 x float> %x, <2 x float> %y, <2 x float> %z) {
 ; CHECK-LABEL: @fsub_fadd_fsub_reassoc_commute(
 ; CHECK-NEXT:    [[D:%.*]] = fdiv <2 x float> [[Y:%.*]], <float 4.200000e+01, float -4.200000e+01>
-; CHECK-NEXT:    [[S1:%.*]] = fsub <2 x float> [[W:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = fadd <2 x float> [[D]], [[S1]]
-; CHECK-NEXT:    [[S2:%.*]] = fsub fast <2 x float> [[A]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[D]], [[W:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = fsub fast <2 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x float> [[S2]]
 ;
   %d = fdiv <2 x float> %y, <float 42.0, float -42.0> ; thwart complexity-based canonicalization
@@ -813,12 +817,14 @@ define <2 x float> @fsub_fadd_fsub_reassoc_commute(<2 x float> %w, <2 x float> %
   ret <2 x float> %s2
 }
 
+; (v-w) + (x-y) - z --> (v+x) - (w+y+z)
+
 define float @fsub_fadd_fsub_reassoc_twice(float %v, float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: @fsub_fadd_fsub_reassoc_twice(
-; CHECK-NEXT:    [[S1:%.*]] = fsub reassoc nsz float [[V:%.*]], [[W:%.*]]
-; CHECK-NEXT:    [[S2:%.*]] = fsub reassoc nsz float [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = fadd reassoc nsz float [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = fsub reassoc nsz float [[A]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd reassoc nsz float [[W:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd reassoc nsz float [[X:%.*]], [[V:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd reassoc nsz float [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    [[S3:%.*]] = fsub reassoc nsz float [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret float [[S3]]
 ;
   %s1 = fsub reassoc nsz float %v, %w
@@ -828,6 +834,8 @@ define float @fsub_fadd_fsub_reassoc_twice(float %v, float %w, float %x, float %
   ret float %s3
 }
 
+; negative test - FMF
+
 define float @fsub_fadd_fsub_not_reassoc(float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: @fsub_fadd_fsub_not_reassoc(
 ; CHECK-NEXT:    [[S1:%.*]] = fsub fast float [[W:%.*]], [[X:%.*]]
@@ -841,6 +849,8 @@ define float @fsub_fadd_fsub_not_reassoc(float %w, float %x, float %y, float %z)
   ret float %s2
 }
 
+; negative test - uses
+
 define float @fsub_fadd_fsub_reassoc_use1(float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: @fsub_fadd_fsub_reassoc_use1(
 ; CHECK-NEXT:    [[S1:%.*]] = fsub fast float [[W:%.*]], [[X:%.*]]
@@ -856,6 +866,8 @@ define float @fsub_fadd_fsub_reassoc_use1(float %w, float %x, float %y, float %z
   ret float %s2
 }
 
+; negative test - uses
+
 define float @fsub_fadd_fsub_reassoc_use2(float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: @fsub_fadd_fsub_reassoc_use2(
 ; CHECK-NEXT:    [[S1:%.*]] = fsub fast float [[W:%.*]], [[X:%.*]]

From 106ec64fbc7fb5ef28d0368fb1dca18e67e75adf Mon Sep 17 00:00:00 2001
From: Hiroshi Yamauchi <yamauchi@google.com>
Date: Tue, 4 Feb 2020 15:19:33 -0800
Subject: [PATCH 113/770] [PGO] Add memcmp/bcmp size value profiling.

Summary: This adds support for memcmp/bcmp to the existing memcpy/memset value profiling.

Reviewers: davidxl

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79751
---
 .../Instrumentation/PGOInstrumentation.cpp    |  52 +++--
 .../Instrumentation/PGOMemOPSizeOpt.cpp       | 190 +++++++++++++-----
 .../Instrumentation/ValueProfileCollector.cpp |  10 +-
 .../Instrumentation/ValueProfileCollector.h   |   3 +-
 .../Instrumentation/ValueProfilePlugins.inc   |  22 +-
 .../Inputs/memop_size_annotation.proftext     |  22 +-
 .../PGOProfile/memop_size_annotation.ll       |   9 +
 .../Transforms/PGOProfile/memop_size_opt.ll   | 130 ++++++++++--
 8 files changed, 354 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 5fcb0b27d46fe..72eb5cd61b003 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -377,6 +377,7 @@ class PGOInstrumentationGenLegacyPass : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -405,6 +406,7 @@ class PGOInstrumentationUseLegacyPass : public ModulePass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -437,6 +439,7 @@ INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
                       "PGO instrumentation.", false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
                     "PGO instrumentation.", false, false)
 
@@ -566,11 +569,11 @@ template <class Edge, class BBInfo> class FuncPGOInstrumentation {
   }
 
   FuncPGOInstrumentation(
-      Function &Func,
+      Function &Func, TargetLibraryInfo &TLI,
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr, bool IsCS = false)
-      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func),
+      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
         ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) {
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
@@ -834,15 +837,16 @@ populateEHOperandBundle(VPCandidateInfo &Cand,
 // Visit all edge and instrument the edges not in MST, and do value profiling.
 // Critical edges will be split.
 static void instrumentOneFunc(
-    Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI,
+    Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI,
+    BlockFrequencyInfo *BFI,
     std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
     bool IsCS) {
   // Split indirectbr critical edges here before computing the MST rather than
   // later in getInstrBB() to avoid invalidating it.
   SplitIndirectBrCriticalEdges(F, BPI, BFI);
 
-  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, ComdatMembers, true, BPI,
-                                                   BFI, IsCS);
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, TLI, ComdatMembers, true,
+                                                   BPI, BFI, IsCS);
   std::vector<BasicBlock *> InstrumentBBs;
   FuncInfo.getInstrumentBBs(InstrumentBBs);
   unsigned NumCounters =
@@ -997,12 +1001,12 @@ namespace {
 
 class PGOUseFunc {
 public:
-  PGOUseFunc(Function &Func, Module *Modu,
+  PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
              BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
              ProfileSummaryInfo *PSI, bool IsCS)
       : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
-        FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS),
+        FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS),
         FreqAttr(FFA_Normal), IsCS(IsCS) {}
 
   // Read counts for the instrumented BB from profile.
@@ -1504,7 +1508,8 @@ static void collectComdatMembers(
 }
 
 static bool InstrumentAllFunctions(
-    Module &M, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
@@ -1516,9 +1521,10 @@ static bool InstrumentAllFunctions(
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
+    auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);
     auto *BFI = LookupBFI(F);
-    instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers, IsCS);
+    instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS);
   }
   return true;
 }
@@ -1534,27 +1540,32 @@ bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
+  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   auto LookupBPI = [this](Function &F) {
     return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
   };
   auto LookupBFI = [this](Function &F) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
-  return InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS);
+  return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS);
 }
 
 PreservedAnalyses PGOInstrumentationGen::run(Module &M,
                                              ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
   auto LookupBPI = [&FAM](Function &F) {
     return &FAM.getResult<BranchProbabilityAnalysis>(F);
   };
-
   auto LookupBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
-  if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS))
+  if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1562,6 +1573,7 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
 
 static bool annotateAllFunctions(
     Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
+    function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
     ProfileSummaryInfo *PSI, bool IsCS) {
@@ -1609,12 +1621,13 @@ static bool annotateAllFunctions(
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
+    auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);
     auto *BFI = LookupBFI(F);
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
-    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS);
+    PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS);
     bool AllZeros = false;
     if (!Func.readCounters(PGOReader.get(), AllZeros))
       continue;
@@ -1695,10 +1708,12 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
                                              ModuleAnalysisManager &AM) {
 
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
   auto LookupBPI = [&FAM](Function &F) {
     return &FAM.getResult<BranchProbabilityAnalysis>(F);
   };
-
   auto LookupBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
@@ -1706,7 +1721,7 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
   auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
 
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
-                            LookupBPI, LookupBFI, PSI, IsCS))
+                            LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1716,6 +1731,9 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
+  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   auto LookupBPI = [this](Function &F) {
     return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
   };
@@ -1724,8 +1742,8 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
   };
 
   auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI,
-                              IsCS);
+  return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI,
+                              LookupBFI, PSI, IsCS);
 }
 
 static std::string getSimpleNodeName(const BasicBlock *Node) {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 9767fda82f3da..bef0e0257f029 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -95,6 +95,11 @@ extern cl::opt<std::string> MemOPSizeRange;
 // This option sets the value that groups large memop sizes
 extern cl::opt<unsigned> MemOPSizeLarge;
 
+static cl::opt<bool>
+    MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(false),
+                       cl::Hidden,
+                       cl::desc("Size-specialize memcmp and bcmp calls"));
+
 namespace {
 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
 public:
@@ -113,6 +118,7 @@ class PGOMemOPSizeOptLegacyPass : public FunctionPass {
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 } // end anonymous namespace
@@ -122,6 +128,7 @@ INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
                       "Optimize memory intrinsic using its size value profile",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
                     "Optimize memory intrinsic using its size value profile",
                     false, false)
@@ -131,11 +138,90 @@ FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
 }
 
 namespace {
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
+struct MemOp {
+  Instruction *I;
+  MemOp(MemIntrinsic *MI) : I(MI) {}
+  MemOp(CallInst *CI) : I(CI) {}
+  MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
+  CallInst *asCI() { return cast<CallInst>(I); }
+  MemOp clone() {
+    if (auto MI = asMI())
+      return MemOp(cast<MemIntrinsic>(MI->clone()));
+    return MemOp(cast<CallInst>(asCI()->clone()));
+  }
+  Value *getLength() {
+    if (auto MI = asMI())
+      return MI->getLength();
+    return asCI()->getArgOperand(2);
+  }
+  void setLength(Value *Length) {
+    if (auto MI = asMI())
+      return MI->setLength(Length);
+    asCI()->setArgOperand(2, Length);
+  }
+  StringRef getFuncName() {
+    if (auto MI = asMI())
+      return MI->getCalledFunction()->getName();
+    return asCI()->getCalledFunction()->getName();
+  }
+  bool isMemmove() {
+    if (auto MI = asMI())
+      if (MI->getIntrinsicID() == Intrinsic::memmove)
+        return true;
+    return false;
+  }
+  bool isMemcmp(TargetLibraryInfo &TLI) {
+    LibFunc Func;
+    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+        Func == LibFunc_memcmp) {
+      return true;
+    }
+    return false;
+  }
+  bool isBcmp(TargetLibraryInfo &TLI) {
+    LibFunc Func;
+    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+        Func == LibFunc_bcmp) {
+      return true;
+    }
+    return false;
+  }
+  const char *getName(TargetLibraryInfo &TLI) {
+    if (auto MI = asMI())
+      return getMIName(MI);
+    LibFunc Func;
+    if (TLI.getLibFunc(*asCI(), Func)) {
+      if (Func == LibFunc_memcmp)
+        return "memcmp";
+      if (Func == LibFunc_bcmp)
+        return "bcmp";
+    }
+    llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
+    return nullptr;
+  }
+};
+
 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
 public:
   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
-               OptimizationRemarkEmitter &ORE, DominatorTree *DT)
-      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
+               OptimizationRemarkEmitter &ORE, DominatorTree *DT,
+               TargetLibraryInfo &TLI)
+      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
     ValueDataArray =
         std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
     // Get the MemOPSize range information from option MemOPSizeRange,
@@ -147,13 +233,12 @@ class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
     WorkList.clear();
     visit(Func);
 
-    for (auto &MI : WorkList) {
+    for (auto &MO : WorkList) {
       ++NumOfPGOMemOPAnnotate;
-      if (perform(MI)) {
+      if (perform(MO)) {
         Changed = true;
         ++NumOfPGOMemOPOpt;
-        LLVM_DEBUG(dbgs() << "MemOP call: "
-                          << MI->getCalledFunction()->getName()
+        LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
                           << "is Transformed.\n");
       }
     }
@@ -164,7 +249,16 @@ class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
     // Not perform on constant length calls.
     if (dyn_cast<ConstantInt>(Length))
       return;
-    WorkList.push_back(&MI);
+    WorkList.push_back(MemOp(&MI));
+  }
+
+  void visitCallInst(CallInst &CI) {
+    LibFunc Func;
+    if (TLI.getLibFunc(CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
+        !dyn_cast<ConstantInt>(CI.getArgOperand(2))) {
+      WorkList.push_back(MemOp(&CI));
+    }
   }
 
 private:
@@ -172,15 +266,16 @@ class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
   BlockFrequencyInfo &BFI;
   OptimizationRemarkEmitter &ORE;
   DominatorTree *DT;
+  TargetLibraryInfo &TLI;
   bool Changed;
-  std::vector<MemIntrinsic *> WorkList;
+  std::vector<MemOp> WorkList;
   // Start of the previse range.
   int64_t PreciseRangeStart;
   // Last value of the previse range.
   int64_t PreciseRangeLast;
   // The space to read the profile annotation.
   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
-  bool perform(MemIntrinsic *MI);
+  bool perform(MemOp MO);
 
   // This kind shows which group the value falls in. For PreciseValue, we have
   // the profile count for that value. LargeGroup groups the values that are in
@@ -196,19 +291,6 @@ class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
   }
 };
 
-static const char *getMIName(const MemIntrinsic *MI) {
-  switch (MI->getIntrinsicID()) {
-  case Intrinsic::memcpy:
-    return "memcpy";
-  case Intrinsic::memmove:
-    return "memmove";
-  case Intrinsic::memset:
-    return "memset";
-  default:
-    return "unknown";
-  }
-}
-
 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
   assert(Count <= TotalCount);
   if (Count < MemOPCountThreshold)
@@ -227,21 +309,23 @@ static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
   return ScaleCount / Denom;
 }
 
-bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
-  assert(MI);
-  if (MI->getIntrinsicID() == Intrinsic::memmove)
+bool MemOPSizeOpt::perform(MemOp MO) {
+  assert(MO.I);
+  if (MO.isMemmove())
+    return false;
+  if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
     return false;
 
   uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
   uint64_t TotalCount;
-  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+  if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions,
                                 ValueDataArray.get(), NumVals, TotalCount))
     return false;
 
   uint64_t ActualCount = TotalCount;
   uint64_t SavedTotalCount = TotalCount;
   if (MemOPScaleCount) {
-    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
     if (!BBEdgeCount)
       return false;
     ActualCount = *BBEdgeCount;
@@ -333,13 +417,13 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   // }
   // merge_bb:
 
-  BasicBlock *BB = MI->getParent();
+  BasicBlock *BB = MO.I->getParent();
   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
   LLVM_DEBUG(dbgs() << *BB << "\n");
   auto OrigBBFreq = BFI.getBlockFreq(BB);
 
-  BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
-  BasicBlock::iterator It(*MI);
+  BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
+  BasicBlock::iterator It(*MO.I);
   ++It;
   assert(It != DefaultBB->end());
   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
@@ -351,15 +435,24 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   auto &Ctx = Func.getContext();
   IRBuilder<> IRB(BB);
   BB->getTerminator()->eraseFromParent();
-  Value *SizeVar = MI->getLength();
+  Value *SizeVar = MO.getLength();
   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+  Type *MemOpTy = MO.I->getType();
+  PHINode *PHI = nullptr;
+  if (!MemOpTy->isVoidTy()) {
+    // Insert a phi for the return values at the merge block.
+    IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
+    PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
+    MO.I->replaceAllUsesWith(PHI);
+    PHI->addIncoming(MO.I, DefaultBB);
+  }
 
   // Clear the value profile data.
-  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+  MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
   // If all promoted, we don't need the MD.prof metadata.
   if (SavedRemainCount > 0 || Version != NumVals)
     // Otherwise we need update with the un-promoted records back.
-    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+    annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version),
                       SavedRemainCount, IPVK_MemOPSize, NumVals);
 
   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
@@ -371,17 +464,18 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   for (uint64_t SizeId : SizeIds) {
     BasicBlock *CaseBB = BasicBlock::Create(
         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
-    Instruction *NewInst = MI->clone();
+    MemOp NewMO = MO.clone();
     // Fix the argument.
-    auto *MemI = cast<MemIntrinsic>(NewInst);
-    auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
+    auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
     assert(SizeType && "Expected integer type size argument.");
     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
-    MemI->setLength(CaseSizeId);
-    CaseBB->getInstList().push_back(NewInst);
+    NewMO.setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewMO.I);
     IRBuilder<> IRBCase(CaseBB);
     IRBCase.CreateBr(MergeBB);
     SI->addCase(CaseSizeId, CaseBB);
+    if (!MemOpTy->isVoidTy())
+      PHI->addIncoming(NewMO.I, CaseBB);
     if (DT) {
       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
@@ -399,11 +493,10 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 
   ORE.emit([&]() {
     using namespace ore;
-    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
-             << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
-             << " with count " << NV("Count", SumForOpt) << " out of "
-             << NV("Total", TotalCount) << " for " << NV("Versions", Version)
-             << " versions";
+    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
+           << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
+           << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
+           << " for " << NV("Versions", Version) << " versions";
   });
 
   return true;
@@ -412,13 +505,13 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 
 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
                                 OptimizationRemarkEmitter &ORE,
-                                DominatorTree *DT) {
+                                DominatorTree *DT, TargetLibraryInfo &TLI) {
   if (DisableMemOPOPT)
     return false;
 
   if (F.hasFnAttribute(Attribute::OptimizeForSize))
     return false;
-  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
+  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
   MemOPSizeOpt.perform();
   return MemOPSizeOpt.isChanged();
 }
@@ -429,7 +522,9 @@ bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
+  TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
 }
 
 namespace llvm {
@@ -440,7 +535,8 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
   if (!Changed)
     return PreservedAnalyses::all();
   auto PA = PreservedAnalyses();
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
index 604726d4f40fc..cd4f636ff1320 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -38,7 +38,7 @@ using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
 
 template <> class PluginChain<> {
 public:
-  PluginChain(Function &F) {}
+  PluginChain(Function &F, TargetLibraryInfo &TLI) {}
   void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
 };
 
@@ -48,7 +48,8 @@ class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
   using Base = PluginChain<Ts...>;
 
 public:
-  PluginChain(Function &F) : PluginChain<Ts...>(F), Plugin(F) {}
+  PluginChain(Function &F, TargetLibraryInfo &TLI)
+      : PluginChain<Ts...>(F, TLI), Plugin(F, TLI) {}
 
   void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
     if (K == PluginT::Kind)
@@ -65,8 +66,9 @@ class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal
   using PluginChainFinal::PluginChainFinal;
 };
 
-ValueProfileCollector::ValueProfileCollector(Function &F)
-    : PImpl(new ValueProfileCollectorImpl(F)) {}
+ValueProfileCollector::ValueProfileCollector(Function &F,
+                                             TargetLibraryInfo &TLI)
+    : PImpl(new ValueProfileCollectorImpl(F, TLI)) {}
 
 ValueProfileCollector::~ValueProfileCollector() = default;
 
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
index ff883c8d0c779..c3f549c2e7cc5 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 #define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -58,7 +59,7 @@ class ValueProfileCollector {
     Instruction *AnnotatedInst; // Where metadata is attached.
   };
 
-  ValueProfileCollector(Function &Fn);
+  ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI);
   ValueProfileCollector(ValueProfileCollector &&) = delete;
   ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
 
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index 361035b178c85..b5dd9fab24a54 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -23,12 +23,14 @@ using CandidateInfo = ValueProfileCollector::CandidateInfo;
 ///--------------------------- MemIntrinsicPlugin ------------------------------
 class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
   Function &F;
+  TargetLibraryInfo &TLI;
   std::vector<CandidateInfo> *Candidates;
 
 public:
   static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
 
-  MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {}
+  MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI)
+      : F(Fn), TLI(TLI), Candidates(nullptr) {}
 
   void run(std::vector<CandidateInfo> &Cs) {
     Candidates = &Cs;
@@ -45,6 +47,22 @@ public:
     Instruction *AnnotatedInst = &MI;
     Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
   }
+  void visitCallInst(CallInst &CI) {
+    auto *F = CI.getCalledFunction();
+    if (!F)
+      return;
+    LibFunc Func;
+    if (TLI.getLibFunc(CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) {
+      Value *Length = CI.getArgOperand(2);
+      // Not instrument constant length calls.
+      if (dyn_cast<ConstantInt>(Length))
+        return;
+      Instruction *InsertPt = &CI;
+      Instruction *AnnotatedInst = &CI;
+      Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+    }
+  }
 };
 
 ///------------------------ IndirectCallPromotionPlugin ------------------------
@@ -54,7 +72,7 @@ class IndirectCallPromotionPlugin {
 public:
   static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
 
-  IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {}
+  IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {}
 
   void run(std::vector<CandidateInfo> &Candidates) {
     std::vector<CallBase *> Result = findIndirectCalls(F);
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext b/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
index 400b29df30365..cce1a67a94574 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
@@ -14,7 +14,27 @@ foo
 # ValueKind = IPVK_MemOPSize:
 1
 # NumValueSites:
-1
+3
+9
+7:33
+2:88
+9:72
+4:66
+1:99
+5:55
+6:44
+3:77
+8:22
+9
+7:33
+2:88
+9:72
+4:66
+1:99
+5:55
+6:44
+3:77
+8:22
 9
 7:33
 2:88
diff --git a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll
index a59988462ae64..5884a6ebbb25d 100644
--- a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll
+++ b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll
@@ -33,6 +33,12 @@ for.body3:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false)
 ; MEMOP_ANNOTATION: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false)
 ; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE:[0-9]+]]
+  %memcmp = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv)
+; MEMOP_ANNOTATION: call i32 @memcmp(i8* %dst, i8* %src, i64 %conv)
+; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE]]
+  %bcmp = call i32 @bcmp(i8* %dst, i8* %src, i64 %conv)
+; MEMOP_ANNOTATION: call i32 @bcmp(i8* %dst, i8* %src, i64 %conv)
+; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE]]
 ; MEMOP_ANNOTATION9: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22}
 ; MEMOP_ANNOTATION4: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72}
   br label %for.inc
@@ -56,4 +62,7 @@ declare void @llvm.lifetime.start(i64, i8* nocapture)
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
 
+declare i32 @memcmp(i8*, i8*, i64)
+declare i32 @bcmp(i8*, i8*, i64)
+
 declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/llvm/test/Transforms/PGOProfile/memop_size_opt.ll b/llvm/test/Transforms/PGOProfile/memop_size_opt.ll
index 8d6215cf9252e..bc79fbc3e37e7 100644
--- a/llvm/test/Transforms/PGOProfile/memop_size_opt.ll
+++ b/llvm/test/Transforms/PGOProfile/memop_size_opt.ll
@@ -1,8 +1,8 @@
-; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT
-; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT
-; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT
+; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefix=MEMOP_OPT
+; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 --pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefix=MEMOP_OPT
+; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pgo-memop-optimize-memcmp-bcmp -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT
 ; RUN: FileCheck %s -input-file=%t.opt.yaml --check-prefix=YAML
-; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT
+; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pgo-memop-optimize-memcmp-bcmp -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT
 ; RUN: FileCheck %s -input-file=%t.opt.yaml --check-prefix=YAML
 
 
@@ -57,12 +57,6 @@ for.body3:
 ; MEMOP_OPT:   br label %[[MERGE_LABEL2]]
 ; MEMOP_OPT: [[MERGE_LABEL2]]:
 ; MEMOP_OPT:   br label %for.inc
-; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99}
-; Should be 457 total left (original total count 556, minus 99 from specialized
-; value 1, which is removed from VP array. Also, we only end up with 5 total
-; values, since the default max number of promotions is 5 and therefore
-; the rest of the values are ignored when extracting the VP metadata.
-; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66}
 
 for.inc:
   %inc = add nsw i32 %j.0, 1
@@ -79,6 +73,83 @@ for.end6:
   ret void
 }
 
+declare void @consume(i32 %v1, i32 %v2)
+
+define void @foo_memcmp_bcmp(i8* %dst, i8* %src, i8* %dst2, i8* %src2, i32* %a, i32 %n) !prof !27 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end6, !prof !28
+
+for.body:
+  br label %for.cond1
+
+for.cond1:
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  br i1 %cmp2, label %for.body3, label %for.end, !prof !29
+
+for.body3:
+  %add = add nsw i32 %i.0, 1
+  %conv = sext i32 %add to i64
+  %memcmp = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv), !prof !30
+  %bcmp = call i32 @bcmp(i8* %dst2, i8* %src2, i64 %conv), !prof !31
+  call void @consume(i32 %memcmp, i32 %bcmp)
+  br label %for.inc
+
+; MEMOP_OPT:  switch i64 %conv, label %[[DEFAULT_LABEL:.*]] [
+; MEMOP_OPT:    i64 1, label %[[CASE_1_LABEL:.*]]
+; MEMOP_OPT:  ], !prof [[SWITCH_BW:![0-9]+]]
+; MEMOP_OPT: [[CASE_1_LABEL]]:
+; MEMOP_OPT:   %[[RV:.*]] = call i32 @memcmp(i8* %dst, i8* %src, i64 1)
+; MEMOP_OPT:   br label %[[MERGE_LABEL:.*]]
+; MEMOP_OPT: [[DEFAULT_LABEL]]:
+; MEMOP_OPT:   %[[RVD:.*]] = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv), !prof [[NEWVP:![0-9]+]]
+; MEMOP_OPT:   br label %[[MERGE_LABEL]]
+; MEMOP_OPT: [[MERGE_LABEL]]:
+; MEMOP_OPT:  %[[PHI:.*]] = phi i32 [ %[[RVD]], %[[DEFAULT_LABEL]] ], [ %[[RV]], %[[CASE_1_LABEL]] ]
+; MEMOP_OPT:  switch i64 %conv, label %[[DEFAULT_LABEL2:.*]] [
+; MEMOP_OPT:    i64 1, label %[[CASE_1_LABEL2:.*]]
+; MEMOP_OPT:  ], !prof [[SWITCH_BW:![0-9]+]]
+; MEMOP_OPT: [[CASE_1_LABEL2]]:
+; MEMOP_OPT:   %[[RV2:.*]] = call i32 @bcmp(i8* %dst2, i8* %src2, i64 1)
+; MEMOP_OPT:   br label %[[MERGE_LABEL2:.*]]
+; MEMOP_OPT: [[DEFAULT_LABEL2]]:
+; MEMOP_OPT:   %[[RVD2:.*]] = call i32 @bcmp(i8* %dst2, i8* %src2, i64 %conv), !prof [[NEWVP]]
+; MEMOP_OPT:   br label %[[MERGE_LABEL2]]
+; MEMOP_OPT: [[MERGE_LABEL2]]:
+; MEMOP_OPT:   %[[PHI2:.*]] = phi i32 [ %[[RVD2]], %[[DEFAULT_LABEL2]] ], [ %[[RV2]], %[[CASE_1_LABEL2]] ]
+; MEMOP_OPT:   call void @consume(i32 %[[PHI]], i32 %[[PHI2]])
+; MEMOP_OPT:   br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:
+  br label %for.inc4
+
+for.inc4:
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:
+  ret void
+}
+
+; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99}
+; Should be 457 total left (original total count 556, minus 99 from specialized
+; value 1, which is removed from VP array. Also, we only end up with 5 total
+; values, since the default max number of promotions is 5 and therefore
+; the rest of the values are ignored when extracting the VP metadata.
+; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66}
+
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 1, !"ProfileSummary", !1}
@@ -118,6 +189,9 @@ declare void @llvm.lifetime.start(i64, i8* nocapture)
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
 
+declare i32 @memcmp(i8*, i8*, i64)
+declare i32 @bcmp(i8*, i8*, i64)
+
 declare void @llvm.lifetime.end(i64, i8* nocapture)
 
 ; YAML:      --- !Passed
@@ -127,7 +201,7 @@ declare void @llvm.lifetime.end(i64, i8* nocapture)
 ; YAML-NEXT: Hotness:         0
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'optimized '
-; YAML-NEXT:   - Intrinsic:       memcpy
+; YAML-NEXT:   - Memop:           memcpy
 ; YAML-NEXT:   - String:          ' with count '
 ; YAML-NEXT:   - Count:           '99'
 ; YAML-NEXT:   - String:          ' out of '
@@ -143,7 +217,39 @@ declare void @llvm.lifetime.end(i64, i8* nocapture)
 ; YAML-NEXT: Hotness:         0
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'optimized '
-; YAML-NEXT:   - Intrinsic:       memcpy
+; YAML-NEXT:   - Memop:           memcpy
+; YAML-NEXT:   - String:          ' with count '
+; YAML-NEXT:   - Count:           '99'
+; YAML-NEXT:   - String:          ' out of '
+; YAML-NEXT:   - Total:           '556'
+; YAML-NEXT:   - String:          ' for '
+; YAML-NEXT:   - Versions:        '1'
+; YAML-NEXT:   - String:          ' versions'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Passed
+; YAML-NEXT: Pass:            pgo-memop-opt
+; YAML-NEXT: Name:            memopt-opt
+; YAML-NEXT: Function:        foo_memcmp_bcmp
+; YAML-NEXT: Hotness:         0
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'optimized '
+; YAML-NEXT:   - Memop:           memcmp
+; YAML-NEXT:   - String:          ' with count '
+; YAML-NEXT:   - Count:           '99'
+; YAML-NEXT:   - String:          ' out of '
+; YAML-NEXT:   - Total:           '556'
+; YAML-NEXT:   - String:          ' for '
+; YAML-NEXT:   - Versions:        '1'
+; YAML-NEXT:   - String:          ' versions'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Passed
+; YAML-NEXT: Pass:            pgo-memop-opt
+; YAML-NEXT: Name:            memopt-opt
+; YAML-NEXT: Function:        foo_memcmp_bcmp
+; YAML-NEXT: Hotness:         0
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'optimized '
+; YAML-NEXT:   - Memop:           bcmp
 ; YAML-NEXT:   - String:          ' with count '
 ; YAML-NEXT:   - Count:           '99'
 ; YAML-NEXT:   - String:          ' out of '

From 3e62289f42d21e7e1f9a8b1d6f970740b22f5d47 Mon Sep 17 00:00:00 2001
From: Sean Fertile <sd.fertile@gmail.com>
Date: Tue, 26 May 2020 13:06:50 -0400
Subject: [PATCH 114/770] [PowerPC][NFC] Add colon to TODO's and fix
 indentation.

---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 7da24f03bc7a7..3ec6788d077b5 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -224,17 +224,17 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
   };
 
   static const SpillSlot AIXOffsets32[] = {
-    CALLEE_SAVED_FPRS,
-    CALLEE_SAVED_GPRS32,
-    // Add AIX's extra CSR.
-    {PPC::R13, -76},
-    // TODO Update when we add vector support for AIX.
+      CALLEE_SAVED_FPRS,
+      CALLEE_SAVED_GPRS32,
+      // Add AIX's extra CSR.
+      {PPC::R13, -76},
+      // TODO: Update when we add vector support for AIX.
   };
 
   static const SpillSlot AIXOffsets64[] = {
-    CALLEE_SAVED_FPRS,
-    CALLEE_SAVED_GPRS64,
-    // TODO Update when we add vector support for AIX.
+      CALLEE_SAVED_FPRS,
+      CALLEE_SAVED_GPRS64,
+      // TODO: Update when we add vector support for AIX.
   };
 
   if (Subtarget.is64BitELFABI()) {

From 2c7d63257d8e33ff721af78045d2be6bac54da05 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 26 May 2020 20:40:45 +0300
Subject: [PATCH 115/770] [MSSA][Doc] Clobbers, more info on Defs / Def chain

- Added more info about what we refer as a clobber in MSSA.
- Added more info about MemoryDefs and how there is a single Def chain.
- The doc portrayed MSSA as modeling the heap whileit is modeling
  the whole memory, so I changed the wording to not be heap-specific.

Differential Revision: https://reviews.llvm.org/D80000
---
 llvm/docs/MemorySSA.rst | 81 ++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 25 deletions(-)

diff --git a/llvm/docs/MemorySSA.rst b/llvm/docs/MemorySSA.rst
index 1669117fcf560..4f96c83a032d2 100644
--- a/llvm/docs/MemorySSA.rst
+++ b/llvm/docs/MemorySSA.rst
@@ -14,20 +14,22 @@ interactions between various memory operations. Its goal is to replace
 unless you're very careful, use of ``MemoryDependenceAnalysis`` can easily
 result in quadratic-time algorithms in LLVM. Additionally, ``MemorySSA`` doesn't
 have as many arbitrary limits as ``MemoryDependenceAnalysis``, so you should get
-better results, too.
+better results, too. One common use of ``MemorySSA`` is to quickly find out
+that something definitely cannot happen (for example, reason that a hoist
+out of a loop can't happen).
 
 At a high level, one of the goals of ``MemorySSA`` is to provide an SSA based
 form for memory, complete with def-use and use-def chains, which
 enables users to quickly find may-def and may-uses of memory operations.
 It can also be thought of as a way to cheaply give versions to the complete
-state of heap memory, and associate memory operations with those versions.
+state of memory, and associate memory operations with those versions.
 
 This document goes over how ``MemorySSA`` is structured, and some basic
 intuition on how ``MemorySSA`` works.
 
 A paper on MemorySSA (with notes about how it's implemented in GCC) `can be
 found here <http://www.airs.com/dnovillo/Papers/mem-ssa.pdf>`_. Though, it's
-relatively out-of-date; the paper references multiple heap partitions, but GCC
+relatively out-of-date; the paper references multiple memory partitions, but GCC
 eventually swapped to just using one, like we now have in LLVM.  Like
 GCC's, LLVM's MemorySSA is intraprocedural.
 
@@ -41,9 +43,29 @@ structure that maps ``Instruction``\ s to ``MemoryAccess``\ es, which are
 
 Each ``MemoryAccess`` can be one of three types:
 
+- ``MemoryDef``
 - ``MemoryPhi``
 - ``MemoryUse``
-- ``MemoryDef``
+
+``MemoryDef``\ s are operations which may either modify memory, or which
+introduce some kind of ordering constraints. Examples of ``MemoryDef``\ s
+include ``store``\ s, function calls, ``load``\ s with ``acquire`` (or higher)
+ordering, volatile operations, memory fences, etc. A ``MemoryDef``
+always introduces a new version of the entire memory and is linked with a single
+``MemoryDef/MemoryPhi`` which is the version of memory that the new
+version is based on. This implies that there is a *single*
+``Def`` chain that connects all the ``Def``\ s, either directly
+or indireclty. For example in:
+
+.. code-block:: llvm
+  b = MemoryDef(a)
+  c = MemoryDef(b)
+  d = MemoryDef(c)
+
+``d`` is connected directly with ``c`` and indirectly with ``b``.
+This means that ``d`` potentially clobbers (see below) ``c`` *or*
+``b`` *or* both. This in turn implies that without the use of `The walker_`,
+initially every ``MemoryDef`` clobbers every other ``MemoryDef``.
 
 ``MemoryPhi``\ s are ``PhiNode``\ s, but for memory operations. If at any
 point we have two (or more) ``MemoryDef``\ s that could flow into a
@@ -61,11 +83,6 @@ reach a phi node may or may not clobber a given variable).
 ``MemoryUse``\ s are operations which use but don't modify memory. An example of
 a ``MemoryUse`` is a ``load``, or a ``readonly`` function call.
 
-``MemoryDef``\ s are operations which may either modify memory, or which
-introduce some kind of ordering constraints. Examples of ``MemoryDef``\ s
-include ``store``\ s, function calls, ``load``\ s with ``acquire`` (or higher)
-ordering, volatile operations, memory fences, etc.
-
 Every function that exists has a special ``MemoryDef`` called ``liveOnEntry``.
 It dominates every ``MemoryAccess`` in the function that ``MemorySSA`` is being
 run on, and implies that we've hit the top of the function. It's the only
@@ -75,14 +92,28 @@ defined before the function begins.
 
 An example of all of this overlaid on LLVM IR (obtained by running ``opt
 -passes='print<memoryssa>' -disable-output`` on an ``.ll`` file) is below. When
-viewing this example, it may be helpful to view it in terms of clobbers. The
-operands of a given ``MemoryAccess`` are all (potential) clobbers of said
-MemoryAccess, and the value produced by a ``MemoryAccess`` can act as a clobber
-for other ``MemoryAccess``\ es. Another useful way of looking at it is in
-terms of heap versions.  In that view, operands of a given
-``MemoryAccess`` are the version of the heap before the operation, and
-if the access produces a value, the value is the new version of the heap
-after the operation.
+viewing this example, it may be helpful to view it in terms of clobbers.
+The operands of a given ``MemoryAccess`` are all (potential) clobbers of said
+``MemoryAccess``, and the value produced by a ``MemoryAccess`` can act as a clobber
+for other ``MemoryAccess``\ es.
+
+If a ``MemoryAccess`` is a *clobber* of another, it means that these two
+``MemoryAccess``\ es may access the same memory. For example, ``x = MemoryDef(y)``
+means that ``x`` potentially modifies memory that ``y`` modifies/constrains
+(or has modified / constrained).
+In the same manner, ``a = MemoryPhi({BB1,b},{BB2,c})`` means that
+anyone that uses ``a`` is accessing memory potentially modified / constrained
+by either ``b`` or ``c`` (or both).  And finally, ``MemoryUse(x)`` means
+that this use accesses memory that ``x`` has modified / constrained
+(as an example, think that if ``x = MemoryDef(...)``
+and ``MemoryUse(x)`` are in the same loop, the use can't
+be hoisted outside alone).
+
+Another useful way of looking at it is in terms of memory versions.
+In that view, operands of a given ``MemoryAccess`` are the version
+of the entire memory before the operation, and if the access produces
+a value (i.e. ``MemoryDef/MemoryPhi``),
+the value is the new version of the memory after the operation.
 
 .. code-block:: llvm
 
@@ -96,7 +127,7 @@ after the operation.
     br label %while.cond
 
   while.cond:
-    ; 6 = MemoryPhi({%0,1},{if.end,4})
+    ; 6 = MemoryPhi({entry,1},{if.end,4})
     br i1 undef, label %if.then, label %if.else
 
   if.then:
@@ -148,8 +179,8 @@ Going from the top down:
   reaching definition is ``5``.
 - ``MemoryUse(1)`` notes that ``load i8, i8* %p3`` is just a user of memory,
   and the last thing that could clobber this use is above ``while.cond`` (e.g.
-  the store to ``%p3``). In heap versioning parlance, it really only depends on
-  the heap version 1, and is unaffected by the new heap versions generated since
+  the store to ``%p3``). In memory versioning parlance, it really only depends on
+  the memory version 1, and is unaffected by the new memory versions generated since
   then.
 
 As an aside, ``MemoryAccess`` is a ``Value`` mostly for convenience; it's not
@@ -222,7 +253,7 @@ second ``MemoryUse`` in ``if.end`` has an operand of ``1``, which is a
 value numbering, etc, faster and easier.
 
 It is not possible to optimize ``MemoryDef`` in the same way, as we
-restrict ``MemorySSA`` to one heap variable and, thus, one Phi node
+restrict ``MemorySSA`` to one memory variable and, thus, one Phi node
 per block.
 
 
@@ -320,14 +351,14 @@ Precision
 
 ``MemorySSA`` in LLVM deliberately trades off precision for speed.
 Let us think about memory variables as if they were disjoint partitions of the
-heap (that is, if you have one variable, as above, it represents the entire
-heap, and if you have multiple variables, each one represents some
-disjoint portion of the heap)
+memory (that is, if you have one variable, as above, it represents the entire
+memory, and if you have multiple variables, each one represents some
+disjoint portion of the memory)
 
 First, because alias analysis results conflict with each other, and
 each result may be what an analysis wants (IE
 TBAA may say no-alias, and something else may say must-alias), it is
-not possible to partition the heap the way every optimization wants.
+not possible to partition the memory the way every optimization wants.
 Second, some alias analysis results are not transitive (IE A noalias B,
 and B noalias C, does not mean A noalias C), so it is not possible to
 come up with a precise partitioning in all cases without variables to

From 8f1156a7d004d97e9f75484a00dc4278698fd8ea Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Tue, 26 May 2020 18:47:56 +0100
Subject: [PATCH 116/770] [clang-format] Fix an ObjC regression introduced with
 new [[likely]][[unlikely]] support in if/else clauses

Summary:
{D80144} introduce an ObjC regression

Only parse the `[]` if what follows is really an attribute

Reviewers: krasimir, JakeMerdichAMD

Reviewed By: krasimir

Subscribers: rdwampler, aaron.ballman, curdeius, cfe-commits

Tags: #clang, #clang-format

Differential Revision: https://reviews.llvm.org/D80547
---
 clang/lib/Format/UnwrappedLineParser.cpp  | 49 ++++++++++++++++++++++-
 clang/lib/Format/UnwrappedLineParser.h    |  1 +
 clang/unittests/Format/FormatTest.cpp     |  5 +++
 clang/unittests/Format/FormatTestObjC.cpp | 19 +++++++++
 4 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 03b6e0c9ef744..b8da2c23b55ac 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1962,7 +1962,7 @@ void UnwrappedLineParser::parseIfThenElse() {
   if (FormatTok->Tok.is(tok::l_paren))
     parseParens();
   // handle [[likely]] / [[unlikely]]
-  if (FormatTok->is(tok::l_square))
+  if (FormatTok->is(tok::l_square) && tryToParseSimpleAttribute())
     parseSquare();
   bool NeedsUnwrappedLine = false;
   if (FormatTok->Tok.is(tok::l_brace)) {
@@ -1981,7 +1981,7 @@ void UnwrappedLineParser::parseIfThenElse() {
   if (FormatTok->Tok.is(tok::kw_else)) {
     nextToken();
     // handle [[likely]] / [[unlikely]]
-    if (FormatTok->is(tok::l_square))
+    if (FormatTok->Tok.is(tok::l_square) && tryToParseSimpleAttribute())
       parseSquare();
     if (FormatTok->Tok.is(tok::l_brace)) {
       CompoundStatementIndenter Indenter(this, Style, Line->Level);
@@ -2343,6 +2343,51 @@ bool UnwrappedLineParser::parseEnum() {
   // "} n, m;" will end up in one unwrapped line.
 }
 
+namespace {
+// A class used to set and restore the Token position when peeking
+// ahead in the token source.
+class ScopedTokenPosition {
+  unsigned StoredPosition;
+  FormatTokenSource *Tokens;
+
+public:
+  ScopedTokenPosition(FormatTokenSource *Tokens) : Tokens(Tokens) {
+    assert(Tokens && "Tokens expected to not be null");
+    StoredPosition = Tokens->getPosition();
+  }
+
+  ~ScopedTokenPosition() { Tokens->setPosition(StoredPosition); }
+};
+} // namespace
+
+// Look to see if we have [[ by looking ahead, if
+// its not then rewind to the original position.
+bool UnwrappedLineParser::tryToParseSimpleAttribute() {
+  ScopedTokenPosition AutoPosition(Tokens);
+  FormatToken *Tok = Tokens->getNextToken();
+  // We already read the first [ check for the second.
+  if (Tok && !Tok->is(tok::l_square)) {
+    return false;
+  }
+  // Double check that the attribute is just something
+  // fairly simple.
+  while (Tok) {
+    if (Tok->is(tok::r_square)) {
+      break;
+    }
+    Tok = Tokens->getNextToken();
+  }
+  Tok = Tokens->getNextToken();
+  if (Tok && !Tok->is(tok::r_square)) {
+    return false;
+  }
+  Tok = Tokens->getNextToken();
+  if (Tok && Tok->is(tok::semi)) {
+    return false;
+  }
+  return true;
+}
+
 void UnwrappedLineParser::parseJavaEnumBody() {
   // Determine whether the enum is simple, i.e. does not have a semicolon or
   // constants with class bodies. Simple enums can be formatted like braced
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 8d4118ab6dc7d..8b3aa4c84edba 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -134,6 +134,7 @@ class UnwrappedLineParser {
   bool tryToParseLambdaIntroducer();
   bool tryToParsePropertyAccessor();
   void tryToParseJSFunction();
+  bool tryToParseSimpleAttribute();
   void addUnwrappedLine();
   bool eof() const;
   // LevelDifference is the difference of levels after and before the current
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index a3b70bfd28245..eea0b364d97c5 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -16513,6 +16513,11 @@ TEST_F(FormatTest, LikelyUnlikely) {
                "  return 42;\n"
                "}\n",
                Style);
+
+  verifyFormat("if (argc > 5) [[gnu::unused]] {\n"
+               "  return 29;\n"
+               "}",
+               Style);
 }
 
 TEST_F(FormatTest, LLVMDefaultStyle) {
diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp
index d73d090a8ba37..28d33dcdaa541 100644
--- a/clang/unittests/Format/FormatTestObjC.cpp
+++ b/clang/unittests/Format/FormatTestObjC.cpp
@@ -1434,6 +1434,25 @@ TEST_F(FormatTestObjC, BreakLineBeforeNestedBlockParam) {
       "           }]");
 }
 
+TEST_F(FormatTestObjC, IfNotUnlikely) {
+  Style = getGoogleStyle(FormatStyle::LK_ObjC);
+
+  verifyFormat("if (argc < 5) [obj func:arg];");
+  verifyFormat("if (argc < 5) [[obj1 method1:arg1] method2:arg2];");
+  verifyFormat("if (argc < 5) [[foo bar] baz:i[0]];");
+  verifyFormat("if (argc < 5) [[foo bar] baz:i[0]][1];");
+
+  verifyFormat("if (argc < 5)\n"
+               "  [obj func:arg];\n"
+               "else\n"
+               "  [obj func:arg2];");
+
+  verifyFormat("if (argc < 5) [[unlikely]]\n"
+               "  [obj func:arg];\n"
+               "else [[likely]]\n"
+               "  [obj func:arg2];");
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang

From d70ec366c91b2a5fc6334e6f6ca9c4d9a6785c5e Mon Sep 17 00:00:00 2001
From: Adam Balogh <adam.balogh@ericsson.com>
Date: Tue, 26 May 2020 13:48:20 +0200
Subject: [PATCH 117/770] [Analyzer][NFC] Remove the SubEngine interface

The `SubEngine` interface is an interface with only one implementation
`EpxrEngine`. Adding other implementations are difficult and very
unlikely in the near future. Currently, if anything from `ExprEngine` is
to be exposed to other classes it is moved to `SubEngine` which
restricts the alternative implementations. The virtual methods are have
a slight perofrmance impact. Furthermore, instead of the `LLVM`-style
inheritance a native inheritance is used here, which renders `LLVM`
functions like e.g. `cast<T>()` unusable here. This patch removes this
interface and allows usage of `ExprEngine` directly.

Differential Revision: https://reviews.llvm.org/D80548
---
 .../Core/PathSensitive/ConstraintManager.h    |   7 +-
 .../Core/PathSensitive/CoreEngine.h           |   6 +-
 .../Core/PathSensitive/ExprEngine.h           |  95 ++++++----
 .../Core/PathSensitive/ProgramState.h         |  10 +-
 .../PathSensitive/RangedConstraintManager.h   |   4 +-
 .../Core/PathSensitive/SMTConstraintManager.h |   5 +-
 .../PathSensitive/SimpleConstraintManager.h   |   6 +-
 .../Core/PathSensitive/SubEngine.h            | 178 ------------------
 .../Core/BugReporterVisitors.cpp              |   1 -
 clang/lib/StaticAnalyzer/Core/CMakeLists.txt  |   1 -
 clang/lib/StaticAnalyzer/Core/CallEvent.cpp   |   2 +-
 clang/lib/StaticAnalyzer/Core/CoreEngine.cpp  |  41 ++--
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   2 +
 .../lib/StaticAnalyzer/Core/ProgramState.cpp  |  10 +-
 .../Core/RangeConstraintManager.cpp           |   7 +-
 clang/lib/StaticAnalyzer/Core/RegionStore.cpp |   4 +-
 .../Core/SMTConstraintManager.cpp             |   2 +-
 clang/lib/StaticAnalyzer/Core/SValBuilder.cpp |   2 +-
 .../Core/SimpleConstraintManager.cpp          |   4 +-
 .../StaticAnalyzer/Core/SimpleSValBuilder.cpp |   2 +-
 clang/lib/StaticAnalyzer/Core/SubEngine.cpp   |  13 --
 21 files changed, 118 insertions(+), 284 deletions(-)
 delete mode 100644 clang/include/clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h
 delete mode 100644 clang/lib/StaticAnalyzer/Core/SubEngine.cpp

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h
index 935b2bb7b937d..335536b6a3106 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h
@@ -32,7 +32,7 @@ namespace clang {
 namespace ento {
 
 class ProgramStateManager;
-class SubEngine;
+class ExprEngine;
 class SymbolReaper;
 
 class ConditionTruthVal {
@@ -193,10 +193,11 @@ class ConstraintManager {
 
 std::unique_ptr<ConstraintManager>
 CreateRangeConstraintManager(ProgramStateManager &statemgr,
-                             SubEngine *subengine);
+                             ExprEngine *exprengine);
 
 std::unique_ptr<ConstraintManager>
-CreateZ3ConstraintManager(ProgramStateManager &statemgr, SubEngine *subengine);
+CreateZ3ConstraintManager(ProgramStateManager &statemgr,
+                          ExprEngine *exprengine);
 
 } // namespace ento
 } // namespace clang
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
index 278193ef99ede..2aca2c99ef4fd 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
@@ -41,7 +41,7 @@ class LabelDecl;
 namespace ento {
 
 class FunctionSummariesTy;
-class SubEngine;
+class ExprEngine;
 
 //===----------------------------------------------------------------------===//
 /// CoreEngine - Implements the core logic of the graph-reachability
@@ -69,7 +69,7 @@ class CoreEngine {
       std::vector<std::pair<const CFGBlock *, const ExplodedNode *>>;
 
 private:
-  SubEngine &SubEng;
+  ExprEngine &ExprEng;
 
   /// G - The simulation graph.  Each node is a (location,state) pair.
   mutable ExplodedGraph G;
@@ -129,7 +129,7 @@ class CoreEngine {
 
 public:
   /// Construct a CoreEngine object to analyze the provided CFG.
-  CoreEngine(SubEngine &subengine,
+  CoreEngine(ExprEngine &exprengine,
              FunctionSummariesTy *FS,
              AnalyzerOptions &Opts);
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index c66c54116a0c6..a94c847f35ee1 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -21,6 +21,7 @@
 #include "clang/Analysis/DomainSpecific/ObjCNoReturn.h"
 #include "clang/Analysis/ProgramPoint.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
@@ -29,9 +30,9 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/Store.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/WorkList.h"
 #include "llvm/ADT/ArrayRef.h"
 #include <cassert>
@@ -42,6 +43,8 @@ namespace clang {
 class AnalysisDeclContextManager;
 class AnalyzerOptions;
 class ASTContext;
+class CFGBlock;
+class CFGElement;
 class ConstructionContext;
 class CXXBindTemporaryExpr;
 class CXXCatchStmt;
@@ -72,16 +75,29 @@ class CrossTranslationUnitContext;
 
 namespace ento {
 
+class AnalysisManager;
 class BasicValueFactory;
+class BlockCounter;
+class BranchNodeBuilder;
 class CallEvent;
 class CheckerManager;
 class ConstraintManager;
 class CXXTempObjectRegion;
+class EndOfFunctionNodeBuilder;
+class ExplodedNodeSet;
+class ExplodedNode;
+class IndirectGotoNodeBuilder;
 class MemRegion;
+struct NodeBuilderContext;
+class NodeBuilderWithSinks;
+class ProgramState;
+class ProgramStateManager;
 class RegionAndSymbolInvalidationTraits;
 class SymbolManager;
+class SwitchNodeBuilder;
 
-class ExprEngine : public SubEngine {
+class ExprEngine {
+  virtual void anchor();
 public:
   /// The modes of inlining, which override the default analysis-wide settings.
   enum InliningModes {
@@ -161,7 +177,7 @@ class ExprEngine : public SubEngine {
              SetOfConstDecls *VisitedCalleesIn,
              FunctionSummariesTy *FS, InliningModes HowToInlineIn);
 
-  ~ExprEngine() override = default;
+  ~ExprEngine() = default;
 
   /// Returns true if there is still simulation state on the worklist.
   bool ExecuteWorkList(const LocationContext *L, unsigned Steps = 150000) {
@@ -181,7 +197,7 @@ class ExprEngine : public SubEngine {
   /// getContext - Return the ASTContext associated with this analysis.
   ASTContext &getContext() const { return AMgr.getASTContext(); }
 
-  AnalysisManager &getAnalysisManager() override { return AMgr; }
+  AnalysisManager &getAnalysisManager() { return AMgr; }
 
   AnalysisDeclContextManager &getAnalysisDeclContextManager() {
     return AMgr.getAnalysisDeclContextManager();
@@ -196,7 +212,7 @@ class ExprEngine : public SubEngine {
   BugReporter &getBugReporter() { return BR; }
 
   cross_tu::CrossTranslationUnitContext *
-  getCrossTranslationUnitContext() override {
+  getCrossTranslationUnitContext() {
     return &CTU;
   }
 
@@ -232,7 +248,7 @@ class ExprEngine : public SubEngine {
 
   /// getInitialState - Return the initial state used for the root vertex
   ///  in the ExplodedGraph.
-  ProgramStateRef getInitialState(const LocationContext *InitLoc) override;
+  ProgramStateRef getInitialState(const LocationContext *InitLoc);
 
   ExplodedGraph &getGraph() { return G; }
   const ExplodedGraph &getGraph() const { return G; }
@@ -270,7 +286,7 @@ class ExprEngine : public SubEngine {
   /// processCFGElement - Called by CoreEngine. Used to generate new successor
   ///  nodes by processing the 'effects' of a CFG element.
   void processCFGElement(const CFGElement E, ExplodedNode *Pred,
-                         unsigned StmtIdx, NodeBuilderContext *Ctx) override;
+                         unsigned StmtIdx, NodeBuilderContext *Ctx);
 
   void ProcessStmt(const Stmt *S, ExplodedNode *Pred);
 
@@ -296,7 +312,7 @@ class ExprEngine : public SubEngine {
   /// Called by CoreEngine when processing the entrance of a CFGBlock.
   void processCFGBlockEntrance(const BlockEdge &L,
                                NodeBuilderWithSinks &nodeBuilder,
-                               ExplodedNode *Pred) override;
+                               ExplodedNode *Pred);
 
   /// ProcessBranch - Called by CoreEngine.  Used to generate successor
   ///  nodes by processing the 'effects' of a branch condition.
@@ -305,7 +321,7 @@ class ExprEngine : public SubEngine {
                      ExplodedNode *Pred,
                      ExplodedNodeSet &Dst,
                      const CFGBlock *DstT,
-                     const CFGBlock *DstF) override;
+                     const CFGBlock *DstF);
 
   /// Called by CoreEngine.
   /// Used to generate successor nodes for temporary destructors depending
@@ -314,7 +330,7 @@ class ExprEngine : public SubEngine {
                                      NodeBuilderContext &BldCtx,
                                      ExplodedNode *Pred, ExplodedNodeSet &Dst,
                                      const CFGBlock *DstT,
-                                     const CFGBlock *DstF) override;
+                                     const CFGBlock *DstF);
 
   /// Called by CoreEngine.  Used to processing branching behavior
   /// at static initializers.
@@ -323,27 +339,27 @@ class ExprEngine : public SubEngine {
                                 ExplodedNode *Pred,
                                 ExplodedNodeSet &Dst,
                                 const CFGBlock *DstT,
-                                const CFGBlock *DstF) override;
+                                const CFGBlock *DstF);
 
   /// processIndirectGoto - Called by CoreEngine.  Used to generate successor
   ///  nodes by processing the 'effects' of a computed goto jump.
-  void processIndirectGoto(IndirectGotoNodeBuilder& builder) override;
+  void processIndirectGoto(IndirectGotoNodeBuilder& builder);
 
   /// ProcessSwitch - Called by CoreEngine.  Used to generate successor
   ///  nodes by processing the 'effects' of a switch statement.
-  void processSwitch(SwitchNodeBuilder& builder) override;
+  void processSwitch(SwitchNodeBuilder& builder);
 
   /// Called by CoreEngine.  Used to notify checkers that processing a
   /// function has begun. Called for both inlined and and top-level functions.
   void processBeginOfFunction(NodeBuilderContext &BC,
                               ExplodedNode *Pred, ExplodedNodeSet &Dst,
-                              const BlockEdge &L) override;
+                              const BlockEdge &L);
 
   /// Called by CoreEngine.  Used to notify checkers that processing a
   /// function has ended. Called for both inlined and and top-level functions.
   void processEndOfFunction(NodeBuilderContext& BC,
                             ExplodedNode *Pred,
-                            const ReturnStmt *RS = nullptr) override;
+                            const ReturnStmt *RS = nullptr);
 
   /// Remove dead bindings/symbols before exiting a function.
   void removeDeadOnEndOfFunction(NodeBuilderContext& BC,
@@ -352,19 +368,19 @@ class ExprEngine : public SubEngine {
 
   /// Generate the entry node of the callee.
   void processCallEnter(NodeBuilderContext& BC, CallEnter CE,
-                        ExplodedNode *Pred) override;
+                        ExplodedNode *Pred);
 
   /// Generate the sequence of nodes that simulate the call exit and the post
   /// visit for CallExpr.
-  void processCallExit(ExplodedNode *Pred) override;
+  void processCallExit(ExplodedNode *Pred);
 
   /// Called by CoreEngine when the analysis worklist has terminated.
-  void processEndWorklist() override;
+  void processEndWorklist();
 
   /// evalAssume - Callback function invoked by the ConstraintManager when
   ///  making assumptions about state values.
   ProgramStateRef processAssume(ProgramStateRef state, SVal cond,
-                                bool assumption) override;
+                                bool assumption);
 
   /// processRegionChanges - Called by ProgramStateManager whenever a change is made
   ///  to the store. Used to update checkers that track region values.
@@ -374,14 +390,21 @@ class ExprEngine : public SubEngine {
                        ArrayRef<const MemRegion *> ExplicitRegions,
                        ArrayRef<const MemRegion *> Regions,
                        const LocationContext *LCtx,
-                       const CallEvent *Call) override;
+                       const CallEvent *Call);
+
+  inline ProgramStateRef
+  processRegionChange(ProgramStateRef state,
+                      const MemRegion* MR,
+                      const LocationContext *LCtx) {
+    return processRegionChanges(state, nullptr, MR, MR, LCtx, nullptr);
+  }
 
   /// printJson - Called by ProgramStateManager to print checker-specific data.
   void printJson(raw_ostream &Out, ProgramStateRef State,
                  const LocationContext *LCtx, const char *NL,
-                 unsigned int Space, bool IsDot) const override;
+                 unsigned int Space, bool IsDot) const;
 
-  ProgramStateManager &getStateManager() override { return StateMgr; }
+  ProgramStateManager &getStateManager() { return StateMgr; }
 
   StoreManager &getStoreManager() { return StateMgr.getStoreManager(); }
 
@@ -608,23 +631,11 @@ class ExprEngine : public SubEngine {
                              const ConstructionContextItem &Item,
                              const LocationContext *LC);
 
-protected:
-  /// evalBind - Handle the semantics of binding a value to a specific location.
-  ///  This method is used by evalStore, VisitDeclStmt, and others.
-  void evalBind(ExplodedNodeSet &Dst, const Stmt *StoreE, ExplodedNode *Pred,
-                SVal location, SVal Val, bool atDeclInit = false,
-                const ProgramPoint *PP = nullptr);
-
   /// Call PointerEscape callback when a value escapes as a result of bind.
   ProgramStateRef processPointerEscapedOnBind(
       ProgramStateRef State, ArrayRef<std::pair<SVal, SVal>> LocAndVals,
       const LocationContext *LCtx, PointerEscapeKind Kind,
-      const CallEvent *Call) override;
-
-  ProgramStateRef
-  processPointerEscapedOnBind(ProgramStateRef State,
-                              SVal Loc, SVal Val,
-                              const LocationContext *LCtx);
+      const CallEvent *Call);
 
   /// Call PointerEscape callback when a value escapes as a result of
   /// region invalidation.
@@ -634,7 +645,19 @@ class ExprEngine : public SubEngine {
                            const InvalidatedSymbols *Invalidated,
                            ArrayRef<const MemRegion *> ExplicitRegions,
                            const CallEvent *Call,
-                           RegionAndSymbolInvalidationTraits &ITraits) override;
+                           RegionAndSymbolInvalidationTraits &ITraits);
+
+private:
+  /// evalBind - Handle the semantics of binding a value to a specific location.
+  ///  This method is used by evalStore, VisitDeclStmt, and others.
+  void evalBind(ExplodedNodeSet &Dst, const Stmt *StoreE, ExplodedNode *Pred,
+                SVal location, SVal Val, bool atDeclInit = false,
+                const ProgramPoint *PP = nullptr);
+
+  ProgramStateRef
+  processPointerEscapedOnBind(ProgramStateRef State,
+                              SVal Loc, SVal Val,
+                              const LocationContext *LCtx);
 
   /// A simple wrapper when you only need to notify checkers of pointer-escape
   /// of some values.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index ecb61bffe3d95..a0d7db6dd860c 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -39,7 +39,7 @@ class CallEvent;
 class CallEventManager;
 
 typedef std::unique_ptr<ConstraintManager>(*ConstraintManagerCreator)(
-    ProgramStateManager &, SubEngine *);
+    ProgramStateManager &, ExprEngine *);
 typedef std::unique_ptr<StoreManager>(*StoreManagerCreator)(
     ProgramStateManager &);
 
@@ -460,8 +460,8 @@ class ProgramStateManager {
   friend class ProgramState;
   friend void ProgramStateRelease(const ProgramState *state);
 private:
-  /// Eng - The SubEngine that owns this state manager.
-  SubEngine *Eng; /* Can be null. */
+  /// Eng - The ExprEngine that owns this state manager.
+  ExprEngine *Eng; /* Can be null. */
 
   EnvironmentManager                   EnvMgr;
   std::unique_ptr<StoreManager>        StoreMgr;
@@ -493,7 +493,7 @@ class ProgramStateManager {
                  StoreManagerCreator CreateStoreManager,
                  ConstraintManagerCreator CreateConstraintManager,
                  llvm::BumpPtrAllocator& alloc,
-                 SubEngine *subeng);
+                 ExprEngine *expreng);
 
   ~ProgramStateManager();
 
@@ -534,7 +534,7 @@ class ProgramStateManager {
 
   StoreManager &getStoreManager() { return *StoreMgr; }
   ConstraintManager &getConstraintManager() { return *ConstraintMgr; }
-  SubEngine &getOwningEngine() { return *Eng; }
+  ExprEngine &getOwningEngine() { return *Eng; }
 
   ProgramStateRef
   removeDeadBindingsFromEnvironmentAndStore(ProgramStateRef St,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
index a9ca3451d8f3e..c72f8292647dc 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
@@ -137,8 +137,8 @@ struct ProgramStateTrait<ConstraintRange>
 
 class RangedConstraintManager : public SimpleConstraintManager {
 public:
-  RangedConstraintManager(SubEngine *SE, SValBuilder &SB)
-      : SimpleConstraintManager(SE, SB) {}
+  RangedConstraintManager(ExprEngine *EE, SValBuilder &SB)
+      : SimpleConstraintManager(EE, SB) {}
 
   ~RangedConstraintManager() override;
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
index 294a45b214d7e..6a0f5f10874e3 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
@@ -31,8 +31,9 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager {
   mutable llvm::SMTSolverRef Solver = llvm::CreateZ3Solver();
 
 public:
-  SMTConstraintManager(clang::ento::SubEngine *SE, clang::ento::SValBuilder &SB)
-      : SimpleConstraintManager(SE, SB) {}
+  SMTConstraintManager(clang::ento::ExprEngine *EE,
+                       clang::ento::SValBuilder &SB)
+      : SimpleConstraintManager(EE, SB) {}
   virtual ~SMTConstraintManager() = default;
 
   //===------------------------------------------------------------------===//
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SimpleConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SimpleConstraintManager.h
index 6bf5e94afdbb6..87e927f5b4800 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SimpleConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SimpleConstraintManager.h
@@ -21,12 +21,12 @@ namespace clang {
 namespace ento {
 
 class SimpleConstraintManager : public ConstraintManager {
-  SubEngine *SU;
+  ExprEngine *EE;
   SValBuilder &SVB;
 
 public:
-  SimpleConstraintManager(SubEngine *subengine, SValBuilder &SB)
-      : SU(subengine), SVB(SB) {}
+  SimpleConstraintManager(ExprEngine *exprengine, SValBuilder &SB)
+      : EE(exprengine), SVB(SB) {}
 
   ~SimpleConstraintManager() override;
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h
deleted file mode 100644
index a7f3c28d4373a..0000000000000
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h
+++ /dev/null
@@ -1,178 +0,0 @@
-//== SubEngine.h - Interface of the subengine of CoreEngine --------*- C++ -*-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interface of a subengine of the CoreEngine.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SUBENGINE_H
-#define LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SUBENGINE_H
-
-#include "clang/Analysis/ProgramPoint.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/Store.h"
-#include "clang/StaticAnalyzer/Core/CheckerManager.h"
-
-namespace clang {
-
-class CFGBlock;
-class CFGElement;
-class LocationContext;
-class Stmt;
-
-namespace cross_tu {
-class CrossTranslationUnitContext;
-}
-
-namespace ento {
-
-struct NodeBuilderContext;
-class AnalysisManager;
-class ExplodedNodeSet;
-class ExplodedNode;
-class ProgramState;
-class ProgramStateManager;
-class BlockCounter;
-class BranchNodeBuilder;
-class IndirectGotoNodeBuilder;
-class SwitchNodeBuilder;
-class EndOfFunctionNodeBuilder;
-class NodeBuilderWithSinks;
-class MemRegion;
-
-class SubEngine {
-  virtual void anchor();
-public:
-  virtual ~SubEngine() {}
-
-  virtual ProgramStateRef getInitialState(const LocationContext *InitLoc) = 0;
-
-  virtual AnalysisManager &getAnalysisManager() = 0;
-
-  virtual cross_tu::CrossTranslationUnitContext *
-  getCrossTranslationUnitContext() = 0;
-
-  virtual ProgramStateManager &getStateManager() = 0;
-
-  /// Called by CoreEngine. Used to generate new successor
-  /// nodes by processing the 'effects' of a block-level statement.
-  virtual void processCFGElement(const CFGElement E, ExplodedNode* Pred,
-                                 unsigned StmtIdx, NodeBuilderContext *Ctx)=0;
-
-  /// Called by CoreEngine when it starts processing a CFGBlock.  The
-  /// SubEngine is expected to populate dstNodes with new nodes representing
-  /// updated analysis state, or generate no nodes at all if it doesn't.
-  virtual void processCFGBlockEntrance(const BlockEdge &L,
-                                       NodeBuilderWithSinks &nodeBuilder,
-                                       ExplodedNode *Pred) = 0;
-
-  /// Called by CoreEngine.  Used to generate successor
-  ///  nodes by processing the 'effects' of a branch condition.
-  virtual void processBranch(const Stmt *Condition,
-                             NodeBuilderContext& BuilderCtx,
-                             ExplodedNode *Pred,
-                             ExplodedNodeSet &Dst,
-                             const CFGBlock *DstT,
-                             const CFGBlock *DstF) = 0;
-
-  /// Called by CoreEngine.
-  /// Used to generate successor nodes for temporary destructors depending
-  /// on whether the corresponding constructor was visited.
-  virtual void processCleanupTemporaryBranch(const CXXBindTemporaryExpr *BTE,
-                                             NodeBuilderContext &BldCtx,
-                                             ExplodedNode *Pred,
-                                             ExplodedNodeSet &Dst,
-                                             const CFGBlock *DstT,
-                                             const CFGBlock *DstF) = 0;
-
-  /// Called by CoreEngine.  Used to processing branching behavior
-  /// at static initializers.
-  virtual void processStaticInitializer(const DeclStmt *DS,
-                                        NodeBuilderContext& BuilderCtx,
-                                        ExplodedNode *Pred,
-                                        ExplodedNodeSet &Dst,
-                                        const CFGBlock *DstT,
-                                        const CFGBlock *DstF) = 0;
-
-  /// Called by CoreEngine.  Used to generate successor
-  /// nodes by processing the 'effects' of a computed goto jump.
-  virtual void processIndirectGoto(IndirectGotoNodeBuilder& builder) = 0;
-
-  /// Called by CoreEngine.  Used to generate successor
-  /// nodes by processing the 'effects' of a switch statement.
-  virtual void processSwitch(SwitchNodeBuilder& builder) = 0;
-
-  /// Called by CoreEngine.  Used to notify checkers that processing a
-  /// function has begun. Called for both inlined and and top-level functions.
-  virtual void processBeginOfFunction(NodeBuilderContext &BC,
-                                      ExplodedNode *Pred,
-                                      ExplodedNodeSet &Dst,
-                                      const BlockEdge &L) = 0;
-
-  /// Called by CoreEngine.  Used to notify checkers that processing a
-  /// function has ended. Called for both inlined and and top-level functions.
-  virtual void processEndOfFunction(NodeBuilderContext& BC,
-                                    ExplodedNode *Pred,
-                                    const ReturnStmt *RS = nullptr) = 0;
-
-  // Generate the entry node of the callee.
-  virtual void processCallEnter(NodeBuilderContext& BC, CallEnter CE,
-                                ExplodedNode *Pred) = 0;
-
-  // Generate the first post callsite node.
-  virtual void processCallExit(ExplodedNode *Pred) = 0;
-
-  /// Called by ConstraintManager. Used to call checker-specific
-  /// logic for handling assumptions on symbolic values.
-  virtual ProgramStateRef processAssume(ProgramStateRef state,
-                                       SVal cond, bool assumption) = 0;
-
-  /// processRegionChanges - Called by ProgramStateManager whenever a change is
-  /// made to the store. Used to update checkers that track region values.
-  virtual ProgramStateRef
-  processRegionChanges(ProgramStateRef state,
-                       const InvalidatedSymbols *invalidated,
-                       ArrayRef<const MemRegion *> ExplicitRegions,
-                       ArrayRef<const MemRegion *> Regions,
-                       const LocationContext *LCtx,
-                       const CallEvent *Call) = 0;
-
-
-  inline ProgramStateRef
-  processRegionChange(ProgramStateRef state,
-                      const MemRegion* MR,
-                      const LocationContext *LCtx) {
-    return processRegionChanges(state, nullptr, MR, MR, LCtx, nullptr);
-  }
-
-  virtual ProgramStateRef processPointerEscapedOnBind(
-      ProgramStateRef State, ArrayRef<std::pair<SVal, SVal>> LocAndVals,
-      const LocationContext *LCtx, PointerEscapeKind Kind,
-      const CallEvent *Call) = 0;
-
-  virtual ProgramStateRef
-  notifyCheckersOfPointerEscape(ProgramStateRef State,
-                           const InvalidatedSymbols *Invalidated,
-                           ArrayRef<const MemRegion *> ExplicitRegions,
-                           const CallEvent *Call,
-                           RegionAndSymbolInvalidationTraits &HTraits) = 0;
-
-  /// printJson - Called by ProgramStateManager to print checker-specific data.
-  virtual void printJson(raw_ostream &Out, ProgramStateRef State,
-                         const LocationContext *LCtx, const char *NL,
-                         unsigned int Space, bool IsDot) const = 0;
-
-  /// Called by CoreEngine when the analysis worklist is either empty or the
-  //  maximum number of analysis steps have been reached.
-  virtual void processEndWorklist() = 0;
-};
-
-} // end GR namespace
-
-} // end clang namespace
-
-#endif
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 3b2e5cd28e437..ad79f7cb9359f 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -45,7 +45,6 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
diff --git a/clang/lib/StaticAnalyzer/Core/CMakeLists.txt b/clang/lib/StaticAnalyzer/Core/CMakeLists.txt
index 057cdd4bb18ab..233ffaf799568 100644
--- a/clang/lib/StaticAnalyzer/Core/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Core/CMakeLists.txt
@@ -44,7 +44,6 @@ add_clang_library(clangStaticAnalyzerCore
   SimpleSValBuilder.cpp
   SMTConstraintManager.cpp
   Store.cpp
-  SubEngine.cpp
   SValBuilder.cpp
   SVals.cpp
   SymbolManager.cpp
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 1ea7c26dc76b0..fb728ac9e4f5a 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -564,7 +564,7 @@ RuntimeDefinition AnyFunctionCall::getRuntimeDefinition() const {
     return RuntimeDefinition(Decl);
   }
 
-  SubEngine &Engine = getState()->getStateManager().getOwningEngine();
+  ExprEngine &Engine = getState()->getStateManager().getOwningEngine();
   AnalyzerOptions &Opts = Engine.getAnalysisManager().options;
 
   // Try to get CTU definition only if CTUDir is provided.
diff --git a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
index 5a49b18aecf12..70deb13a8e1ae 100644
--- a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
@@ -23,8 +23,8 @@
 #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/BlockCounter.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/WorkList.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -52,8 +52,7 @@ STATISTIC(NumPathsExplored,
 // Core analysis engine.
 //===----------------------------------------------------------------------===//
 
-static std::unique_ptr<WorkList> generateWorkList(AnalyzerOptions &Opts,
-                                                  SubEngine &subengine) {
+static std::unique_ptr<WorkList> generateWorkList(AnalyzerOptions &Opts) {
   switch (Opts.getExplorationStrategy()) {
     case ExplorationStrategyKind::DFS:
       return WorkList::makeDFS();
@@ -71,9 +70,9 @@ static std::unique_ptr<WorkList> generateWorkList(AnalyzerOptions &Opts,
   llvm_unreachable("Unknown AnalyzerOptions::ExplorationStrategyKind");
 }
 
-CoreEngine::CoreEngine(SubEngine &subengine, FunctionSummariesTy *FS,
+CoreEngine::CoreEngine(ExprEngine &exprengine, FunctionSummariesTy *FS,
                        AnalyzerOptions &Opts)
-    : SubEng(subengine), WList(generateWorkList(Opts, subengine)),
+    : ExprEng(exprengine), WList(generateWorkList(Opts)),
       BCounterFactory(G.getAllocator()), FunctionSummaries(FS) {}
 
 /// ExecuteWorkList - Run the worklist algorithm for a maximum number of steps.
@@ -104,7 +103,7 @@ bool CoreEngine::ExecuteWorkList(const LocationContext *L, unsigned Steps,
     WList->setBlockCounter(BCounterFactory.GetEmptyCounter());
 
     if (!InitState)
-      InitState = SubEng.getInitialState(L);
+      InitState = ExprEng.getInitialState(L);
 
     bool IsNew;
     ExplodedNode *Node = G.getNode(StartLoc, InitState, false, &IsNew);
@@ -113,7 +112,7 @@ bool CoreEngine::ExecuteWorkList(const LocationContext *L, unsigned Steps,
 
     NodeBuilderContext BuilderCtx(*this, StartLoc.getDst(), Node);
     ExplodedNodeSet DstBegin;
-    SubEng.processBeginOfFunction(BuilderCtx, Node, DstBegin, StartLoc);
+    ExprEng.processBeginOfFunction(BuilderCtx, Node, DstBegin, StartLoc);
 
     enqueue(DstBegin);
   }
@@ -147,7 +146,7 @@ bool CoreEngine::ExecuteWorkList(const LocationContext *L, unsigned Steps,
 
     dispatchWorkItem(Node, Node->getLocation(), WU);
   }
-  SubEng.processEndWorklist();
+  ExprEng.processEndWorklist();
   return WList->hasWork();
 }
 
@@ -172,7 +171,7 @@ void CoreEngine::dispatchWorkItem(ExplodedNode* Pred, ProgramPoint Loc,
       break;
 
     case ProgramPoint::CallExitBeginKind:
-      SubEng.processCallExit(Pred);
+      ExprEng.processCallExit(Pred);
       break;
 
     case ProgramPoint::EpsilonKind: {
@@ -253,17 +252,17 @@ void CoreEngine::HandleBlockEdge(const BlockEdge &L, ExplodedNode *Pred) {
     }
 
     // Process the final state transition.
-    SubEng.processEndOfFunction(BuilderCtx, Pred, RS);
+    ExprEng.processEndOfFunction(BuilderCtx, Pred, RS);
 
     // This path is done. Don't enqueue any more nodes.
     return;
   }
 
-  // Call into the SubEngine to process entering the CFGBlock.
+  // Call into the ExprEngine to process entering the CFGBlock.
   ExplodedNodeSet dstNodes;
   BlockEntrance BE(Blk, Pred->getLocationContext());
   NodeBuilderWithSinks nodeBuilder(Pred, dstNodes, BuilderCtx, BE);
-  SubEng.processCFGBlockEntrance(L, nodeBuilder, Pred);
+  ExprEng.processCFGBlockEntrance(L, nodeBuilder, Pred);
 
   // Auto-generate a node.
   if (!nodeBuilder.hasGeneratedNodes()) {
@@ -287,7 +286,7 @@ void CoreEngine::HandleBlockEntrance(const BlockEntrance &L,
   // Process the entrance of the block.
   if (Optional<CFGElement> E = L.getFirstElement()) {
     NodeBuilderContext Ctx(*this, L.getBlock(), Pred);
-    SubEng.processCFGElement(*E, Pred, 0, &Ctx);
+    ExprEng.processCFGElement(*E, Pred, 0, &Ctx);
   }
   else
     HandleBlockExit(L.getBlock(), Pred);
@@ -367,7 +366,7 @@ void CoreEngine::HandleBlockExit(const CFGBlock * B, ExplodedNode *Pred) {
            builder(Pred, B, cast<IndirectGotoStmt>(Term)->getTarget(),
                    *(B->succ_begin()), this);
 
-        SubEng.processIndirectGoto(builder);
+        ExprEng.processIndirectGoto(builder);
         return;
       }
 
@@ -378,7 +377,7 @@ void CoreEngine::HandleBlockExit(const CFGBlock * B, ExplodedNode *Pred) {
         //      'element' variable to a value.
         //  (2) in a terminator, which represents the branch.
         //
-        // For (1), subengines will bind a value (i.e., 0 or 1) indicating
+        // For (1), ExprEngine will bind a value (i.e., 0 or 1) indicating
         // whether or not collection contains any more elements.  We cannot
         // just test to see if the element is nil because a container can
         // contain nil elements.
@@ -389,7 +388,7 @@ void CoreEngine::HandleBlockExit(const CFGBlock * B, ExplodedNode *Pred) {
         SwitchNodeBuilder builder(Pred, B, cast<SwitchStmt>(Term)->getCond(),
                                     this);
 
-        SubEng.processSwitch(builder);
+        ExprEng.processSwitch(builder);
         return;
       }
 
@@ -418,7 +417,7 @@ void CoreEngine::HandleBlockExit(const CFGBlock * B, ExplodedNode *Pred) {
 
 void CoreEngine::HandleCallEnter(const CallEnter &CE, ExplodedNode *Pred) {
   NodeBuilderContext BuilderCtx(*this, CE.getEntry(), Pred);
-  SubEng.processCallEnter(BuilderCtx, CE, Pred);
+  ExprEng.processCallEnter(BuilderCtx, CE, Pred);
 }
 
 void CoreEngine::HandleBranch(const Stmt *Cond, const Stmt *Term,
@@ -426,7 +425,7 @@ void CoreEngine::HandleBranch(const Stmt *Cond, const Stmt *Term,
   assert(B->succ_size() == 2);
   NodeBuilderContext Ctx(*this, B, Pred);
   ExplodedNodeSet Dst;
-  SubEng.processBranch(Cond, Ctx, Pred, Dst, *(B->succ_begin()),
+  ExprEng.processBranch(Cond, Ctx, Pred, Dst, *(B->succ_begin()),
                        *(B->succ_begin() + 1));
   // Enqueue the new frontier onto the worklist.
   enqueue(Dst);
@@ -438,7 +437,7 @@ void CoreEngine::HandleCleanupTemporaryBranch(const CXXBindTemporaryExpr *BTE,
   assert(B->succ_size() == 2);
   NodeBuilderContext Ctx(*this, B, Pred);
   ExplodedNodeSet Dst;
-  SubEng.processCleanupTemporaryBranch(BTE, Ctx, Pred, Dst, *(B->succ_begin()),
+  ExprEng.processCleanupTemporaryBranch(BTE, Ctx, Pred, Dst, *(B->succ_begin()),
                                        *(B->succ_begin() + 1));
   // Enqueue the new frontier onto the worklist.
   enqueue(Dst);
@@ -449,7 +448,7 @@ void CoreEngine::HandleStaticInit(const DeclStmt *DS, const CFGBlock *B,
   assert(B->succ_size() == 2);
   NodeBuilderContext Ctx(*this, B, Pred);
   ExplodedNodeSet Dst;
-  SubEng.processStaticInitializer(DS, Ctx, Pred, Dst,
+  ExprEng.processStaticInitializer(DS, Ctx, Pred, Dst,
                                   *(B->succ_begin()), *(B->succ_begin()+1));
   // Enqueue the new frontier onto the worklist.
   enqueue(Dst);
@@ -464,7 +463,7 @@ void CoreEngine::HandlePostStmt(const CFGBlock *B, unsigned StmtIdx,
     HandleBlockExit(B, Pred);
   else {
     NodeBuilderContext Ctx(*this, B, Pred);
-    SubEng.processCFGElement((*B)[StmtIdx], Pred, StmtIdx, &Ctx);
+    ExprEng.processCFGElement((*B)[StmtIdx], Pred, StmtIdx, &Ctx);
   }
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 36b930faf2d02..6fce27bc95569 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -3206,3 +3206,5 @@ void *ProgramStateTrait<ReplayWithoutInlining>::GDMIndex() {
   static int index = 0;
   return &index;
 }
+
+void ExprEngine::anchor() { }
diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
index 3ecee758c676c..006a4006b7fc9 100644
--- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -16,8 +16,8 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicType.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -76,12 +76,12 @@ ProgramStateManager::ProgramStateManager(ASTContext &Ctx,
                                          StoreManagerCreator CreateSMgr,
                                          ConstraintManagerCreator CreateCMgr,
                                          llvm::BumpPtrAllocator &alloc,
-                                         SubEngine *SubEng)
-  : Eng(SubEng), EnvMgr(alloc), GDMFactory(alloc),
+                                         ExprEngine *ExprEng)
+  : Eng(ExprEng), EnvMgr(alloc), GDMFactory(alloc),
     svalBuilder(createSimpleSValBuilder(alloc, Ctx, *this)),
     CallEventMgr(new CallEventManager(alloc)), Alloc(alloc) {
   StoreMgr = (*CreateSMgr)(*this);
-  ConstraintMgr = (*CreateCMgr)(*this, SubEng);
+  ConstraintMgr = (*CreateCMgr)(*this, ExprEng);
 }
 
 
@@ -189,7 +189,7 @@ ProgramState::invalidateRegionsImpl(ValueList Values,
                                     RegionAndSymbolInvalidationTraits *ITraits,
                                     const CallEvent *Call) const {
   ProgramStateManager &Mgr = getStateManager();
-  SubEngine &Eng = Mgr.getOwningEngine();
+  ExprEngine &Eng = Mgr.getOwningEngine();
 
   InvalidatedSymbols InvalidatedSyms;
   if (!IS)
diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 137e2cefe5a04..a3ea7d4c013b9 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -285,8 +285,8 @@ void RangeSet::print(raw_ostream &os) const {
 namespace {
 class RangeConstraintManager : public RangedConstraintManager {
 public:
-  RangeConstraintManager(SubEngine *SE, SValBuilder &SVB)
-      : RangedConstraintManager(SE, SVB) {}
+  RangeConstraintManager(ExprEngine *EE, SValBuilder &SVB)
+      : RangedConstraintManager(EE, SVB) {}
 
   //===------------------------------------------------------------------===//
   // Implementation for interface from ConstraintManager.
@@ -374,7 +374,8 @@ class RangeConstraintManager : public RangedConstraintManager {
 } // end anonymous namespace
 
 std::unique_ptr<ConstraintManager>
-ento::CreateRangeConstraintManager(ProgramStateManager &StMgr, SubEngine *Eng) {
+ento::CreateRangeConstraintManager(ProgramStateManager &StMgr,
+                                   ExprEngine *Eng) {
   return std::make_unique<RangeConstraintManager>(Eng, StMgr.getSValBuilder());
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 6cca0f5f57d10..2a55c99647124 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -24,10 +24,10 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicSize.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/raw_ostream.h"
@@ -382,7 +382,7 @@ class RegionStoreManager : public StoreManager {
     : StoreManager(mgr), Features(f),
       RBFactory(mgr.getAllocator()), CBFactory(mgr.getAllocator()),
       SmallStructLimit(0) {
-    SubEngine &Eng = StateMgr.getOwningEngine();
+    ExprEngine &Eng = StateMgr.getOwningEngine();
     AnalyzerOptions &Options = Eng.getAnalysisManager().options;
     SmallStructLimit = Options.RegionStoreSmallStructLimit;
   }
diff --git a/clang/lib/StaticAnalyzer/Core/SMTConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/SMTConstraintManager.cpp
index 6ad12ca0a688f..7395622a659ca 100644
--- a/clang/lib/StaticAnalyzer/Core/SMTConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SMTConstraintManager.cpp
@@ -13,6 +13,6 @@ using namespace clang;
 using namespace ento;
 
 std::unique_ptr<ConstraintManager>
-ento::CreateZ3ConstraintManager(ProgramStateManager &StMgr, SubEngine *Eng) {
+ento::CreateZ3ConstraintManager(ProgramStateManager &StMgr, ExprEngine *Eng) {
   return std::make_unique<SMTConstraintManager>(Eng, StMgr.getSValBuilder());
 }
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 3a5841137e1a7..c00a2c8ba8a2c 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -24,12 +24,12 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/Store.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
 #include "llvm/ADT/APSInt.h"
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp
index 85f60231a2769..3709106ad44ce 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp
@@ -44,8 +44,8 @@ ProgramStateRef SimpleConstraintManager::assume(ProgramStateRef State,
 ProgramStateRef SimpleConstraintManager::assume(ProgramStateRef State,
                                                 NonLoc Cond, bool Assumption) {
   State = assumeAux(State, Cond, Assumption);
-  if (NotifyAssumeClients && SU)
-    return SU->processAssume(State, Cond, Assumption);
+  if (NotifyAssumeClients && EE)
+    return EE->processAssume(State, Cond, Assumption);
   return State;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 84c52f53ca5e7..d9fe3af3c0000 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -13,8 +13,8 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SValVisitor.h"
 
 using namespace clang;
diff --git a/clang/lib/StaticAnalyzer/Core/SubEngine.cpp b/clang/lib/StaticAnalyzer/Core/SubEngine.cpp
deleted file mode 100644
index d7ddd9cf46105..0000000000000
--- a/clang/lib/StaticAnalyzer/Core/SubEngine.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-//== SubEngine.cpp - Interface of the subengine of CoreEngine ------*- C++ -*-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/StaticAnalyzer/Core/PathSensitive/SubEngine.h"
-
-using namespace clang::ento;
-
-void SubEngine::anchor() { }

From 10f0b18ed950545d10574f5b30d234bd3789d7b2 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 26 May 2020 17:56:17 +0000
Subject: [PATCH 118/770] [gn build] Port d70ec366c91

---
 llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn
index f37cc42c481e8..4c3d175a31808 100644
--- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn
@@ -55,7 +55,6 @@ static_library("Core") {
     "SimpleConstraintManager.cpp",
     "SimpleSValBuilder.cpp",
     "Store.cpp",
-    "SubEngine.cpp",
     "SymbolManager.cpp",
     "TextDiagnostics.cpp",
     "WorkList.cpp",

From d1f0a76b21975ba66ec2427c2d3ddb7ed1e63949 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 10:54:12 -0700
Subject: [PATCH 119/770] [YAMLTraits] Remove char trait and serialize as
 uint8_t in lldb.

As discussed in https://reviews.llvm.org/D79745
---
 lldb/include/lldb/Utility/Args.h       |  2 +-
 llvm/include/llvm/Support/YAMLTraits.h |  6 ------
 llvm/lib/Support/YAMLTraits.cpp        | 11 -----------
 llvm/unittests/Support/YAMLIOTest.cpp  |  8 +-------
 4 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/lldb/include/lldb/Utility/Args.h b/lldb/include/lldb/Utility/Args.h
index 560f25795d3b7..2cce7d0c697c7 100644
--- a/lldb/include/lldb/Utility/Args.h
+++ b/lldb/include/lldb/Utility/Args.h
@@ -391,7 +391,7 @@ template <> struct MappingTraits<lldb_private::Args::ArgEntry> {
       return lldb_private::Args::ArgEntry(value, quote);
     }
     StringRef value;
-    char quote;
+    uint8_t quote;
   };
   static void mapping(IO &io, lldb_private::Args::ArgEntry &v);
 };
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 9aa96401ae179..f93f36037679a 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -1159,12 +1159,6 @@ struct ScalarTraits<bool> {
   static QuotingType mustQuote(StringRef) { return QuotingType::None; }
 };
 
-template <> struct ScalarTraits<char> {
-  static void output(const char &, void *, raw_ostream &);
-  static StringRef input(StringRef, void *, char &);
-  static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
-};
-
 template<>
 struct ScalarTraits<StringRef> {
   static void output(const StringRef &, void *, raw_ostream &);
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index a4b782db0a96e..f27be3e974306 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -864,17 +864,6 @@ StringRef ScalarTraits<bool>::input(StringRef Scalar, void *, bool &Val) {
   return "invalid boolean";
 }
 
-void ScalarTraits<char>::output(const char &Val, void *, raw_ostream &Out) {
-  Out << Val;
-}
-
-StringRef ScalarTraits<char>::input(StringRef Scalar, void *, char &Val) {
-  if (Scalar.size() != 1)
-    return "invalid character";
-  Val = Scalar[0];
-  return StringRef();
-}
-
 void ScalarTraits<StringRef>::output(const StringRef &Val, void *,
                                      raw_ostream &Out) {
   Out << Val;
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index b2ea5aab59356..d86489cf75604 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -333,7 +333,6 @@ struct BuiltInTypes {
   uint16_t        u16;
   uint8_t         u8;
   bool            b;
-  char            c;
   int64_t         s64;
   int32_t         s32;
   int16_t         s16;
@@ -358,7 +357,6 @@ namespace yaml {
       io.mapRequired("u16",      bt.u16);
       io.mapRequired("u8",       bt.u8);
       io.mapRequired("b",        bt.b);
-      io.mapRequired("c",        bt.c);
       io.mapRequired("s64",      bt.s64);
       io.mapRequired("s32",      bt.s32);
       io.mapRequired("s16",      bt.s16);
@@ -388,7 +386,6 @@ TEST(YAMLIO, TestReadBuiltInTypes) {
             "u16:      65000\n"
             "u8:       255\n"
             "b:        false\n"
-            "c:        'c'\n"
             "s64:      -5000000000\n"
             "s32:      -2000000000\n"
             "s16:      -32000\n"
@@ -399,7 +396,7 @@ TEST(YAMLIO, TestReadBuiltInTypes) {
             "h16:      0x8765\n"
             "h32:      0xFEDCBA98\n"
             "h64:      0xFEDCBA9876543210\n"
-            "...\n");
+           "...\n");
   yin >> map;
 
   EXPECT_FALSE(yin.error());
@@ -410,7 +407,6 @@ TEST(YAMLIO, TestReadBuiltInTypes) {
   EXPECT_EQ(map.u16, 65000);
   EXPECT_EQ(map.u8,  255);
   EXPECT_EQ(map.b,   false);
-  EXPECT_EQ(map.c,   'c');
   EXPECT_EQ(map.s64, -5000000000LL);
   EXPECT_EQ(map.s32, -2000000000L);
   EXPECT_EQ(map.s16, -32000);
@@ -438,7 +434,6 @@ TEST(YAMLIO, TestReadWriteBuiltInTypes) {
     map.u16 = 50000;
     map.u8  = 254;
     map.b   = true;
-    map.c   = 'd';
     map.s64 = -6000000000LL;
     map.s32 = -2000000000;
     map.s16 = -32000;
@@ -468,7 +463,6 @@ TEST(YAMLIO, TestReadWriteBuiltInTypes) {
     EXPECT_EQ(map.u16,      50000);
     EXPECT_EQ(map.u8,       254);
     EXPECT_EQ(map.b,        true);
-    EXPECT_EQ(map.c,        'd');
     EXPECT_EQ(map.s64,      -6000000000LL);
     EXPECT_EQ(map.s32,      -2000000000L);
     EXPECT_EQ(map.s16,      -32000);

From b8a3c618d6c5df081cad69b5ffb386a7a7b0361f Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 26 May 2020 11:06:07 -0700
Subject: [PATCH 120/770] [ELF] Allow misaligned SHT_GNU_verneed

Bazel created interface shared objects (.ifso) may be misaligned.  We use
llvm::support::detail::packed_endian_specific_integral under the hood
which allows reading of misaligned values, so there is not a need to
diagnose (in LLD we don't intend to support sophisticated parsing for
SHT_GNU_*).
---
 lld/ELF/InputFiles.cpp                                 |  6 ++----
 .../{verneed-shared.yaml => verneed-shared.test}       | 10 ++++++----
 2 files changed, 8 insertions(+), 8 deletions(-)
 rename lld/test/ELF/invalid/{verneed-shared.yaml => verneed-shared.test} (89%)

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 9fdd0547ddca4..c451aee1f921a 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1224,14 +1224,12 @@ std::vector<uint32_t> SharedFile::parseVerneed(const ELFFile<ELFT> &obj,
   ArrayRef<uint8_t> data = CHECK(obj.getSectionContents(sec), this);
   const uint8_t *verneedBuf = data.begin();
   for (unsigned i = 0; i != sec->sh_info; ++i) {
-    if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end() ||
-        uintptr_t(verneedBuf) % sizeof(uint32_t) != 0)
+    if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end())
       fatal(toString(this) + " has an invalid Verneed");
     auto *vn = reinterpret_cast<const typename ELFT::Verneed *>(verneedBuf);
     const uint8_t *vernauxBuf = verneedBuf + vn->vn_aux;
     for (unsigned j = 0; j != vn->vn_cnt; ++j) {
-      if (vernauxBuf + sizeof(typename ELFT::Vernaux) > data.end() ||
-          uintptr_t(vernauxBuf) % sizeof(uint32_t) != 0)
+      if (vernauxBuf + sizeof(typename ELFT::Vernaux) > data.end())
         fatal(toString(this) + " has an invalid Vernaux");
       auto *aux = reinterpret_cast<const typename ELFT::Vernaux *>(vernauxBuf);
       if (aux->vna_name >= this->stringTable.size())
diff --git a/lld/test/ELF/invalid/verneed-shared.yaml b/lld/test/ELF/invalid/verneed-shared.test
similarity index 89%
rename from lld/test/ELF/invalid/verneed-shared.yaml
rename to lld/test/ELF/invalid/verneed-shared.test
index 18315fe8a2df9..916b8c1a5d950 100644
--- a/lld/test/ELF/invalid/verneed-shared.yaml
+++ b/lld/test/ELF/invalid/verneed-shared.test
@@ -6,7 +6,7 @@
 ## sh_offset(SHT_GNU_verneed) is out of bounds.
 # RUN: yaml2obj --docnum=1 %s -o %t1.so
 # RUN: not ld.lld %t.o %t1.so -o /dev/null 2>&1 | FileCheck --check-prefix=SHOFFSET %s
-# SHOFFSET: error: {{.*}}.so: section [index 1] has a sh_offset (0xffffffff) + sh_size (0x0) that is greater than the file size (0x228)
+# SHOFFSET: error: {{.*}}.so: section [index 1] has a sh_offset (0xffffffff) + sh_size (0x0) that is greater than the file size (0x168)
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
@@ -17,12 +17,14 @@ Sections:
   - Name:  .gnu.version_r
     Type:  SHT_GNU_verneed
     Flags: [ SHF_ALLOC ]
+    Info:  1
     ShOffset: 0xFFFFFFFF
 
-## A Verneed entry is misaligned (not a multiple of 4).
+## A Verneed entry is misaligned (not a multiple of 4). This may happen
+## some interface shared objects. We use memcpy to read the fields, so
+## misalignment isn't a problem and there is no need to diagnose.
 # RUN: yaml2obj --docnum=2 %s -o %t2.so
-# RUN: not ld.lld %t.o %t2.so -o /dev/null 2>&1 | FileCheck --check-prefix=VN-MISALIGNED %s
-# VN-MISALIGNED: {{.*}}.so has an invalid Verneed
+# RUN: ld.lld %t.o %t2.so -o /dev/null
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64

From 50db8402fc6652559d9ba3dc97bb787c4160ef5b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 26 May 2020 18:46:37 +0100
Subject: [PATCH 121/770] ResourcePriorityQueue.h - reduce unnecessary includes
 to forward declarations. NFC.

Move includes to ResourcePriorityQueue.cpp
---
 llvm/include/llvm/CodeGen/ResourcePriorityQueue.h      | 10 +++++-----
 .../lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp |  4 ++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
index 81587a3170ce1..b38cd49241742 100644
--- a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -16,15 +16,15 @@
 #ifndef LLVM_CODEGEN_RESOURCEPRIORITYQUEUE_H
 #define LLVM_CODEGEN_RESOURCEPRIORITYQUEUE_H
 
-#include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/MCInstrItineraries.h"
 
 namespace llvm {
+  class DFAPacketizer;
+  class InstrItineraryData;
   class ResourcePriorityQueue;
+  class SelectionDAGISel;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
 
   /// Sorting functions for the Available queue.
   struct resource_sort {
diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 34660e3a48ec5..55fe26eb64cda 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -19,9 +19,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"

From 0165cf701156db4d399cb31d31ecb154372e2562 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 26 May 2020 19:09:36 +0100
Subject: [PATCH 122/770] ObjCARCAnalysisUtils.h - remove unused includes. NFC.

We just need to include Passes.h in ObjCARCAliasAnalysis.cpp to compensate
---
 llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h | 4 ----
 llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp        | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
index 3edcc9894cf75..d120c6a4fd592 100644
--- a/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
+++ b/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h
@@ -23,17 +23,13 @@
 #define LLVM_LIB_ANALYSIS_OBJCARCANALYSISUTILS_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
 
 namespace llvm {
 namespace objcarc {
diff --git a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
index abe3cde57a25c..80e019f5fc921 100644
--- a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -24,6 +24,7 @@
 
 #include "llvm/Analysis/ObjCARCAliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Value.h"

From 8d31dd23ec2368d00b0668c3d01b1fd2ce4d621b Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 11:22:48 -0700
Subject: [PATCH 123/770] [lldb/Reproducers] Skip remaining failing test in
 python_api subdir

Skip the remaining two failing test in the python_api subdirectory. See
inline comments for the reason why.
---
 lldb/test/API/python_api/hello_world/TestHelloWorld.py | 1 +
 lldb/test/API/python_api/sbdata/TestSBData.py          | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/lldb/test/API/python_api/hello_world/TestHelloWorld.py b/lldb/test/API/python_api/hello_world/TestHelloWorld.py
index 5b189b36d7f92..2d38043bb4504 100644
--- a/lldb/test/API/python_api/hello_world/TestHelloWorld.py
+++ b/lldb/test/API/python_api/hello_world/TestHelloWorld.py
@@ -75,6 +75,7 @@ def test_with_process_launch_api(self):
     @add_test_categories(['pyapi'])
     @skipIfiOSSimulator
     @expectedFailureNetBSD
+    @skipIfReproducer # File synchronization is not supported during replay.
     def test_with_attach_to_process_with_id_api(self):
         """Create target, spawn a process, and attach to it with process id."""
         exe = '%s_%d'%(self.testMethodName, os.getpid())
diff --git a/lldb/test/API/python_api/sbdata/TestSBData.py b/lldb/test/API/python_api/sbdata/TestSBData.py
index a12f683d60139..ee04968042419 100644
--- a/lldb/test/API/python_api/sbdata/TestSBData.py
+++ b/lldb/test/API/python_api/sbdata/TestSBData.py
@@ -21,6 +21,7 @@ def setUp(self):
         self.line = line_number('main.cpp', '// set breakpoint here')
 
     @add_test_categories(['pyapi'])
+    @skipIfReproducer # SBData::SetData is not instrumented.
     def test_byte_order_and_address_byte_size(self):
         """Test the SBData::SetData() to ensure the byte order and address
         byte size are obeyed"""
@@ -41,6 +42,7 @@ def test_byte_order_and_address_byte_size(self):
         self.assertTrue(addr == 0x8877665544332211);
 
     @add_test_categories(['pyapi'])
+    @skipIfReproducer # SBData::SetData is not instrumented.
     def test_with_run_command(self):
         """Test the SBData APIs."""
         self.build()

From a94e08d2e840a0e7ce032f59e9344bc49b5a54a1 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 11:32:02 -0700
Subject: [PATCH 124/770] [StaticAnalyzer] Fix non-virtual destructor warning

Ficed warning: 'clang::ento::ExprEngine' has virtual functions but non-virtual destructor [-
Wnon-virtual-dtor]
  ~ExprEngine() = default;
---
 .../clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index a94c847f35ee1..b32302cfc3378 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -177,7 +177,7 @@ class ExprEngine {
              SetOfConstDecls *VisitedCalleesIn,
              FunctionSummariesTy *FS, InliningModes HowToInlineIn);
 
-  ~ExprEngine() = default;
+  virtual ~ExprEngine() = default;
 
   /// Returns true if there is still simulation state on the worklist.
   bool ExecuteWorkList(const LocationContext *L, unsigned Steps = 150000) {

From 2e824925402f011c2a4d3a0b51cce388b6d14d16 Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Tue, 26 May 2020 11:31:24 -0700
Subject: [PATCH 125/770] [fuzzer][afl] Fix build with GCC

Summary:
Fixes this build error with GCC 9.3.0:

```
../lib/fuzzer/afl/afl_driver.cpp:114:30: error: expected unqualified-id before string constant
  114 | __attribute__((weak)) extern "C" void __sanitizer_set_report_fd(void *);
      |                              ^~~
```

Reviewers: metzman, kcc

Reviewed By: kcc

Subscribers: #sanitizers

Tags: #sanitizers

Differential Revision: https://reviews.llvm.org/D80479
---
 compiler-rt/lib/fuzzer/afl/afl_driver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/fuzzer/afl/afl_driver.cpp b/compiler-rt/lib/fuzzer/afl/afl_driver.cpp
index bb3b48f367289..457f180ecc825 100644
--- a/compiler-rt/lib/fuzzer/afl/afl_driver.cpp
+++ b/compiler-rt/lib/fuzzer/afl/afl_driver.cpp
@@ -111,7 +111,7 @@ static uint8_t AflInputBuf[kMaxAflInputSize];
 
 // Use this optionally defined function to output sanitizer messages even if
 // user asks to close stderr.
-__attribute__((weak)) extern "C" void __sanitizer_set_report_fd(void *);
+extern "C" __attribute__((weak)) void __sanitizer_set_report_fd(void *);
 
 // Keep track of where stderr content is being written to, so that
 // dup_and_close_stderr can use the correct one.

From 6e9223a2c65835444c5c1328d52daf9f85f9618c Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Tue, 26 May 2020 13:34:52 -0500
Subject: [PATCH 126/770] [PowerPC][NFC] Update test to prevent DCE from
 causing failures

The test case provided in PR45709 can be simplified by DCE to an
empty function. To prevent this from happening if DCE is run prior
to ISEL in the back end, just add optnone to the function. The
behaviour it is testing for is in the SDAG legalization and is
not sensitive to optnone so the test case still achieves its desired
objective.
---
 llvm/test/CodeGen/PowerPC/pr45709.ll | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/pr45709.ll b/llvm/test/CodeGen/PowerPC/pr45709.ll
index bc295fafd2105..3a26173965467 100644
--- a/llvm/test/CodeGen/PowerPC/pr45709.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45709.ll
@@ -10,30 +10,37 @@
 define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2 {
 ; CHECK-LABEL: _ZN1a1bEv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bclr 12, 4*cr5+lt, 0
-; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_6
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_1: # %.preheader
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
-; CHECK-NEXT:    vxor v3, v3, v3
 ; CHECK-NEXT:    addi r3, r3, .LCPI0_0@toc@l
-; CHECK-NEXT:    lvx v4, 0, r3
+; CHECK-NEXT:    lvx v3, 0, r3
+; CHECK-NEXT:    vperm v2, v2, v2, v3
+; CHECK-NEXT:    vxor v3, v3, v3
 ; CHECK-NEXT:    addi r3, r1, -48
 ; CHECK-NEXT:    stvx v3, 0, r3
 ; CHECK-NEXT:    addi r3, r1, -32
-; CHECK-NEXT:    vperm v2, v2, v2, v4
 ; CHECK-NEXT:    stvx v2, 0, r3
 ; CHECK-NEXT:    lwz r3, -48(r1)
 ; CHECK-NEXT:    lwz r4, -32(r1)
 ; CHECK-NEXT:    cmpw r4, r3
-; CHECK-NEXT:    bc 12, gt, .LBB0_2
-; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_2: # %.preheader
+; CHECK-NEXT:    bc 12, gt, .LBB0_4
+; CHECK-NEXT:    b .LBB0_5
+; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    addi r3, r4, 0
-; CHECK-NEXT:  .LBB0_3: # %.preheader
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    cmpw r3, r3
 ; CHECK-NEXT:    stw r3, -64(r1)
 ; CHECK-NEXT:    addi r3, r1, -64
 ; CHECK-NEXT:    lvx v2, 0, r3
 ; CHECK-NEXT:    addi r3, r1, -16
 ; CHECK-NEXT:    stvx v2, 0, r3
+; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    blr
   br i1 undef, label %7, label %1
 
@@ -55,4 +62,4 @@ define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2
 
 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind optnone noinline }

From 12dbdc2a6b68162f7370e9754bdb0e1edd65bf3c Mon Sep 17 00:00:00 2001
From: Adam Balogh <adam.balogh@ericsson.com>
Date: Tue, 26 May 2020 20:43:37 +0200
Subject: [PATCH 127/770] [Analyzer] Fix buildbot failure of commit
 rGd70ec366c91b

---
 .../clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index b32302cfc3378..3611979c61911 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -97,7 +97,7 @@ class SymbolManager;
 class SwitchNodeBuilder;
 
 class ExprEngine {
-  virtual void anchor();
+  void anchor();
 public:
   /// The modes of inlining, which override the default analysis-wide settings.
   enum InliningModes {

From 7eb666b1556b86503f2f386bf921186cdbb2d22a Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Fri, 15 May 2020 12:30:07 -0500
Subject: [PATCH 128/770] [PowerPC] Add support for -mcpu=pwr10 in both clang
 and llvm

Summary:
This patch simply adds support for the new CPU in anticipation of
Power10. There isn't really any functionality added so there are no
associated test cases at this time.

Reviewers: stefanp, nemanjai, amyk, hfinkel, power-llvm-team, #powerpc

Reviewed By: stefanp, nemanjai, amyk, #powerpc

Subscribers: NeHuang, steven.zhang, hiraditya, llvm-commits, wuzish, shchenz, cfe-commits, kbarton, echristo

Tags: #clang, #powerpc, #llvm

Differential Revision: https://reviews.llvm.org/D80020
---
 clang/lib/Basic/Targets/PPC.cpp               | 41 +++++++++-----
 clang/lib/Basic/Targets/PPC.h                 | 43 ++++++++-------
 clang/lib/Driver/ToolChains/Arch/PPC.cpp      | 20 ++++---
 clang/test/Misc/target-invalid-cpu-note.c     |  2 +-
 clang/test/Preprocessor/init-ppc64.c          | 18 ++++++
 llvm/lib/Support/Host.cpp                     |  1 +
 llvm/lib/Target/PowerPC/PPC.td                | 27 +++++++--
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  3 +
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp      |  1 +
 llvm/lib/Target/PowerPC/PPCSubtarget.h        | 55 ++++++++++---------
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  9 ++-
 llvm/test/CodeGen/PowerPC/check-cpu.ll        |  6 +-
 12 files changed, 148 insertions(+), 78 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 81c13a8104e8a..231f94b66f5fd 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -151,6 +151,8 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("_ARCH_PWR8");
   if (ArchDefs & ArchDefinePwr9)
     Builder.defineMacro("_ARCH_PWR9");
+  if (ArchDefs & ArchDefinePwr10)
+    Builder.defineMacro("_ARCH_PWR10");
   if (ArchDefs & ArchDefineA2)
     Builder.defineMacro("_ARCH_A2");
   if (ArchDefs & ArchDefineA2q) {
@@ -263,41 +265,51 @@ bool PPCTargetInfo::initFeatureMap(
                             .Case("pwr7", true)
                             .Case("pwr8", true)
                             .Case("pwr9", true)
+                            .Case("pwr10", true)
                             .Case("ppc64", true)
                             .Case("ppc64le", true)
                             .Default(false);
 
   Features["qpx"] = (CPU == "a2q");
-  Features["power9-vector"] = (CPU == "pwr9");
+  Features["power9-vector"] = llvm::StringSwitch<bool>(CPU)
+                                  .Case("pwr10", true)
+                                  .Case("pwr9", true)
+                                  .Default(false);
   Features["crypto"] = llvm::StringSwitch<bool>(CPU)
                            .Case("ppc64le", true)
+                           .Case("pwr10", true)
                            .Case("pwr9", true)
                            .Case("pwr8", true)
                            .Default(false);
   Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
                                   .Case("ppc64le", true)
+                                  .Case("pwr10", true)
                                   .Case("pwr9", true)
                                   .Case("pwr8", true)
                                   .Default(false);
   Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
                            .Case("ppc64le", true)
+                           .Case("pwr10", true)
                            .Case("pwr9", true)
                            .Case("pwr8", true)
                            .Case("pwr7", true)
                            .Default(false);
   Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
                            .Case("ppc64le", true)
+                           .Case("pwr10", true)
                            .Case("pwr9", true)
                            .Case("pwr8", true)
                            .Case("pwr7", true)
                            .Default(false);
   Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
                                 .Case("ppc64le", true)
+                                .Case("pwr10", true)
                                 .Case("pwr9", true)
                                 .Case("pwr8", true)
                                 .Default(false);
   Features["vsx"] = llvm::StringSwitch<bool>(CPU)
                         .Case("ppc64le", true)
+                        .Case("pwr10", true)
                         .Case("pwr9", true)
                         .Case("pwr8", true)
                         .Case("pwr7", true)
@@ -313,10 +325,10 @@ bool PPCTargetInfo::initFeatureMap(
                         .Case("e500", true)
                         .Default(false);
 
-  // Future CPU should include all of the features of Power 9 as well as any
+  // Future CPU should include all of the features of Power 10 as well as any
   // additional features (yet to be determined) specific to it.
   if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
+    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
     addFutureSpecificFeatures(Features);
   }
 
@@ -463,18 +475,17 @@ ArrayRef<TargetInfo::AddlRegName> PPCTargetInfo::getGCCAddlRegNames() const {
 }
 
 static constexpr llvm::StringLiteral ValidCPUNames[] = {
-    {"generic"},   {"440"},       {"450"},         {"601"},         {"602"},
-    {"603"},       {"603e"},      {"603ev"},       {"604"},         {"604e"},
-    {"620"},       {"630"},       {"g3"},          {"7400"},        {"g4"},
-    {"7450"},      {"g4+"},       {"750"},         {"8548"},        {"970"},
-    {"g5"},        {"a2"},        {"a2q"},         {"e500"},        {"e500mc"},
-    {"e5500"},     {"power3"},    {"pwr3"},        {"power4"},      {"pwr4"},
-    {"power5"},    {"pwr5"},      {"power5x"},     {"pwr5x"},       {"power6"},
-    {"pwr6"},      {"power6x"},   {"pwr6x"},       {"power7"},      {"pwr7"},
-    {"power8"},    {"pwr8"},      {"power9"},      {"pwr9"},        {"powerpc"},
-    {"ppc"},       {"powerpc64"}, {"ppc64"},       {"powerpc64le"}, {"ppc64le"},
-    {"future"}
-};
+    {"generic"},     {"440"},     {"450"},     {"601"},       {"602"},
+    {"603"},         {"603e"},    {"603ev"},   {"604"},       {"604e"},
+    {"620"},         {"630"},     {"g3"},      {"7400"},      {"g4"},
+    {"7450"},        {"g4+"},     {"750"},     {"8548"},      {"970"},
+    {"g5"},          {"a2"},      {"a2q"},     {"e500"},      {"e500mc"},
+    {"e5500"},       {"power3"},  {"pwr3"},    {"power4"},    {"pwr4"},
+    {"power5"},      {"pwr5"},    {"power5x"}, {"pwr5x"},     {"power6"},
+    {"pwr6"},        {"power6x"}, {"pwr6x"},   {"power7"},    {"pwr7"},
+    {"power8"},      {"pwr8"},    {"power9"},  {"pwr9"},      {"power10"},
+    {"pwr10"},       {"powerpc"}, {"ppc"},     {"powerpc64"}, {"ppc64"},
+    {"powerpc64le"}, {"ppc64le"}, {"future"}};
 
 bool PPCTargetInfo::isValidCPUName(StringRef Name) const {
   return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames);
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 7c19a96a99c74..3feda1853547f 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -43,13 +43,13 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
     ArchDefinePwr7 = 1 << 11,
     ArchDefinePwr8 = 1 << 12,
     ArchDefinePwr9 = 1 << 13,
-    ArchDefineFuture = 1 << 14,
-    ArchDefineA2 = 1 << 15,
-    ArchDefineA2q = 1 << 16,
-    ArchDefineE500 = 1 << 17
+    ArchDefinePwr10 = 1 << 14,
+    ArchDefineFuture = 1 << 15,
+    ArchDefineA2 = 1 << 16,
+    ArchDefineA2q = 1 << 17,
+    ArchDefineE500 = 1 << 18
   } ArchDefineTypes;
 
-
   ArchDefineTypes ArchDefs = ArchDefineNone;
   static const Builtin::Info BuiltinInfo[];
   static const char *const GCCRegNames[];
@@ -119,20 +119,20 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q)
               .Cases("power3", "pwr3", ArchDefinePpcgr)
               .Cases("power4", "pwr4",
-                    ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+                     ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power5", "pwr5",
-                    ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                        ArchDefinePpcsq)
+                     ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                         ArchDefinePpcsq)
               .Cases("power5x", "pwr5x",
-                    ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
-                        ArchDefinePpcgr | ArchDefinePpcsq)
+                     ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
+                         ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power6", "pwr6",
-                    ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
-                        ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+                     ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
+                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power6x", "pwr6x",
-                    ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
-                        ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                        ArchDefinePpcsq)
+                     ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
+                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                         ArchDefinePpcsq)
               .Cases("power7", "pwr7",
                      ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
@@ -146,11 +146,16 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                      ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                          ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                          ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+              .Cases("power10", "pwr10",
+                     ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 |
+                         ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
+                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                         ArchDefinePpcsq)
               .Case("future",
-                    ArchDefineFuture | ArchDefinePwr9 | ArchDefinePwr8 |
-                        ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
-                        ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                        ArchDefinePpcsq)
+                    ArchDefineFuture | ArchDefinePwr10 | ArchDefinePwr9 |
+                        ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
+                        ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
+                        ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("8548", "e500", ArchDefineE500)
               .Default(ArchDefineNone);
     }
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index e5130a9485de7..144e276a6bd87 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -70,6 +70,7 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
         .Case("power7", "pwr7")
         .Case("power8", "pwr8")
         .Case("power9", "pwr9")
+        .Case("power10", "pwr10")
         .Case("future", "future")
         .Case("pwr3", "pwr3")
         .Case("pwr4", "pwr4")
@@ -80,6 +81,7 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
         .Case("pwr7", "pwr7")
         .Case("pwr8", "pwr8")
         .Case("pwr9", "pwr9")
+        .Case("pwr10", "pwr10")
         .Case("powerpc", "ppc")
         .Case("powerpc64", "ppc64")
         .Case("powerpc64le", "ppc64le")
@@ -91,14 +93,16 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
 
 const char *ppc::getPPCAsmModeForCPU(StringRef Name) {
   return llvm::StringSwitch<const char *>(Name)
-        .Case("pwr7", "-mpower7")
-        .Case("power7", "-mpower7")
-        .Case("pwr8", "-mpower8")
-        .Case("power8", "-mpower8")
-        .Case("ppc64le", "-mpower8")
-        .Case("pwr9", "-mpower9")
-        .Case("power9", "-mpower9")
-        .Default("-many");
+      .Case("pwr7", "-mpower7")
+      .Case("power7", "-mpower7")
+      .Case("pwr8", "-mpower8")
+      .Case("power8", "-mpower8")
+      .Case("ppc64le", "-mpower8")
+      .Case("pwr9", "-mpower9")
+      .Case("power9", "-mpower9")
+      .Case("pwr10", "-mpower10")
+      .Case("power10", "-mpower10")
+      .Default("-many");
 }
 
 void ppc::getPPCTargetFeatures(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 9f036c94c3f8e..5c571fb458ec5 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -81,7 +81,7 @@
 // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750,
 // PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4,
 // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x,
-// PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, powerpc, ppc, powerpc64,
+// PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64,
 // PPC-SAME: ppc64, powerpc64le, ppc64le, future
 
 // RUN: not %clang_cc1 -triple mips--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix MIPS
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index b24f8eb7050be..ed8601636554e 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -627,12 +627,30 @@
 // PPCPOWER9:#define _ARCH_PWR7 1
 // PPCPOWER9:#define _ARCH_PWR9 1
 //
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr10 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER10 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power10 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER10 %s
+//
+// PPCPOWER10:#define _ARCH_PPC 1
+// PPCPOWER10:#define _ARCH_PPC64 1
+// PPCPOWER10:#define _ARCH_PPCGR 1
+// PPCPOWER10:#define _ARCH_PPCSQ 1
+// PPCPOWER10:#define _ARCH_PWR10 1
+// PPCPOWER10:#define _ARCH_PWR4 1
+// PPCPOWER10:#define _ARCH_PWR5 1
+// PPCPOWER10:#define _ARCH_PWR5X 1
+// PPCPOWER10:#define _ARCH_PWR6 1
+// PPCPOWER10-NOT:#define _ARCH_PWR6X 1
+// PPCPOWER10:#define _ARCH_PWR7 1
+// PPCPOWER10:#define _ARCH_PWR8 1
+// PPCPOWER10:#define _ARCH_PWR9 1
+//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu future -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCFUTURE %s
 //
 // PPCFUTURE:#define _ARCH_PPC 1
 // PPCFUTURE:#define _ARCH_PPC64 1
 // PPCFUTURE:#define _ARCH_PPCGR 1
 // PPCFUTURE:#define _ARCH_PPCSQ 1
+// PPCFUTURE:#define _ARCH_PWR10 1
 // PPCFUTURE:#define _ARCH_PWR4 1
 // PPCFUTURE:#define _ARCH_PWR5 1
 // PPCFUTURE:#define _ARCH_PWR5X 1
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index d9b3cac5e8dc0..da68464c4a3d9 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -142,6 +142,7 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Case("POWER8E", "pwr8")
       .Case("POWER8NVL", "pwr8")
       .Case("POWER9", "pwr9")
+      .Case("POWER10", "pwr10")
       // FIXME: If we get a simulator or machine with the capabilities of
       // mcpu=future, we should revisit this and add the name reported by the
       // simulator/machine.
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 1d1f11e498c20..a6c7868f6ac25 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -51,6 +51,7 @@ def DirectivePwr6x
 def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
 def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
+def DirectivePwr10: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR10", "">;
 def DirectivePwrFuture
     : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;
 
@@ -205,6 +206,9 @@ def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
 def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
                                      "true",
                                      "Enable instructions added in ISA 3.0.">;
+def FeatureISA3_1 : SubtargetFeature<"isa-v31-instructions", "IsISA3_1",
+                                     "true",
+                                     "Enable instructions added in ISA 3.1.">;
 def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
                                         "Enable POWER9 Altivec instructions",
                                         [FeatureISA3_0, FeatureP8Altivec]>;
@@ -328,14 +332,25 @@ def ProcessorFeatures {
   list<SubtargetFeature> P9Features =
     !listconcat(P9InheritableFeatures, P9SpecificFeatures);
 
+  // Power10
+  // For P10 CPU we assume that all of the existing features from Power9
+  // still exist with the exception of those we know are Power9 specific.
+  list<SubtargetFeature> P10AdditionalFeatures =
+    [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
+     FeaturePCRelativeMemops];
+  list<SubtargetFeature> P10SpecificFeatures = [];
+  list<SubtargetFeature> P10InheritableFeatures =
+    !listconcat(P9InheritableFeatures, P10AdditionalFeatures);
+  list<SubtargetFeature> P10Features =
+    !listconcat(P10InheritableFeatures, P10SpecificFeatures);
+
   // Future
-  // For future CPU we assume that all of the existing features from Power 9
-  // still exist with the exception of those we know are Power 9 specific.
+  // For future CPU we assume that all of the existing features from Power10
+  // still exist with the exception of those we know are Power10 specific.
   list<SubtargetFeature> FutureAdditionalFeatures = [];
-  list<SubtargetFeature> FutureSpecificFeatures =
-    [FeaturePrefixInstrs, FeaturePCRelativeMemops];
+  list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
-    !listconcat(P9InheritableFeatures, FutureAdditionalFeatures);
+    !listconcat(P10InheritableFeatures, FutureAdditionalFeatures);
   list<SubtargetFeature> FutureFeatures =
     !listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
 }
@@ -540,6 +555,8 @@ def : ProcessorModel<"pwr6x", G5Model,
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
+// No scheduler model yet.
+def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 42df83831113a..53f9ac678c7b7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1306,6 +1306,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE:
     setPrefLoopAlignment(Align(16));
     setPrefFunctionAlignment(Align(16));
@@ -14913,6 +14914,7 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE: {
     if (!ML)
       break;
@@ -16103,6 +16105,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
       // vector        7       2      2
       return true;
     case PPC::DIR_PWR9:
+    case PPC::DIR_PWR10:
     case PPC::DIR_PWR_FUTURE:
       //  type        mul     add    shl
       // scalar        5       2      2
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index cfc54df13f792..2f332715d8cac 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -115,6 +115,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
   IsISA3_0 = false;
+  IsISA3_1 = false;
   UseLongCalls = false;
   SecurePlt = false;
   VectorsUseTwoUnits = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index be1143f903e8b..bfe39814e4cc8 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -34,32 +34,33 @@ class StringRef;
 
 namespace PPC {
   // -m directive values.
-  enum {
-    DIR_NONE,
-    DIR_32,
-    DIR_440,
-    DIR_601,
-    DIR_602,
-    DIR_603,
-    DIR_7400,
-    DIR_750,
-    DIR_970,
-    DIR_A2,
-    DIR_E500,
-    DIR_E500mc,
-    DIR_E5500,
-    DIR_PWR3,
-    DIR_PWR4,
-    DIR_PWR5,
-    DIR_PWR5X,
-    DIR_PWR6,
-    DIR_PWR6X,
-    DIR_PWR7,
-    DIR_PWR8,
-    DIR_PWR9,
-    DIR_PWR_FUTURE,
-    DIR_64
-  };
+enum {
+  DIR_NONE,
+  DIR_32,
+  DIR_440,
+  DIR_601,
+  DIR_602,
+  DIR_603,
+  DIR_7400,
+  DIR_750,
+  DIR_970,
+  DIR_A2,
+  DIR_E500,
+  DIR_E500mc,
+  DIR_E5500,
+  DIR_PWR3,
+  DIR_PWR4,
+  DIR_PWR5,
+  DIR_PWR5X,
+  DIR_PWR6,
+  DIR_PWR6X,
+  DIR_PWR7,
+  DIR_PWR8,
+  DIR_PWR9,
+  DIR_PWR10,
+  DIR_PWR_FUTURE,
+  DIR_64
+};
 }
 
 class GlobalValue;
@@ -138,6 +139,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
   bool IsISA3_0;
+  bool IsISA3_1;
   bool UseLongCalls;
   bool SecurePlt;
   bool VectorsUseTwoUnits;
@@ -308,6 +310,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool hasHTM() const { return HasHTM; }
   bool hasFloat128() const { return HasFloat128; }
   bool isISA3_0() const { return IsISA3_0; }
+  bool isISA3_1() const { return IsISA3_1; }
   bool useLongCalls() const { return UseLongCalls; }
   bool hasFusion() const { return HasFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index a41c6b41a991b..46c5335a558f4 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -651,11 +651,12 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
   if (CacheLineSize.getNumOccurrences() > 0)
     return CacheLineSize;
 
-  // On P7, P8 or P9 we have a cache line size of 128.
+  // Starting with P7 we have a cache line size of 128.
   unsigned Directive = ST->getCPUDirective();
   // Assume that Future CPU has the same cache line size as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+      Directive == PPC::DIR_PWR_FUTURE)
     return 128;
 
   // On other processors return a default of 64 bytes.
@@ -687,9 +688,11 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // For P7 and P8, floating-point instructions have a 6-cycle latency and
   // there are two execution units, so unroll by 12x for latency hiding.
   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
   // Assume that future is the same as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+      Directive == PPC::DIR_PWR_FUTURE)
     return 12;
 
   // For most things, modern systems have two execution units (and
diff --git a/llvm/test/CodeGen/PowerPC/check-cpu.ll b/llvm/test/CodeGen/PowerPC/check-cpu.ll
index baa39024ebe8d..132be3058216b 100644
--- a/llvm/test/CodeGen/PowerPC/check-cpu.ll
+++ b/llvm/test/CodeGen/PowerPC/check-cpu.ll
@@ -2,9 +2,13 @@
 ; RUN:     -mcpu=future < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:     -mcpu=future < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=power10 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr10 < %s | FileCheck %s
 
 
-; Test mcpu=future that should be recognized on PowerPC.
+; Test -mcpu=[pwr10|future] is recognized on PowerPC.
 
 ; CHECK-NOT: is not a recognized processor for this target
 ; CHECK:     .text

From 0788392637f414c312a995f3202177a2919eba2f Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 26 May 2020 13:27:16 -0400
Subject: [PATCH 129/770] [InstCombine] add tests for reassociative sub/add
 expressions; NFC

---
 llvm/test/Transforms/InstCombine/sub.ll | 83 +++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index f6fa797eb0c82..52c51c70f02cf 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -1538,3 +1538,86 @@ define i8 @test75(i8 %x) {
   %t1 = sub i8 %x, %t0
   ret i8 %t1
 }
+
+; ((w-x) + y) - z --> (w+y) - (x+z)
+
+define i8 @sub_add_sub_reassoc(i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @sub_add_sub_reassoc(
+; CHECK-NEXT:    [[S1:%.*]] = sub i8 [[W:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[S1]], [[Y:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[A]], [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[S2]]
+;
+  %s1 = sub i8 %w, %x
+  %a = add i8 %s1, %y
+  %s2 = sub i8 %a, %z
+  ret i8 %s2
+}
+
+; vectors work too.
+
+define <2 x i8> @sub_add_sub_reassoc_commute(<2 x i8> %w, <2 x i8> %x, <2 x i8> %y, <2 x i8> %z) {
+; CHECK-LABEL: @sub_add_sub_reassoc_commute(
+; CHECK-NEXT:    [[D:%.*]] = sdiv <2 x i8> [[Y:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[S1:%.*]] = sub <2 x i8> [[W:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[D]], [[S1]]
+; CHECK-NEXT:    [[S2:%.*]] = sub <2 x i8> [[A]], [[Z:%.*]]
+; CHECK-NEXT:    ret <2 x i8> [[S2]]
+;
+  %d = sdiv <2 x i8> %y, <i8 42, i8 -42> ; thwart complexity-based canonicalization
+  %s1 = sub <2 x i8> %w, %x
+  %a = add <2 x i8> %d, %s1
+  %s2 = sub <2 x i8> %a, %z
+  ret <2 x i8> %s2
+}
+
+; (v-w) + (x-y) - z --> (v+x) - (w+y+z)
+
+define i8 @sub_add_sub_reassoc_twice(i8 %v, i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @sub_add_sub_reassoc_twice(
+; CHECK-NEXT:    [[S1:%.*]] = sub i8 [[V:%.*]], [[W:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[S1]], [[S2]]
+; CHECK-NEXT:    [[S3:%.*]] = sub i8 [[A]], [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[S3]]
+;
+  %s1 = sub i8 %v, %w
+  %s2 = sub i8 %x, %y
+  %a = add i8 %s1, %s2
+  %s3 = sub i8 %a, %z
+  ret i8 %s3
+}
+
+; negative test - uses
+
+define i8 @sub_add_sub_reassoc_use1(i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @sub_add_sub_reassoc_use1(
+; CHECK-NEXT:    [[S1:%.*]] = sub i8 [[W:%.*]], [[X:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[S1]])
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[S1]], [[Y:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[A]], [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[S2]]
+;
+  %s1 = sub i8 %w, %x
+  call void @use8(i8 %s1)
+  %a = add i8 %s1, %y
+  %s2 = sub i8 %a, %z
+  ret i8 %s2
+}
+
+; negative test - uses
+
+define i8 @sub_add_sub_reassoc_use2(i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @sub_add_sub_reassoc_use2(
+; CHECK-NEXT:    [[S1:%.*]] = sub i8 [[W:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[S1]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use8(i8 [[A]])
+; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[A]], [[Z:%.*]]
+; CHECK-NEXT:    ret i8 [[S2]]
+;
+  %s1 = sub i8 %w, %x
+  %a = add i8 %s1, %y
+  call void @use8(i8 %a)
+  %s2 = sub i8 %a, %z
+  ret i8 %s2
+}

From f5cfcc4b0638eaca9194776309d16cd59c1f961b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 26 May 2020 14:30:48 -0400
Subject: [PATCH 130/770] [LoopVectorize] regenerate full test checks; NFC

---
 .../LoopVectorize/interleaved-accesses.ll     | 52 ++++++++++++++-----
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 5e5dc5e74f1c1..b82d47fced554 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -133,20 +133,46 @@ for.end:                                          ; preds = %for.body
 ;   return r;
 ; }
 
-; CHECK-LABEL: @test_struct_load4(
-; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
-; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK: add <4 x i32>
-; CHECK: sub <4 x i32>
-; CHECK: add <4 x i32>
-; CHECK: sub <4 x i32>
-
 %struct.ST4 = type { i32, i32, i32, i32 }
 
 define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+; CHECK-LABEL: @test_struct_load4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP4]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !7
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
+;
 entry:
   br label %for.body
 
@@ -187,7 +213,7 @@ for.end:                                          ; preds = %for.body
 ; }
 
 ; CHECK-LABEL: @test_struct_store4(
-; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>*
 ; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
@@ -509,7 +535,7 @@ for.body:                                         ; preds = %for.body, %entry
 ;   int a;
 ;   float b;
 ; };
-; 
+;
 ; int SA;
 ; float SB;
 ;

From 1a2bffaf8b4567663f3001bd9c7532322e89f990 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 26 May 2020 14:32:57 -0400
Subject: [PATCH 131/770] [InstCombine] reassociate sub+add to increase adds
 and throughput

The -reassociate pass tends to transform this kind of pattern into
something that is worse for vectorization and codegen. See PR43953:
https://bugs.llvm.org/show_bug.cgi?id=43953

Follows-up the FP version of the same transform:
rGa0ce2338a083
---
 .../InstCombine/InstCombineAddSub.cpp         | 11 ++++++++++
 llvm/test/Transforms/InstCombine/sub.ll       | 20 +++++++++----------
 .../LoopVectorize/interleaved-accesses.ll     |  6 +++---
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 233e0c7b5de72..288d0d148689e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1765,6 +1765,17 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
     return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
 
+  // Reassociate sub/add sequences to create more add instructions and
+  // reduce dependency chains:
+  // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+  Value *Z;
+  if (match(Op0, m_OneUse(m_c_Add(m_OneUse(m_Sub(m_Value(X), m_Value(Y))),
+                                  m_Value(Z))))) {
+    Value *XZ = Builder.CreateAdd(X, Z);
+    Value *YW = Builder.CreateAdd(Y, Op1);
+    return BinaryOperator::CreateSub(XZ, YW);
+  }
+
   if (Constant *C = dyn_cast<Constant>(Op0)) {
     Value *X;
     if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 52c51c70f02cf..9463cea877b92 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -1543,9 +1543,9 @@ define i8 @test75(i8 %x) {
 
 define i8 @sub_add_sub_reassoc(i8 %w, i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @sub_add_sub_reassoc(
-; CHECK-NEXT:    [[S1:%.*]] = sub i8 [[W:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = add i8 [[S1]], [[Y:%.*]]
-; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[A]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[W:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i8 [[S2]]
 ;
   %s1 = sub i8 %w, %x
@@ -1559,9 +1559,9 @@ define i8 @sub_add_sub_reassoc(i8 %w, i8 %x, i8 %y, i8 %z) {
 define <2 x i8> @sub_add_sub_reassoc_commute(<2 x i8> %w, <2 x i8> %x, <2 x i8> %y, <2 x i8> %z) {
 ; CHECK-LABEL: @sub_add_sub_reassoc_commute(
 ; CHECK-NEXT:    [[D:%.*]] = sdiv <2 x i8> [[Y:%.*]], <i8 42, i8 -42>
-; CHECK-NEXT:    [[S1:%.*]] = sub <2 x i8> [[W:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[D]], [[S1]]
-; CHECK-NEXT:    [[S2:%.*]] = sub <2 x i8> [[A]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i8> [[D]], [[W:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = sub <2 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i8> [[S2]]
 ;
   %d = sdiv <2 x i8> %y, <i8 42, i8 -42> ; thwart complexity-based canonicalization
@@ -1575,10 +1575,10 @@ define <2 x i8> @sub_add_sub_reassoc_commute(<2 x i8> %w, <2 x i8> %x, <2 x i8>
 
 define i8 @sub_add_sub_reassoc_twice(i8 %v, i8 %w, i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @sub_add_sub_reassoc_twice(
-; CHECK-NEXT:    [[S1:%.*]] = sub i8 [[V:%.*]], [[W:%.*]]
-; CHECK-NEXT:    [[S2:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = add i8 [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = sub i8 [[A]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[W:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X:%.*]], [[V:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    [[S3:%.*]] = sub i8 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret i8 [[S3]]
 ;
   %s1 = sub i8 %v, %w
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index b82d47fced554..f7a02d613af1b 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -152,9 +152,9 @@ define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[STRIDED_VEC2]]
-; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP4]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6

From 713538b629e45e6236b5d60fd6b64d7b8669cd00 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Tue, 26 May 2020 11:58:31 -0700
Subject: [PATCH 132/770] Be more specific about auto * vs auto for po alias.

---
 lldb/source/Interpreter/CommandInterpreter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 1cd71b07eaeb7..61288fc42131a 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -356,7 +356,7 @@ void CommandInterpreter::Initialize() {
     AddAlias("p", cmd_obj_sp, "--")->SetHelpLong("");
     AddAlias("print", cmd_obj_sp, "--")->SetHelpLong("");
     AddAlias("call", cmd_obj_sp, "--")->SetHelpLong("");
-    if (auto po = AddAlias("po", cmd_obj_sp, "-O --")) {
+    if (auto *po = AddAlias("po", cmd_obj_sp, "-O --")) {
       po->SetHelp("Evaluate an expression on the current thread.  Displays any "
                   "returned value with formatting "
                   "controlled by the type's author.");

From fca76b79456c916fd2ce193ef76d6e795bd9c105 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 26 May 2020 12:04:14 -0700
Subject: [PATCH 133/770] Roll variables into an LLVM_DEBUG block to address
 -Wunused-but-set-variable

---
 llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index cc7a7e2ca9cc8..ee219724ee469 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -83,8 +83,6 @@ class ELFLinkGraphBuilder_x86_64 {
 
       for (auto SymRef : *Symbols) {
         Optional<StringRef> Name;
-        unsigned char Binding;
-        uint64_t Value;
         uint64_t Size = 0;
 
         // FIXME: Read size.
@@ -95,17 +93,15 @@ class ELFLinkGraphBuilder_x86_64 {
         } else {
           return NameOrErr.takeError();
         }
-        Binding = SymRef.getBinding();
-        Value = SymRef.getValue();
         LLVM_DEBUG({
           dbgs() << "  ";
           if (!Name)
             dbgs() << "<anonymous symbol>";
           else
             dbgs() << *Name;
-          dbgs() << ": value = " << formatv("{0:x16}", Value)
+          dbgs() << ": value = " << formatv("{0:x16}", SymRef.getValue())
                  << ", type = " << formatv("{0:x2}", SymRef.getType())
-                 << ", binding = " << Binding
+                 << ", binding = " << SymRef.getBinding()
                  << ", size =" << Size;
           dbgs() << "\n";
         });

From ae903f0313e481520eff8a13044070aca4d0b75d Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 12:14:32 -0700
Subject: [PATCH 134/770] [lldb/Test] Reinstate FoundationSymtabTestCase

---
 .../lang/objc/foundation/TestSymbolTable.py   | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
index b77a8dfc0ed90..f3331e829c27e 100644
--- a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
+++ b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
@@ -40,13 +40,25 @@ def test_with_python_api(self):
         process = target.LaunchSimple(
             None, None, self.get_process_working_directory())
 
-        #
-        # Exercise Python APIs to access the symbol table entries.
-        #
-
         # Create the filespec by which to locate our a.out module.
         filespec = lldb.SBFileSpec(exe, False)
 
         module = target.FindModule(filespec)
         self.assertTrue(module, VALID_MODULE)
 
+        # Create the set of known symbols.  As we iterate through the symbol
+        # table, remove the symbol from the set if it is a known symbol.
+        expected_symbols = set(self.symbols_list)
+        for symbol in module:
+            self.assertTrue(symbol, VALID_SYMBOL)
+            #print("symbol:", symbol)
+            name = symbol.GetName()
+            if name in expected_symbols:
+                #print("Removing %s from known_symbols %s" % (name, expected_symbols))
+                expected_symbols.remove(name)
+
+        # At this point, the known_symbols set should have become an empty set.
+        # If not, raise an error.
+        #print("symbols unaccounted for:", expected_symbols)
+        self.assertTrue(len(expected_symbols) == 0,
+                        "All the known symbols are accounted for")

From ef94f60ff7954521e6ff1be044a4a5d0599ce4ef Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 26 May 2020 22:16:13 +0300
Subject: [PATCH 135/770] [MSSA][Doc] Fix typo

---
 llvm/docs/MemorySSA.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/MemorySSA.rst b/llvm/docs/MemorySSA.rst
index 4f96c83a032d2..99adc8b11911b 100644
--- a/llvm/docs/MemorySSA.rst
+++ b/llvm/docs/MemorySSA.rst
@@ -64,7 +64,7 @@ or indireclty. For example in:
 
 ``d`` is connected directly with ``c`` and indirectly with ``b``.
 This means that ``d`` potentially clobbers (see below) ``c`` *or*
-``b`` *or* both. This in turn implies that without the use of `The walker_`,
+``b`` *or* both. This in turn implies that without the use of `The walker`_,
 initially every ``MemoryDef`` clobbers every other ``MemoryDef``.
 
 ``MemoryPhi``\ s are ``PhiNode``\ s, but for memory operations. If at any

From c4dbe59ae8253d73b63e5fcce0bc8bc44b4d07b5 Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <psteinfeld@nvidia.com>
Date: Fri, 22 May 2020 12:08:56 -0700
Subject: [PATCH 136/770] [flang] Fixes for problems with declaring procedure
 entities

Summary:
We were not detecting declaring multiple interfaces to the same procedure.
Also, we were not handling the initialization of entitiies where the associated
Symbol had previously had errors.

I added the function `IsInterfaceSet()` to ProcEntityDetails to see if
the value of `interface_` had been previously set.  I then checked  this
function before calling set_interface() and emitted an error message if
the interface was already set.

Also, in situations where we were emitting error messages for symbols, I
set the Error flag on the Symbol.  Then when performing initialization
on the Symbol, I first check to see if the Symbol had an error.

Reviewers: tskeith, klausler, DavidTruby

Subscribers: llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80453
---
 flang/include/flang/Semantics/symbol.h |  8 ++-
 flang/lib/Semantics/resolve-names.cpp  | 96 ++++++++++++++++----------
 flang/test/Semantics/resolve91.f90     | 46 ++++++++++++
 3 files changed, 111 insertions(+), 39 deletions(-)
 create mode 100644 flang/test/Semantics/resolve91.f90

diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 2a95f483a173e..34e4ea95eb4af 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -218,7 +218,13 @@ class ProcEntityDetails : public EntityDetails, public WithPassArg {
 
   const ProcInterface &interface() const { return interface_; }
   ProcInterface &interface() { return interface_; }
-  void set_interface(const ProcInterface &interface) { interface_ = interface; }
+  void set_interface(const ProcInterface &interface) {
+    CHECK(!IsInterfaceSet());
+    interface_ = interface;
+  }
+  bool IsInterfaceSet() {
+    return interface_.symbol() != nullptr || interface_.type() != nullptr;
+  }
   inline bool HasExplicitInterface() const;
 
   // Be advised: !init().has_value() => uninitialized pointer,
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 175d02597dfa2..3b60969b122a7 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3435,18 +3435,25 @@ Symbol &DeclarationVisitor::DeclareProcEntity(
     const parser::Name &name, Attrs attrs, const ProcInterface &interface) {
   Symbol &symbol{DeclareEntity<ProcEntityDetails>(name, attrs)};
   if (auto *details{symbol.detailsIf<ProcEntityDetails>()}) {
-    if (interface.type()) {
-      symbol.set(Symbol::Flag::Function);
-    } else if (interface.symbol()) {
-      if (interface.symbol()->test(Symbol::Flag::Function)) {
+    if (details->IsInterfaceSet()) {
+      SayWithDecl(name, symbol,
+          "The interface for procedure '%s' has already been "
+          "declared"_err_en_US);
+      context().SetError(symbol);
+    } else {
+      if (interface.type()) {
         symbol.set(Symbol::Flag::Function);
-      } else if (interface.symbol()->test(Symbol::Flag::Subroutine)) {
-        symbol.set(Symbol::Flag::Subroutine);
+      } else if (interface.symbol()) {
+        if (interface.symbol()->test(Symbol::Flag::Function)) {
+          symbol.set(Symbol::Flag::Function);
+        } else if (interface.symbol()->test(Symbol::Flag::Subroutine)) {
+          symbol.set(Symbol::Flag::Subroutine);
+        }
       }
+      details->set_interface(interface);
+      SetBindNameOn(symbol);
+      SetPassNameOn(symbol);
     }
-    details->set_interface(interface);
-    SetBindNameOn(symbol);
-    SetPassNameOn(symbol);
   }
   return symbol;
 }
@@ -3460,18 +3467,22 @@ Symbol &DeclarationVisitor::DeclareObjectEntity(
     }
     if (!arraySpec().empty()) {
       if (details->IsArray()) {
-        Say(name,
-            "The dimensions of '%s' have already been declared"_err_en_US);
-        context().SetError(symbol);
+        if (!context().HasError(symbol)) {
+          Say(name,
+              "The dimensions of '%s' have already been declared"_err_en_US);
+          context().SetError(symbol);
+        }
       } else {
         details->set_shape(arraySpec());
       }
     }
     if (!coarraySpec().empty()) {
       if (details->IsCoarray()) {
-        Say(name,
-            "The codimensions of '%s' have already been declared"_err_en_US);
-        context().SetError(symbol);
+        if (!context().HasError(symbol)) {
+          Say(name,
+              "The codimensions of '%s' have already been declared"_err_en_US);
+          context().SetError(symbol);
+        }
       } else {
         details->set_coshape(coarraySpec());
       }
@@ -3913,7 +3924,7 @@ bool DeclarationVisitor::Pre(const parser::ProcComponentDefStmt &) {
   CHECK(!interfaceName_);
   return true;
 }
-void DeclarationVisitor::Post(const parser::ProcComponentDefStmt &stmt) {
+void DeclarationVisitor::Post(const parser::ProcComponentDefStmt &) {
   interfaceName_ = nullptr;
 }
 bool DeclarationVisitor::Pre(const parser::ProcPointerInit &x) {
@@ -4702,9 +4713,11 @@ void DeclarationVisitor::SetType(
   } else if (!symbol.test(Symbol::Flag::Implicit)) {
     SayWithDecl(
         name, symbol, "The type of '%s' has already been declared"_err_en_US);
+    context().SetError(symbol);
   } else if (type != *prevType) {
     SayWithDecl(name, symbol,
         "The type of '%s' has already been implicitly declared"_err_en_US);
+    context().SetError(symbol);
   } else {
     symbol.set(Symbol::Flag::Implicit, false);
   }
@@ -5697,17 +5710,21 @@ void DeclarationVisitor::PointerInitialization(
     const parser::Name &name, const parser::InitialDataTarget &target) {
   if (name.symbol) {
     Symbol &ultimate{name.symbol->GetUltimate()};
-    if (IsPointer(ultimate)) {
-      if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
-        CHECK(!details->init());
-        Walk(target);
-        if (MaybeExpr expr{EvaluateExpr(target)}) {
-          CheckInitialDataTarget(ultimate, *expr, target.value().source);
-          details->set_init(std::move(*expr));
+    if (!context().HasError(ultimate)) {
+      if (IsPointer(ultimate)) {
+        if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
+          CHECK(!details->init());
+          Walk(target);
+          if (MaybeExpr expr{EvaluateExpr(target)}) {
+            CheckInitialDataTarget(ultimate, *expr, target.value().source);
+            details->set_init(std::move(*expr));
+          }
         }
+      } else {
+        Say(name,
+            "'%s' is not a pointer but is initialized like one"_err_en_US);
+        context().SetError(ultimate);
       }
-    } else {
-      Say(name, "'%s' is not a pointer but is initialized like one"_err_en_US);
     }
   }
 }
@@ -5715,22 +5732,25 @@ void DeclarationVisitor::PointerInitialization(
     const parser::Name &name, const parser::ProcPointerInit &target) {
   if (name.symbol) {
     Symbol &ultimate{name.symbol->GetUltimate()};
-    if (IsProcedurePointer(ultimate)) {
-      auto &details{ultimate.get<ProcEntityDetails>()};
-      CHECK(!details.init());
-      Walk(target);
-      if (const auto *targetName{std::get_if<parser::Name>(&target.u)}) {
-        CheckInitialProcTarget(ultimate, *targetName, name.source);
-        if (targetName->symbol) {
-          details.set_init(*targetName->symbol);
+    if (!context().HasError(ultimate)) {
+      if (IsProcedurePointer(ultimate)) {
+        auto &details{ultimate.get<ProcEntityDetails>()};
+        CHECK(!details.init());
+        Walk(target);
+        if (const auto *targetName{std::get_if<parser::Name>(&target.u)}) {
+          CheckInitialProcTarget(ultimate, *targetName, name.source);
+          if (targetName->symbol) {
+            details.set_init(*targetName->symbol);
+          }
+        } else {
+          details.set_init(nullptr); // explicit NULL()
         }
       } else {
-        details.set_init(nullptr); // explicit NULL()
+        Say(name,
+            "'%s' is not a procedure pointer but is initialized "
+            "like one"_err_en_US);
+        context().SetError(ultimate);
       }
-    } else {
-      Say(name,
-          "'%s' is not a procedure pointer but is initialized "
-          "like one"_err_en_US);
     }
   }
 }
diff --git a/flang/test/Semantics/resolve91.f90 b/flang/test/Semantics/resolve91.f90
new file mode 100644
index 0000000000000..f55ca865cf3c0
--- /dev/null
+++ b/flang/test/Semantics/resolve91.f90
@@ -0,0 +1,46 @@
+! RUN: %S/test_errors.sh %s %t %f18
+! Tests for duplicate definitions and initializations, mostly of procedures
+module m
+  procedure(real), pointer :: p
+  !ERROR: The interface for procedure 'p' has already been declared
+  procedure(integer), pointer :: p
+end
+
+module m1
+    real, dimension(:), pointer :: realArray => null()
+    !ERROR: The type of 'realarray' has already been declared
+    real, dimension(:), pointer :: realArray => localArray
+end module m1
+
+module m2
+  interface
+    subroutine sub()
+    end subroutine sub
+  end interface
+
+  procedure(sub), pointer :: p1 => null()
+  !ERROR: The interface for procedure 'p1' has already been declared
+  procedure(sub), pointer :: p1 => null()
+
+end module m2
+
+module m3
+  interface
+    real function fun()
+    end function fun
+  end interface
+
+  procedure(fun), pointer :: f1 => null()
+  !ERROR: The interface for procedure 'f1' has already been declared
+  procedure(fun), pointer :: f1 => null()
+
+end module m3
+
+module m4
+  real, dimension(:), pointer :: localArray => null()
+  type :: t2
+    real, dimension(:), pointer :: realArray => null()
+    !ERROR: Component 'realarray' is already declared in this derived type
+    real, dimension(:), pointer :: realArray => localArray
+  end type
+end module m4

From e09064e97f293491e59b30569033c8962129bdeb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 24 May 2020 18:06:56 -0400
Subject: [PATCH 137/770] AMDGPU: Update store node checks for atomics

Prepare to switch to using StoreSDNode for atomic stores.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 50ee0856377a2..52823c16d72d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2730,7 +2730,7 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
           (
             Subtarget->getScalarizeGlobalBehavior() &&
             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-            !Ld->isVolatile() &&
+            Ld->isSimple() &&
             !N->isDivergent() &&
             static_cast<const SITargetLowering *>(
               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1edc42adf9de8..d1891e25e5f29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2920,7 +2920,7 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
     return SDValue();
 
   LoadSDNode *LN = cast<LoadSDNode>(N);
-  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
     return SDValue();
 
   SDLoc SL(N);
@@ -2974,7 +2974,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
     return SDValue();
 
   StoreSDNode *SN = cast<StoreSDNode>(N);
-  if (SN->isVolatile() || !ISD::isNormalStore(SN))
+  if (!SN->isSimple() || !ISD::isNormalStore(SN))
     return SDValue();
 
   EVT VT = SN->getMemoryVT();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2c147fa8947c1..a2f5b6cdeec25 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7625,7 +7625,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
       AS == AMDGPUAS::GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
-        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+        Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
         Alignment >= 4 && NumElements < 32) {
       if (MemVT.isPow2VectorType())
         return SDValue();

From e99d50d8440efe8fa3515db4dae873ba39810dfd Mon Sep 17 00:00:00 2001
From: Shoaib Meenai <smeenai@fb.com>
Date: Tue, 26 May 2020 12:22:03 -0700
Subject: [PATCH 138/770] [Support] Remove stale comment

Clang has supported __builtin_assume_aligned since r217349 back in 2014,
so the comment is very out of date.
---
 llvm/include/llvm/Support/Compiler.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index 79af6f5d3c686..80ea76240d6cf 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -373,7 +373,6 @@
 #if __has_builtin(__builtin_assume_aligned) || LLVM_GNUC_PREREQ(4, 7, 0)
 # define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a)
 #elif defined(LLVM_BUILTIN_UNREACHABLE)
-// As of today, clang does not support __builtin_assume_aligned.
 # define LLVM_ASSUME_ALIGNED(p, a) \
            (((uintptr_t(p) % (a)) == 0) ? (p) : (LLVM_BUILTIN_UNREACHABLE, (p)))
 #else

From ba10daa820fa868816eed2b85e70197d354ebfe6 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Tue, 26 May 2020 15:34:57 -0400
Subject: [PATCH 139/770] [mlir][Vector] Add more vector.contract ->
 outerproduct lowerings and fix vector.contract type inference.

This revision expands the types of vector contractions that can be lowered to vector.outerproduct.
All 8 permutation cases are support.
The idiomatic manipulation of AffineMap written declaratively makes this straightforward.

In the process a bug with the vector.contract verifier was uncovered.
The vector shape verification part of the contract op is rewritten to use AffineMap composition.
One bug in the vector `ops.mlir` test is fixed and a new case not yet captured is added
to the vector`invalid.mlir` test.

Differential Revision: https://reviews.llvm.org/D80393
---
 .../mlir/Dialect/Utils/StructuredOpsUtils.h   |  12 +
 mlir/lib/Dialect/Vector/VectorOps.cpp         |  67 ++++--
 mlir/lib/Dialect/Vector/VectorTransforms.cpp  | 133 ++++++++---
 mlir/test/Dialect/Vector/invalid.mlir         |  20 ++
 mlir/test/Dialect/Vector/ops.mlir             |  14 +-
 .../Vector/vector-contract-transforms.mlir    | 216 ++++++++++++++++++
 6 files changed, 408 insertions(+), 54 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index 5a36aabfab75e..02d2762560767 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -81,12 +81,24 @@ constexpr StringRef getPaddingAttrName() { return "padding"; }
 
 /// Use to encode that a particular iterator type has parallel semantics.
 constexpr StringRef getParallelIteratorTypeName() { return "parallel"; }
+constexpr bool isParallelIterator(Attribute attr) {
+  auto strAttr = attr.dyn_cast_or_null<StringAttr>();
+  return strAttr && strAttr.getValue() == getParallelIteratorTypeName();
+}
 
 /// Use to encode that a particular iterator type has reduction semantics.
 constexpr StringRef getReductionIteratorTypeName() { return "reduction"; }
+constexpr bool isReductionIterator(Attribute attr) {
+  auto strAttr = attr.dyn_cast_or_null<StringAttr>();
+  return strAttr && strAttr.getValue() == getReductionIteratorTypeName();
+}
 
 /// Use to encode that a particular iterator type has window semantics.
 constexpr StringRef getWindowIteratorTypeName() { return "window"; }
+constexpr bool isWindowIterator(Attribute attr) {
+  auto strAttr = attr.dyn_cast_or_null<StringAttr>();
+  return strAttr && strAttr.getValue() == getWindowIteratorTypeName();
+}
 
 /// Use to encode that a particular iterator type has window semantics.
 inline ArrayRef<StringRef> getAllIteratorTypeNames() {
diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp
index 1574edb344941..63891d1004d4e 100644
--- a/mlir/lib/Dialect/Vector/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -223,8 +223,9 @@ static bool verifyDimMap(VectorType lhsType, VectorType rhsType,
   return true;
 }
 
-static bool verifyOutputShape(
-    VectorType lhsType, VectorType rhsType, Type accType, Type resType,
+static LogicalResult verifyOutputShape(
+    ContractionOp op, VectorType lhsType, VectorType rhsType, Type accType,
+    Type resType,
     const std::vector<std::pair<int64_t, int64_t>> &contractingDimMap,
     const std::vector<std::pair<int64_t, int64_t>> &batchDimMap) {
   DenseSet<int64_t> lhsContractingDimSet;
@@ -256,26 +257,56 @@ static bool verifyOutputShape(
   if (expectedResultDims.size() == 0) {
     // No batch or free dimension implies a scalar result.
     if (resType.isa<VectorType>() || accType.isa<VectorType>())
-      return false;
-
+      return op.emitOpError("invalid accumulator/result vector shape");
   } else {
     // At least one batch or free dimension implies a vector result.
     auto resVectorType = resType.dyn_cast<VectorType>();
     auto accVectorType = accType.dyn_cast<VectorType>();
     if (!resVectorType || !accVectorType)
-      return false;
-
-    // Verify dimension from 'resType' against 'expectedResultDims'.
-    if (resVectorType.getShape().size() != expectedResultDims.size() ||
-        accVectorType.getShape().size() != expectedResultDims.size())
-      return false;
-    for (int64_t i = 0, e = resVectorType.getRank(); i < e; ++i) {
-      if (resVectorType.getDimSize(i) != expectedResultDims[i] ||
-          accVectorType.getDimSize(i) != expectedResultDims[i])
-        return false;
+      return op.emitOpError("invalid accumulator/result vector shape");
+
+    // Infer expected result vector type. Lhs + rhs map and lhs + rhs vector
+    // types fully define the result vector type. This assumes the affine maps
+    // are well-formed, which must have been verified already.
+    MLIRContext *ctx = op.getContext();
+    AffineMap lhsMap = op.getIndexingMaps()[0];
+    AffineMap rhsMap = op.getIndexingMaps()[1];
+    SmallVector<AffineExpr, 4> extents(lhsMap.getNumInputs());
+    for (auto pair :
+         {std::make_pair(lhsType, lhsMap), std::make_pair(rhsType, rhsMap)}) {
+      VectorType v = pair.first;
+      auto map = pair.second;
+      for (unsigned idx = 0, e = v.getRank(); idx < e; ++idx) {
+        unsigned pos = map.getResult(idx).cast<AffineDimExpr>().getPosition();
+        if (!extents[pos])
+          extents[pos] = getAffineConstantExpr(v.getShape()[idx], ctx);
+      }
     }
+    assert(llvm::all_of(extents, [](AffineExpr e) { return e; }) &&
+           "expected extent along all dimensions.");
+
+    AffineMap resMap = op.getIndexingMaps()[2];
+    auto extentsMap = AffineMap::get(/*dimCount=*/extents.size(),
+                                     /*symCount=*/0, extents, ctx);
+    // Compose the resMap with the extentsMap, which is a constant map.
+    AffineMap expectedMap = simplifyAffineMap(resMap.compose(extentsMap));
+    assert(llvm::all_of(
+               expectedMap.getResults(),
+               [](AffineExpr e) { return e.isa<AffineConstantExpr>(); }) &&
+           "expected constant extent along all dimensions.");
+    // Extract the expected shape and build the type.
+    auto expectedShape = llvm::to_vector<4>(
+        llvm::map_range(expectedMap.getResults(), [](AffineExpr e) {
+          return e.cast<AffineConstantExpr>().getValue();
+        }));
+    auto expected =
+        VectorType::get(expectedShape, resVectorType.getElementType());
+    if (resVectorType != expected || accVectorType != expected)
+      return op.emitOpError(
+                 "invalid accumulator/result vector shape, expected: ")
+             << expected;
   }
-  return true;
+  return success();
 }
 
 static LogicalResult verify(ContractionOp op) {
@@ -329,9 +360,9 @@ static LogicalResult verify(ContractionOp op) {
     return op.emitOpError("invalid batch dimension map");
 
   // Verify 'accType' and 'resType' shape.
-  if (!verifyOutputShape(lhsType, rhsType, accType, resType, contractingDimMap,
-                         batchDimMap))
-    return op.emitOpError("invalid accumulator/result vector shape");
+  if (failed(verifyOutputShape(op, lhsType, rhsType, accType, resType,
+                               contractingDimMap, batchDimMap)))
+    return failure();
 
   // Verify that either two vector masks are set or none are set.
   auto lhsMaskType = op.getLHSVectorMaskType();
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index 44ff03a04f223..491ad62affcbd 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -1454,10 +1454,17 @@ ContractionOpToMatmulOpLowering::match(vector::ContractionOp op) const {
   if (llvm::size(op.masks()) != 0)
     return failure();
 
+  auto iteratorTypes = op.iterator_types().getValue();
+  if (!isParallelIterator(iteratorTypes[0]) ||
+      !isParallelIterator(iteratorTypes[1]) ||
+      !isReductionIterator(iteratorTypes[2]))
+    return failure();
+
   if (vectorTransformsOptions.vectorContractLowering !=
           vector::VectorContractLowering::Matmul ||
       !isRowMajorMatmul(op.indexing_maps()))
     return failure();
+
   return success();
 }
 
@@ -1503,34 +1510,8 @@ void ContractionOpToMatmulOpLowering::rewrite(vector::ContractionOp op,
 ///    %cK = vector.outerproduct %atRowK, %bRowK, %cK-1
 /// ```
 ///
-/// This only kicks in when VectorTransformsOptions is set to OuterProduct and
-/// the vector.contract op is a row-major matrix multiply.
-void ContractionOpToOuterProductOpLowering::rewrite(
-    vector::ContractionOp op, PatternRewriter &rewriter) const {
-  VectorType lhsType = op.getLhsType();
-  // TODO(ntv) other modes.
-  // We know we are in row-major.
-  bool transposeLhs = false;
-  unsigned reductionSize =
-      transposeLhs ? lhsType.getShape()[0] : lhsType.getShape()[1];
-
-  // If transposeLhs == false (i.e. lhs(m, reductionSize)), we need to
-  // transpose it to extract the proper vector<m x f32>. Otherwise, just take
-  // the lhs.
-  Value lhs = transposeLhs
-                  ? op.lhs()
-                  : rewriter.create<vector::TransposeOp>(
-                        op.getLoc(), op.lhs(), ArrayRef<int64_t>{1, 0});
-  Value res = op.acc();
-  // ExtractOp does not allow dynamic indexing, we must unroll explicitly.
-  for (unsigned k = 0; k < reductionSize; ++k) {
-    Value a = rewriter.create<vector::ExtractOp>(op.getLoc(), lhs, k);
-    Value b = rewriter.create<vector::ExtractOp>(op.getLoc(), op.rhs(), k);
-    res = rewriter.create<vector::OuterProductOp>(op.getLoc(), a, b, res);
-  }
-  rewriter.replaceOp(op, res);
-}
-
+/// This only kicks in when VectorTransformsOptions is set to OuterProduct but
+/// otherwise supports any layout permutation of the matrix-multiply.
 LogicalResult
 ContractionOpToOuterProductOpLowering ::match(vector::ContractionOp op) const {
   // TODO(ajcbik): implement masks
@@ -1538,12 +1519,104 @@ ContractionOpToOuterProductOpLowering ::match(vector::ContractionOp op) const {
     return failure();
 
   if (vectorTransformsOptions.vectorContractLowering !=
-          vector::VectorContractLowering::OuterProduct ||
-      !isRowMajorMatmul(op.indexing_maps()))
+      vector::VectorContractLowering::OuterProduct)
+    return failure();
+
+  // Transpose arguments to make them ready for lowering to OuterProduct. The
+  // constraint to match is that we must load full rows at a time with
+  // vector::ExtractOp.
+  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
+  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  AffineExpr m, n, k;
+  bindDims(op.getContext(), m, n, k);
+  auto iteratorTypes = op.iterator_types().getValue();
+  if (!isParallelIterator(iteratorTypes[0]) ||
+      !isParallelIterator(iteratorTypes[1]) ||
+      !isReductionIterator(iteratorTypes[2]))
+    return failure();
+  SmallVector<AffineMap, 4> maps = op.getIndexingMaps();
+  // When lowering to outerproduct we can support all permutations.
+  if (maps != infer({{m, k}, {k, n}, {m, n}}) &&
+      maps != infer({{m, k}, {n, k}, {m, n}}) &&
+      maps != infer({{k, m}, {k, n}, {m, n}}) &&
+      maps != infer({{k, m}, {n, k}, {m, n}}) &&
+      maps != infer({{m, k}, {k, n}, {n, m}}) &&
+      maps != infer({{m, k}, {n, k}, {n, m}}) &&
+      maps != infer({{k, m}, {k, n}, {n, m}}) &&
+      maps != infer({{k, m}, {n, k}, {n, m}}))
     return failure();
   return success();
 }
 
+void ContractionOpToOuterProductOpLowering::rewrite(
+    vector::ContractionOp op, PatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  unsigned reductionSize = 0;
+  VectorType lhsType = op.getLhsType();
+  Value lhs = op.lhs(), rhs = op.rhs(), res = op.acc();
+
+  // Transpose arguments to make them ready for lowering to OuterProduct. The
+  // constraint to match is that we must load full rows at a time with
+  // vector::ExtractOp.
+  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
+  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+  AffineExpr m, n, k;
+  bindDims(rewriter.getContext(), m, n, k);
+  SmallVector<int64_t, 2> perm{1, 0};
+  SmallVector<AffineMap, 4> maps = op.getIndexingMaps();
+  // First batch of cases, no need to output permute.
+  if (maps == infer({{m, k}, {k, n}, {m, n}})) {
+    // This is the classical row-major matmul. Just permute the lhs.
+    reductionSize = lhsType.getShape()[1];
+    lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+  } else if (maps == infer({{m, k}, {n, k}, {m, n}})) {
+    // TODO: may be better to fail and use some vector<k> -> scalar reduction.
+    reductionSize = lhsType.getShape()[1];
+    lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+    rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+  } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
+    // No need to permute anything.
+    reductionSize = lhsType.getShape()[0];
+  } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
+    // Just permute the rhs.
+    reductionSize = lhsType.getShape()[0];
+    rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+  }
+  // Second batch of cases, reshuffle to avoid output permute.
+  else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
+    // This is the classical row-major matmul. Just permute the lhs.
+    reductionSize = lhsType.getShape()[1];
+    Value tmp = rhs;
+    rhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+    lhs = tmp;
+  } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
+    // TODO: may be better to fail and use some vector<k> -> scalar reduction.
+    reductionSize = lhsType.getShape()[1];
+    Value tmp = rhs;
+    rhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+    lhs = rewriter.create<vector::TransposeOp>(loc, tmp, perm);
+  } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
+    // No need to permute anything, but still swap lhs and rhs.
+    reductionSize = lhsType.getShape()[0];
+    std::swap(lhs, rhs);
+  } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
+    // Just permute the rhs.
+    reductionSize = lhsType.getShape()[0];
+    Value tmp = lhs;
+    lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+    rhs = tmp;
+  }
+  assert(reductionSize > 0);
+
+  // ExtractOp does not allow dynamic indexing, we must unroll explicitly.
+  for (unsigned k = 0; k < reductionSize; ++k) {
+    Value a = rewriter.create<vector::ExtractOp>(op.getLoc(), lhs, k);
+    Value b = rewriter.create<vector::ExtractOp>(op.getLoc(), rhs, k);
+    res = rewriter.create<vector::OuterProductOp>(op.getLoc(), a, b, res);
+  }
+  rewriter.replaceOp(op, res);
+}
+
 /// Progressive lowering of ContractionOp.
 /// One:
 ///   %x = vector.contract with at least one free/batch dimension
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index c18cf38edfc90..cc72511a6e782 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -767,6 +767,26 @@ func @contraction(%arg0: vector<4x3xi32>,
 
 // -----
 
+#contraction_accesses = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#contraction_trait = {
+  indexing_maps = #contraction_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+func @contraction(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<2x3xf32>)
+-> vector<3x2xf32>
+{
+// expected-error@+1 {{invalid accumulator/result vector shape, expected: 'vector<3x2xf32>'}}
+  %0 = vector.contract #contraction_trait %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x3xf32> into vector<2x3xf32>
+  return %0 : vector<2x3xf32>
+}
+
+// -----
+
 func @create_mask() {
   %c2 = constant 2 : index
   %c3 = constant 3 : index
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index c194cbe238117..57c03c903fe89 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -160,9 +160,11 @@ func @contraction_to_scalar(%arg0: vector<10xf32>, %arg1: vector<10xf32>) -> f32
   indexing_maps = #contraction_accesses0,
   iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
 }
-#contraction_accesses1 = [
+#contraction_accesses1 = [              // 7,  8, 16, 15
   affine_map<(f0, f1, f2, f3, c0, c1) -> (c0, f0, c1, f2)>,
+                                        // 8, 16,  7,  5
   affine_map<(f0, f1, f2, f3, c0, c1) -> (f1, c1, c0, f3)>,
+                                        // 8,  8, 15,  5
   affine_map<(f0, f1, f2, f3, c0, c1) -> (f0, f1, f2, f3)>
 ]
 #contraction_trait1 = {
@@ -172,7 +174,7 @@ func @contraction_to_scalar(%arg0: vector<10xf32>, %arg1: vector<10xf32>) -> f32
 }
 // CHECK-LABEL: contraction
 func @contraction(%arg0 : vector<7x8x16x15xf32>, %arg1 : vector<8x16x7x5xf32>,
-                  %arg2 : vector<8x15x5xf32>, %arg3 : vector<8x15x8x5xf32>,
+                  %arg2 : vector<8x15x5xf32>, %arg3 : vector<8x8x15x5xf32>,
                   %arg4 : index) {
   // Test contraction with batch and contracting dims.
   // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32>
@@ -181,16 +183,16 @@ func @contraction(%arg0 : vector<7x8x16x15xf32>, %arg1 : vector<8x16x7x5xf32>,
   // Test contraction with only contracting dims. In this case the lhs/rhs
   // dimension of size 8 will be considered a parallel dim for lhs/rhs and will
   // appear twice in the output.
-  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x8x15x5xf32>
   %1 = vector.contract #contraction_trait1 %arg0, %arg1, %arg3
-      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x8x15x5xf32>
   // Test contraction with optional vector mask arguments.
   %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1>
   %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1>
-  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+  // CHECK: vector.contract {indexing_maps = [#{{.*}}, #{{.*}}, #{{.*}}], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}} : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x8x15x5xf32>
   %2 = vector.contract #contraction_trait1 %arg0, %arg1, %arg3, %lhs_mask,
                                            %rhs_mask
-      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32>
+      : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x8x15x5xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
index 7eea3baa8d87c..1dd2f377a29c0 100644
--- a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir
@@ -681,3 +681,219 @@ func @genbool_var_2d(%arg0: index, %arg1: index) -> vector<2x3xi1> {
   %0 = vector.create_mask %arg0, %arg1 : vector<2x3xi1>
   return %0 : vector<2x3xi1>
 }
+
+#matmat_accesses_0 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_0 = {
+  indexing_maps = #matmat_accesses_0,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_0
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      OUTERPRODUCT: %[[a0:.*]] = vector.extract %[[At]][0] : vector<1x2xf32>
+//      OUTERPRODUCT: %[[b0:.*]] = vector.extract %[[B]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<2x3xf32>
+func @matmul_0(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<2x3xf32>)
+-> vector<2x3xf32>
+{
+  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x3xf32> into vector<2x3xf32>
+  return %0 : vector<2x3xf32>
+}
+
+#matmat_accesses_1 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_1 = {
+  indexing_maps = #matmat_accesses_1,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_1
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<3x1xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      OUTERPRODUCT: %[[Bt:.*]] = vector.transpose %[[B]], [1, 0]
+//      OUTERPRODUCT: %[[a0:.*]] = vector.extract %[[At]][0] : vector<1x2xf32>
+//      OUTERPRODUCT: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<2x3xf32>
+func @matmul_1(%arg0: vector<2x1xf32>, %arg1: vector<3x1xf32>, %arg2: vector<2x3xf32>)
+-> vector<2x3xf32>
+{
+  %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<3x1xf32> into vector<2x3xf32>
+  return %0 : vector<2x3xf32>
+}
+
+#matmat_accesses_2 = [
+  affine_map<(m, n, k) -> (k, m)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_2 = {
+  indexing_maps = #matmat_accesses_2,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_2
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
+//      OUTERPRODUCT: %[[a0:.*]] = vector.extract %[[A]][0] : vector<1x2xf32>
+//      OUTERPRODUCT: %[[b0:.*]] = vector.extract %[[B]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<2x3xf32>
+func @matmul_2(%arg0: vector<1x2xf32>, %arg1: vector<1x3xf32>, %arg2: vector<2x3xf32>)
+-> vector<2x3xf32>
+{
+  %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
+    : vector<1x2xf32>, vector<1x3xf32> into vector<2x3xf32>
+  return %0 : vector<2x3xf32>
+}
+
+#matmat_accesses_3 = [
+  affine_map<(m, n, k) -> (k, m)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_3 = {
+  indexing_maps = #matmat_accesses_3,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_3
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<3x1xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
+//      OUTERPRODUCT: %[[Bt:.*]] = vector.transpose %[[B]], [1, 0]
+//      OUTERPRODUCT: %[[a0:.*]] = vector.extract %[[A]][0] : vector<1x2xf32>
+//      OUTERPRODUCT: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<2x3xf32>
+func @matmul_3(%arg0: vector<1x2xf32>, %arg1: vector<3x1xf32>, %arg2: vector<2x3xf32>)
+-> vector<2x3xf32>
+{
+  %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
+    : vector<1x2xf32>, vector<3x1xf32> into vector<2x3xf32>
+  return %0 : vector<2x3xf32>
+}
+
+#matmat_accesses_4 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_4 = {
+  indexing_maps = #matmat_accesses_4,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_4
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<3x2xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      OUTERPRODUCT: %[[b0:.*]] = vector.extract %[[B]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[a0:.*]] = vector.extract %[[At]][0] : vector<1x2xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<3x2xf32>
+func @matmul_4(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x2xf32>)
+-> vector<3x2xf32>
+{
+  %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
+  return %0 : vector<3x2xf32>
+}
+
+#matmat_accesses_5 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_5 = {
+  indexing_maps = #matmat_accesses_5,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_5
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<3x2xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      OUTERPRODUCT-DAG: %[[a0:.*]] = vector.extract %[[At]][0] : vector<1x2xf32>
+//      OUTERPRODUCT-DAG: %[[b0:.*]] = vector.extract %[[B]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<3x2xf32>
+func @matmul_5(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x2xf32>)
+-> vector<3x2xf32>
+{
+  %0 = vector.contract #matmat_trait_5 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
+  return %0 : vector<3x2xf32>
+}
+
+#matmat_accesses_6 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_6 = {
+  indexing_maps = #matmat_accesses_6,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_6
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<3x2xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      OUTERPRODUCT-DAG: %[[a0:.*]] = vector.extract %[[At]][0] : vector<1x2xf32>
+//      OUTERPRODUCT-DAG: %[[b0:.*]] = vector.extract %[[B]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<3x2xf32>
+func @matmul_6(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x2xf32>)
+-> vector<3x2xf32>
+{
+  %0 = vector.contract #matmat_trait_6 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
+  return %0 : vector<3x2xf32>
+}
+
+#matmat_accesses_7 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_7 = {
+  indexing_maps = #matmat_accesses_7,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// OUTERPRODUCT-LABEL: func @matmul_7
+// OUTERPRODUCT-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// OUTERPRODUCT-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// OUTERPRODUCT-SAME: %[[C:[a-zA-Z0-9]*]]: vector<3x2xf32>
+//      OUTERPRODUCT: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      OUTERPRODUCT-DAG: %[[a0:.*]] = vector.extract %[[At]][0] : vector<1x2xf32>
+//      OUTERPRODUCT-DAG: %[[b0:.*]] = vector.extract %[[B]][0] : vector<1x3xf32>
+//      OUTERPRODUCT: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
+//      OUTERPRODUCT: return %[[c0]] : vector<3x2xf32>
+func @matmul_7(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x2xf32>)
+-> vector<3x2xf32>
+{
+  %0 = vector.contract #matmat_trait_7 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
+  return %0 : vector<3x2xf32>
+}

From 42725aeed8cbabc15e351e2854ae549df2c5dcde Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 30 Apr 2020 16:42:27 -0700
Subject: [PATCH 140/770] Process gep (select ptr1, ptr2) in SROA

Differential Revision: https://reviews.llvm.org/D79217
---
 llvm/lib/Transforms/Scalar/SROA.cpp     |  51 ++++++++
 llvm/test/Transforms/SROA/select-gep.ll | 149 ++++++++++++++++++++++++
 2 files changed, 200 insertions(+)
 create mode 100644 llvm/test/Transforms/SROA/select-gep.ll

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 016de041b2f8e..1d486a3e74fd1 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3441,7 +3441,58 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     return false;
   }
 
+  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
+  bool foldGEPSelect(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
+                      << "\n    original: " << *Sel
+                      << "\n              " << GEPI);
+
+    IRBuilderTy Builder(&GEPI);
+    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    bool IsInBounds = GEPI.isInBounds();
+
+    Value *True = Sel->getTrueValue();
+    Value *NTrue =
+        IsInBounds
+            ? Builder.CreateInBoundsGEP(True, Index,
+                                        True->getName() + ".sroa.gep")
+            : Builder.CreateGEP(True, Index, True->getName() + ".sroa.gep");
+
+    Value *False = Sel->getFalseValue();
+
+    Value *NFalse =
+        IsInBounds
+            ? Builder.CreateInBoundsGEP(False, Index,
+                                        False->getName() + ".sroa.gep")
+            : Builder.CreateGEP(False, Index, False->getName() + ".sroa.gep");
+
+    Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
+                                       Sel->getName() + ".sroa.sel");
+    GEPI.replaceAllUsesWith(NSel);
+    GEPI.eraseFromParent();
+
+    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
+                      << "\n              " << *NFalse
+                      << "\n              " << *NSel << '\n');
+
+    if (isa<Instruction>(NTrue))
+      visit(cast<Instruction>(NTrue));
+    if (isa<Instruction>(NFalse))
+      visit(cast<Instruction>(NFalse));
+
+    return true;
+  }
+
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (isa<SelectInst>(GEPI.getPointerOperand()) &&
+        foldGEPSelect(GEPI))
+      return true;
+
     enqueueUsers(GEPI);
     return false;
   }
diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll
new file mode 100644
index 0000000000000..93cb3420d0af7
--- /dev/null
+++ b/llvm/test/Transforms/SROA/select-gep.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -sroa < %s | FileCheck %s
+
+%pair = type { i32, i32 }
+
+define i32 @test_sroa_select_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_select_gep(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[LOAD_SROA_SPECULATED:%.*]] = select i1 [[COND:%.*]], i32 1, i32 2
+; CHECK-NEXT:    ret i32 [[LOAD_SROA_SPECULATED]]
+;
+bb:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  %select = select i1 %cond, %pair* %a, %pair* %b
+  %gep = getelementptr inbounds %pair, %pair* %select, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_select_gep_non_inbound(i1 %cond) {
+; CHECK-LABEL: @test_sroa_select_gep_non_inbound(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[LOAD_SROA_SPECULATED:%.*]] = select i1 [[COND:%.*]], i32 1, i32 2
+; CHECK-NEXT:    ret i32 [[LOAD_SROA_SPECULATED]]
+;
+bb:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  %select = select i1 %cond, %pair* %a, %pair* %b
+  %gep = getelementptr %pair, %pair* %select, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_select_gep_volatile_load(i1 %cond) {
+; CHECK-LABEL: @test_sroa_select_gep_volatile_load(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_SROA_2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B_SROA_2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 11, i32* [[A_SROA_0]], align 4
+; CHECK-NEXT:    store i32 12, i32* [[B_SROA_0]], align 4
+; CHECK-NEXT:    store i32 21, i32* [[A_SROA_2]], align 4
+; CHECK-NEXT:    store i32 22, i32* [[B_SROA_2]], align 4
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[COND:%.*]], i32* [[A_SROA_0]], i32* [[B_SROA_0]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile i32, i32* [[SELECT_SROA_SEL]], align 4
+; CHECK-NEXT:    [[SELECT_SROA_SEL3:%.*]] = select i1 [[COND]], i32* [[A_SROA_2]], i32* [[B_SROA_2]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load volatile i32, i32* [[SELECT_SROA_SEL3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+bb:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a0 = getelementptr inbounds %pair, %pair* %a, i32 0, i32 0
+  %gep_b0 = getelementptr inbounds %pair, %pair* %b, i32 0, i32 0
+  store i32 11, i32* %gep_a0, align 4
+  store i32 12, i32* %gep_b0, align 4
+  %gep_a1 = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b1 = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 21, i32* %gep_a1, align 4
+  store i32 22, i32* %gep_b1, align 4
+  %select = select i1 %cond, %pair* %a, %pair* %b
+  %gep1 = getelementptr inbounds %pair, %pair* %select, i32 0, i32 0
+  %load1 = load volatile i32, i32* %gep1, align 4
+  %gep2 = getelementptr inbounds %pair, %pair* %select, i32 0, i32 1
+  %load2 = load volatile i32, i32* %gep2, align 4
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+define i32 @test_sroa_select_gep_undef(i1 %cond) {
+; CHECK-LABEL: @test_sroa_select_gep_undef(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[COND:%.*]], i32* [[A_SROA_0]], i32* undef
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[SELECT_SROA_SEL]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+bb:
+  %a = alloca %pair, align 4
+  %select = select i1 %cond, %pair* %a, %pair* undef
+  %gep = getelementptr inbounds %pair, %pair* %select, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_gep_select_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_gep_select_gep(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[A_SROA_0]], align 4
+; CHECK-NEXT:    store i32 2, i32* [[B_SROA_0]], align 4
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[COND:%.*]], i32* [[A_SROA_0]], i32* [[B_SROA_0]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[COND]], i32* [[SELECT_SROA_SEL]], i32* [[A_SROA_0]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[SELECT2]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+bb:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  %select = select i1 %cond, i32* %gep_a, i32* %gep_b
+  %gep = getelementptr inbounds i32, i32* %select, i32 0
+  %select2 = select i1 %cond, i32* %gep, i32* %gep_a
+  %load = load i32, i32* %select2, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_gep_select_gep_nonconst_idx(i1 %cond, i32 %idx) {
+; CHECK-LABEL: @test_sroa_gep_select_gep_nonconst_idx(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], %pair* [[A]], %pair* [[B]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[SELECT]], i32 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+bb:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  %select = select i1 %cond, %pair* %a, %pair* %b
+  %gep = getelementptr inbounds %pair, %pair* %select, i32 %idx, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}

From bd7ff5d94f0f591206188267a0e1529fa13d6c2e Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson@sony.com>
Date: Tue, 26 May 2020 19:28:34 +0100
Subject: [PATCH 141/770] [DebugInfo] Correct debuginfo for post-ra hoist and
 sink in Machine LICM

Reviewers: vsk, aprantl

Differential Revision: https://reviews.llvm.org/D79868
---
 llvm/lib/CodeGen/MachineLICM.cpp              |  12 +
 .../MIR/X86/mlicm-hoist-post-regalloc.mir     |  93 ++++++++
 ...hoist.mir => mlicm-hoist-pre-regalloc.mir} |  33 +--
 llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir    | 216 ++++++++++++++++++
 4 files changed, 324 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
 rename llvm/test/DebugInfo/MIR/X86/{mlicm-hoist.mir => mlicm-hoist-pre-regalloc.mir} (79%)
 create mode 100644 llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 0a3e9dcd3af7e..2a60858b6de21 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -635,6 +635,11 @@ void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *MBB = MI->getParent();
   Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
 
+  // Since we are moving the instruction out of its basic block, we do not
+  // retain its debug location. Doing so would degrade the debugging
+  // experience and adversely affect the accuracy of profiling information.
+  MI->setDebugLoc(DebugLoc());
+
   // Add register to livein list to all the BBs in the current loop since a
   // loop invariant must be kept live throughout the whole loop. This is
   // important to ensure later passes do not scavenge the def register.
@@ -829,7 +834,14 @@ void MachineLICMBase::SinkIntoLoop() {
     }
     if (!CanSink || !B || B == Preheader)
       continue;
+
+    LLVM_DEBUG(dbgs() << "Sinking to " << printMBBReference(*B) << " from "
+                      << printMBBReference(*I->getParent()) << ": " << *I);
     B->splice(B->getFirstNonPHI(), Preheader, I);
+
+    // The instruction is is moved from its basic block, so do not retain the
+    // debug information.
+    I->setDebugLoc(DebugLoc());
   }
 }
 
diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
new file mode 100644
index 0000000000000..91f77d331f184
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
@@ -0,0 +1,93 @@
+--- |
+  ; RUN: llc -start-before=phi-node-elimination -stop-after=machinelicm -debug-only=machinelicm -o - %s  | FileCheck %s
+  ; Ensure we execute machinelicm post register allocation.
+  ; Line numbers should not be retained when loop invariant instructions are hoisted.
+  ;
+  ; CHECK-LABEL:  bb.0.entry:
+  ; CHECK:        MOV64rm $rip, 1, $noreg, target-flags(x86-gotpcrel) @x, $noreg :: (load 8 from got)
+  ; CHECK-LABEL:  bb.1.while.body:
+  ;
+
+  @x = common local_unnamed_addr global i32 0, align 4, !dbg !0
+
+  define void @Process(i32* nocapture readonly %p) !dbg !10 {
+  entry:
+    call void @llvm.dbg.value(metadata i32* %p, metadata !17, metadata !DIExpression()), !dbg !18
+    br label %while.body, !dbg !19
+
+  while.body:                                       ; preds = %while.body, %entry
+    %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %while.body ]
+    call void @llvm.dbg.value(metadata i32* %p.addr.0, metadata !17, metadata !DIExpression()), !dbg !18
+    %incdec.ptr = getelementptr inbounds i32, i32* %p.addr.0, i64 1, !dbg !20
+    call void @llvm.dbg.value(metadata i32* %incdec.ptr, metadata !17, metadata !DIExpression()), !dbg !18
+    %0 = load i32, i32* %p.addr.0, align 4, !dbg !21
+    store i32 %0, i32* @x, align 4, !dbg !22
+    br label %while.body, !dbg !23, !llvm.loop !25
+  }
+
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+
+  !llvm.dbg.cu = !{!2}
+  !llvm.module.flags = !{!7, !8}
+  !llvm.ident = !{!9}
+
+  !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+  !1 = !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+  !2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 10.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+  !3 = !DIFile(filename: "t.ll", directory: "/tmp/")
+  !4 = !{}
+  !5 = !{!0}
+  !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !7 = !{i32 2, !"Dwarf Version", i32 4}
+  !8 = !{i32 2, !"Debug Info Version", i32 3}
+  !9 = !{!"clang version 10.0.0 "}
+  !10 = distinct !DISubprogram(name: "Process", scope: !3, file: !3, line: 2, type: !11, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !16)
+  !11 = !DISubroutineType(types: !12)
+  !12 = !{null, !13}
+  !13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+  !14 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !15)
+  !15 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  !16 = !{!17}
+  !17 = !DILocalVariable(name: "p", arg: 1, scope: !10, file: !3, line: 2, type: !13)
+  !18 = !DILocation(line: 2, column: 34, scope: !10)
+  !19 = !DILocation(line: 4, column: 3, scope: !10)
+  !20 = !DILocation(line: 5, column: 11, scope: !10)
+  !21 = !DILocation(line: 5, column: 9, scope: !10)
+  !22 = !DILocation(line: 5, column: 7, scope: !10)
+  !23 = !DILocation(line: 4, column: 3, scope: !24)
+  !24 = !DILexicalBlockFile(scope: !10, file: !3, discriminator: 1)
+  !25 = distinct !{!25, !19, !20}
+
+...
+---
+name:            Process
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64 }
+  - { id: 1, class: gr64 }
+  - { id: 2, class: gr64 }
+  - { id: 3, class: gr32 }
+  - { id: 4, class: gr64 }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.while.body(0x80000000)
+    liveins: $rdi
+
+    DBG_VALUE $rdi, _, !17, !DIExpression(), debug-location !18
+    %2 = COPY $rdi
+    DBG_VALUE %2, _, !17, !DIExpression(), debug-location !18
+
+  bb.1.while.body:
+    successors: %bb.1.while.body(0x80000000)
+
+    %0 = PHI %2, %bb.0.entry, %1, %bb.1.while.body
+    DBG_VALUE %0, _, !17, !DIExpression(), debug-location !18
+    %1 = ADD64ri8 %0, 4, implicit-def dead $eflags, debug-location !20
+    DBG_VALUE %1, _, !17, !DIExpression(), debug-location !18
+    %3 = MOV32rm %0, 1, _, 0, _, debug-location !21 :: (load 4 from %ir.p.addr.0)
+    %4 = MOV64rm $rip, 1, _, target-flags(x86-gotpcrel) @x, _, debug-location !22 :: (load 8 from got)
+    MOV32mr killed %4, 1, _, 0, _, killed %3, debug-location !22 :: (store 4 into @x)
+    JMP_1 %bb.1.while.body, debug-location !23
+
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
similarity index 79%
rename from llvm/test/DebugInfo/MIR/X86/mlicm-hoist.mir
rename to llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
index 61dabf8910b67..8c0eb376eb408 100644
--- a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist.mir
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
@@ -23,13 +23,11 @@
   ;
   ; ModuleID = 'tx.ll'
   source_filename = "t.c"
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
 
   @x = common local_unnamed_addr global i32 0, align 4, !dbg !0
 
   ; Function Attrs: noreturn nounwind uwtable
-  define void @Process(i32* nocapture readonly %p) local_unnamed_addr #0 !dbg !9 {
+  define void @Process(i32* nocapture readonly %p) local_unnamed_addr  !dbg !9 {
   entry:
     tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !16, metadata !17), !dbg !18
     br label %while.body, !dbg !19
@@ -45,11 +43,7 @@
   }
 
   ; Function Attrs: nounwind readnone
-  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-  attributes #0 = { noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { nounwind readnone }
-  attributes #2 = { nounwind }
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
   !llvm.dbg.cu = !{!1}
   !llvm.module.flags = !{!6, !7}
@@ -80,7 +74,7 @@
   !22 = !{!23, !23, i64 0}
   !23 = !{!"int", !24, i64 0}
   !24 = !{!"omnipotent char", !25, i64 0}
-  !25 = !{!"Simple C/C++ TBAA"}
+  !25 = !{!"C++"}
   !26 = !DILocation(line: 5, column: 7, scope: !9)
   !27 = !DILocation(line: 4, column: 3, scope: !28)
   !28 = !DILexicalBlockFile(scope: !9, file: !2, discriminator: 1)
@@ -89,11 +83,6 @@
 ...
 ---
 name:            Process
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
 tracksRegLiveness: true
 registers:
   - { id: 0, class: gr64 }
@@ -101,22 +90,6 @@ registers:
   - { id: 2, class: gr64 }
   - { id: 3, class: gr32 }
   - { id: 4, class: gr64 }
-liveins:
-  - { reg: '$rdi', virtual-reg: '%2' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
 body:             |
   bb.0.entry:
     successors: %bb.1.while.body(0x80000000)
diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir
new file mode 100644
index 0000000000000..7b5a19ffa9e7f
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir
@@ -0,0 +1,216 @@
+--- |
+  ; RUN: llc --run-pass=machinelicm -sink-insts-to-avoid-spills %s -o - | FileCheck %s --match-full-lines
+  ; CHECK-LABEL: bb.4 (%ir-block.9):
+  ; CHECK: %0:gr64 = nuw ADD64ri8 %9, 4, implicit-def dead $eflags
+  ; 
+  ; When instructions are sunk to prevent register spills, line numbers should not be retained.
+
+  %struct.A = type { i32, i32, i32, i32, i32, i32 }
+  
+  define void @p(i8* nocapture readonly %input, %struct.A* %a) !dbg !10 {
+    %1 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 1, !dbg !18
+    %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 2
+    %3 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 3
+    %4 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 4
+    %5 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 5
+    %scevgep = getelementptr i8, i8* %input, i64 1
+    br label %.backedge
+  
+  .backedge:                                        ; preds = %.backedge.backedge, %0
+    %lsr.iv = phi i8* [ %scevgep1, %.backedge.backedge ], [ %scevgep, %0 ]
+    %6 = load i8, i8* %lsr.iv, align 1
+    switch i8 %6, label %.backedge.backedge [
+      i8 0, label %7
+      i8 10, label %9
+      i8 20, label %10
+      i8 30, label %11
+      i8 40, label %12
+      i8 50, label %13
+    ]
+  
+  7:                                                ; preds = %.backedge
+    %8 = bitcast %struct.A* %a to i32*
+    tail call void @f(i32* %8)
+    br label %.backedge.backedge
+  
+  9:                                                ; preds = %.backedge
+    tail call void @f(i32* %1)
+    br label %.backedge.backedge
+  
+  .backedge.backedge:                               ; preds = %13, %12, %11, %10, %9, %7, %.backedge
+    %scevgep1 = getelementptr i8, i8* %lsr.iv, i64 1
+    br label %.backedge
+  
+  10:                                               ; preds = %.backedge
+    tail call void @f(i32* %2)
+    br label %.backedge.backedge
+  
+  11:                                               ; preds = %.backedge
+    tail call void @f(i32* %3)
+    br label %.backedge.backedge
+  
+  12:                                               ; preds = %.backedge
+    tail call void @f(i32* %4)
+    br label %.backedge.backedge
+  
+  13:                                               ; preds = %.backedge
+    tail call void @f(i32* %5)
+    br label %.backedge.backedge
+  }
+  
+  declare void @f(i32*)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**)
+
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!7, !8}
+  !llvm.ident = !{!9}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3)
+  !1 = !DIFile(filename: "t.ll", directory: "tmp/X86")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+  !5 = !DIGlobalVariable(name: "x", scope: !0, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true)
+  !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !7 = !{i32 2, !"Dwarf Version", i32 4}
+  !8 = !{i32 2, !"Debug Info Version", i32 3}
+  !9 = !{!"clang version 10.0.0 "}
+  !10 = distinct !DISubprogram(name: "p", scope: !1, file: !1, line: 2, type: !11, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+  !11 = !DISubroutineType(types: !12)
+  !12 = !{null, !13}
+  !13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+  !14 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !15)
+  !15 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  !16 = !{!17}
+  !17 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !1, line: 2, type: !15)
+  !18 = !DILocation(line: 4, column: 3, scope: !10)
+
+
+...
+---
+name:            p
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: gr64, preferred-register: '' }
+  - { id: 2, class: gr64, preferred-register: '' }
+  - { id: 3, class: gr64, preferred-register: '' }
+  - { id: 4, class: gr64, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+  - { id: 6, class: gr64, preferred-register: '' }
+  - { id: 7, class: gr64, preferred-register: '' }
+  - { id: 8, class: gr64, preferred-register: '' }
+  - { id: 9, class: gr64, preferred-register: '' }
+  - { id: 10, class: gr64_nosp, preferred-register: '' }
+  - { id: 11, class: gr32, preferred-register: '' }
+  - { id: 12, class: gr64, preferred-register: '' }
+  - { id: 13, class: gr64, preferred-register: '' }
+  - { id: 14, class: gr64, preferred-register: '' }
+  - { id: 15, class: gr64, preferred-register: '' }
+jumpTable:
+  kind:            label-difference32
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.3', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.5', '%bb.4', '%bb.4', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', 
+                         '%bb.6', '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.7', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', '%bb.4', 
+                         '%bb.4', '%bb.4', '%bb.8' ]
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.1(0x80000000)
+    liveins: $rdi, $rsi
+  
+    %9:gr64 = COPY $rsi
+    %8:gr64 = COPY $rdi
+    %0:gr64 = nuw ADD64ri8 %9, 4, implicit-def dead $eflags, debug-location !18
+    %1:gr64 = nuw ADD64ri8 %9, 8, implicit-def dead $eflags
+    %2:gr64 = nuw ADD64ri8 %9, 12, implicit-def dead $eflags
+    %3:gr64 = nuw ADD64ri8 %9, 16, implicit-def dead $eflags
+    %4:gr64 = nuw ADD64ri8 %9, 20, implicit-def dead $eflags
+    %5:gr64 = INC64r %8, implicit-def dead $eflags
+  
+  bb.1..backedge:
+    successors: %bb.4(0x09249249), %bb.9(0x76db6db7)
+  
+    %6:gr64 = PHI %5, %bb.0, %7, %bb.4
+    %11:gr32 = MOVZX32rm8 %6, 1, $noreg, 0, $noreg :: (load 1 from %ir.lsr.iv)
+    %10:gr64_nosp = SUBREG_TO_REG 0, killed %11, %subreg.sub_32bit
+    %12:gr64 = SUB64ri8 %10, 50, implicit-def $eflags
+    JCC_1 %bb.4, 7, implicit $eflags
+  
+  bb.9..backedge:
+    successors: %bb.2(0x13b13b14), %bb.4(0x09d89d8a), %bb.3(0x13b13b14), %bb.5(0x13b13b14), %bb.6(0x13b13b14), %bb.7(0x13b13b14), %bb.8(0x13b13b14)
+  
+    %13:gr64 = LEA64r $rip, 1, $noreg, %jump-table.0, $noreg
+    %14:gr64 = MOVSX64rm32 %13, 4, %10, 0, $noreg :: (load 4 from jump-table)
+    %15:gr64 = ADD64rr %14, %13, implicit-def dead $eflags
+    JMP64r killed %15
+  
+  bb.2 (%ir-block.7):
+    successors: %bb.4(0x80000000)
+  
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = COPY %9
+    CALL64pcrel32 target-flags(x86-plt) @f, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.4
+  
+  bb.3 (%ir-block.9):
+    successors: %bb.4(0x80000000)
+  
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = COPY %0
+    CALL64pcrel32 target-flags(x86-plt) @f, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  
+  bb.4..backedge.backedge:
+    successors: %bb.1(0x80000000)
+  
+    %7:gr64 = INC64r %6, implicit-def dead $eflags
+    JMP_1 %bb.1
+  
+  bb.5 (%ir-block.10):
+    successors: %bb.4(0x80000000)
+  
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = COPY %1
+    CALL64pcrel32 target-flags(x86-plt) @f, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.4
+  
+  bb.6 (%ir-block.11):
+    successors: %bb.4(0x80000000)
+  
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = COPY %2
+    CALL64pcrel32 target-flags(x86-plt) @f, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.4
+  
+  bb.7 (%ir-block.12):
+    successors: %bb.4(0x80000000)
+  
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = COPY %3
+    CALL64pcrel32 target-flags(x86-plt) @f, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.4
+  
+  bb.8 (%ir-block.13):
+    successors: %bb.4(0x80000000)
+  
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = COPY %4
+    CALL64pcrel32 target-flags(x86-plt) @f, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.4
+
+...

From e1d2cecec5197af7104e4c50e6aed4313d512cda Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 13:15:43 -0700
Subject: [PATCH 142/770] [lldb/Test] Cleanup TestSymbolTable.py (NFC)

---
 .../test/API/lang/objc/foundation/TestSymbolTable.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
index f3331e829c27e..df4860f148260 100644
--- a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
+++ b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
@@ -17,8 +17,7 @@ class FoundationSymtabTestCase(TestBase):
                     '-[MyString dealloc]',
                     '-[MyString description]',
                     '-[MyString descriptionPauses]',     # synthesized property
-                    # synthesized property
-                    '-[MyString setDescriptionPauses:]',
+                    '-[MyString setDescriptionPauses:]', # synthesized property
                     'Test_Selector',
                     'Test_NSString',
                     'Test_MyString',
@@ -31,14 +30,13 @@ def test_with_python_api(self):
         """Test symbol table access with Python APIs."""
         self.build()
         exe = self.getBuildArtifact("a.out")
-        self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET)
-
         target = self.dbg.CreateTarget(exe)
         self.assertTrue(target, VALID_TARGET)
 
         # Launch the process, and do not stop at the entry point.
         process = target.LaunchSimple(
             None, None, self.get_process_working_directory())
+        self.assertTrue(process, PROCESS_IS_VALID)
 
         # Create the filespec by which to locate our a.out module.
         filespec = lldb.SBFileSpec(exe, False)
@@ -51,14 +49,14 @@ def test_with_python_api(self):
         expected_symbols = set(self.symbols_list)
         for symbol in module:
             self.assertTrue(symbol, VALID_SYMBOL)
-            #print("symbol:", symbol)
+            self.trace("symbol:", symbol)
             name = symbol.GetName()
             if name in expected_symbols:
-                #print("Removing %s from known_symbols %s" % (name, expected_symbols))
+                self.trace("Removing %s from known_symbols %s" % (name, expected_symbols))
                 expected_symbols.remove(name)
 
         # At this point, the known_symbols set should have become an empty set.
         # If not, raise an error.
-        #print("symbols unaccounted for:", expected_symbols)
+        self.trace("symbols unaccounted for:", expected_symbols)
         self.assertTrue(len(expected_symbols) == 0,
                         "All the known symbols are accounted for")

From e9003207591e4830bcce2de1631db901f8c4f2b8 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Tue, 26 May 2020 16:21:15 -0400
Subject: [PATCH 143/770] [mlir] Hotfix - Drop spurious constexpr that breaks
 build

---
 mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index 02d2762560767..13db3a2a88d2a 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -81,21 +81,21 @@ constexpr StringRef getPaddingAttrName() { return "padding"; }
 
 /// Use to encode that a particular iterator type has parallel semantics.
 constexpr StringRef getParallelIteratorTypeName() { return "parallel"; }
-constexpr bool isParallelIterator(Attribute attr) {
+bool isParallelIterator(Attribute attr) {
   auto strAttr = attr.dyn_cast_or_null<StringAttr>();
   return strAttr && strAttr.getValue() == getParallelIteratorTypeName();
 }
 
 /// Use to encode that a particular iterator type has reduction semantics.
 constexpr StringRef getReductionIteratorTypeName() { return "reduction"; }
-constexpr bool isReductionIterator(Attribute attr) {
+bool isReductionIterator(Attribute attr) {
   auto strAttr = attr.dyn_cast_or_null<StringAttr>();
   return strAttr && strAttr.getValue() == getReductionIteratorTypeName();
 }
 
 /// Use to encode that a particular iterator type has window semantics.
 constexpr StringRef getWindowIteratorTypeName() { return "window"; }
-constexpr bool isWindowIterator(Attribute attr) {
+bool isWindowIterator(Attribute attr) {
   auto strAttr = attr.dyn_cast_or_null<StringAttr>();
   return strAttr && strAttr.getValue() == getWindowIteratorTypeName();
 }

From c990bdf7f8761f047fac85615377835edf015698 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Tue, 26 May 2020 16:24:56 -0400
Subject: [PATCH 144/770] [mlir] Hotfix - Add inline to avoid multiple symbols
 on trivial functions

---
 mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index 13db3a2a88d2a..168e877e50561 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -81,21 +81,21 @@ constexpr StringRef getPaddingAttrName() { return "padding"; }
 
 /// Use to encode that a particular iterator type has parallel semantics.
 constexpr StringRef getParallelIteratorTypeName() { return "parallel"; }
-bool isParallelIterator(Attribute attr) {
+inline bool isParallelIterator(Attribute attr) {
   auto strAttr = attr.dyn_cast_or_null<StringAttr>();
   return strAttr && strAttr.getValue() == getParallelIteratorTypeName();
 }
 
 /// Use to encode that a particular iterator type has reduction semantics.
 constexpr StringRef getReductionIteratorTypeName() { return "reduction"; }
-bool isReductionIterator(Attribute attr) {
+inline bool isReductionIterator(Attribute attr) {
   auto strAttr = attr.dyn_cast_or_null<StringAttr>();
   return strAttr && strAttr.getValue() == getReductionIteratorTypeName();
 }
 
 /// Use to encode that a particular iterator type has window semantics.
 constexpr StringRef getWindowIteratorTypeName() { return "window"; }
-bool isWindowIterator(Attribute attr) {
+inline bool isWindowIterator(Attribute attr) {
   auto strAttr = attr.dyn_cast_or_null<StringAttr>();
   return strAttr && strAttr.getValue() == getWindowIteratorTypeName();
 }

From bb10fa3a53f928e2e24ad3eaf8e57508fe9d4320 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 24 May 2020 12:28:59 -0400
Subject: [PATCH 145/770] AMDGPU: Fix wrong null value for private address
 space

I'm guessing this was a holdover from when 0 was an invalid stack
pointer, but surprised nobody has discovered this before.

Also don't allow offset folding for -1 pointers, since it looks weird
to partially fold this.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 36 +++++++-------
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  5 +-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |  3 +-
 .../GlobalISel/inst-select-load-private.mir   | 28 +++++++++++
 .../GlobalISel/legalize-addrspacecast.mir     | 26 +++++-----
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     | 47 +++++++++++++++----
 llvm/test/CodeGen/AMDGPU/nullptr.ll           |  4 +-
 7 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 52823c16d72d9..edd8ea39e0f42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1482,22 +1482,26 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
-    unsigned Imm = CAddr->getZExtValue();
-
-    SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
-    MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                                        DL, MVT::i32, HighBits);
-    VAddr = SDValue(MovHighBits, 0);
-
-    // In a call sequence, stores to the argument stack area are relative to the
-    // stack pointer.
-    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-
-    SOffset = isStackPtrRelative(PtrInfo)
-                  ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
-                  : CurDAG->getTargetConstant(0, DL, MVT::i32);
-    ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
-    return true;
+    int64_t Imm = CAddr->getSExtValue();
+    const int64_t NullPtr =
+        AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
+    // Don't fold null pointer.
+    if (Imm != NullPtr) {
+      SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+      MachineSDNode *MovHighBits = CurDAG->getMachineNode(
+        AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+      VAddr = SDValue(MovHighBits, 0);
+
+      // In a call sequence, stores to the argument stack area are relative to the
+      // stack pointer.
+      const MachinePointerInfo &PtrInfo
+        = cast<MemSDNode>(Parent)->getPointerInfo();
+      SOffset = isStackPtrRelative(PtrInfo)
+        ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+        : CurDAG->getTargetConstant(0, DL, MVT::i32);
+      ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+      return true;
+    }
   }
 
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aee6c0dd8a8e0..5afec2188d66b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3062,7 +3062,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
 
   int64_t Offset = 0;
-  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
+      Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
     // TODO: Should this be inside the render function? The iterator seems to
@@ -3091,7 +3092,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
              }}};
   }
 
-  assert(Offset == 0);
+  assert(Offset == 0 || Offset == -1);
 
   // Try to fold a frame index directly into the MUBUF vaddr field, and any
   // offsets.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 2ef6cd5b3e338..e223fecc88195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -57,8 +57,9 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   void adjustPassManager(PassManagerBuilder &) override;
 
   /// Get the integer value of a null pointer in the given address space.
-  uint64_t getNullPointerValue(unsigned AddrSpace) const {
+  static int64_t getNullPointerValue(unsigned AddrSpace) {
     return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+            AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
             AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
   }
 };
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 13e4035a48828..79284fdfd05f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -843,3 +843,31 @@ body: |
     $vgpr0 = COPY %3
 
 ...
+
+# Should not fold offset if this is a null dereference.
+---
+
+name: load_private_s32_from_neg1
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+
+    ; GFX6-LABEL: name: load_private_s32_from_neg1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GFX9-LABEL: name: load_private_s32_from_neg1
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5)
+    ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    %0:vgpr(p5) = G_CONSTANT i32 -1
+    %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5)
+    $vgpr0 = COPY %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index 643bdd3b7d582..395d34a00081d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -173,12 +173,12 @@ body: |
     ; VI-LABEL: name: test_addrspacecast_p5_to_p0
     ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
     ; VI: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; VI: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 0
+    ; VI: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; VI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
-    ; VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
-    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4, addrspace 4)
+    ; VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
+    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 4, addrspace 4)
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p5), [[C]]
     ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
     ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
@@ -186,7 +186,7 @@ body: |
     ; VI: $vgpr0_vgpr1 = COPY [[SELECT]](p0)
     ; GFX9-LABEL: name: test_addrspacecast_p5_to_p0
     ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 0
+    ; GFX9: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; GFX9: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; GFX9: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
@@ -216,7 +216,7 @@ body: |
 
     ; VI-LABEL: name: test_addrspacecast_p0_to_p5
     ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; VI: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 0
+    ; VI: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; VI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; VI: [[EXTRACT:%[0-9]+]]:_(p5) = G_EXTRACT [[COPY]](p0), 0
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p0), [[C1]]
@@ -224,7 +224,7 @@ body: |
     ; VI: $vgpr0 = COPY [[SELECT]](p5)
     ; GFX9-LABEL: name: test_addrspacecast_p0_to_p5
     ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 0
+    ; GFX9: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; GFX9: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; GFX9: [[EXTRACT:%[0-9]+]]:_(p5) = G_EXTRACT [[COPY]](p0), 0
     ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p0), [[C1]]
@@ -232,7 +232,7 @@ body: |
     ; GFX9: $vgpr0 = COPY [[SELECT]](p5)
     ; SI-LABEL: name: test_addrspacecast_p0_to_p5
     ; SI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; SI: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 0
+    ; SI: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
     ; SI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; SI: [[EXTRACT:%[0-9]+]]:_(p5) = G_EXTRACT [[COPY]](p0), 0
     ; SI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p0), [[C1]]
@@ -260,8 +260,8 @@ body: |
     ; VI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
-    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
+    ; VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
+    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p3), [[C]]
     ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
     ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
@@ -466,15 +466,15 @@ body: |
     ; VI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
     ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
-    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
+    ; VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64)
+    ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
     ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
     ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
     ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
     ; VI: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
     ; VI: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
-    ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY3]], [[C2]](s64)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
+    ; VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY3]], [[C2]](s64)
+    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load 4, align 64, addrspace 4)
     ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
     ; VI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
     ; VI: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 764f935097642..d16edbac75fe2 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -76,7 +76,7 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
+; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@@ -89,7 +89,7 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
+; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@@ -167,7 +167,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
@@ -252,12 +252,16 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 
 ; FIXME: Shouldn't need to enable queue ptr
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; CI: enable_sgpr_queue_ptr = 1
-; GFX9: enable_sgpr_queue_ptr = 0
+; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
+; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
 
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(5)* null to i32*
@@ -266,14 +270,41 @@ define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
 }
 
 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
-; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(5)*
   store volatile i32 7, i32 addrspace(5)* %cast
   ret void
 }
 
+
+; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
+
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 {
+  %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32*
+  store volatile i32 7, i32* %cast
+  ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_flat_to_private_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
+define amdgpu_kernel void @cast_neg1_flat_to_private_addrspacecast() #0 {
+  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(5)*
+  store volatile i32 7, i32 addrspace(5)* %cast
+  ret void
+}
+
+
 ; Disable optimizations in case there are optimizations added that
 ; specialize away generic pointer accesses.
 
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 4eaf9836bb9d3..16292f0ebee08 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -4,7 +4,7 @@
 %struct.S = type { i32 addrspace(5)*, i32 addrspace(1)*, i32 addrspace(4)*, i32 addrspace(3)*, i32*, i32 addrspace(2)*}
 
 ; CHECK-LABEL: nullptr_priv:
-; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -1
 @nullptr_priv = global i32 addrspace(5)* addrspacecast (i32* null to i32 addrspace(5)*)
 
 ; CHECK-LABEL: nullptr_glob:
@@ -98,7 +98,7 @@
 @nullptr23 = global i32 addrspace(23)* addrspacecast (i32* null to i32 addrspace(23)*)
 
 ; CHECK-LABEL: structWithPointers:
-; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -1
 ; GCN-NEXT:   .zero 4
 ; GCN-NEXT:   .quad 0
 ; R600-NEXT:  .long 0

From 14de6e29b1315e9abe61d71e3e13f75bff80e1be Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 26 May 2020 12:51:46 -0700
Subject: [PATCH 146/770] [Clang][Driver] Add Bounds and Thread to
 SupportsCoverage list

Summary:
This permits combining -fsanitize-coverage with -fsanitize=bounds or
-fsanitize=thread. Note that, GCC already supports combining these.

Tested:
- Add Clang end-to-end test checking IR is generated for both combinations
of sanitizers.
- Several previously failing TSAN tests now pass.

Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=45831

Reviewers: vitalybuka

Reviewed By: vitalybuka

Subscribers: #sanitizers, dvyukov, nickdesaulniers, cfe-commits

Tags: #clang, #sanitizers

Differential Revision: https://reviews.llvm.org/D79628
---
 clang/lib/Driver/SanitizerArgs.cpp            |  5 +++--
 clang/test/CodeGen/sanitize-coverage.c        | 22 +++++++++++++++++++
 clang/test/Driver/fsanitize-coverage.c        |  2 ++
 .../sanitizer_coverage_inline8bit_counter.cpp |  1 -
 .../sanitizer_coverage_inline_bool_flag.cpp   |  1 -
 .../TestCases/sanitizer_coverage_no_prune.cpp |  2 +-
 .../sanitizer_coverage_stack_depth.cpp        |  2 --
 ...sanitizer_coverage_trace_pc_guard-init.cpp |  1 -
 8 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CodeGen/sanitize-coverage.c

diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index bc186fa5a5982..35e982a502ef6 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -43,11 +43,12 @@ static const SanitizerMask SupportsCoverage =
     SanitizerKind::KernelAddress | SanitizerKind::KernelHWAddress |
     SanitizerKind::MemTag | SanitizerKind::Memory |
     SanitizerKind::KernelMemory | SanitizerKind::Leak |
-    SanitizerKind::Undefined | SanitizerKind::Integer |
+    SanitizerKind::Undefined | SanitizerKind::Integer | SanitizerKind::Bounds |
     SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
     SanitizerKind::DataFlow | SanitizerKind::Fuzzer |
     SanitizerKind::FuzzerNoLink | SanitizerKind::FloatDivideByZero |
-    SanitizerKind::SafeStack | SanitizerKind::ShadowCallStack;
+    SanitizerKind::SafeStack | SanitizerKind::ShadowCallStack |
+    SanitizerKind::Thread;
 static const SanitizerMask RecoverableByDefault =
     SanitizerKind::Undefined | SanitizerKind::Integer |
     SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
diff --git a/clang/test/CodeGen/sanitize-coverage.c b/clang/test/CodeGen/sanitize-coverage.c
new file mode 100644
index 0000000000000..6fc8e39354d4f
--- /dev/null
+++ b/clang/test/CodeGen/sanitize-coverage.c
@@ -0,0 +1,22 @@
+// RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S                       -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=address    -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,ASAN
+// RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=bounds     -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,BOUNDS
+// RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=memory     -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,MSAN
+// RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=thread     -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,TSAN
+// RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=undefined  -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,UBSAN
+
+int x[10];
+
+// CHECK-LABEL: define dso_local void @foo(
+void foo(int n) {
+  // CHECK-DAG: call void @__sanitizer_cov_trace_pc
+  // CHECK-DAG: call void @__sanitizer_cov_trace_const_cmp
+  // ASAN-DAG: call void @__asan_report_store
+  // MSAN-DAG: call void @__msan_warning
+  // BOUNDS-DAG: call void @__ubsan_handle_out_of_bounds
+  // TSAN-DAG: call void @__tsan_func_entry
+  // UBSAN-DAG: call void @__ubsan_handle
+  if (n)
+    x[n] = 42;
+}
+// CHECK-LABEL: declare void
diff --git a/clang/test/Driver/fsanitize-coverage.c b/clang/test/Driver/fsanitize-coverage.c
index b10fc86bb3911..02078d847512e 100644
--- a/clang/test/Driver/fsanitize-coverage.c
+++ b/clang/test/Driver/fsanitize-coverage.c
@@ -12,8 +12,10 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=bounds -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // RUN: %clang -target %itanium_abi_triple -fsanitize=float-divide-by-zero -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // CHECK-SANITIZE-COVERAGE-FUNC: fsanitize-coverage-type=1
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter.cpp
index 58a64d1a92dc8..68eca85eb4d42 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter.cpp
@@ -5,7 +5,6 @@
 //
 // RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
-// XFAIL: tsan
 
 #include <stdio.h>
 #include <stdint.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline_bool_flag.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline_bool_flag.cpp
index c3783e80f6237..d62ffe613b5b0 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline_bool_flag.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline_bool_flag.cpp
@@ -5,7 +5,6 @@
 //
 // RUN: %clangxx -O0 %s -fsanitize-coverage=inline-bool-flag,pc-table -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
-// XFAIL: tsan
 
 #include <assert.h>
 #include <stdint.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cpp
index 9604da222f8e9..6a7bb0dda0a82 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_no_prune.cpp
@@ -2,7 +2,7 @@
 
 // REQUIRES: has_sancovcc,stable-runtime
 // UNSUPPORTED: i386-darwin
-// XFAIL: ubsan,tsan
+// XFAIL: ubsan
 // XFAIL: android && asan
 
 // RUN: %clangxx -O0 %s -S -o - -emit-llvm -fsanitize-coverage=trace-pc,bb,no-prune 2>&1 | grep "call void @__sanitizer_cov_trace_pc" | count 3
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_stack_depth.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_stack_depth.cpp
index 90959ef5b0287..29a63c0a92f32 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_stack_depth.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_stack_depth.cpp
@@ -1,7 +1,5 @@
 // Tests -fsanitize-coverage=stack-depth
 //
-// XFAIL: tsan
-//
 // RUN: %clangxx -O0 -std=c++11 -fsanitize-coverage=stack-depth %s -o %t
 // RUN: %run %t 2>&1 | FileCheck %s --implicit-check-not Assertion{{.*}}failed
 // RUN: %clangxx -O0 -std=c++11 -fsanitize-coverage=trace-pc-guard,stack-depth \
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard-init.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard-init.cpp
index b92a513b6d65f..0b2da9aebac8e 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard-init.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard-init.cpp
@@ -1,7 +1,6 @@
 // Tests trace pc guard coverage collection.
 //
 // REQUIRES: has_sancovcc,stable-runtime,x86_64-linux
-// XFAIL: tsan
 //
 // RUN: DIR=%t_workdir
 // RUN: CLANG_ARGS="-O0 -fsanitize-coverage=trace-pc-guard"

From 09de6e0fbd0b6ca7fa8760ac3513be6bbbba5a81 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 26 May 2020 13:49:23 -0700
Subject: [PATCH 147/770] Let @skipUnlessAddressSanitizer imply @skipIfAsan

Don't run tests that use address sanitizer inside an address-sanitized
LLDB. The tests don't support that configuration. Incidentally they
were skipped on green dragon for a different reason, so this hasn't
come up there before.
---
 .../Python/lldbsuite/test/decorators.py       | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 65b63b4b40a8e..b94b672e44999 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -785,10 +785,21 @@ def is_compiler_clang_with_ubsan(self):
 
     return skipTestIfFn(is_compiler_clang_with_ubsan)(func)
 
+def is_running_under_asan():
+    if ('ASAN_OPTIONS' in os.environ):
+        return "ASAN unsupported"
+    return None
+
 def skipUnlessAddressSanitizer(func):
     """Decorate the item to skip test unless Clang -fsanitize=thread is supported."""
 
     def is_compiler_with_address_sanitizer(self):
+        # Also don't run tests that use address sanitizer inside an
+        # address-sanitized LLDB. The tests don't support that
+        # configuration.
+        if is_running_under_asan():
+            return "Address sanitizer tests are disabled when runing under ASAN"
+
         compiler_path = self.getCompiler()
         compiler = os.path.basename(compiler_path)
         f = tempfile.NamedTemporaryFile()
@@ -803,6 +814,10 @@ def is_compiler_with_address_sanitizer(self):
         return None
     return skipTestIfFn(is_compiler_with_address_sanitizer)(func)
 
+def skipIfAsan(func):
+    """Skip this test if the environment is set up to run LLDB *itself* under ASAN."""
+    return skipTestIfFn(is_running_under_asan)(func)
+
 def _get_bool_config_skip_if_decorator(key):
     config = lldb.SBDebugger.GetBuildConfiguration()
     value_node = config.GetValueForKey(key)
@@ -847,14 +862,6 @@ def is_feature_enabled(self):
                 return "%s is not supported on this system." % feature
     return skipTestIfFn(is_feature_enabled)
 
-def skipIfAsan(func):
-    """Skip this test if the environment is set up to run LLDB itself under ASAN."""
-    def is_asan():
-        if ('ASAN_OPTIONS' in os.environ):
-            return "ASAN unsupported"
-        return None
-    return skipTestIfFn(is_asan)(func)
-
 def skipIfReproducer(func):
     """Skip this test if the environment is set up to run LLDB with reproducers."""
     def is_reproducer():

From 01fee8aa24a6070542cfa55b2c32036d1d5869b8 Mon Sep 17 00:00:00 2001
From: Davide Italiano <ditaliano@apple.com>
Date: Tue, 26 May 2020 13:53:16 -0700
Subject: [PATCH 148/770] [MLICM] Remove unneeded option so the test doesn't
 fail.

---
 llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
index 91f77d331f184..97cdea090c9c5 100644
--- a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
@@ -1,5 +1,5 @@
 --- |
-  ; RUN: llc -start-before=phi-node-elimination -stop-after=machinelicm -debug-only=machinelicm -o - %s  | FileCheck %s
+  ; RUN: llc -start-before=phi-node-elimination -stop-after=machinelicm -o - %s  | FileCheck %s
   ; Ensure we execute machinelicm post register allocation.
   ; Line numbers should not be retained when loop invariant instructions are hoisted.
   ;

From 5cf90d6cf1b811a6693383c487f79d24d5b306bb Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 26 May 2020 21:50:15 +0100
Subject: [PATCH 149/770] [LoopUnroll] Simplify latch/header block handling
 (NFC).

I think the current code dealing with connecting the unrolled iterations
is a bit more complicated than necessary currently. To connect the
unrolled iterations, we have to update the unrolled latch blocks to
branch to the header of the next unrolled iteration.

We need to do this regardless whether the latch is exiting or not.

Additionally, we try to turn the conditional branch in the exiting block
to an unconditional one. This is an optimization only; alternatively we
could leave the conditional branches in place and rely on other passes
to simplify the conditions.

Logically, this is a separate step from connecting the latches to the
headers, but it is convenient to fold them into the same loop, if the
latch is also exiting. For headers (or other non-latch exiting blocks,
this is done separately).

Hopefully the patch with additional comments makes things a bit clearer.

Reviewers: efriedma, dmgreen, hfinkel, Whitney

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D80544
---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp | 80 +++++++++++-------------
 1 file changed, 36 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 23b61c40a7567..d9323e70bef60 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -716,9 +716,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
   }
 
-  auto setDest = [LoopExit, ContinueOnTrue](BasicBlock *Src, BasicBlock *Dest,
-                                            BasicBlock *BlockInLoop,
-                                            bool NeedConditional) {
+  auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop,
+                    bool NeedConditional, bool ContinueOnTrue,
+                    bool IsDestLoopExit) {
     auto *Term = cast<BranchInst>(Src->getTerminator());
     if (NeedConditional) {
       // Update the conditional branch's successor for the following
@@ -726,7 +726,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       Term->setSuccessor(!ContinueOnTrue, Dest);
     } else {
       // Remove phi operands at this loop exit
-      if (Dest != LoopExit) {
+      if (!IsDestLoopExit) {
         BasicBlock *BB = Src;
         for (BasicBlock *Succ : successors(BB)) {
           // Preserve the incoming value from BB if we are jumping to the block
@@ -743,29 +743,27 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
   };
 
-  // Now that all the basic blocks for the unrolled iterations are in place,
-  // set up the branches to connect them.
-  if (LatchIsExiting) {
-    // Set up latches to branch to the new header in the unrolled iterations or
-    // the loop exit for the last latch in a fully unrolled loop.
-    for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-      // The branch destination.
-      unsigned j = (i + 1) % e;
-      BasicBlock *Dest = Headers[j];
-      bool NeedConditional = true;
+  // Connect latches of the unrolled iterations to the headers of the next
+  // iteration. If the latch is also the exiting block, the conditional branch
+  // may have to be preserved.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = LatchIsExiting;
 
-      if (RuntimeTripCount && j != 0) {
+    if (LatchIsExiting) {
+      if (RuntimeTripCount && j != 0)
         NeedConditional = false;
-      }
 
       // For a complete unroll, make the last iteration end with a branch
       // to the exit block.
       if (CompletelyUnroll) {
         if (j == 0)
           Dest = LoopExit;
-        // If using trip count upper bound to completely unroll, we need to keep
-        // the conditional branch except the last one because the loop may exit
-        // after any iteration.
+        // If using trip count upper bound to completely unroll, we need to
+        // keep the conditional branch except the last one because the loop
+        // may exit after any iteration.
         assert(NeedConditional &&
                "NeedCondition cannot be modified by both complete "
                "unrolling and runtime unrolling");
@@ -777,16 +775,18 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // unconditional branch for some iterations.
         NeedConditional = false;
       }
-
-      setDest(Latches[i], Dest, Headers[i], NeedConditional);
     }
-  } else {
-    // Setup headers to branch to their new successors in the unrolled
-    // iterations.
+
+    setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue,
+            Dest == LoopExit);
+  }
+
+  if (!LatchIsExiting) {
+    // If the latch is not exiting, we may be able to simplify the conditional
+    // branches in the unrolled exiting blocks.
     for (unsigned i = 0, e = Headers.size(); i != e; ++i) {
       // The branch destination.
       unsigned j = (i + 1) % e;
-      BasicBlock *Dest = HeaderSucc[i];
       bool NeedConditional = true;
 
       if (RuntimeTripCount && j != 0)
@@ -802,27 +802,19 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // unconditional branch for some iterations.
         NeedConditional = false;
 
-      setDest(Headers[i], Dest, HeaderSucc[i], NeedConditional);
+      // Conditional branches from non-latch exiting block have successors
+      // either in the same loop iteration or outside the loop. The branches are
+      // already correct.
+      if (NeedConditional)
+        continue;
+      setDest(Headers[i], HeaderSucc[i], HeaderSucc[i], NeedConditional,
+              ContinueOnTrue, false);
     }
 
-    // Set up latches to branch to the new header in the unrolled iterations or
-    // the loop exit for the last latch in a fully unrolled loop.
-
-    for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-      // The original branch was replicated in each unrolled iteration.
-      BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
-
-      // The branch destination.
-      unsigned j = (i + 1) % e;
-      BasicBlock *Dest = Headers[j];
-
-      // When completely unrolling, the last latch becomes unreachable.
-      if (CompletelyUnroll && j == 0)
-        new UnreachableInst(Term->getContext(), Term);
-      else
-        // Replace the conditional branch with an unconditional one.
-        BranchInst::Create(Dest, Term);
-
+    // When completely unrolling, the last latch becomes unreachable.
+    if (CompletelyUnroll) {
+      BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator());
+      new UnreachableInst(Term->getContext(), Term);
       Term->eraseFromParent();
     }
   }

From 512e806a33e80058a409d205a378a6e6fc2ef39d Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Tue, 26 May 2020 13:30:19 -0700
Subject: [PATCH 150/770] [AMDGPU] Bail alloca vectorization if GEP not found

Differential Revision: https://reviews.llvm.org/D80587
---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 13 ++++++++++---
 .../AMDGPU/promote-alloca-vector-to-vector.ll  | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 03e927b3cdc4d..036f5440dc75d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -339,7 +339,9 @@ static Value *stripBitcasts(Value *V) {
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  GetElementPtrInst *GEP = cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  if (!GEP)
+    return nullptr;
 
   auto I = GEPIdx.find(GEP);
   return I == GEPIdx.end() ? nullptr : I->second;
@@ -496,10 +498,12 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
       if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
         break;
 
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      if (!Index)
+        break;
 
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
@@ -515,9 +519,12 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
           SI->getValueOperand()->getType()->isVectorTy())
         break;
 
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      if (!Index)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *Elt = SI->getValueOperand();
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 15da72db4abb7..da52bcee3637c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -189,5 +189,23 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}ptr_alloca_bitcast:
+; OPT-LABEL: define i64 @ptr_alloca_bitcast
+
+; GCN-NOT: buffer_
+; GCN: v_mov_b32_e32 v1, 0
+
+; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
+; OPT: %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
+; OPT: %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
+
+define i64 @ptr_alloca_bitcast() {
+entry:
+  %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
+  %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
+  %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
+  ret i64 %tmp1
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()

From ecb66f50eeb73c32f8fd955a97bb070fbdd519ed Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sat, 23 May 2020 22:47:21 -0700
Subject: [PATCH 151/770] [NFC, StackSafety] Move FunctionInfo into ::
 namespace

---
 .../llvm/Analysis/StackSafetyAnalysis.h       | 13 ++-
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     | 83 ++++++++++---------
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index c797d498b5dd8..9158f42481bf1 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -21,19 +21,18 @@ namespace llvm {
 /// Interface to access stack safety analysis results for single function.
 class StackSafetyInfo {
 public:
-  struct FunctionInfo;
+  struct InfoTy;
 
 private:
-  std::unique_ptr<FunctionInfo> Info;
+  std::unique_ptr<InfoTy> Info;
 
 public:
-  StackSafetyInfo();
-  StackSafetyInfo(FunctionInfo &&Info);
+  StackSafetyInfo(InfoTy Info);
   StackSafetyInfo(StackSafetyInfo &&);
   StackSafetyInfo &operator=(StackSafetyInfo &&);
   ~StackSafetyInfo();
 
-  FunctionInfo *getInfo() const { return Info.get(); }
+  const InfoTy &getInfo() const { return *Info; }
 
   // TODO: Add useful for client methods.
   void print(raw_ostream &O) const;
@@ -60,13 +59,13 @@ class StackSafetyPrinterPass : public PassInfoMixin<StackSafetyPrinterPass> {
 
 /// StackSafetyInfo wrapper for the legacy pass manager
 class StackSafetyInfoWrapperPass : public FunctionPass {
-  StackSafetyInfo SSI;
+  Optional<StackSafetyInfo> SSI;
 
 public:
   static char ID;
   StackSafetyInfoWrapperPass();
 
-  const StackSafetyInfo &getResult() const { return SSI; }
+  const StackSafetyInfo &getResult() const { return *SSI; }
 
   void print(raw_ostream &O, const Module *M) const override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 91c52e1bb9ffd..b98a0e5880463 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -15,6 +15,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 using namespace llvm;
 
@@ -143,10 +144,8 @@ uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) {
   return Size;
 }
 
-} // end anonymous namespace
-
 /// Describes uses of allocas and parameters inside of a single function.
-struct StackSafetyInfo::FunctionInfo {
+struct FunctionInfo {
   // May be a Function or a GlobalAlias
   const GlobalValue *GV = nullptr;
   // Informations about allocas uses.
@@ -158,14 +157,11 @@ struct StackSafetyInfo::FunctionInfo {
   // StackSafetyDataFlowAnalysis counter stored here for faster access.
   int UpdateCount = 0;
 
-  FunctionInfo(const StackSafetyInfo &SSI) : FunctionInfo(*SSI.Info) {}
-
+  FunctionInfo() = default;
   explicit FunctionInfo(const Function *F) : GV(F){};
   // Creates FunctionInfo that forwards all the parameters to the aliasee.
   explicit FunctionInfo(const GlobalAlias *A);
 
-  FunctionInfo(FunctionInfo &&) = default;
-
   bool IsDSOLocal() const { return GV->isDSOLocal(); };
 
   bool IsInterposable() const { return GV->isInterposable(); };
@@ -184,12 +180,9 @@ struct StackSafetyInfo::FunctionInfo {
     for (auto &AS : Allocas)
       O << "      " << AS << "\n";
   }
-
-private:
-  FunctionInfo(const FunctionInfo &) = default;
 };
 
-StackSafetyInfo::FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
+FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
   unsigned PointerSize = A->getParent()->getDataLayout().getPointerSizeInBits();
   const GlobalObject *Aliasee = A->getBaseObject();
   const FunctionType *Type = cast<FunctionType>(Aliasee->getValueType());
@@ -201,6 +194,16 @@ StackSafetyInfo::FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
   }
 }
 
+} // namespace
+
+struct StackSafetyInfo::InfoTy {
+  FunctionInfo Info;
+};
+
+StackSafetyInfo makeSSI(FunctionInfo Info) {
+  return StackSafetyInfo(StackSafetyInfo::InfoTy{std::move(Info)});
+}
+
 namespace {
 
 class StackSafetyLocalAnalysis {
@@ -232,7 +235,7 @@ class StackSafetyLocalAnalysis {
         UnknownRange(PointerSize, true) {}
 
   // Run the transformation on the associated function.
-  StackSafetyInfo run();
+  FunctionInfo run();
 };
 
 ConstantRange
@@ -382,8 +385,8 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
   return true;
 }
 
-StackSafetyInfo StackSafetyLocalAnalysis::run() {
-  StackSafetyInfo::FunctionInfo Info(&F);
+FunctionInfo StackSafetyLocalAnalysis::run() {
+  FunctionInfo Info(&F);
   assert(!F.isDeclaration() &&
          "Can't run StackSafety on a function declaration");
 
@@ -406,12 +409,11 @@ StackSafetyInfo StackSafetyLocalAnalysis::run() {
 
   LLVM_DEBUG(dbgs() << "[StackSafety] done\n");
   LLVM_DEBUG(Info.print(dbgs()));
-  return StackSafetyInfo(std::move(Info));
+  return Info;
 }
 
 class StackSafetyDataFlowAnalysis {
-  using FunctionMap =
-      std::map<const GlobalValue *, StackSafetyInfo::FunctionInfo>;
+  using FunctionMap = std::map<const GlobalValue *, FunctionInfo>;
 
   FunctionMap Functions;
   // Callee-to-Caller multimap.
@@ -424,8 +426,7 @@ class StackSafetyDataFlowAnalysis {
   ConstantRange getArgumentAccessRange(const GlobalValue *Callee,
                                        unsigned ParamNo) const;
   bool updateOneUse(UseInfo &US, bool UpdateToFullSet);
-  void updateOneNode(const GlobalValue *Callee,
-                     StackSafetyInfo::FunctionInfo &FS);
+  void updateOneNode(const GlobalValue *Callee, FunctionInfo &FS);
   void updateOneNode(const GlobalValue *Callee) {
     updateOneNode(Callee, Functions.find(Callee)->second);
   }
@@ -440,12 +441,12 @@ class StackSafetyDataFlowAnalysis {
 
 public:
   StackSafetyDataFlowAnalysis(
-      Module &M, std::function<const StackSafetyInfo &(Function &)> FI);
+      Module &M, std::function<const FunctionInfo &(Function &)> FI);
   StackSafetyGlobalInfo run();
 };
 
 StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis(
-    Module &M, std::function<const StackSafetyInfo &(Function &)> FI)
+    Module &M, std::function<const FunctionInfo &(Function &)> FI)
     : PointerSize(M.getDataLayout().getPointerSizeInBits()),
       UnknownRange(PointerSize, true) {
   // Without ThinLTO, run the local analysis for every function in the TU and
@@ -455,7 +456,7 @@ StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis(
       Functions.emplace(&F, FI(F));
   for (auto &A : M.aliases())
     if (isa<Function>(A.getBaseObject()))
-      Functions.emplace(&A, StackSafetyInfo::FunctionInfo(&A));
+      Functions.emplace(&A, FunctionInfo(&A));
 }
 
 ConstantRange
@@ -465,7 +466,7 @@ StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
   // Unknown callee (outside of LTO domain or an indirect call).
   if (IT == Functions.end())
     return UnknownRange;
-  const StackSafetyInfo::FunctionInfo &FS = IT->second;
+  const FunctionInfo &FS = IT->second;
   // The definition of this symbol may not be the definition in this linkage
   // unit.
   if (!FS.IsDSOLocal() || FS.IsInterposable())
@@ -495,8 +496,8 @@ bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
   return Changed;
 }
 
-void StackSafetyDataFlowAnalysis::updateOneNode(
-    const GlobalValue *Callee, StackSafetyInfo::FunctionInfo &FS) {
+void StackSafetyDataFlowAnalysis::updateOneNode(const GlobalValue *Callee,
+                                                FunctionInfo &FS) {
   bool UpdateToFullSet = FS.UpdateCount > StackSafetyMaxIterations;
   bool Changed = false;
   for (auto &AS : FS.Allocas)
@@ -523,7 +524,7 @@ void StackSafetyDataFlowAnalysis::runDataFlow() {
   SmallVector<const GlobalValue *, 16> Callees;
   for (auto &F : Functions) {
     Callees.clear();
-    StackSafetyInfo::FunctionInfo &FS = F.second;
+    FunctionInfo &FS = F.second;
     for (auto &AS : FS.Allocas)
       for (auto &CS : AS.Use.Calls)
         Callees.push_back(CS.Callee);
@@ -561,7 +562,7 @@ StackSafetyGlobalInfo StackSafetyDataFlowAnalysis::run() {
 
   StackSafetyGlobalInfo SSI;
   for (auto &F : Functions)
-    SSI.emplace(F.first, std::move(F.second));
+    SSI.emplace(F.first, makeSSI(F.second));
   return SSI;
 }
 
@@ -590,8 +591,8 @@ bool setStackSafetyMetadata(Module &M, const StackSafetyGlobalInfo &SSGI) {
     auto Iter = SSGI.find(&F);
     if (Iter == SSGI.end())
       continue;
-    StackSafetyInfo::FunctionInfo *Summary = Iter->second.getInfo();
-    for (auto &AS : Summary->Allocas) {
+    const FunctionInfo &Summary = Iter->second.getInfo().Info;
+    for (auto &AS : Summary.Allocas) {
       ConstantRange AllocaRange{APInt(Width, 0), APInt(Width, AS.Size)};
       if (AllocaRange.contains(AS.Use.Range)) {
         AS.AI->setMetadata(M.getMDKindID("stack-safe"),
@@ -605,23 +606,22 @@ bool setStackSafetyMetadata(Module &M, const StackSafetyGlobalInfo &SSGI) {
 
 } // end anonymous namespace
 
-StackSafetyInfo::StackSafetyInfo() = default;
 StackSafetyInfo::StackSafetyInfo(StackSafetyInfo &&) = default;
 StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default;
 
-StackSafetyInfo::StackSafetyInfo(FunctionInfo &&Info)
-    : Info(new FunctionInfo(std::move(Info))) {}
+StackSafetyInfo::StackSafetyInfo(InfoTy Info)
+    : Info(new InfoTy(std::move(Info))) {}
 
 StackSafetyInfo::~StackSafetyInfo() = default;
 
-void StackSafetyInfo::print(raw_ostream &O) const { Info->print(O); }
+void StackSafetyInfo::print(raw_ostream &O) const { Info->Info.print(O); }
 
 AnalysisKey StackSafetyAnalysis::Key;
 
 StackSafetyInfo StackSafetyAnalysis::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   StackSafetyLocalAnalysis SSLA(F, AM.getResult<ScalarEvolutionAnalysis>(F));
-  return SSLA.run();
+  return makeSSI(SSLA.run());
 }
 
 PreservedAnalyses StackSafetyPrinterPass::run(Function &F,
@@ -643,13 +643,13 @@ void StackSafetyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void StackSafetyInfoWrapperPass::print(raw_ostream &O, const Module *M) const {
-  SSI.print(O);
+  SSI->print(O);
 }
 
 bool StackSafetyInfoWrapperPass::runOnFunction(Function &F) {
   StackSafetyLocalAnalysis SSLA(
       F, getAnalysis<ScalarEvolutionWrapperPass>().getSE());
-  SSI = StackSafetyInfo(SSLA.run());
+  SSI = makeSSI(SSLA.run());
   return false;
 }
 
@@ -661,8 +661,8 @@ StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
   StackSafetyDataFlowAnalysis SSDFA(
-      M, [&FAM](Function &F) -> const StackSafetyInfo & {
-        return FAM.getResult<StackSafetyAnalysis>(F);
+      M, [&FAM](Function &F) -> const FunctionInfo & {
+        return FAM.getResult<StackSafetyAnalysis>(F).getInfo().Info;
       });
   return SSDFA.run();
 }
@@ -702,8 +702,11 @@ void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage(
 
 bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
   StackSafetyDataFlowAnalysis SSDFA(
-      M, [this](Function &F) -> const StackSafetyInfo & {
-        return getAnalysis<StackSafetyInfoWrapperPass>(F).getResult();
+      M, [this](Function &F) -> const FunctionInfo & {
+        return getAnalysis<StackSafetyInfoWrapperPass>(F)
+            .getResult()
+            .getInfo()
+            .Info;
       });
   SSGI = SSDFA.run();
   return SetMetadata ? setStackSafetyMetadata(M, SSGI) : false;

From 9abb0e8d5be2ffad06ccfcc2d5530997ad093b81 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sun, 24 May 2020 02:44:31 -0700
Subject: [PATCH 152/770] [NFC, StackSafety] Remove unnecessary data

---
 .../llvm/Analysis/StackSafetyAnalysis.h       |   3 +-
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     | 164 +++++++++---------
 2 files changed, 82 insertions(+), 85 deletions(-)

diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index 9158f42481bf1..33a4b2c149c36 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -35,7 +35,7 @@ class StackSafetyInfo {
   const InfoTy &getInfo() const { return *Info; }
 
   // TODO: Add useful for client methods.
-  void print(raw_ostream &O) const;
+  void print(raw_ostream &O, const GlobalValue &F) const;
 };
 
 /// StackSafetyInfo wrapper for the new pass manager.
@@ -60,6 +60,7 @@ class StackSafetyPrinterPass : public PassInfoMixin<StackSafetyPrinterPass> {
 /// StackSafetyInfo wrapper for the legacy pass manager
 class StackSafetyInfoWrapperPass : public FunctionPass {
   Optional<StackSafetyInfo> SSI;
+  const Function *F = nullptr;
 
 public:
   static char ID;
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index b98a0e5880463..31f30d4b5d56f 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
@@ -98,35 +99,6 @@ raw_ostream &operator<<(raw_ostream &OS, const UseInfo &U) {
   return OS;
 }
 
-struct AllocaInfo {
-  AllocaInst *AI = nullptr;
-  uint64_t Size = 0;
-  UseInfo Use;
-
-  AllocaInfo(unsigned PointerSize, AllocaInst *AI, uint64_t Size)
-      : AI(AI), Size(Size), Use(PointerSize) {}
-
-  StringRef getName() const { return AI->getName(); }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const AllocaInfo &A) {
-  return OS << A.getName() << "[" << A.Size << "]: " << A.Use;
-}
-
-struct ParamInfo {
-  const Argument *Arg = nullptr;
-  UseInfo Use;
-
-  explicit ParamInfo(unsigned PointerSize, const Argument *Arg)
-      : Arg(Arg), Use(PointerSize) {}
-
-  StringRef getName() const { return Arg ? Arg->getName() : "<N/A>"; }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const ParamInfo &P) {
-  return OS << P.getName() << "[]: " << P.Use;
-}
-
 /// Calculate the allocation size of a given alloca. Returns 0 if the
 /// size can not be statically determined.
 uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) {
@@ -146,20 +118,16 @@ uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) {
 
 /// Describes uses of allocas and parameters inside of a single function.
 struct FunctionInfo {
-  // May be a Function or a GlobalAlias
+  SmallVector<UseInfo, 4> Allocas;
+  SmallVector<UseInfo, 4> Params;
   const GlobalValue *GV = nullptr;
-  // Informations about allocas uses.
-  SmallVector<AllocaInfo, 4> Allocas;
-  // Informations about parameters uses.
-  SmallVector<ParamInfo, 4> Params;
   // TODO: describe return value as depending on one or more of its arguments.
 
   // StackSafetyDataFlowAnalysis counter stored here for faster access.
   int UpdateCount = 0;
 
   FunctionInfo() = default;
-  explicit FunctionInfo(const Function *F) : GV(F){};
-  // Creates FunctionInfo that forwards all the parameters to the aliasee.
+  FunctionInfo(const Function *F) : GV(F){};
   explicit FunctionInfo(const GlobalAlias *A);
 
   bool IsDSOLocal() const { return GV->isDSOLocal(); };
@@ -168,17 +136,36 @@ struct FunctionInfo {
 
   StringRef getName() const { return GV->getName(); }
 
-  void print(raw_ostream &O) const {
+  void print(raw_ostream &O, StringRef Name, const Function *F) const {
     // TODO: Consider different printout format after
     // StackSafetyDataFlowAnalysis. Calls and parameters are irrelevant then.
-    O << "  @" << getName() << (IsDSOLocal() ? "" : " dso_preemptable")
+    O << "  @" << Name << (IsDSOLocal() ? "" : " dso_preemptable")
       << (IsInterposable() ? " interposable" : "") << "\n";
+
     O << "    args uses:\n";
-    for (auto &P : Params)
-      O << "      " << P << "\n";
+    size_t Pos = 0;
+    for (auto &P : Params) {
+      StringRef Name = "<N/A>";
+      if (F)
+        Name = F->getArg(Pos)->getName();
+      O << "      " << Name << "[]: " << P << "\n";
+      ++Pos;
+    }
+
     O << "    allocas uses:\n";
-    for (auto &AS : Allocas)
-      O << "      " << AS << "\n";
+    if (F) {
+      size_t Pos = 0;
+      for (auto &I : instructions(F)) {
+        if (auto AI = dyn_cast<AllocaInst>(&I)) {
+          auto &AS = Allocas[Pos];
+          O << "      " << AI->getName() << "["
+            << getStaticAllocaAllocationSize(AI) << "]: " << AS << "\n";
+          ++Pos;
+        }
+      }
+    } else {
+      assert(Allocas.empty());
+    }
   }
 };
 
@@ -188,8 +175,8 @@ FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
   const FunctionType *Type = cast<FunctionType>(Aliasee->getValueType());
   // 'Forward' all parameters to this alias to the aliasee
   for (unsigned ArgNo = 0; ArgNo < Type->getNumParams(); ArgNo++) {
-    Params.emplace_back(PointerSize, nullptr);
-    UseInfo &US = Params.back().Use;
+    Params.emplace_back(PointerSize);
+    UseInfo &US = Params.back();
     US.Calls.emplace_back(Aliasee, ArgNo, ConstantRange(APInt(PointerSize, 0)));
   }
 }
@@ -394,21 +381,20 @@ FunctionInfo StackSafetyLocalAnalysis::run() {
 
   for (auto &I : instructions(F)) {
     if (auto AI = dyn_cast<AllocaInst>(&I)) {
-      Info.Allocas.emplace_back(PointerSize, AI,
-                                getStaticAllocaAllocationSize(AI));
-      AllocaInfo &AS = Info.Allocas.back();
-      analyzeAllUses(AI, AS.Use);
+      Info.Allocas.emplace_back(PointerSize);
+      UseInfo &AS = Info.Allocas.back();
+      analyzeAllUses(AI, AS);
     }
   }
 
   for (const Argument &A : make_range(F.arg_begin(), F.arg_end())) {
-    Info.Params.emplace_back(PointerSize, &A);
-    ParamInfo &PS = Info.Params.back();
-    analyzeAllUses(&A, PS.Use);
+    Info.Params.emplace_back(PointerSize);
+    UseInfo &PS = Info.Params.back();
+    analyzeAllUses(&A, PS);
   }
 
+  LLVM_DEBUG(Info.print(dbgs(), F.getName(), &F));
   LLVM_DEBUG(dbgs() << "[StackSafety] done\n");
-  LLVM_DEBUG(Info.print(dbgs()));
   return Info;
 }
 
@@ -473,7 +459,7 @@ StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
     return UnknownRange;
   if (ParamNo >= FS.Params.size()) // possibly vararg
     return UnknownRange;
-  return FS.Params[ParamNo].Use.Range;
+  return FS.Params[ParamNo].Range;
 }
 
 bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
@@ -501,14 +487,14 @@ void StackSafetyDataFlowAnalysis::updateOneNode(const GlobalValue *Callee,
   bool UpdateToFullSet = FS.UpdateCount > StackSafetyMaxIterations;
   bool Changed = false;
   for (auto &AS : FS.Allocas)
-    Changed |= updateOneUse(AS.Use, UpdateToFullSet);
+    Changed |= updateOneUse(AS, UpdateToFullSet);
   for (auto &PS : FS.Params)
-    Changed |= updateOneUse(PS.Use, UpdateToFullSet);
+    Changed |= updateOneUse(PS, UpdateToFullSet);
 
   if (Changed) {
     LLVM_DEBUG(dbgs() << "=== update [" << FS.UpdateCount
-                      << (UpdateToFullSet ? ", full-set" : "") << "] "
-                      << FS.getName() << "\n");
+                      << (UpdateToFullSet ? ", full-set" : "") << "] " << &FS
+                      << "\n");
     // Callers of this function may need updating.
     for (auto &CallerID : Callers[Callee])
       WorkList.insert(CallerID);
@@ -526,10 +512,10 @@ void StackSafetyDataFlowAnalysis::runDataFlow() {
     Callees.clear();
     FunctionInfo &FS = F.second;
     for (auto &AS : FS.Allocas)
-      for (auto &CS : AS.Use.Calls)
+      for (auto &CS : AS.Calls)
         Callees.push_back(CS.Callee);
     for (auto &PS : FS.Params)
-      for (auto &CS : PS.Use.Calls)
+      for (auto &CS : PS.Calls)
         Callees.push_back(CS.Callee);
 
     llvm::sort(Callees);
@@ -566,22 +552,6 @@ StackSafetyGlobalInfo StackSafetyDataFlowAnalysis::run() {
   return SSI;
 }
 
-void print(const StackSafetyGlobalInfo &SSI, raw_ostream &O, const Module &M) {
-  size_t Count = 0;
-  for (auto &F : M.functions())
-    if (!F.isDeclaration()) {
-      SSI.find(&F)->second.print(O);
-      O << "\n";
-      ++Count;
-    }
-  for (auto &A : M.aliases()) {
-    SSI.find(&A)->second.print(O);
-    O << "\n";
-    ++Count;
-  }
-  assert(Count == SSI.size() && "Unexpected functions in the result");
-}
-
 bool setStackSafetyMetadata(Module &M, const StackSafetyGlobalInfo &SSGI) {
   bool Changed = false;
   unsigned Width = M.getDataLayout().getPointerSizeInBits();
@@ -592,12 +562,18 @@ bool setStackSafetyMetadata(Module &M, const StackSafetyGlobalInfo &SSGI) {
     if (Iter == SSGI.end())
       continue;
     const FunctionInfo &Summary = Iter->second.getInfo().Info;
-    for (auto &AS : Summary.Allocas) {
-      ConstantRange AllocaRange{APInt(Width, 0), APInt(Width, AS.Size)};
-      if (AllocaRange.contains(AS.Use.Range)) {
-        AS.AI->setMetadata(M.getMDKindID("stack-safe"),
-                           MDNode::get(M.getContext(), None));
-        Changed = true;
+    size_t Pos = 0;
+    for (auto &I : instructions(F)) {
+      if (auto AI = dyn_cast<AllocaInst>(&I)) {
+        auto &AS = Summary.Allocas[Pos];
+        ConstantRange AllocaRange{
+            APInt(Width, 0), APInt(Width, getStaticAllocaAllocationSize(AI))};
+        if (AllocaRange.contains(AS.Range)) {
+          AI->setMetadata(M.getMDKindID("stack-safe"),
+                          MDNode::get(M.getContext(), None));
+          Changed = true;
+        }
+        ++Pos;
       }
     }
   }
@@ -614,7 +590,26 @@ StackSafetyInfo::StackSafetyInfo(InfoTy Info)
 
 StackSafetyInfo::~StackSafetyInfo() = default;
 
-void StackSafetyInfo::print(raw_ostream &O) const { Info->Info.print(O); }
+void StackSafetyInfo::print(raw_ostream &O, const GlobalValue &F) const {
+  Info->Info.print(O, F.getName(), dyn_cast<Function>(&F));
+}
+
+static void print(const StackSafetyGlobalInfo &SSI, raw_ostream &O,
+                  const Module &M) {
+  size_t Count = 0;
+  for (auto &F : M.functions())
+    if (!F.isDeclaration()) {
+      SSI.find(&F)->second.print(O, F);
+      O << "\n";
+      ++Count;
+    }
+  for (auto &A : M.aliases()) {
+    SSI.find(&A)->second.print(O, A);
+    O << "\n";
+    ++Count;
+  }
+  assert(Count == SSI.size() && "Unexpected functions in the result");
+}
 
 AnalysisKey StackSafetyAnalysis::Key;
 
@@ -627,7 +622,7 @@ StackSafetyInfo StackSafetyAnalysis::run(Function &F,
 PreservedAnalyses StackSafetyPrinterPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
   OS << "'Stack Safety Local Analysis' for function '" << F.getName() << "'\n";
-  AM.getResult<StackSafetyAnalysis>(F).print(OS);
+  AM.getResult<StackSafetyAnalysis>(F).print(OS, F);
   return PreservedAnalyses::all();
 }
 
@@ -643,13 +638,14 @@ void StackSafetyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void StackSafetyInfoWrapperPass::print(raw_ostream &O, const Module *M) const {
-  SSI->print(O);
+  SSI->print(O, *F);
 }
 
 bool StackSafetyInfoWrapperPass::runOnFunction(Function &F) {
   StackSafetyLocalAnalysis SSLA(
       F, getAnalysis<ScalarEvolutionWrapperPass>().getSE());
   SSI = makeSSI(SSLA.run());
+  this->F = &F;
   return false;
 }
 

From 6e39379bbbe1d8aba658f638dfc42f0ba0cbb926 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Wed, 20 May 2020 15:30:58 -0700
Subject: [PATCH 153/770] [DwarfExpression] Support entry values for indirect
 parameters

Summary:
A struct argument can be passed-by-value to a callee via a pointer to a
temporary stack copy. Add support for emitting an entry value DBG_VALUE
when an indirect parameter DBG_VALUE becomes unavailable. This is done
by omitting DW_OP_stack_value from the entry value expression, to make
the expression describe the location of an object.

rdar://63373691

Reviewers: djtodoro, aprantl, dstenb

Subscribers: hiraditya, lldb-commits, llvm-commits

Tags: #lldb, #llvm

Differential Revision: https://reviews.llvm.org/D80345
---
 .../basic_entry_values/main.cpp               |  28 +++++
 llvm/docs/LangRef.rst                         |   8 +-
 .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |   7 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |   7 +-
 .../CodeGen/AsmPrinter/DwarfExpression.cpp    |  22 +++-
 llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h |  15 ++-
 llvm/lib/CodeGen/LiveDebugValues.cpp          |   6 +-
 ...bgcall-site-indirect-param-with-offset.mir | 102 +++++++++++++++
 .../AArch64/dbgcall-site-indirect-param.mir   | 117 ++++++++++++++++++
 9 files changed, 289 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param-with-offset.mir
 create mode 100644 llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param.mir

diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp
index c739a05f421e3..83f622cadf146 100644
--- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp
+++ b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp
@@ -137,6 +137,30 @@ func14(int &sink, void (*target_no_tailcall)(int &, int)) {
   target_no_tailcall(sink, 123);
 }
 
+/// A structure that is guaranteed -- when passed to a callee by value -- to be
+/// passed via a pointer to a temporary copy in the caller. On x86_64 & aarch64
+/// only.
+struct StructPassedViaPointerToTemporaryCopy {
+  // Under the 64-bit AAPCS, a struct larger than 16 bytes is not SROA'd, and
+  // is instead passed via pointer to a temporary copy.
+  long a, b, c;
+  StructPassedViaPointerToTemporaryCopy() : a(1), b(2), c(3) {}
+
+  // Failing that, a virtual method forces passing via pointer to a temporary
+  // copy under the common calling conventions (e.g. 32/64-bit x86, Linux/Win,
+  // according to https://www.agner.org/optimize/calling_conventions.pdf).
+  virtual void add_vtable() {}
+};
+
+__attribute__((noinline)) void func15(StructPassedViaPointerToTemporaryCopy S) {
+  use<StructPassedViaPointerToTemporaryCopy &>(S);
+  use<int &>(dummy);
+
+  ++global;
+  //% self.filecheck("expr S", "main.cpp", "-check-prefix=FUNC15-EXPR")
+  // FUNC15-EXPR: (a = 1, b = 2, c = 3)
+}
+
 __attribute__((disable_tail_calls)) int main() {
   int sink = 0;
   S1 s1;
@@ -169,5 +193,9 @@ __attribute__((disable_tail_calls)) int main() {
   // Test that evaluation can "see through" an indirect tail call.
   func14(sink, func13);
 
+  // Test evaluation of an entry value that dereferences a temporary stack
+  // slot set up by the caller for a StructPassedViaPointerToTemporaryCopy.
+  func15(StructPassedViaPointerToTemporaryCopy());
+
   return 0;
 }
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 01f41a7ea3f17..0891392b1e61e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5104,9 +5104,11 @@ The current supported opcode vocabulary is limited:
 
   ``DW_OP_LLVM_entry_value`` is only legal in MIR. The operation is introduced
   by the ``LiveDebugValues`` pass; currently only for function parameters that
-  are unmodified throughout a function and that are described as simple
-  register location descriptions. The operation is also introduced by the
-  ``AsmPrinter`` pass when a call site parameter value
+  are unmodified throughout a function. Support is limited to function
+  parameter that are described as simple register location descriptions, or as
+  indirect locations (e.g. when a struct is passed-by-value to a callee via a
+  pointer to a temporary copy made in the caller). The entry value op is also
+  introduced by the ``AsmPrinter`` pass when a call site parameter value
   (``DW_AT_call_site_parameter_value``) is represented as entry value of the
   parameter.
 - ``DW_OP_breg`` (or ``DW_OP_bregx``) represents a content on the provided
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 93bf9d6c2f715..dce90b3c17c0d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1285,15 +1285,12 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
   const DIExpression *DIExpr = DV.getSingleExpression();
   DwarfExpr.addFragmentOffset(DIExpr);
-  if (Location.isIndirect())
-    DwarfExpr.setMemoryLocationKind();
+  DwarfExpr.setLocation(Location, DIExpr);
 
   DIExpressionCursor Cursor(DIExpr);
 
-  if (DIExpr->isEntryValue()) {
-    DwarfExpr.setEntryValueFlag();
+  if (DIExpr->isEntryValue())
     DwarfExpr.beginEntryValueExpression(Cursor);
-  }
 
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
   if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 17fcf692d913d..953154f0b10b6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2399,14 +2399,11 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
       DwarfExpr.addUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
     MachineLocation Location = Value.getLoc();
-    if (Location.isIndirect())
-      DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.setLocation(Location, DIExpr);
     DIExpressionCursor Cursor(DIExpr);
 
-    if (DIExpr->isEntryValue()) {
-      DwarfExpr.setEntryValueFlag();
+    if (DIExpr->isEntryValue())
       DwarfExpr.beginEntryValueExpression(Cursor);
-    }
 
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
     if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 69bc06cb94676..7b64c2238bd6a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -259,7 +259,8 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
     if (isEntryValue())
       finalizeEntryValue();
 
-    if (isEntryValue() && !isParameterValue() && DwarfVersion >= 4)
+    if (isEntryValue() && !isIndirect() && !isParameterValue() &&
+        DwarfVersion >= 4)
       emitOp(dwarf::DW_OP_stack_value);
 
     DwarfRegs.clear();
@@ -318,6 +319,25 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   return true;
 }
 
+void DwarfExpression::setEntryValueFlags(const MachineLocation &Loc) {
+  LocationFlags |= EntryValue;
+  if (Loc.isIndirect())
+    LocationFlags |= Indirect;
+}
+
+void DwarfExpression::setLocation(const MachineLocation &Loc,
+                                  const DIExpression *DIExpr) {
+  if (Loc.isIndirect())
+    // Do not treat entry value descriptions of indirect parameters as memory
+    // locations. This allows DwarfExpression::addReg() to add DW_OP_regN to an
+    // entry value description.
+    if (!DIExpr->isEntryValue())
+      setMemoryLocationKind();
+
+  if (DIExpr->isEntryValue())
+    setEntryValueFlags(Loc);
+}
+
 void DwarfExpression::beginEntryValueExpression(
     DIExpressionCursor &ExprCursor) {
   auto Op = ExprCursor.take();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 5d43862827873..42be827cd5a09 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -30,6 +30,7 @@ class APInt;
 class DwarfCompileUnit;
 class DIELoc;
 class TargetRegisterInfo;
+class MachineLocation;
 
 /// Holds a DIExpression and keeps track of how many operands have been consumed
 /// so far.
@@ -142,14 +143,18 @@ class DwarfExpression {
   /// The kind of location description being produced.
   enum { Unknown = 0, Register, Memory, Implicit };
 
-  /// The flags of location description being produced.
-  enum { EntryValue = 1, CallSiteParamValue };
+  /// Additional location flags which may be combined with any location kind.
+  /// Currently, entry values are not supported for the Memory location kind.
+  enum { EntryValue = 1 << 0, Indirect = 1 << 1, CallSiteParamValue = 1 << 2 };
 
   unsigned LocationKind : 3;
-  unsigned LocationFlags : 2;
+  unsigned LocationFlags : 3;
   unsigned DwarfVersion : 4;
 
 public:
+  /// Set the location (\p Loc) and \ref DIExpression (\p DIExpr) to describe.
+  void setLocation(const MachineLocation &Loc, const DIExpression *DIExpr);
+
   bool isUnknownLocation() const { return LocationKind == Unknown; }
 
   bool isMemoryLocation() const { return LocationKind == Memory; }
@@ -160,6 +165,8 @@ class DwarfExpression {
 
   bool isEntryValue() const { return LocationFlags & EntryValue; }
 
+  bool isIndirect() const { return LocationFlags & Indirect; }
+
   bool isParameterValue() { return LocationFlags & CallSiteParamValue; }
 
   Optional<uint8_t> TagOffset;
@@ -296,7 +303,7 @@ class DwarfExpression {
   }
 
   /// Lock this down to become an entry value location.
-  void setEntryValueFlag() { LocationFlags |= EntryValue; }
+  void setEntryValueFlags(const MachineLocation &Loc);
 
   /// Lock this down to become a call site parameter location.
   void setCallSiteParamValueFlag() { LocationFlags |= CallSiteParamValue; }
diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index 470cb227fe7c8..00a6149a05404 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -1613,10 +1613,6 @@ bool LiveDebugValues::isEntryValueCandidate(
   if (MI.getDebugLoc()->getInlinedAt())
     return false;
 
-  // Do not consider indirect debug values (TODO: explain why).
-  if (MI.isIndirectDebugValue())
-    return false;
-
   // Only consider parameters that are described using registers. Parameters
   // that are passed on the stack are not yet supported, so ignore debug
   // values that are described by the frame or stack pointer.
@@ -1631,7 +1627,7 @@ bool LiveDebugValues::isEntryValueCandidate(
     return false;
 
   // TODO: Add support for parameters that have a pre-existing debug expressions
-  // (e.g. fragments, or indirect parameters using DW_OP_deref).
+  // (e.g. fragments).
   if (MI.getDebugExpression()->getNumElements() > 0)
     return false;
 
diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param-with-offset.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param-with-offset.mir
new file mode 100644
index 0000000000000..ee3a8e8ae5211
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param-with-offset.mir
@@ -0,0 +1,102 @@
+# RUN: llc -emit-call-site-info -start-before=livedebugvalues -stop-after=machineverifier -o - %s \
+# RUN:   | FileCheck %s -check-prefix=MIR
+
+# Copied from dbgcall-site-indirect-param.mir, with hand modifications:
+# an offset is added to the indirect parameter DBG_VALUE.
+#
+# We do not support emitting an entry value in this case.
+
+# MIR:      renamable $w0 = LDRWui killed renamable $x8
+# MIR-NOT:  DBG_VALUE $x0, 0, {{.*}}, !DIExpression(DW_OP_LLVM_entry_value
+# MIR-NEXT: BL @baz
+# MIR-NEXT: frame-destroy LDPXpost
+# MIR-NEXT: TCRETURNdi @baz
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "arm64-apple-ios10.0.0"
+
+  %struct.fat_ptr = type { i32*, i32*, i32* }
+
+  define i32 @bar(%struct.fat_ptr* nocapture readonly %f) local_unnamed_addr !dbg !13 {
+  entry:
+    call void @llvm.dbg.declare(metadata %struct.fat_ptr* %f, metadata !23, metadata !DIExpression()), !dbg !24
+    %ptr2 = bitcast %struct.fat_ptr* %f to i32**, !dbg !25
+    %0 = load i32*, i32** %ptr2, align 8, !dbg !25
+    %1 = load i32, i32* %0, align 4, !dbg !31
+    %call = tail call i32 @baz(i32 %1), !dbg !34
+    %call1 = tail call i32 @baz(i32 %call), !dbg !35
+    ret i32 %call1, !dbg !36
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+  declare !dbg !4 i32 @baz(i32) local_unnamed_addr optsize
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10, !11}
+  !llvm.ident = !{!12}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None, sysroot: "/")
+  !1 = !DIFile(filename: "indirect.c", directory: "/tmp/fatptr")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 7, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{i32 7, !"PIC Level", i32 2}
+  !12 = !{!"clang"}
+  !13 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !14, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !22)
+  !14 = !DISubroutineType(types: !15)
+  !15 = !{!7, !16}
+  !16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "fat_ptr", file: !1, line: 1, size: 192, elements: !17)
+  !17 = !{!18, !20, !21}
+  !18 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !16, file: !1, line: 2, baseType: !19, size: 64)
+  !19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+  !20 = !DIDerivedType(tag: DW_TAG_member, name: "low", scope: !16, file: !1, line: 2, baseType: !19, size: 64, offset: 64)
+  !21 = !DIDerivedType(tag: DW_TAG_member, name: "high", scope: !16, file: !1, line: 2, baseType: !19, size: 64, offset: 128)
+  !22 = !{!23}
+  !23 = !DILocalVariable(name: "f", arg: 1, scope: !13, file: !1, line: 5, type: !16)
+  !24 = !DILocation(line: 5, column: 24, scope: !13)
+  !25 = !DILocation(line: 6, column: 23, scope: !13)
+  !31 = !DILocation(line: 6, column: 20, scope: !13)
+  !34 = !DILocation(line: 6, column: 16, scope: !13)
+  !35 = !DILocation(line: 6, column: 12, scope: !13)
+  !36 = !DILocation(line: 6, column: 5, scope: !13)
+
+...
+---
+name:            bar
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:
+  - { bb: 0, offset: 8, fwdArgRegs:
+      - { arg: 0, reg: '$w0' } }
+  - { bb: 0, offset: 10, fwdArgRegs:
+      - { arg: 0, reg: '$w0' } }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $lr
+
+    DBG_VALUE $x0, 0, !23, !DIExpression(DW_OP_plus_uconst, 12), debug-location !24
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
+    $fp = frame-setup ADDXri $sp, 0, 0
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8, debug-location !25
+    frame-setup CFI_INSTRUCTION offset $w29, -16, debug-location !25
+    renamable $x8 = LDRXui killed renamable $x0, 0, debug-location !25 :: (load 8 from %ir.ptr2)
+    renamable $w0 = LDRWui killed renamable $x8, 0, debug-location !31 :: (load 4 from %ir.0)
+    BL @baz, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0, debug-location !34
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2, debug-location !35 :: (load 8 from %stack.1), (load 8 from %stack.0)
+    TCRETURNdi @baz, 0, csr_aarch64_aapcs, implicit $sp, implicit $w0, debug-location !35
+
+...
diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param.mir
new file mode 100644
index 0000000000000..d7edd1c654adf
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-indirect-param.mir
@@ -0,0 +1,117 @@
+# RUN: llc -emit-call-site-info -start-before=livedebugvalues -stop-after=machineverifier -o - %s \
+# RUN:   | FileCheck %s -check-prefix=MIR
+
+# RUN: llc -emit-call-site-info -start-before=livedebugvalues -filetype=obj -o - %s \
+# RUN:   | llvm-dwarfdump - | FileCheck %s -check-prefix=DWARF -implicit-check-not=DW_OP_entry_value
+
+# // Original Source
+# struct fat_ptr {
+#   int *ptr, *low, *high;
+# };
+# extern int baz(int x);
+# int bar(struct fat_ptr f) {
+#   return baz(baz(*f.ptr));
+# }
+
+# MIR:      renamable $w0 = LDRWui killed renamable $x8
+# MIR-NEXT: DBG_VALUE $x0, 0, {{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1)
+# MIR-NEXT: BL @baz
+# MIR-NEXT: frame-destroy LDPXpost
+# MIR-NEXT: TCRETURNdi @baz
+
+# After w0 is clobbered, we should get an indirect parameter entry value for "f".
+
+# DWARF-LABEL: DW_TAG_formal_parameter
+# DWARF-NEXT: DW_AT_location
+# DWARF-NEXT: [0x0000000000000000, 0x0000000000000010): DW_OP_breg0 W0+0
+# DWARF-NEXT: [0x0000000000000010, 0x000000000000001c): DW_OP_entry_value(DW_OP_reg0 W0))
+# DWARF-NEXT: DW_AT_name    ("f")
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "arm64-apple-ios10.0.0"
+
+  %struct.fat_ptr = type { i32*, i32*, i32* }
+
+  define i32 @bar(%struct.fat_ptr* nocapture readonly %f) local_unnamed_addr !dbg !13 {
+  entry:
+    call void @llvm.dbg.declare(metadata %struct.fat_ptr* %f, metadata !23, metadata !DIExpression()), !dbg !24
+    %ptr2 = bitcast %struct.fat_ptr* %f to i32**, !dbg !25
+    %0 = load i32*, i32** %ptr2, align 8, !dbg !25
+    %1 = load i32, i32* %0, align 4, !dbg !31
+    %call = tail call i32 @baz(i32 %1), !dbg !34
+    %call1 = tail call i32 @baz(i32 %call), !dbg !35
+    ret i32 %call1, !dbg !36
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+  declare !dbg !4 i32 @baz(i32) local_unnamed_addr optsize
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10, !11}
+  !llvm.ident = !{!12}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None, sysroot: "/")
+  !1 = !DIFile(filename: "indirect.c", directory: "/tmp/fatptr")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 7, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{i32 7, !"PIC Level", i32 2}
+  !12 = !{!"clang"}
+  !13 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !14, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !22)
+  !14 = !DISubroutineType(types: !15)
+  !15 = !{!7, !16}
+  !16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "fat_ptr", file: !1, line: 1, size: 192, elements: !17)
+  !17 = !{!18, !20, !21}
+  !18 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !16, file: !1, line: 2, baseType: !19, size: 64)
+  !19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+  !20 = !DIDerivedType(tag: DW_TAG_member, name: "low", scope: !16, file: !1, line: 2, baseType: !19, size: 64, offset: 64)
+  !21 = !DIDerivedType(tag: DW_TAG_member, name: "high", scope: !16, file: !1, line: 2, baseType: !19, size: 64, offset: 128)
+  !22 = !{!23}
+  !23 = !DILocalVariable(name: "f", arg: 1, scope: !13, file: !1, line: 5, type: !16)
+  !24 = !DILocation(line: 5, column: 24, scope: !13)
+  !25 = !DILocation(line: 6, column: 23, scope: !13)
+  !31 = !DILocation(line: 6, column: 20, scope: !13)
+  !34 = !DILocation(line: 6, column: 16, scope: !13)
+  !35 = !DILocation(line: 6, column: 12, scope: !13)
+  !36 = !DILocation(line: 6, column: 5, scope: !13)
+
+...
+---
+name:            bar
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:
+  - { bb: 0, offset: 8, fwdArgRegs:
+      - { arg: 0, reg: '$w0' } }
+  - { bb: 0, offset: 10, fwdArgRegs:
+      - { arg: 0, reg: '$w0' } }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $lr
+
+    DBG_VALUE $x0, 0, !23, !DIExpression(), debug-location !24
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
+    $fp = frame-setup ADDXri $sp, 0, 0
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8, debug-location !25
+    frame-setup CFI_INSTRUCTION offset $w29, -16, debug-location !25
+    renamable $x8 = LDRXui killed renamable $x0, 0, debug-location !25 :: (load 8 from %ir.ptr2)
+    renamable $w0 = LDRWui killed renamable $x8, 0, debug-location !31 :: (load 4 from %ir.0)
+    BL @baz, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0, debug-location !34
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2, debug-location !35 :: (load 8 from %stack.1), (load 8 from %stack.0)
+    TCRETURNdi @baz, 0, csr_aarch64_aapcs, implicit $sp, implicit $w0, debug-location !35
+
+...

From 6a74ad6baad45b8572d196f7f290593ed62075b5 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 14:27:22 -0700
Subject: [PATCH 154/770] [sancov] Accommodate sancov and coverage report
 server for use under Windows

Summary:
This patch makes the following changes to SanCov and its complementary Python script in order to resolve issues pertaining to non-UNIX file paths in JSON symbolization information:
* Convert all paths to use forward slash.
* Update `coverage-report-server.py` to correctly handle paths to sources which contain spaces.
* Remove Linux platform restriction for all SanCov unit tests. All SanCov tests passed when ran on my local Windows machine.

Patch by Douglas Gliner.

Reviewers: kcc, filcab, phosek, morehouse, vitalybuka, metzman

Reviewed By: vitalybuka

Subscribers: vsk, Dor1s, llvm-commits

Tags: #sanitizers, #llvm

Differential Revision: https://reviews.llvm.org/D51018
---
 llvm/test/tools/sancov/blacklist.test                   | 2 +-
 llvm/test/tools/sancov/covered_functions.test           | 2 +-
 llvm/test/tools/sancov/merge.test                       | 2 +-
 llvm/test/tools/sancov/not_covered_functions.test       | 2 +-
 llvm/test/tools/sancov/print.test                       | 2 +-
 llvm/test/tools/sancov/stats.test                       | 2 +-
 llvm/test/tools/sancov/symbolize.test                   | 8 +++++---
 llvm/test/tools/sancov/symbolize_noskip_dead_files.test | 2 +-
 llvm/test/tools/sancov/validation.test                  | 2 +-
 llvm/tools/sancov/coverage-report-server.py             | 6 ++++--
 llvm/tools/sancov/sancov.cpp                            | 2 +-
 11 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/llvm/test/tools/sancov/blacklist.test b/llvm/test/tools/sancov/blacklist.test
index 53f48534dc955..a9cf47f53cdb5 100644
--- a/llvm/test/tools/sancov/blacklist.test
+++ b/llvm/test/tools/sancov/blacklist.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target && host-byteorder-little-endian
 RUN: sancov -covered-functions %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s --check-prefix=ALL
 RUN: sancov -covered-functions -blacklist %p/Inputs/fun_blacklist.txt %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
 RUN: sancov -covered-functions -blacklist %p/Inputs/src_blacklist.txt %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.1.sancov | FileCheck --check-prefix=CHECK1 %s
diff --git a/llvm/test/tools/sancov/covered_functions.test b/llvm/test/tools/sancov/covered_functions.test
index 8126049a0ca1d..bcdfaf8879d41 100644
--- a/llvm/test/tools/sancov/covered_functions.test
+++ b/llvm/test/tools/sancov/covered_functions.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target && host-byteorder-little-endian
 RUN: sancov -covered-functions %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
 RUN: sancov -covered-functions -strip_path_prefix=Inputs/ %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck --check-prefix=STRIP_PATH %s
 RUN: sancov -demangle=0 -covered-functions %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck --check-prefix=NO_DEMANGLE %s
diff --git a/llvm/test/tools/sancov/merge.test b/llvm/test/tools/sancov/merge.test
index 9c5ca9e6244ca..6c867654583ca 100644
--- a/llvm/test/tools/sancov/merge.test
+++ b/llvm/test/tools/sancov/merge.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target
 RUN: sancov -merge %p/Inputs/test-linux_x86_64.0.symcov| FileCheck --check-prefix=MERGE1 %s
 RUN: sancov -merge %p/Inputs/test-linux_x86_64.0.symcov %p/Inputs/test-linux_x86_64.1.symcov| FileCheck --check-prefix=MERGE2 %s
 
diff --git a/llvm/test/tools/sancov/not_covered_functions.test b/llvm/test/tools/sancov/not_covered_functions.test
index 4e0e81a52c2c8..d1b91f6e56820 100644
--- a/llvm/test/tools/sancov/not_covered_functions.test
+++ b/llvm/test/tools/sancov/not_covered_functions.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target && host-byteorder-little-endian
 RUN: sancov -skip-dead-files=0 -not-covered-functions %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
 RUN: sancov -not-covered-functions %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.1.sancov | FileCheck --check-prefix=CHECK1 --allow-empty %s
 
diff --git a/llvm/test/tools/sancov/print.test b/llvm/test/tools/sancov/print.test
index fe94216b051a2..62ab3d991b8e3 100644
--- a/llvm/test/tools/sancov/print.test
+++ b/llvm/test/tools/sancov/print.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target && host-byteorder-little-endian
 RUN: sancov -print %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
 
 CHECK: 0x4e132b
diff --git a/llvm/test/tools/sancov/stats.test b/llvm/test/tools/sancov/stats.test
index 030d16a9dc616..46ff6e5e5db10 100644
--- a/llvm/test/tools/sancov/stats.test
+++ b/llvm/test/tools/sancov/stats.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target && host-byteorder-little-endian
 RUN: sancov -print-coverage-stats %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
 
 CHECK: all-edges: 8
diff --git a/llvm/test/tools/sancov/symbolize.test b/llvm/test/tools/sancov/symbolize.test
index 3cc426f919b96..acf58ae117123 100644
--- a/llvm/test/tools/sancov/symbolize.test
+++ b/llvm/test/tools/sancov/symbolize.test
@@ -1,5 +1,6 @@
-REQUIRES: x86_64-linux
-RUN: sancov -symbolize -strip_path_prefix="llvm/" %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
+REQUIRES: x86-registered-target && host-byteorder-little-endian
+RUN: sancov -symbolize -strip_path_prefix="llvm/" %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s --check-prefixes=CHECK,STRIP
+RUN: sancov -symbolize %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s --check-prefixes=CHECK,NOSTRIP
 
 CHECK: {
 CHECK-NEXT:  "covered-points": [
@@ -11,7 +12,8 @@ CHECK-NEXT:    "4e1586"
 CHECK-NEXT:  ],
 CHECK-NEXT:  "binary-hash": "BB3CDD5045AED83906F6ADCC1C4DAF7E2596A6B5",
 CHECK-NEXT:  "point-symbol-info": {
-CHECK-NEXT:    "test/tools/sancov/Inputs/test.cpp": {
+STRIP-NEXT:    "test/tools/sancov/Inputs/test.cpp": {
+NOSTRIP-NEXT:  "/usr/local/google/home/aizatsky/src/llvm/test/tools/sancov/Inputs/test.cpp": {
 CHECK-NEXT:      "bar(std::string)": {
 CHECK-NEXT:        "4e132b": "12:0"
 CHECK-NEXT:      },
diff --git a/llvm/test/tools/sancov/symbolize_noskip_dead_files.test b/llvm/test/tools/sancov/symbolize_noskip_dead_files.test
index 9ddf89cbf56d2..0038ea197735a 100644
--- a/llvm/test/tools/sancov/symbolize_noskip_dead_files.test
+++ b/llvm/test/tools/sancov/symbolize_noskip_dead_files.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target && host-byteorder-little-endian
 RUN: sancov -symbolize -skip-dead-files=0 -strip_path_prefix="llvm/" %p/Inputs/test-linux_x86_64 %p/Inputs/test-linux_x86_64.0.sancov | FileCheck %s
 
 CHECK: {
diff --git a/llvm/test/tools/sancov/validation.test b/llvm/test/tools/sancov/validation.test
index 437870cf597bf..fdcfd0610f82e 100644
--- a/llvm/test/tools/sancov/validation.test
+++ b/llvm/test/tools/sancov/validation.test
@@ -1,4 +1,4 @@
-REQUIRES: x86_64-linux
+REQUIRES: x86-registered-target
 RUN: not sancov -covered-functions %p/Inputs/test-linux_x86_64 2>&1 | FileCheck --check-prefix=NOCFILE %s
 
 NOCFILE: WARNING: No coverage file for {{.*}}test-linux_x86_64
diff --git a/llvm/tools/sancov/coverage-report-server.py b/llvm/tools/sancov/coverage-report-server.py
index 251d8f1b77bac..5ea978fae642a 100755
--- a/llvm/tools/sancov/coverage-report-server.py
+++ b/llvm/tools/sancov/coverage-report-server.py
@@ -32,6 +32,7 @@
 import os
 import string
 import math
+import urllib
 
 INDEX_PAGE_TMPL = """
 <html>
@@ -128,6 +129,7 @@ class ServerHandler(http.server.BaseHTTPRequestHandler):
     src_path = None
 
     def do_GET(self):
+        norm_path = os.path.normpath(urllib.parse.unquote(self.path[1:]))
         if self.path == '/':
             self.send_response(200)
             self.send_header("Content-type", "text/html; charset=utf-8")
@@ -147,8 +149,8 @@ def do_GET(self):
             response = string.Template(INDEX_PAGE_TMPL).safe_substitute(
                 filenames='\n'.join(filelist))
             self.wfile.write(response.encode('UTF-8', 'replace'))
-        elif self.symcov_data.has_file(self.path[1:]):
-            filename = self.path[1:]
+        elif self.symcov_data.has_file(norm_path):
+            filename = norm_path
             filepath = os.path.join(self.src_path, filename) 
             if not os.path.exists(filepath):
                 self.send_response(404)
diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp
index ed384a2710072..6f949f2963658 100644
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -471,7 +471,7 @@ static std::unique_ptr<symbolize::LLVMSymbolizer> createSymbolizer() {
 static std::string normalizeFilename(const std::string &FileName) {
   SmallString<256> S(FileName);
   sys::path::remove_dots(S, /* remove_dot_dot */ true);
-  return stripPathPrefix(std::string(S));
+  return stripPathPrefix(sys::path::convert_to_slash(std::string(S)));
 }
 
 class Blacklists {

From 1e06b169be3e59799b8dcaf16d1d03bd4c12da42 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jroelofs@jroelofs.com>
Date: Fri, 22 May 2020 06:53:55 -0600
Subject: [PATCH 155/770] [clang][docs] Document additional bits of libc that
 -ffreestanding envs must provide

Differential Revision: https://reviews.llvm.org/D80436
---
 clang/docs/CommandGuide/clang.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst
index 6947450beb43d..de0e0eda90974 100644
--- a/clang/docs/CommandGuide/clang.rst
+++ b/clang/docs/CommandGuide/clang.rst
@@ -246,7 +246,9 @@ Language Selection and Mode Options
 .. option:: -ffreestanding
 
  Indicate that the file should be compiled for a freestanding, not a hosted,
- environment.
+ environment. Note that it is assumed that a freestanding environment will
+ additionally provide `memcpy`, `memmove`, `memset` and `memcmp`
+ implementations, as these are needed for efficient codegen for many programs.
 
 .. option:: -fno-builtin
 

From 9eacda51fa23abf4f6503ff533dcb70071cbe569 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson@sony.com>
Date: Tue, 26 May 2020 22:33:59 +0100
Subject: [PATCH 156/770] [debuginfo] Fix broken tests from MachineLICM
 salvaging fix

Previous commit: bd7ff5d94f

- Added missing x86 triples
- Added missing asserts
---
 llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir | 4 +++-
 llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir  | 3 ++-
 llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir                | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
index 97cdea090c9c5..8b6e160cd92ae 100644
--- a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-post-regalloc.mir
@@ -1,5 +1,6 @@
 --- |
-  ; RUN: llc -start-before=phi-node-elimination -stop-after=machinelicm -o - %s  | FileCheck %s
+  ; REQUIRES: asserts
+  ; RUN: llc -start-before=phi-node-elimination -stop-after=machinelicm -debug-only=machinelicm -o - %s  | FileCheck %s
   ; Ensure we execute machinelicm post register allocation.
   ; Line numbers should not be retained when loop invariant instructions are hoisted.
   ;
@@ -7,6 +8,7 @@
   ; CHECK:        MOV64rm $rip, 1, $noreg, target-flags(x86-gotpcrel) @x, $noreg :: (load 8 from got)
   ; CHECK-LABEL:  bb.1.while.body:
   ;
+  target triple = "x86_64-unknown-linux-gnu"
 
   @x = common local_unnamed_addr global i32 0, align 4, !dbg !0
 
diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
index 8c0eb376eb408..fa5da8f1fe4c8 100644
--- a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
@@ -1,4 +1,4 @@
---- |
+--- | 
   ; RUN: llc -run-pass=machinelicm -o - %s | FileCheck %s
   ; Line numbers should not be retained when loop invariant instructions are hoisted.
   ; Doing so causes poor stepping bevavior.
@@ -23,6 +23,7 @@
   ;
   ; ModuleID = 'tx.ll'
   source_filename = "t.c"
+  target triple = "x86_64-unknown-linux-gnu"
 
   @x = common local_unnamed_addr global i32 0, align 4, !dbg !0
 
diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir
index 7b5a19ffa9e7f..24fbe71b2a349 100644
--- a/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-sink.mir
@@ -4,6 +4,7 @@
   ; CHECK: %0:gr64 = nuw ADD64ri8 %9, 4, implicit-def dead $eflags
   ; 
   ; When instructions are sunk to prevent register spills, line numbers should not be retained.
+   target triple = "x86_64-unknown-linux-gnu"
 
   %struct.A = type { i32, i32, i32, i32, i32, i32 }
   

From 5192783bb29c32196f87044de113fc43d7dfaae8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kirst=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Mon, 13 Apr 2020 20:51:27 +0200
Subject: [PATCH 157/770] [analyzer][RetainCount] Tie diagnostics to
 osx.cocoa.RetainCount rather then RetainCountBase, for the most part

Similarly to other patches of mine, I'm trying to uniformize the checker
interface so that dependency checkers don't emit diagnostics. The checker that
made me most anxious so far was definitely RetainCount, because it is definitely
impacted by backward compatibility concerns, and implements a checker hierarchy
that is a lot different to other examples of similar size. Also, I don't have
authority, nor expertise regarding ObjC related code, so I welcome any
objection/discussion!

Differential Revision: https://reviews.llvm.org/D78099
---
 .../RetainCountChecker/RetainCountChecker.cpp |  71 ++-
 .../RetainCountChecker/RetainCountChecker.h   |  32 +-
 .../RetainCountDiagnostics.cpp                |  19 +-
 .../RetainCountDiagnostics.h                  |  17 +-
 .../Inputs/expected-plists/edges-new.mm.plist |  12 +-
 .../Inputs/expected-plists/objc-arc.m.plist   |  24 +-
 .../objc-radar17039661.m.plist                |   4 +-
 .../plist-output-alternate.m.plist            |   4 +-
 .../expected-plists/plist-output.m.plist      |   4 +-
 .../retain-release-path-notes.m.plist         | 108 ++---
 .../retain-release.m.objc.plist               | 420 +++++++++---------
 .../retain-release.m.objcpp.plist             | 420 +++++++++---------
 .../test/Analysis/incorrect-checker-names.mm  |   2 +-
 .../Inputs/expected-plists/path-notes.m.plist |   4 +-
 .../Analysis/test-separate-retaincount.cpp    |  14 +-
 15 files changed, 591 insertions(+), 564 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
index 280d511e87c56..3f3267ff93916 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "RetainCountChecker.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 
 using namespace clang;
 using namespace ento;
 using namespace retaincountchecker;
-using llvm::StrInStrNoCase;
 
 REGISTER_MAP_WITH_PROGRAMSTATE(RefBindings, SymbolRef, RefVal)
 
@@ -701,7 +701,7 @@ void RetainCountChecker::checkSummary(const RetainSummary &Summ,
 
   for (ProgramStateRef St : Out) {
     if (DeallocSent) {
-      C.addTransition(St, C.getPredecessor(), &DeallocSentTag);
+      C.addTransition(St, C.getPredecessor(), &getDeallocSentTag());
     } else {
       C.addTransition(St);
     }
@@ -844,13 +844,13 @@ RetainCountChecker::errorKindToBugKind(RefVal::Kind ErrorKind,
                                        SymbolRef Sym) const {
   switch (ErrorKind) {
     case RefVal::ErrorUseAfterRelease:
-      return useAfterRelease;
+      return *UseAfterRelease;
     case RefVal::ErrorReleaseNotOwned:
-      return releaseNotOwned;
+      return *ReleaseNotOwned;
     case RefVal::ErrorDeallocNotOwned:
       if (Sym->getType()->getPointeeCXXRecordDecl())
-        return freeNotOwned;
-      return deallocNotOwned;
+        return *FreeNotOwned;
+      return *DeallocNotOwned;
     default:
       llvm_unreachable("Unhandled error.");
   }
@@ -946,7 +946,7 @@ bool RetainCountChecker::evalCall(const CallEvent &Call,
       // Assume that output is zero on the other branch.
       NullOutputState = NullOutputState->BindExpr(
           CE, LCtx, C.getSValBuilder().makeNull(), /*Invalidate=*/false);
-      C.addTransition(NullOutputState, &CastFailTag);
+      C.addTransition(NullOutputState, &getCastFailTag());
 
       // And on the original branch assume that both input and
       // output are non-zero.
@@ -1095,7 +1095,7 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
         if (N) {
           const LangOptions &LOpts = C.getASTContext().getLangOpts();
           auto R =
-              std::make_unique<RefLeakReport>(leakAtReturn, LOpts, N, Sym, C);
+              std::make_unique<RefLeakReport>(*LeakAtReturn, LOpts, N, Sym, C);
           C.emitReport(std::move(R));
         }
         return N;
@@ -1120,7 +1120,7 @@ ExplodedNode * RetainCountChecker::checkReturnWithRetEffect(const ReturnStmt *S,
         ExplodedNode *N = C.addTransition(state, Pred, &ReturnNotOwnedTag);
         if (N) {
           auto R = std::make_unique<RefCountReport>(
-              returnNotOwnedForOwned, C.getASTContext().getLangOpts(), N, Sym);
+              *ReturnNotOwnedForOwned, C.getASTContext().getLangOpts(), N, Sym);
           C.emitReport(std::move(R));
         }
         return N;
@@ -1273,8 +1273,8 @@ RetainCountChecker::handleAutoreleaseCounts(ProgramStateRef state,
     os << "has a +" << V.getCount() << " retain count";
 
     const LangOptions &LOpts = Ctx.getASTContext().getLangOpts();
-    auto R = std::make_unique<RefCountReport>(overAutorelease, LOpts, N, Sym,
-                                               os.str());
+    auto R = std::make_unique<RefCountReport>(*OverAutorelease, LOpts, N, Sym,
+                                              os.str());
     Ctx.emitReport(std::move(R));
   }
 
@@ -1320,7 +1320,7 @@ RetainCountChecker::processLeaks(ProgramStateRef state,
 
   if (N) {
     for (SymbolRef L : Leaked) {
-      const RefCountBug &BT = Pred ? leakWithinFunction : leakAtReturn;
+      const RefCountBug &BT = Pred ? *LeakWithinFunction : *LeakAtReturn;
       Ctx.emitReport(std::make_unique<RefLeakReport>(BT, LOpts, N, L, Ctx));
     }
   }
@@ -1473,19 +1473,39 @@ void RetainCountChecker::printState(raw_ostream &Out, ProgramStateRef State,
 // Checker registration.
 //===----------------------------------------------------------------------===//
 
+std::unique_ptr<CheckerProgramPointTag> RetainCountChecker::DeallocSentTag;
+std::unique_ptr<CheckerProgramPointTag> RetainCountChecker::CastFailTag;
+
 void ento::registerRetainCountBase(CheckerManager &Mgr) {
-  Mgr.registerChecker<RetainCountChecker>();
+  auto *Chk = Mgr.registerChecker<RetainCountChecker>();
+  Chk->DeallocSentTag =
+      std::make_unique<CheckerProgramPointTag>(Chk, "DeallocSent");
+  Chk->CastFailTag =
+      std::make_unique<CheckerProgramPointTag>(Chk, "DynamicCastFail");
 }
 
 bool ento::shouldRegisterRetainCountBase(const CheckerManager &mgr) {
   return true;
 }
-
 void ento::registerRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
   Chk->TrackObjCAndCFObjects = true;
   Chk->TrackNSCFStartParam = Mgr.getAnalyzerOptions().getCheckerBooleanOption(
       Mgr.getCurrentCheckerName(), "TrackNSCFStartParam");
+
+#define INIT_BUGTYPE(KIND)                                                     \
+  Chk->KIND = std::make_unique<RefCountBug>(Mgr.getCurrentCheckerName(),       \
+                                            RefCountBug::KIND);
+  // TODO: Ideally, we should have a checker for each of these bug types.
+  INIT_BUGTYPE(UseAfterRelease)
+  INIT_BUGTYPE(ReleaseNotOwned)
+  INIT_BUGTYPE(DeallocNotOwned)
+  INIT_BUGTYPE(FreeNotOwned)
+  INIT_BUGTYPE(OverAutorelease)
+  INIT_BUGTYPE(ReturnNotOwnedForOwned)
+  INIT_BUGTYPE(LeakWithinFunction)
+  INIT_BUGTYPE(LeakAtReturn)
+#undef INIT_BUGTYPE
 }
 
 bool ento::shouldRegisterRetainCountChecker(const CheckerManager &mgr) {
@@ -1495,6 +1515,29 @@ bool ento::shouldRegisterRetainCountChecker(const CheckerManager &mgr) {
 void ento::registerOSObjectRetainCountChecker(CheckerManager &Mgr) {
   auto *Chk = Mgr.getChecker<RetainCountChecker>();
   Chk->TrackOSObjects = true;
+
+  // FIXME: We want bug reports to always have the same checker name associated
+  // with them, yet here, if RetainCountChecker is disabled but
+  // OSObjectRetainCountChecker is enabled, the checker names will be different.
+  // This hack will make it so that the checker name depends on which checker is
+  // enabled rather than on the registration order.
+  // For the most part, we want **non-hidden checkers** to be associated with
+  // diagnostics, and **hidden checker options** with the fine-tuning of
+  // modeling. Following this logic, OSObjectRetainCountChecker should be the
+  // latter, but we can't just remove it for backward compatibility reasons.
+#define LAZY_INIT_BUGTYPE(KIND)                                                \
+  if (!Chk->KIND)                                                              \
+    Chk->KIND = std::make_unique<RefCountBug>(Mgr.getCurrentCheckerName(),     \
+                                              RefCountBug::KIND);
+  LAZY_INIT_BUGTYPE(UseAfterRelease)
+  LAZY_INIT_BUGTYPE(ReleaseNotOwned)
+  LAZY_INIT_BUGTYPE(DeallocNotOwned)
+  LAZY_INIT_BUGTYPE(FreeNotOwned)
+  LAZY_INIT_BUGTYPE(OverAutorelease)
+  LAZY_INIT_BUGTYPE(ReturnNotOwnedForOwned)
+  LAZY_INIT_BUGTYPE(LeakWithinFunction)
+  LAZY_INIT_BUGTYPE(LeakAtReturn)
+#undef LAZY_INIT_BUGTYPE
 }
 
 bool ento::shouldRegisterOSObjectRetainCountChecker(const CheckerManager &mgr) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
index dd79bbef321c3..223e28c2c5b86 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.h
@@ -251,20 +251,20 @@ class RetainCountChecker
                     eval::Assume,
                     eval::Call > {
 
-  RefCountBug useAfterRelease{this, RefCountBug::UseAfterRelease};
-  RefCountBug releaseNotOwned{this, RefCountBug::ReleaseNotOwned};
-  RefCountBug deallocNotOwned{this, RefCountBug::DeallocNotOwned};
-  RefCountBug freeNotOwned{this, RefCountBug::FreeNotOwned};
-  RefCountBug overAutorelease{this, RefCountBug::OverAutorelease};
-  RefCountBug returnNotOwnedForOwned{this, RefCountBug::ReturnNotOwnedForOwned};
-  RefCountBug leakWithinFunction{this, RefCountBug::LeakWithinFunction};
-  RefCountBug leakAtReturn{this, RefCountBug::LeakAtReturn};
-
-  CheckerProgramPointTag DeallocSentTag{this, "DeallocSent"};
-  CheckerProgramPointTag CastFailTag{this, "DynamicCastFail"};
+public:
+  std::unique_ptr<RefCountBug> UseAfterRelease;
+  std::unique_ptr<RefCountBug> ReleaseNotOwned;
+  std::unique_ptr<RefCountBug> DeallocNotOwned;
+  std::unique_ptr<RefCountBug> FreeNotOwned;
+  std::unique_ptr<RefCountBug> OverAutorelease;
+  std::unique_ptr<RefCountBug> ReturnNotOwnedForOwned;
+  std::unique_ptr<RefCountBug> LeakWithinFunction;
+  std::unique_ptr<RefCountBug> LeakAtReturn;
 
   mutable std::unique_ptr<RetainSummaryManager> Summaries;
-public:
+
+  static std::unique_ptr<CheckerProgramPointTag> DeallocSentTag;
+  static std::unique_ptr<CheckerProgramPointTag> CastFailTag;
 
   /// Track Objective-C and CoreFoundation objects.
   bool TrackObjCAndCFObjects = false;
@@ -360,13 +360,11 @@ class RetainCountChecker
                              CheckerContext &Ctx,
                              ExplodedNode *Pred = nullptr) const;
 
-  const CheckerProgramPointTag &getDeallocSentTag() const {
-    return DeallocSentTag;
+  static const CheckerProgramPointTag &getDeallocSentTag() {
+    return *DeallocSentTag;
   }
 
-  const CheckerProgramPointTag &getCastFailTag() const {
-    return CastFailTag;
-  }
+  static const CheckerProgramPointTag &getCastFailTag() { return *CastFailTag; }
 
 private:
   /// Perform the necessary checks and state adjustments at the end of the
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
index cfad47626354a..1d8ed90f7590c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp
@@ -18,7 +18,7 @@ using namespace clang;
 using namespace ento;
 using namespace retaincountchecker;
 
-StringRef RefCountBug::bugTypeToName(RefCountBug::RefCountBugType BT) {
+StringRef RefCountBug::bugTypeToName(RefCountBug::RefCountBugKind BT) {
   switch (BT) {
   case UseAfterRelease:
     return "Use-after-release";
@@ -37,7 +37,7 @@ StringRef RefCountBug::bugTypeToName(RefCountBug::RefCountBugType BT) {
   case LeakAtReturn:
     return "Leak of returned object";
   }
-  llvm_unreachable("Unknown RefCountBugType");
+  llvm_unreachable("Unknown RefCountBugKind");
 }
 
 StringRef RefCountBug::getDescription() const {
@@ -60,13 +60,14 @@ StringRef RefCountBug::getDescription() const {
   case LeakAtReturn:
     return "";
   }
-  llvm_unreachable("Unknown RefCountBugType");
+  llvm_unreachable("Unknown RefCountBugKind");
 }
 
-RefCountBug::RefCountBug(const CheckerBase *Checker, RefCountBugType BT)
+RefCountBug::RefCountBug(CheckerNameRef Checker, RefCountBugKind BT)
     : BugType(Checker, bugTypeToName(BT), categories::MemoryRefCount,
-              /*SuppressOnSink=*/BT == LeakWithinFunction || BT == LeakAtReturn),
-      BT(BT), Checker(Checker) {}
+              /*SuppressOnSink=*/BT == LeakWithinFunction ||
+                  BT == LeakAtReturn),
+      BT(BT) {}
 
 static bool isNumericLiteralExpression(const Expr *E) {
   // FIXME: This set of cases was copied from SemaExprObjC.
@@ -453,8 +454,6 @@ RefCountReportVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BRC,
                                  PathSensitiveBugReport &BR) {
 
   const auto &BT = static_cast<const RefCountBug&>(BR.getBugType());
-  const auto *Checker =
-      static_cast<const RetainCountChecker *>(BT.getChecker());
 
   bool IsFreeUnowned = BT.getBugType() == RefCountBug::FreeNotOwned ||
                        BT.getBugType() == RefCountBug::DeallocNotOwned;
@@ -545,11 +544,11 @@ RefCountReportVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BRC,
 
   const ProgramPointTag *Tag = N->getLocation().getTag();
 
-  if (Tag == &Checker->getCastFailTag()) {
+  if (Tag == &RetainCountChecker::getCastFailTag()) {
     os << "Assuming dynamic cast returns null due to type mismatch";
   }
 
-  if (Tag == &Checker->getDeallocSentTag()) {
+  if (Tag == &RetainCountChecker::getDeallocSentTag()) {
     // We only have summaries attached to nodes after evaluating CallExpr and
     // ObjCMessageExprs.
     const Stmt *S = N->getLocation().castAs<StmtPoint>().getStmt();
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
index e9e2777540548..286a8ae2ef7d7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.h
@@ -26,7 +26,7 @@ namespace retaincountchecker {
 
 class RefCountBug : public BugType {
 public:
-  enum RefCountBugType {
+  enum RefCountBugKind {
     UseAfterRelease,
     ReleaseNotOwned,
     DeallocNotOwned,
@@ -36,21 +36,14 @@ class RefCountBug : public BugType {
     LeakWithinFunction,
     LeakAtReturn,
   };
-  RefCountBug(const CheckerBase *checker, RefCountBugType BT);
+  RefCountBug(CheckerNameRef Checker, RefCountBugKind BT);
   StringRef getDescription() const;
 
-  RefCountBugType getBugType() const {
-    return BT;
-  }
-
-  const CheckerBase *getChecker() const {
-    return Checker;
-  }
+  RefCountBugKind getBugType() const { return BT; }
 
 private:
-  RefCountBugType BT;
-  const CheckerBase *Checker;
-  static StringRef bugTypeToName(RefCountBugType BT);
+  RefCountBugKind BT;
+  static StringRef bugTypeToName(RefCountBugKind BT);
 };
 
 class RefCountReport : public PathSensitiveBugReport {
diff --git a/clang/test/Analysis/Inputs/expected-plists/edges-new.mm.plist b/clang/test/Analysis/Inputs/expected-plists/edges-new.mm.plist
index b949e20ebbe86..74e11075fe3d4 100644
--- a/clang/test/Analysis/Inputs/expected-plists/edges-new.mm.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/edges-new.mm.plist
@@ -2119,9 +2119,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b2b15a95787e594ff79f02c600e9d357</string>
+   <key>issue_hash_content_of_line_in_context</key><string>29a10ca4af622b6146ca082e49d919d6</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar8331641</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -11612,9 +11612,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;foo&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ef342aeb2f2719117ddd4ef1b72f5ba7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>f533db5cbb9c20d171f9f92105789dc4</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>test2</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -21954,9 +21954,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;foo&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>f81f51dd154d0a11cab412a1cd1cd095</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5616a7601faa1a8c2ac56fa1b595b172</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>longLines</string>
   <key>issue_hash_function_offset</key><string>1</string>
diff --git a/clang/test/Analysis/Inputs/expected-plists/objc-arc.m.plist b/clang/test/Analysis/Inputs/expected-plists/objc-arc.m.plist
index 574575b6d25a3..d3a1a5c6c47fd 100644
--- a/clang/test/Analysis/Inputs/expected-plists/objc-arc.m.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/objc-arc.m.plist
@@ -312,9 +312,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7bd4a6e187407677b2d9e717576818bf</string>
+   <key>issue_hash_content_of_line_in_context</key><string>61d185b2522d15fb327f6784e0217adf</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_cf_leak</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -843,9 +843,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;obj5&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0aed4f65cb3dba7331f9319fd1ceb003</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5baa7d5f38420d0a035aa61607675f3e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>from_cf</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -989,9 +989,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;obj6&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0851961d40a4c8331ebe713f4a3e05f4</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4665e04694fd55e7c4ed7a67860b3b74</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>from_cf</string>
   <key>issue_hash_function_offset</key><string>8</string>
@@ -1423,9 +1423,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>00045bff3b7c26fe7cb80a71f512575c</string>
+   <key>issue_hash_content_of_line_in_context</key><string>798e65f80df0526369f9bb240e3d91fd</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_unretainedObject</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -1734,9 +1734,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFStringRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9f258122568ea8763047e98db8a52647</string>
+   <key>issue_hash_content_of_line_in_context</key><string>e1fbcc142b678b3c2c43737ee35b64d9</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>24</string>
@@ -1928,9 +1928,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;o&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>8187b0ba5cadd42594120fe05d871502</string>
+   <key>issue_hash_content_of_line_in_context</key><string>e300a279615a384d2b310329651d3978</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar11059275_positive</string>
   <key>issue_hash_function_offset</key><string>1</string>
diff --git a/clang/test/Analysis/Inputs/expected-plists/objc-radar17039661.m.plist b/clang/test/Analysis/Inputs/expected-plists/objc-radar17039661.m.plist
index 3c87e3909bec5..23bd69851c0be 100644
--- a/clang/test/Analysis/Inputs/expected-plists/objc-radar17039661.m.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/objc-radar17039661.m.plist
@@ -1329,9 +1329,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSNumber *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c204ce6cce660a7714c801bdf9183431</string>
+   <key>issue_hash_content_of_line_in_context</key><string>500e2bbda41c8086771ad98b6bcfdc50</string>
   <key>location</key>
   <dict>
    <key>line</key><integer>53</integer>
diff --git a/clang/test/Analysis/Inputs/expected-plists/plist-output-alternate.m.plist b/clang/test/Analysis/Inputs/expected-plists/plist-output-alternate.m.plist
index 53bc4cb66ef91..1c8d962100c1f 100644
--- a/clang/test/Analysis/Inputs/expected-plists/plist-output-alternate.m.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/plist-output-alternate.m.plist
@@ -1485,9 +1485,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b2b15a95787e594ff79f02c600e9d357</string>
+   <key>issue_hash_content_of_line_in_context</key><string>29a10ca4af622b6146ca082e49d919d6</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar8331641</string>
   <key>issue_hash_function_offset</key><string>2</string>
diff --git a/clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist b/clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist
index 9203e48c46835..76fec546267cd 100644
--- a/clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/plist-output.m.plist
@@ -2372,9 +2372,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;foo&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ef342aeb2f2719117ddd4ef1b72f5ba7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>f533db5cbb9c20d171f9f92105789dc4</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>test2</string>
   <key>issue_hash_function_offset</key><string>2</string>
diff --git a/clang/test/Analysis/Inputs/expected-plists/retain-release-path-notes.m.plist b/clang/test/Analysis/Inputs/expected-plists/retain-release-path-notes.m.plist
index 2d67e6e34e123..71ccd79bf3a7b 100644
--- a/clang/test/Analysis/Inputs/expected-plists/retain-release-path-notes.m.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/retain-release-path-notes.m.plist
@@ -104,9 +104,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;leaked&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>fc2476fe550128eebe2a0a8fa4299a59</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d21e9660cc6434ef84a51f39ffcdce86</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>creationViaAlloc</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -225,9 +225,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;leaked&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>31ad4a19f94c8994ebf7e887ed4ab840</string>
+   <key>issue_hash_content_of_line_in_context</key><string>f8ec2601a04113e567aa1d09c9902c91</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>creationViaCFCreate</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -571,9 +571,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;leaked&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1b654ea7bbef1493beda9e0a667dd859</string>
+   <key>issue_hash_content_of_line_in_context</key><string>dd26a8ad9a7a057feaa636974b43ccb0</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>acquisitionViaMethod</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -770,9 +770,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;leaked&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3fc42b0b859923347e789ad601d29b2a</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2f2de5d7fe728958585598b619069e5a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>acquisitionViaProperty</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -967,9 +967,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;leaked&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0b4d42c9cc01d55bc281c067f1cc1c3d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1c02b65e83dad1b22270ff5a71de3118</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>acquisitionViaCFFunction</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -1164,9 +1164,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>baa3d5ecb7824a6997e0734ad148ec55</string>
+   <key>issue_hash_content_of_line_in_context</key><string>03c23f0f82d7f2fd880a22e0d9cf14b9</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>explicitDealloc</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -1361,9 +1361,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ce73a05e0a1055b4b451f5015edbd6ec</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6f1b3f0c6c7f79f1af9b313273a01e92</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>implicitDealloc</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -1633,9 +1633,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b8cbd4dae812cd8d8faaf3b48dad2021</string>
+   <key>issue_hash_content_of_line_in_context</key><string>cb5e4205a8f925230a70715914a2e3d2</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>overAutorelease</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -1831,9 +1831,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ee96f7e22e32b24d677efa45b2395915</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1edd178e5ad76c79ce9812f519e8f467</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>autoreleaseUnowned</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -1953,9 +1953,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;leaked&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>12887d3520c4c9fd03995feeb69967ec</string>
+   <key>issue_hash_content_of_line_in_context</key><string>3f08690fae9687c29bb23b7a7cb7995b</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>makeCollectableIgnored</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -2076,9 +2076,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d715154641c7b248d401df12c1ce0808</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4b621ab5f8f2ef9240699119f4d874cb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>CFCopyRuleViolation</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -2197,9 +2197,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;object&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>58d56f1d5982f5923ab07900852ea30c</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5248d2310322982d02e5f3d564249b4f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>CFGetRuleViolation</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -2318,9 +2318,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>cc20c23c14b2363ca453c24ede3bc38d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4f23ad2725fb68134cec8b8354cd295c</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>copyViolation</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -2439,9 +2439,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>4eefa164042de89f947573c1df2fce03</string>
+   <key>issue_hash_content_of_line_in_context</key><string>da1dab126ed46b144040160ae8628460</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>copyViolationIndexedSubscript</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -2560,9 +2560,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e8ad4d8a073872a91d2b0225319cd521</string>
+   <key>issue_hash_content_of_line_in_context</key><string>52877f9471b1ecdaf213b39016b84e52</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>copyViolationKeyedSubscript</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -2681,9 +2681,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;result&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>f858bd7c1720b43bd464bbec97a1cb6b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>cf8c65a18ad9982cb9848a266cd9c61b</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>getViolation</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -2877,9 +2877,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>4da16a9c4c9d9587418f276359c5f098</string>
+   <key>issue_hash_content_of_line_in_context</key><string>e7b798151545b45a994592df0d27d250</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>copyAutorelease</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -2999,9 +2999,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>18ba6f4fe59b182bee196c1a976e3aa2</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4e0c810e2b301aca3f636ad7e3d6b0b8</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testNumericLiteral</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -3120,9 +3120,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ac4375d1ab6887c27055ee00b20a212e</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1d054002016aa4360aaf23a4c4d8fbb7</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testBoxedInt</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -3241,9 +3241,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>cd2f260edad8ce1826b21acc49cba277</string>
+   <key>issue_hash_content_of_line_in_context</key><string>67ca92144b05322ee4569aea88d08595</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testBoxedString</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -3362,9 +3362,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e60765ef00b3af982aacd5471a2cdb21</string>
+   <key>issue_hash_content_of_line_in_context</key><string>32fcec71872b8f62d8d7b1b05284b0fe</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testArray</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -3483,9 +3483,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>42da4f0388822b235ed56427f2e1ac1b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d9584825bb1e62066879949e3ade8570</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testDictionary</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -3841,9 +3841,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;MyObj *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b5589615cea2321192e477d2011edf09</string>
+   <key>issue_hash_content_of_line_in_context</key><string>eef2aef4b58abf21fcfa4bbf69e19c02</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>test</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -4240,9 +4240,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;y&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b319657460942b0e8deafb79876d5479</string>
+   <key>issue_hash_content_of_line_in_context</key><string>8c27524f691296551f9e52856b824326</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>test</string>
   <key>issue_hash_function_offset</key><string>8</string>
@@ -4518,9 +4518,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>8e06af66dd0b414c095c951ac1f2cc68</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4fc36e73ba317d307dc9cc4b3d62fd0a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>CFOverAutorelease</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -4716,9 +4716,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>06eeb988e43f885cb575eba46e7ccf8f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>08e6a3931d34cda45c09dfda76976e17</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>CFAutoreleaseUnowned</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -4988,9 +4988,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e1b335bbbaad2a9c427e681a6fac6562</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d9bb23a5435fe15df9d7ffdc27a8a072</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>CFAutoreleaseUnownedMixed</string>
   <key>issue_hash_function_offset</key><string>4</string>
diff --git a/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objc.plist b/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objc.plist
index 74e8dd606a2d9..8b5ab23df9ed6 100644
--- a/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objc.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objc.plist
@@ -397,9 +397,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1089a297e77ff0c9d2d55cfb3aae26d3</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5928b2a4699cbae0686391c20e639007</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f1</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -816,9 +816,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>bb12c99d56657635b20d4a0801590eed</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6b2e175938153ac041f52ebbf50b1f43</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f2</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -1107,9 +1107,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0e9bb151f425535a0ec1b0bf0574dd7d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>3fdbd844ddb925306ba2bb1b3626f310</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f5</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -1305,9 +1305,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ad4b758c93bbe7feeee349a526293527</string>
+   <key>issue_hash_content_of_line_in_context</key><string>8529da75e357c59fb0a7fefb0b6e0952</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f6</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -1502,9 +1502,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2a319c210c1c5b4274e3f28931ead03b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>eb0faa12081b1e28b218e4c6e53d57ec</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f7</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -1659,9 +1659,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2c347e0a0af508867a6d854a3fc8f690</string>
+   <key>issue_hash_content_of_line_in_context</key><string>404d4de8faa444bc52fd510380bd0a63</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f7</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -1857,9 +1857,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0be746eb38e868156f7f57ea95735f4e</string>
+   <key>issue_hash_content_of_line_in_context</key><string>251dff6727b3d99ec95caa28672669ea</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f8</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -2562,9 +2562,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;disk&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3e83186b5b944ef7a3ec026d469d5ad7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>69ae08a90fe52a921ed423df38ed7480</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -3045,9 +3045,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dict&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ffc6479dc21fc10cdb83b4392685ed36</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a7f8c63b1cdc39df79b7457e27ff4930</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -3660,9 +3660,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;disk&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1c06fc99a1d078653ae8e4fe308e09cd</string>
+   <key>issue_hash_content_of_line_in_context</key><string>cace8e35bed93ecdfa0455ac166aaa97</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>10</string>
@@ -4345,9 +4345,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;disk&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>460f099c6ae21a4b3ae818c9f65df2b0</string>
+   <key>issue_hash_content_of_line_in_context</key><string>778f70549a15e78703b4dcb3a287df33</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -5162,9 +5162,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dissenter&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>65004e269b1b5cb5d9b5c6f7a02926e3</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6c188b4716e84cdc55b93d40e6c2daf3</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -6044,9 +6044,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;session&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e9c1be038ef498b7985f5b1ddcb5444f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>35b9ac7ff198890c88d5839a898b7fea</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>17</string>
@@ -6161,9 +6161,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;f&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9c7c3b2bf298c7d046fd6fc7f6fe688e</string>
+   <key>issue_hash_content_of_line_in_context</key><string>17d84d673b35235b52d8f8f00c1d1eea</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testLeakCoreMediaReferenceType</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -6282,9 +6282,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>69932084739a429d667d8de6de42af0b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1702285448a953b02ab74a8eb9a610d9</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testOverReleaseMediaReferenceType</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -6674,9 +6674,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;buffer&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0f30258c45ed9ecd8646db90eaf20c4a</string>
+   <key>issue_hash_content_of_line_in_context</key><string>402566b4ddf1683dac1aefc1ab3e76e9</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCMBufferQueueDequeueAndRetain</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -6829,9 +6829,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>13e672795c0e57433c642c84f26f6c9b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>143ef5974bfece95e9894da5250aaff0</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f11</string>
   <key>issue_hash_function_offset</key><string>21</string>
@@ -6941,9 +6941,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;o&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>eeff9e133573bdbc1aeb633284cbdb2b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>af4ad99c5fb565d82e1b4848aaca4e24</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f12</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -7197,9 +7197,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>620a4245edc8df18036da34702ca01c8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>58a0b3f8332f42561f89b11f6eb5e91f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f13_autorelease_b</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -7470,9 +7470,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1a87a5f904c165069a731b0325d45edf</string>
+   <key>issue_hash_content_of_line_in_context</key><string>612dc6574d54c8010703a9776d8a4a0a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f13_autorelease_c</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -7777,9 +7777,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6ed645efdfe968f31d4356610bb6dd02</string>
+   <key>issue_hash_content_of_line_in_context</key><string>c57037289bc3acc586de325df25951ed</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f13_autorelease_d</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -7885,9 +7885,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5295be41524e9e28f4b1a608006801fe</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6abb479bc4c7782a125d680fddf825ef</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f14_leakimmediately</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -8891,9 +8891,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;bmap&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2e5affde083280f6d31ed412ac8c2396</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2cfebefee7b63ce3954419e571be4f63</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f18</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -9012,9 +9012,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>fdd0cb02c08c718da2686b6e0f04aad7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>dcd3becc58a149abe6ade5598138d3dd</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>newString</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -9230,9 +9230,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;kind&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>03f39b74e1ccafa9c613ba4bb71de560</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6688c9cb12f0c76ec80eb03b1d2eddf8</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_6659160</string>
   <key>issue_hash_function_offset</key><string>5</string>
@@ -10529,9 +10529,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c8a4713a734a4f6e747423ef88af6bf8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d04966e9b8e981d8f69bf03823253033</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_6659160</string>
   <key>issue_hash_function_offset</key><string>33</string>
@@ -10737,9 +10737,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>83c7891609f8efb616060d0c6ae6bb43</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1b35183a6aca4df5a8732c8da94e3205</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>pr3820_ReleaseAfterDealloc</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -10969,9 +10969,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9fe338c720f25b3b1d5a68930d3ae4b8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>54f2bd1534fa675b58c4f8eef3120373</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>pr3820_DeallocAfterRelease</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -11221,9 +11221,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dict&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>df3400f53fc437aede21f685ca1955d4</string>
+   <key>issue_hash_content_of_line_in_context</key><string>055e6f3413539276fedeac241fccd9b8</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>applicationDidFinishLaunching:</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -11535,9 +11535,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dict&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5104ca579763af0f8c66da3fdc42b95f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>444f6019b048a95dd71c6be49ecb73ff</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>radar10102244</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -11691,9 +11691,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>a4a85a3991cb3888217d5c62346107dc</string>
+   <key>issue_hash_content_of_line_in_context</key><string>641de26edd3d85ca241de577afbcda86</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_6257780_Case1</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -11847,9 +11847,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;RDar6320065Subclass *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>75b7ad344b1d4665d918188bd10429df</string>
+   <key>issue_hash_content_of_line_in_context</key><string>8e8ae80fd006f27a952f77494bd1c05f</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>_initReturningNewClassBad</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -12044,9 +12044,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>791e285d27d610c4c016065dd5addd37</string>
+   <key>issue_hash_content_of_line_in_context</key><string>625e26ef3ae9de238f30175e4e9f4937</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>initReturningNewClassBad2</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -12132,9 +12132,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>58cf9e4228ab9cbe375ddf37d04d45f1</string>
+   <key>issue_hash_content_of_line_in_context</key><string>666dce676597e2cfa3199521864f7b96</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>NoCopyString</string>
   <key>issue_hash_function_offset</key><string>0</string>
@@ -12217,9 +12217,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e1b0176b31382e7e75129dd78883c91b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>31104cdb408dbc3faf693a5c31973486</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>noCopyString</string>
   <key>issue_hash_function_offset</key><string>0</string>
@@ -12442,9 +12442,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5ff4d17e82026ccd84121b0a361fc135</string>
+   <key>issue_hash_content_of_line_in_context</key><string>909638940b4d7020f51062089653b231</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_RDar6859457</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -12704,9 +12704,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>964683651b544d6c1cce0c4ae6961936</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2a37743e32cfa0a86958fed215c30e87</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_RDar6859457</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -12794,9 +12794,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ca046c4c96c27a0e8c84dd707563bba9</string>
+   <key>issue_hash_content_of_line_in_context</key><string>20b25f0ba6268e055d8491c67c6a26bd</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>:</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -12914,9 +12914,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;id&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>12515c1f2d3343496d32a54ef376347d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>706b9d732ece93a88487dbbf0b82fd23</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13071,9 +13071,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;id&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e10d7d441805b9f66c118bfeccf32f29</string>
+   <key>issue_hash_content_of_line_in_context</key><string>631eebb0c921191c24734f98fe93f6bf</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -13229,9 +13229,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGImageRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3ae54947ad02e14773ac126982de301d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ee36a48521a32c183a086066d3c5ae1f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -13373,9 +13373,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGImageRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6dba0d2672617f7eb2c512129fb17bb3</string>
+   <key>issue_hash_content_of_line_in_context</key><string>70a2dd4ee6b6f7caad87a46dc6dd3580</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -13484,9 +13484,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGLayerRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b065641c4257dac33ff15b08859d09e2</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a82448687d1cbf5cb517914dbe6de4fe</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6945561</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13590,9 +13590,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7cbb4f547b5c1fb1a456ecc47f27d853</string>
+   <key>issue_hash_content_of_line_in_context</key><string>540e0145994c1e14ea750fe91a497855</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOBSDNameMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13696,9 +13696,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0b329ce97e1baf94f89590888a4af794</string>
+   <key>issue_hash_content_of_line_in_context</key><string>99d7012d797e181ef8e9a289ee9099eb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13802,9 +13802,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e207241fbe4666cffeeca3f47966425f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5d956e58f05bcc1b67ff65e02cbba302</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceNameMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13998,9 +13998,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ae61d11111bc6c9f049a5ca8935b7bae</string>
+   <key>issue_hash_content_of_line_in_context</key><string>84a53bfb58a3a929535b47e28b997382</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceAddNotification_wrapper</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -14107,9 +14107,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>62fc802833a96d44d2fa008826c46c64</string>
+   <key>issue_hash_content_of_line_in_context</key><string>36337ff486f6a8b702e68d13393bc975</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IORegistryEntryIDMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -14213,9 +14213,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>644a1e5f3d844a5d9b140de26e6e5645</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ee83ca968ddc2ecad7ae4318ce7d1d95</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOOpenFirmwarePathMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -14410,9 +14410,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>904a99d378144e5aa011649cec493695</string>
+   <key>issue_hash_content_of_line_in_context</key><string>e8c08b2b3d53f5890907888e16927805</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceGetMatchingService_wrapper</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -14607,9 +14607,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>23c94c459003beb49ea078f75a86ccc5</string>
+   <key>issue_hash_content_of_line_in_context</key><string>31664b5acc7980da73f5545fb16b0910</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceGetMatchingServices_wrapper</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -14804,9 +14804,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>06e6fa1f7f96818fbd619dfe8b210b0d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6edae46016a9671e2d5400b100d5efb5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceAddMatchingNotification_wrapper</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -15111,9 +15111,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1692047c1a2ab283584ae01c84e3ae35</string>
+   <key>issue_hash_content_of_line_in_context</key><string>dcec4e2bd254a3c24e84e598b5a827bf</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7152619</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -15311,9 +15311,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGColorSpaceRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>17e5c3184216ca3aef86288dc1f41d8d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>9317a6bf07dd10dc988f2415cc2c4ef7</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7184450</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -15511,9 +15511,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGColorSpaceRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c2225660bdec84d2ae183eda303a1abb</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ec3e6216b279aa48d8403c6aab30d996</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7184450_pos</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -15729,9 +15729,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;myGradient&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6415d6b7dd7d48a2ef27f4c4d0168c64</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4b3d6bb6b8dc5c51b7dfa8554b24eb66</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7184450_pos</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -15848,9 +15848,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>08a69979bb4fa932512da1327fbf3b23</string>
+   <key>issue_hash_content_of_line_in_context</key><string>42a83016e862ec323e24920873073a5a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7299394_positive</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -15988,9 +15988,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGContextRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>32b76a1b35c681cad8093c7e79e36388</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a416473fed3a9dbc6bfee885bee38216</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7358899</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -16099,9 +16099,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;y&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7e6172f0b4b6af27712153519e1934e1</string>
+   <key>issue_hash_content_of_line_in_context</key><string>980dd45e9cf6581dbc2be9ebfc500b7f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar7265711_a</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -16239,9 +16239,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5eb97f906bb3af4befe63c891484f791</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ebf51fb2b16499cf3a5c57d251a91061</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar7306898</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -16682,9 +16682,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6b9b51ce7b68ca0ba6a85e8924601a96</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1174ccc2a30887ebf80fe25fc6722b1a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr_1</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -16788,9 +16788,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>eb040d5ec198d092ec9894af4dce6af8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ce9963dd1c85ac22cea4e4fef615354e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr_1b</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -16977,9 +16977,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str2&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>21b45a41bb0c3c70a0efe89359ff3385</string>
+   <key>issue_hash_content_of_line_in_context</key><string>0183088266857082f35eb17f1377fd69</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr1c</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -17227,9 +17227,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str4&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>60396abae77bacd747ea9081b63a32db</string>
+   <key>issue_hash_content_of_line_in_context</key><string>352a17ef8eddd3aa5f7f6e74a74a4df3</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr1c</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -17336,9 +17336,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e258a710e07550a3dc5f47361a7380e1</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d0e564404585060990202acb33f0bb1e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testattr2_a</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17442,9 +17442,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>dc245145c78c3421392a20775cdd6f23</string>
+   <key>issue_hash_content_of_line_in_context</key><string>567dfcbc22471ca4ba9f2fccd9ff14fb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testattr2_b</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17582,9 +17582,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>77b970319b12b0c189e46ad65fa848c7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>83cd2670977d513443836653fee8147b</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testattr2_b_11358224_self_assign_looses_the_leak</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17670,9 +17670,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>4a8d774d2b821ce1601df7edabf66097</string>
+   <key>issue_hash_content_of_line_in_context</key><string>f83246e7e738918426df1adc915f4eca</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>newString</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18179,9 +18179,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2a609b8807dab6d3cb1a1db524094f2f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5f233261d96f1d461af36fc3e0efc8eb</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>newCFRetainedAsCFNoAttr</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18444,9 +18444,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFDateRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>944f189da47b1406f9cca6f17ad9f77c</string>
+   <key>issue_hash_content_of_line_in_context</key><string>7ee55b74b5ee01c6ffa2a3d83c8cf88b</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>alsoReturnsRetained</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18707,9 +18707,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFDateRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>30ebf65449c31336f8a97555d79f1943</string>
+   <key>issue_hash_content_of_line_in_context</key><string>177b2cf7eb3d8334393ee0861f5a38ac</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>alsoReturnsRetainedAsCF</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18849,9 +18849,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2ab1a2345ddfa1fd48777c7c179d4e33</string>
+   <key>issue_hash_content_of_line_in_context</key><string>85e9d8130a1f1ec37f0ba26746abd749</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_panic_negative</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -19087,9 +19087,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>f96bb4f5c1af6cf932d7ab58b678c235</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4a0b16976e0517b38b2ccc16e2928c2e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_panic_neg_2</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -19210,9 +19210,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>14182fb28ed03595f896c2f8536ac111</string>
+   <key>issue_hash_content_of_line_in_context</key><string>af73d9c62952a300a7c393ebd5073f75</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_blocks_1_pos</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -19497,9 +19497,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>dbf800f836ff675d2f779f7417877c1b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>771b2a332053388ffbdd9ba74ea84c5e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_blocks_1_indirect_retain_via_call</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -19895,9 +19895,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;info&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>64424de797303506a3dfdb52fa765645</string>
+   <key>issue_hash_content_of_line_in_context</key><string>39f8c30f7436f678d5259c0fdd3a0dad</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_8724287</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -19988,9 +19988,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7b7fc0c36e58713202141cb584150903</string>
+   <key>issue_hash_content_of_line_in_context</key><string>107e3efdeb8cdff4bef4c64183c4f6fa</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camelcase_createno</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20074,9 +20074,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>32912dd9518de1b3f4cc8ba38368f7e6</string>
+   <key>issue_hash_content_of_line_in_context</key><string>20c973a013858abb0a926276c956f858</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camelcase_copying</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20160,9 +20160,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1dccc42846a9ef9bf1a1830e277d5b78</string>
+   <key>issue_hash_content_of_line_in_context</key><string>80ee99e51561a37297429740e3a4da0c</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camel_creat</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20246,9 +20246,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2a0ba33097f6e9362a79689e2ac0cf4a</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a4e28a04f6a8d87c8aaf4d71c37cac0f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camel_copymachine</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20385,9 +20385,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;vals&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>43f6c1be372d09a4a4cffaefa69d0148</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6b727a438d8411c058fd32867b9402bc</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6582778</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -20650,9 +20650,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ebe7e868c0075bfa7480e3359e4fbce8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>b39dcf9df7cec8dd73cbbe25b2a7d6c5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar10232019_positive</string>
   <key>issue_hash_function_offset</key><string>6</string>
@@ -20807,9 +20807,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>507c3679ae27249e01844b7555843688</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a501f743b22f1feb5dc317fcad4f7556</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -21033,9 +21033,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a2&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>821f8268a0b7d3f90e4dd88fa1edf39b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a141a6ad33e8ff2ae3b13da0ad36ebc5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>12</string>
@@ -21442,9 +21442,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a3&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>37b00e6e0e6b792ea3294a9ffd6f4886</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2b072d75e8da8e3fe8f7968a85efb37c</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>20</string>
@@ -21815,9 +21815,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>62fc5b80705a03ab1d8b50bdcfbfb179</string>
+   <key>issue_hash_content_of_line_in_context</key><string>0bfdfb7e392626e0fccc6ab9f58f1ca8</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>28</string>
@@ -22370,9 +22370,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3eee239ca30a84ef6ecc5d154ae8df28</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ff7c34e661a42d06a7fb3e9669e70339</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>37</string>
@@ -22643,9 +22643,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>cb86fdadd2217db6b784b37dc29eba34</string>
+   <key>issue_hash_content_of_line_in_context</key><string>73e84c042932d2e17e00f00dc3d36d5a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_integer_literals</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -22874,9 +22874,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>4ad9235c4885452c3034fef815598a63</string>
+   <key>issue_hash_content_of_line_in_context</key><string>465e592d4f7a187717d00b8154a614b5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_boxed_expressions</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -23159,9 +23159,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9d3a52ee2efe90fef76f91f143f0d9e7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>c701bd0c60f51d96c047aa78c9e0eb99</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_boxed_expressions</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -23523,9 +23523,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0aad7b0550b51ebc0a2323c482d8eefd</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a4cedbb647e9632da7a5072cb839e54a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar11400885</string>
   <key>issue_hash_function_offset</key><string>9</string>
@@ -23683,9 +23683,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3b63deb8c998b2d73dd63da9f89672bb</string>
+   <key>issue_hash_content_of_line_in_context</key><string>fd9427d86a2357fd92478c9c7abbc1f4</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testConsumeAndStopTracking</string>
   <key>issue_hash_function_offset</key><string>10</string>
@@ -23842,9 +23842,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>a4fe04db2f5fa1aa2b6d8d18ccb5dd02</string>
+   <key>issue_hash_content_of_line_in_context</key><string>0e65e51476e5671dcd37f632806e5147</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCFConsumeAndStopTracking</string>
   <key>issue_hash_function_offset</key><string>10</string>
@@ -23952,9 +23952,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>55f656da79f1b87a4b5618167f68c233</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a0ba9c47505e923763ea5323ad2f71b7</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_custom_cf</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24058,9 +24058,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;obj&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>a7b4693fabae95c6b2091c7816fb2358</string>
+   <key>issue_hash_content_of_line_in_context</key><string>7a6cf8cb3c5e0ca3125d7e27695a810a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCustomReturnsRetained</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24145,9 +24145,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>51de919c9df9dec2d383d050bf73d2d8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>810fce32373fe40ba8e2d0894d46f667</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCustomReturnsNotRetained</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24502,9 +24502,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;MyObj12706177 *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d8890e44d330279fd91ce8fdb35d7c81</string>
+   <key>issue_hash_content_of_line_in_context</key><string>68ee7961ffb62c575cc2298cb4836090</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>test12706177</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24734,9 +24734,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d4c839aab11cc39188d1054f3270d67f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1dc376fbbe90d14b6766585a0e2b7bee</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>getIncorrectlyAutoreleasedCFType</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -24963,9 +24963,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d2d9e8a977772482263591670a124c5d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6ae8ea9fe4bf203e6b7bfaf649a6ca6a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>createIncorrectlyAutoreleasedCFType</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -25158,9 +25158,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c483bb676bdbea00f7e99b3617b4b6e2</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d4e28f96fc8610b5b4b849f4760956eb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>useAfterRelease</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -25415,9 +25415,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;obj&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5bbb9b1720912f3fd2c67b3332de793b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>7986c4b7fb29301c109343dfe4155202</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testAutoreleaseReturnsInput</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -25673,9 +25673,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;arr&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ea7d6978bcb6da71c23b4bb6fef51a87</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2e0dbfdf379acf2f09e46db47d753e8a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>autoreleaseReturningTypedObject</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -25890,9 +25890,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1f4f3ca2f399a94e54304b4a0dcb1e85</string>
+   <key>issue_hash_content_of_line_in_context</key><string>41a2d6f91fdfa9b5f396102a60571e21</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>autoreleaseObjC</string>
   <key>issue_hash_function_offset</key><string>6</string>
@@ -26048,9 +26048,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ced44137127627330194b72c97aef162</string>
+   <key>issue_hash_content_of_line_in_context</key><string>95dd5581ae4195b71e9a11f34290af5d</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCFReturnsNotRetained</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -26204,9 +26204,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e7615a640885cbd55bc856bfc07d7123</string>
+   <key>issue_hash_content_of_line_in_context</key><string>014103674df4a8a65a96bcdf936637a2</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCFReturnsNotRetainedAnnotated</string>
   <key>issue_hash_function_offset</key><string>4</string>
diff --git a/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objcpp.plist b/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objcpp.plist
index 79145156f2649..d797626af86df 100644
--- a/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objcpp.plist
+++ b/clang/test/Analysis/Inputs/expected-plists/retain-release.m.objcpp.plist
@@ -397,9 +397,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1089a297e77ff0c9d2d55cfb3aae26d3</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5928b2a4699cbae0686391c20e639007</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f1</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -816,9 +816,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>bb12c99d56657635b20d4a0801590eed</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6b2e175938153ac041f52ebbf50b1f43</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f2</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -1107,9 +1107,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0e9bb151f425535a0ec1b0bf0574dd7d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>3fdbd844ddb925306ba2bb1b3626f310</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f5</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -1305,9 +1305,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ad4b758c93bbe7feeee349a526293527</string>
+   <key>issue_hash_content_of_line_in_context</key><string>8529da75e357c59fb0a7fefb0b6e0952</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f6</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -1502,9 +1502,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2a319c210c1c5b4274e3f28931ead03b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>eb0faa12081b1e28b218e4c6e53d57ec</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f7</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -1659,9 +1659,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2c347e0a0af508867a6d854a3fc8f690</string>
+   <key>issue_hash_content_of_line_in_context</key><string>404d4de8faa444bc52fd510380bd0a63</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f7</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -1857,9 +1857,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;date&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0be746eb38e868156f7f57ea95735f4e</string>
+   <key>issue_hash_content_of_line_in_context</key><string>251dff6727b3d99ec95caa28672669ea</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f8</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -2562,9 +2562,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;disk&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3e83186b5b944ef7a3ec026d469d5ad7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>69ae08a90fe52a921ed423df38ed7480</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -3045,9 +3045,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dict&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ffc6479dc21fc10cdb83b4392685ed36</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a7f8c63b1cdc39df79b7457e27ff4930</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -3660,9 +3660,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;disk&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1c06fc99a1d078653ae8e4fe308e09cd</string>
+   <key>issue_hash_content_of_line_in_context</key><string>cace8e35bed93ecdfa0455ac166aaa97</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>10</string>
@@ -4345,9 +4345,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;disk&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>460f099c6ae21a4b3ae818c9f65df2b0</string>
+   <key>issue_hash_content_of_line_in_context</key><string>778f70549a15e78703b4dcb3a287df33</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -5162,9 +5162,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dissenter&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>65004e269b1b5cb5d9b5c6f7a02926e3</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6c188b4716e84cdc55b93d40e6c2daf3</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -6044,9 +6044,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;session&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e9c1be038ef498b7985f5b1ddcb5444f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>35b9ac7ff198890c88d5839a898b7fea</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f10</string>
   <key>issue_hash_function_offset</key><string>17</string>
@@ -6161,9 +6161,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;f&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9c7c3b2bf298c7d046fd6fc7f6fe688e</string>
+   <key>issue_hash_content_of_line_in_context</key><string>17d84d673b35235b52d8f8f00c1d1eea</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testLeakCoreMediaReferenceType</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -6282,9 +6282,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>69932084739a429d667d8de6de42af0b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1702285448a953b02ab74a8eb9a610d9</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testOverReleaseMediaReferenceType</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -6674,9 +6674,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;buffer&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0f30258c45ed9ecd8646db90eaf20c4a</string>
+   <key>issue_hash_content_of_line_in_context</key><string>402566b4ddf1683dac1aefc1ab3e76e9</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCMBufferQueueDequeueAndRetain</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -6829,9 +6829,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>13e672795c0e57433c642c84f26f6c9b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>143ef5974bfece95e9894da5250aaff0</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f11</string>
   <key>issue_hash_function_offset</key><string>21</string>
@@ -6941,9 +6941,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;o&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>eeff9e133573bdbc1aeb633284cbdb2b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>af4ad99c5fb565d82e1b4848aaca4e24</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f12</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -7197,9 +7197,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>620a4245edc8df18036da34702ca01c8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>58a0b3f8332f42561f89b11f6eb5e91f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f13_autorelease_b</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -7470,9 +7470,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1a87a5f904c165069a731b0325d45edf</string>
+   <key>issue_hash_content_of_line_in_context</key><string>612dc6574d54c8010703a9776d8a4a0a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f13_autorelease_c</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -7777,9 +7777,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6ed645efdfe968f31d4356610bb6dd02</string>
+   <key>issue_hash_content_of_line_in_context</key><string>c57037289bc3acc586de325df25951ed</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f13_autorelease_d</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -7885,9 +7885,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5295be41524e9e28f4b1a608006801fe</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6abb479bc4c7782a125d680fddf825ef</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f14_leakimmediately</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -8891,9 +8891,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;bmap&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2e5affde083280f6d31ed412ac8c2396</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2cfebefee7b63ce3954419e571be4f63</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>f18</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -9012,9 +9012,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>fdd0cb02c08c718da2686b6e0f04aad7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>dcd3becc58a149abe6ade5598138d3dd</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>newString</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -9230,9 +9230,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;kind&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>03f39b74e1ccafa9c613ba4bb71de560</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6688c9cb12f0c76ec80eb03b1d2eddf8</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_6659160</string>
   <key>issue_hash_function_offset</key><string>5</string>
@@ -10529,9 +10529,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c8a4713a734a4f6e747423ef88af6bf8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d04966e9b8e981d8f69bf03823253033</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_6659160</string>
   <key>issue_hash_function_offset</key><string>33</string>
@@ -10737,9 +10737,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>83c7891609f8efb616060d0c6ae6bb43</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1b35183a6aca4df5a8732c8da94e3205</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>pr3820_ReleaseAfterDealloc</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -10969,9 +10969,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9fe338c720f25b3b1d5a68930d3ae4b8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>54f2bd1534fa675b58c4f8eef3120373</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>pr3820_DeallocAfterRelease</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -11221,9 +11221,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dict&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>df3400f53fc437aede21f685ca1955d4</string>
+   <key>issue_hash_content_of_line_in_context</key><string>055e6f3413539276fedeac241fccd9b8</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>applicationDidFinishLaunching:</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -11535,9 +11535,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;dict&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5104ca579763af0f8c66da3fdc42b95f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>444f6019b048a95dd71c6be49ecb73ff</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>radar10102244</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -11691,9 +11691,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>a4a85a3991cb3888217d5c62346107dc</string>
+   <key>issue_hash_content_of_line_in_context</key><string>641de26edd3d85ca241de577afbcda86</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_6257780_Case1</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -11847,9 +11847,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;RDar6320065Subclass *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>75b7ad344b1d4665d918188bd10429df</string>
+   <key>issue_hash_content_of_line_in_context</key><string>8e8ae80fd006f27a952f77494bd1c05f</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>_initReturningNewClassBad</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -12044,9 +12044,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>791e285d27d610c4c016065dd5addd37</string>
+   <key>issue_hash_content_of_line_in_context</key><string>625e26ef3ae9de238f30175e4e9f4937</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>initReturningNewClassBad2</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -12132,9 +12132,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>58cf9e4228ab9cbe375ddf37d04d45f1</string>
+   <key>issue_hash_content_of_line_in_context</key><string>666dce676597e2cfa3199521864f7b96</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>NoCopyString</string>
   <key>issue_hash_function_offset</key><string>0</string>
@@ -12217,9 +12217,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e1b0176b31382e7e75129dd78883c91b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>31104cdb408dbc3faf693a5c31973486</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>noCopyString</string>
   <key>issue_hash_function_offset</key><string>0</string>
@@ -12442,9 +12442,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5ff4d17e82026ccd84121b0a361fc135</string>
+   <key>issue_hash_content_of_line_in_context</key><string>909638940b4d7020f51062089653b231</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_RDar6859457</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -12704,9 +12704,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>964683651b544d6c1cce0c4ae6961936</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2a37743e32cfa0a86958fed215c30e87</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_RDar6859457</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -12794,9 +12794,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ca046c4c96c27a0e8c84dd707563bba9</string>
+   <key>issue_hash_content_of_line_in_context</key><string>20b25f0ba6268e055d8491c67c6a26bd</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>:</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -12914,9 +12914,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;id&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>12515c1f2d3343496d32a54ef376347d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>706b9d732ece93a88487dbbf0b82fd23</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13105,9 +13105,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;id&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e10d7d441805b9f66c118bfeccf32f29</string>
+   <key>issue_hash_content_of_line_in_context</key><string>631eebb0c921191c24734f98fe93f6bf</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -13297,9 +13297,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGImageRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3ae54947ad02e14773ac126982de301d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ee36a48521a32c183a086066d3c5ae1f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -13441,9 +13441,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGImageRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6dba0d2672617f7eb2c512129fb17bb3</string>
+   <key>issue_hash_content_of_line_in_context</key><string>70a2dd4ee6b6f7caad87a46dc6dd3580</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6902710</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -13552,9 +13552,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGLayerRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b065641c4257dac33ff15b08859d09e2</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a82448687d1cbf5cb517914dbe6de4fe</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6945561</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13658,9 +13658,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7cbb4f547b5c1fb1a456ecc47f27d853</string>
+   <key>issue_hash_content_of_line_in_context</key><string>540e0145994c1e14ea750fe91a497855</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOBSDNameMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13764,9 +13764,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0b329ce97e1baf94f89590888a4af794</string>
+   <key>issue_hash_content_of_line_in_context</key><string>99d7012d797e181ef8e9a289ee9099eb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -13870,9 +13870,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e207241fbe4666cffeeca3f47966425f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5d956e58f05bcc1b67ff65e02cbba302</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceNameMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -14066,9 +14066,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ae61d11111bc6c9f049a5ca8935b7bae</string>
+   <key>issue_hash_content_of_line_in_context</key><string>84a53bfb58a3a929535b47e28b997382</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceAddNotification_wrapper</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -14175,9 +14175,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>62fc802833a96d44d2fa008826c46c64</string>
+   <key>issue_hash_content_of_line_in_context</key><string>36337ff486f6a8b702e68d13393bc975</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IORegistryEntryIDMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -14281,9 +14281,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableDictionaryRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>644a1e5f3d844a5d9b140de26e6e5645</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ee83ca968ddc2ecad7ae4318ce7d1d95</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOOpenFirmwarePathMatching_wrapper</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -14478,9 +14478,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>904a99d378144e5aa011649cec493695</string>
+   <key>issue_hash_content_of_line_in_context</key><string>e8c08b2b3d53f5890907888e16927805</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceGetMatchingService_wrapper</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -14675,9 +14675,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>23c94c459003beb49ea078f75a86ccc5</string>
+   <key>issue_hash_content_of_line_in_context</key><string>31664b5acc7980da73f5545fb16b0910</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceGetMatchingServices_wrapper</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -14872,9 +14872,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>06e6fa1f7f96818fbd619dfe8b210b0d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6edae46016a9671e2d5400b100d5efb5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>IOServiceAddMatchingNotification_wrapper</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -15179,9 +15179,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1692047c1a2ab283584ae01c84e3ae35</string>
+   <key>issue_hash_content_of_line_in_context</key><string>dcec4e2bd254a3c24e84e598b5a827bf</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7152619</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -15380,9 +15380,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGColorSpaceRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>17e5c3184216ca3aef86288dc1f41d8d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>9317a6bf07dd10dc988f2415cc2c4ef7</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7184450</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -15580,9 +15580,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGColorSpaceRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c2225660bdec84d2ae183eda303a1abb</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ec3e6216b279aa48d8403c6aab30d996</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7184450_pos</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -15798,9 +15798,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;myGradient&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6415d6b7dd7d48a2ef27f4c4d0168c64</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4b3d6bb6b8dc5c51b7dfa8554b24eb66</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7184450_pos</string>
   <key>issue_hash_function_offset</key><string>13</string>
@@ -15917,9 +15917,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>08a69979bb4fa932512da1327fbf3b23</string>
+   <key>issue_hash_content_of_line_in_context</key><string>42a83016e862ec323e24920873073a5a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7299394_positive</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -16057,9 +16057,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CGContextRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>32b76a1b35c681cad8093c7e79e36388</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a416473fed3a9dbc6bfee885bee38216</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_7358899</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -16168,9 +16168,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;y&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7e6172f0b4b6af27712153519e1934e1</string>
+   <key>issue_hash_content_of_line_in_context</key><string>980dd45e9cf6581dbc2be9ebfc500b7f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar7265711_a</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -16308,9 +16308,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5eb97f906bb3af4befe63c891484f791</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ebf51fb2b16499cf3a5c57d251a91061</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar7306898</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -16751,9 +16751,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>6b9b51ce7b68ca0ba6a85e8924601a96</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1174ccc2a30887ebf80fe25fc6722b1a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr_1</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -16857,9 +16857,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>eb040d5ec198d092ec9894af4dce6af8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ce9963dd1c85ac22cea4e4fef615354e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr_1b</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17046,9 +17046,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str2&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>21b45a41bb0c3c70a0efe89359ff3385</string>
+   <key>issue_hash_content_of_line_in_context</key><string>0183088266857082f35eb17f1377fd69</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr1c</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -17296,9 +17296,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;str4&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>60396abae77bacd747ea9081b63a32db</string>
+   <key>issue_hash_content_of_line_in_context</key><string>352a17ef8eddd3aa5f7f6e74a74a4df3</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_attr1c</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -17405,9 +17405,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e258a710e07550a3dc5f47361a7380e1</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d0e564404585060990202acb33f0bb1e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testattr2_a</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17511,9 +17511,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>dc245145c78c3421392a20775cdd6f23</string>
+   <key>issue_hash_content_of_line_in_context</key><string>567dfcbc22471ca4ba9f2fccd9ff14fb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testattr2_b</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17651,9 +17651,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>77b970319b12b0c189e46ad65fa848c7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>83cd2670977d513443836653fee8147b</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testattr2_b_11358224_self_assign_looses_the_leak</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -17739,9 +17739,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;NSString *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>4a8d774d2b821ce1601df7edabf66097</string>
+   <key>issue_hash_content_of_line_in_context</key><string>f83246e7e738918426df1adc915f4eca</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>newString</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18248,9 +18248,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2a609b8807dab6d3cb1a1db524094f2f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>5f233261d96f1d461af36fc3e0efc8eb</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>newCFRetainedAsCFNoAttr</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18513,9 +18513,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFDateRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>944f189da47b1406f9cca6f17ad9f77c</string>
+   <key>issue_hash_content_of_line_in_context</key><string>7ee55b74b5ee01c6ffa2a3d83c8cf88b</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>alsoReturnsRetained</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18776,9 +18776,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFDateRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>30ebf65449c31336f8a97555d79f1943</string>
+   <key>issue_hash_content_of_line_in_context</key><string>177b2cf7eb3d8334393ee0861f5a38ac</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>alsoReturnsRetainedAsCF</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -18918,9 +18918,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2ab1a2345ddfa1fd48777c7c179d4e33</string>
+   <key>issue_hash_content_of_line_in_context</key><string>85e9d8130a1f1ec37f0ba26746abd749</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_panic_negative</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -19156,9 +19156,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>f96bb4f5c1af6cf932d7ab58b678c235</string>
+   <key>issue_hash_content_of_line_in_context</key><string>4a0b16976e0517b38b2ccc16e2928c2e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_panic_neg_2</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -19279,9 +19279,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>14182fb28ed03595f896c2f8536ac111</string>
+   <key>issue_hash_content_of_line_in_context</key><string>af73d9c62952a300a7c393ebd5073f75</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_blocks_1_pos</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -19566,9 +19566,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;number&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>dbf800f836ff675d2f779f7417877c1b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>771b2a332053388ffbdd9ba74ea84c5e</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_blocks_1_indirect_retain_via_call</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -19964,9 +19964,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;info&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>64424de797303506a3dfdb52fa765645</string>
+   <key>issue_hash_content_of_line_in_context</key><string>39f8c30f7436f678d5259c0fdd3a0dad</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar_8724287</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -20057,9 +20057,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>7b7fc0c36e58713202141cb584150903</string>
+   <key>issue_hash_content_of_line_in_context</key><string>107e3efdeb8cdff4bef4c64183c4f6fa</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camelcase_createno</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20143,9 +20143,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>32912dd9518de1b3f4cc8ba38368f7e6</string>
+   <key>issue_hash_content_of_line_in_context</key><string>20c973a013858abb0a926276c956f858</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camelcase_copying</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20229,9 +20229,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1dccc42846a9ef9bf1a1830e277d5b78</string>
+   <key>issue_hash_content_of_line_in_context</key><string>80ee99e51561a37297429740e3a4da0c</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camel_creat</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20315,9 +20315,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;CFMutableArrayRef&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak of returned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>2a0ba33097f6e9362a79689e2ac0cf4a</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a4e28a04f6a8d87c8aaf4d71c37cac0f</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>camel_copymachine</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -20454,9 +20454,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;vals&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>43f6c1be372d09a4a4cffaefa69d0148</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6b727a438d8411c058fd32867b9402bc</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar6582778</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -20719,9 +20719,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ebe7e868c0075bfa7480e3359e4fbce8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>b39dcf9df7cec8dd73cbbe25b2a7d6c5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar10232019_positive</string>
   <key>issue_hash_function_offset</key><string>6</string>
@@ -20876,9 +20876,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>507c3679ae27249e01844b7555843688</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a501f743b22f1feb5dc317fcad4f7556</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>3</string>
@@ -21102,9 +21102,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a2&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>821f8268a0b7d3f90e4dd88fa1edf39b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a141a6ad33e8ff2ae3b13da0ad36ebc5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>12</string>
@@ -21511,9 +21511,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a3&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>37b00e6e0e6b792ea3294a9ffd6f4886</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2b072d75e8da8e3fe8f7968a85efb37c</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>20</string>
@@ -21884,9 +21884,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>62fc5b80705a03ab1d8b50bdcfbfb179</string>
+   <key>issue_hash_content_of_line_in_context</key><string>0bfdfb7e392626e0fccc6ab9f58f1ca8</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>28</string>
@@ -22439,9 +22439,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;a&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3eee239ca30a84ef6ecc5d154ae8df28</string>
+   <key>issue_hash_content_of_line_in_context</key><string>ff7c34e661a42d06a7fb3e9669e70339</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_arrays</string>
   <key>issue_hash_function_offset</key><string>37</string>
@@ -22712,9 +22712,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>cb86fdadd2217db6b784b37dc29eba34</string>
+   <key>issue_hash_content_of_line_in_context</key><string>73e84c042932d2e17e00f00dc3d36d5a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_integer_literals</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -22943,9 +22943,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>4ad9235c4885452c3034fef815598a63</string>
+   <key>issue_hash_content_of_line_in_context</key><string>465e592d4f7a187717d00b8154a614b5</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_boxed_expressions</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -23228,9 +23228,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;value&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>9d3a52ee2efe90fef76f91f143f0d9e7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>c701bd0c60f51d96c047aa78c9e0eb99</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_objc_boxed_expressions</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -23592,9 +23592,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>0aad7b0550b51ebc0a2323c482d8eefd</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a4cedbb647e9632da7a5072cb839e54a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>rdar11400885</string>
   <key>issue_hash_function_offset</key><string>9</string>
@@ -23752,9 +23752,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>3b63deb8c998b2d73dd63da9f89672bb</string>
+   <key>issue_hash_content_of_line_in_context</key><string>fd9427d86a2357fd92478c9c7abbc1f4</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testConsumeAndStopTracking</string>
   <key>issue_hash_function_offset</key><string>10</string>
@@ -23911,9 +23911,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>a4fe04db2f5fa1aa2b6d8d18ccb5dd02</string>
+   <key>issue_hash_content_of_line_in_context</key><string>0e65e51476e5671dcd37f632806e5147</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCFConsumeAndStopTracking</string>
   <key>issue_hash_function_offset</key><string>10</string>
@@ -24021,9 +24021,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;x&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>55f656da79f1b87a4b5618167f68c233</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a0ba9c47505e923763ea5323ad2f71b7</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>test_custom_cf</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24127,9 +24127,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;obj&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>a7b4693fabae95c6b2091c7816fb2358</string>
+   <key>issue_hash_content_of_line_in_context</key><string>7a6cf8cb3c5e0ca3125d7e27695a810a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCustomReturnsRetained</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24214,9 +24214,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>51de919c9df9dec2d383d050bf73d2d8</string>
+   <key>issue_hash_content_of_line_in_context</key><string>810fce32373fe40ba8e2d0894d46f667</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCustomReturnsNotRetained</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24571,9 +24571,9 @@
    <key>description</key><string>Potential leak of an object of type &apos;MyObj12706177 *&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d8890e44d330279fd91ce8fdb35d7c81</string>
+   <key>issue_hash_content_of_line_in_context</key><string>68ee7961ffb62c575cc2298cb4836090</string>
   <key>issue_context_kind</key><string>Objective-C method</string>
   <key>issue_context</key><string>test12706177</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -24803,9 +24803,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d4c839aab11cc39188d1054f3270d67f</string>
+   <key>issue_hash_content_of_line_in_context</key><string>1dc376fbbe90d14b6766585a0e2b7bee</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>getIncorrectlyAutoreleasedCFType</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -25032,9 +25032,9 @@
    <key>description</key><string>Object with a +0 retain count returned to caller where a +1 (owning) retain count is expected</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Method should return an owned object</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>d2d9e8a977772482263591670a124c5d</string>
+   <key>issue_hash_content_of_line_in_context</key><string>6ae8ea9fe4bf203e6b7bfaf649a6ca6a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>createIncorrectlyAutoreleasedCFType</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -25227,9 +25227,9 @@
    <key>description</key><string>Reference-counted object is used after it is released</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Use-after-release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>c483bb676bdbea00f7e99b3617b4b6e2</string>
+   <key>issue_hash_content_of_line_in_context</key><string>d4e28f96fc8610b5b4b849f4760956eb</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>useAfterRelease</string>
   <key>issue_hash_function_offset</key><string>7</string>
@@ -25484,9 +25484,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;obj&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>5bbb9b1720912f3fd2c67b3332de793b</string>
+   <key>issue_hash_content_of_line_in_context</key><string>7986c4b7fb29301c109343dfe4155202</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testAutoreleaseReturnsInput</string>
   <key>issue_hash_function_offset</key><string>2</string>
@@ -25742,9 +25742,9 @@
    <key>description</key><string>Potential leak of an object stored into &apos;arr&apos;</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Leak</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ea7d6978bcb6da71c23b4bb6fef51a87</string>
+   <key>issue_hash_content_of_line_in_context</key><string>2e0dbfdf379acf2f09e46db47d753e8a</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>autoreleaseReturningTypedObject</string>
   <key>issue_hash_function_offset</key><string>1</string>
@@ -25959,9 +25959,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>1f4f3ca2f399a94e54304b4a0dcb1e85</string>
+   <key>issue_hash_content_of_line_in_context</key><string>41a2d6f91fdfa9b5f396102a60571e21</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>autoreleaseObjC</string>
   <key>issue_hash_function_offset</key><string>6</string>
@@ -26117,9 +26117,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>ced44137127627330194b72c97aef162</string>
+   <key>issue_hash_content_of_line_in_context</key><string>95dd5581ae4195b71e9a11f34290af5d</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCFReturnsNotRetained</string>
   <key>issue_hash_function_offset</key><string>4</string>
@@ -26273,9 +26273,9 @@
    <key>description</key><string>Incorrect decrement of the reference count of an object that is not owned at this point by the caller</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Bad release</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>e7615a640885cbd55bc856bfc07d7123</string>
+   <key>issue_hash_content_of_line_in_context</key><string>014103674df4a8a65a96bcdf936637a2</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testCFReturnsNotRetainedAnnotated</string>
   <key>issue_hash_function_offset</key><string>4</string>
diff --git a/clang/test/Analysis/incorrect-checker-names.mm b/clang/test/Analysis/incorrect-checker-names.mm
index 861f81e98eb1b..bf7c6c071153a 100644
--- a/clang/test/Analysis/incorrect-checker-names.mm
+++ b/clang/test/Analysis/incorrect-checker-names.mm
@@ -125,7 +125,7 @@ - (void)myMethodWhichMayFail:(NSError **)error {                  // expected-wa
 void use_out_param_leak() {
   OSObject *obj;
   // FIXME: This shouldn't be tied to a modeling checker.
-  write_into_out_param_on_success(&obj); // expected-warning{{Potential leak of an object stored into 'obj' [osx.cocoa.RetainCountBase]}}
+  write_into_out_param_on_success(&obj); // expected-warning{{Potential leak of an object stored into 'obj' [osx.cocoa.RetainCount]}}
 }
 
 typedef struct dispatch_queue_s *dispatch_queue_t;
diff --git a/clang/test/Analysis/inlining/Inputs/expected-plists/path-notes.m.plist b/clang/test/Analysis/inlining/Inputs/expected-plists/path-notes.m.plist
index 6b3f36721fb83..b14ffffbfc231 100644
--- a/clang/test/Analysis/inlining/Inputs/expected-plists/path-notes.m.plist
+++ b/clang/test/Analysis/inlining/Inputs/expected-plists/path-notes.m.plist
@@ -1965,9 +1965,9 @@
    <key>description</key><string>Object autoreleased too many times</string>
    <key>category</key><string>Memory (Core Foundation/Objective-C/OSObject)</string>
    <key>type</key><string>Object autoreleased too many times</string>
-   <key>check_name</key><string>osx.cocoa.RetainCountBase</string>
+   <key>check_name</key><string>osx.cocoa.RetainCount</string>
    <!-- This hash is experimental and going to change! -->
-   <key>issue_hash_content_of_line_in_context</key><string>b6a556c71184371a9567489c8477c2f7</string>
+   <key>issue_hash_content_of_line_in_context</key><string>a3c91a7a52619d81ebe032dcc49ebb93</string>
   <key>issue_context_kind</key><string>function</string>
   <key>issue_context</key><string>testAutoreleaseTakesEffectInDispatch</string>
   <key>issue_hash_function_offset</key><string>11</string>
diff --git a/clang/test/Analysis/test-separate-retaincount.cpp b/clang/test/Analysis/test-separate-retaincount.cpp
index 41efad452e5ac..5ca4907e7291c 100644
--- a/clang/test/Analysis/test-separate-retaincount.cpp
+++ b/clang/test/Analysis/test-separate-retaincount.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_analyze_cc1 -std=c++14 -DNO_CF_OBJECT -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++14 -verify=no-retain-count %s \
 // RUN:   -analyzer-checker=core,osx \
 // RUN:   -analyzer-disable-checker osx.cocoa.RetainCount
 //
-// RUN: %clang_analyze_cc1 -std=c++14 -DNO_OS_OBJECT -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++14 -verify=no-os-object %s \
 // RUN:   -analyzer-checker=core,osx \
 // RUN:   -analyzer-disable-checker osx.OSObjectRetainCount
 
@@ -20,17 +20,11 @@ using size_t = decltype(sizeof(int));
 void cf_overrelease() {
   CFTypeRef cf = CFCreate();
   CFRelease(cf);
-  CFRelease(cf);
-#ifndef NO_CF_OBJECT
-  // expected-warning@-2{{Reference-counted object is used after it is released}}
-#endif
+  CFRelease(cf); // no-os-object-warning{{Reference-counted object is used after it is released}}
 }
 
 void osobject_overrelease() {
   OSObject *o = new OSObject;
   o->release();
-  o->release();
-#ifndef NO_OS_OBJECT
-  // expected-warning@-2{{Reference-counted object is used after it is released}}
-#endif
+  o->release(); // no-retain-count-warning{{Reference-counted object is used after it is released}}
 }

From efd1a8e66eaa13afff709ebf16ff6280caa82ead Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krist=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Tue, 25 Feb 2020 18:02:18 +0100
Subject: [PATCH 158/770] [analyzer][MallocChecker] Make NewDeleteLeaks depend
 on DynamicMemoryModeling rather than NewDelete

If you remember the mail [1] I sent out about how I envision the future of the
already existing checkers to look dependencywise, one my main points was that no
checker that emits diagnostics should be a dependency. This is more problematic
for some checkers (ahem, RetainCount [2]) more than for others, like this one.

The MallocChecker family is a mostly big monolithic modeling class some small
reporting checkers that only come to action when we are constructing a warning
message, after the actual bug was detected. The implication of this is that
NewDeleteChecker doesn't really do anything to depend on, so this change was
relatively simple.

The only thing that complicates this change is that FreeMemAux (MallocCheckers
method that models general memory deallocation) returns after calling a bug
reporting method, regardless whether the report was ever emitted (which may not
always happen, for instance, if the checker responsible for the report isn't
enabled). This return unfortunately happens before cleaning up the maps in the
GDM keeping track of the state of symbols (whether they are released, whether
that release was successful, etc). What this means is that upon disabling some
checkers, we would never clean up the map and that could've lead to false
positives, e.g.:

error: 'warning' diagnostics seen but not expected:
  File clang/test/Analysis/NewDelete-intersections.mm Line 66: Potential leak of memory pointed to by 'p'
  File clang/test/Analysis/NewDelete-intersections.mm Line 73: Potential leak of memory pointed to by 'p'
  File clang/test/Analysis/NewDelete-intersections.mm Line 77: Potential leak of memory pointed to by 'p'

error: 'warning' diagnostics seen but not expected:
  File clang/test/Analysis/NewDelete-checker-test.cpp Line 111: Undefined or garbage value returned to caller
  File clang/test/Analysis/NewDelete-checker-test.cpp Line 200: Potential leak of memory pointed to by 'p'

error: 'warning' diagnostics seen but not expected:
  File clang/test/Analysis/new.cpp Line 137: Potential leak of memory pointed to by 'x'
There two possible approaches I had in mind:

Make bug reporting methods of MallocChecker returns whether they succeeded, and
proceed with the rest of FreeMemAux if not,
Halt execution with a sink node upon failure. I decided to go with this, as
described in the code.
As you can see from the removed/changed test files, before the big checker
dependency effort landed, there were tests to check for all the weird
configurations of enabled/disabled checkers and their messy interactions, I
largely repurposed these.

[1] http://lists.llvm.org/pipermail/cfe-dev/2019-August/063070.html
[2] http://lists.llvm.org/pipermail/cfe-dev/2019-August/063205.html

Differential Revision: https://reviews.llvm.org/D77474
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td |   4 +-
 .../StaticAnalyzer/Checkers/MallocChecker.cpp | 153 ++++++++++--------
 .../Malloc+NewDelete_intersections.cpp        |  15 --
 .../test/Analysis/NewDelete-checker-test.cpp  | 105 ++++--------
 .../test/Analysis/NewDelete-intersections.mm  |  47 +++---
 clang/test/Analysis/new.cpp                   |  11 +-
 6 files changed, 154 insertions(+), 181 deletions(-)
 delete mode 100644 clang/test/Analysis/Malloc+NewDelete_intersections.cpp

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index bc4b7d00e2d40..2ba3881c61351 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -556,13 +556,13 @@ def NewDeleteChecker : Checker<"NewDelete">,
 
 def NewDeleteLeaksChecker : Checker<"NewDeleteLeaks">,
   HelpText<"Check for memory leaks. Traces memory managed by new/delete.">,
-  Dependencies<[NewDeleteChecker]>,
+  Dependencies<[DynamicMemoryModeling]>,
   Documentation<HasDocumentation>;
 
 def PlacementNewChecker : Checker<"PlacementNew">,
   HelpText<"Check if default placement new is provided with pointers to "
            "sufficient storage capacity">,
-  Dependencies<[NewDeleteChecker]>,
+  Dependencies<[DynamicMemoryModeling]>,
   Documentation<HasDocumentation>;
 
 def CXXSelfAssignmentChecker : Checker<"SelfAssignment">,
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index a7c62a7e8046f..fa69bc253fbd0 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -684,41 +684,42 @@ class MallocChecker
   static bool SummarizeValue(raw_ostream &os, SVal V);
   static bool SummarizeRegion(raw_ostream &os, const MemRegion *MR);
 
-  void ReportBadFree(CheckerContext &C, SVal ArgVal, SourceRange Range,
-                     const Expr *DeallocExpr, AllocationFamily Family) const;
+  void HandleNonHeapDealloc(CheckerContext &C, SVal ArgVal, SourceRange Range,
+                            const Expr *DeallocExpr,
+                            AllocationFamily Family) const;
 
-  void ReportFreeAlloca(CheckerContext &C, SVal ArgVal,
+  void HandleFreeAlloca(CheckerContext &C, SVal ArgVal,
                         SourceRange Range) const;
 
-  void ReportMismatchedDealloc(CheckerContext &C, SourceRange Range,
+  void HandleMismatchedDealloc(CheckerContext &C, SourceRange Range,
                                const Expr *DeallocExpr, const RefState *RS,
                                SymbolRef Sym, bool OwnershipTransferred) const;
 
-  void ReportOffsetFree(CheckerContext &C, SVal ArgVal, SourceRange Range,
+  void HandleOffsetFree(CheckerContext &C, SVal ArgVal, SourceRange Range,
                         const Expr *DeallocExpr, AllocationFamily Family,
                         const Expr *AllocExpr = nullptr) const;
 
-  void ReportUseAfterFree(CheckerContext &C, SourceRange Range,
+  void HandleUseAfterFree(CheckerContext &C, SourceRange Range,
                           SymbolRef Sym) const;
 
-  void ReportDoubleFree(CheckerContext &C, SourceRange Range, bool Released,
+  void HandleDoubleFree(CheckerContext &C, SourceRange Range, bool Released,
                         SymbolRef Sym, SymbolRef PrevSym) const;
 
-  void ReportDoubleDelete(CheckerContext &C, SymbolRef Sym) const;
+  void HandleDoubleDelete(CheckerContext &C, SymbolRef Sym) const;
 
-  void ReportUseZeroAllocated(CheckerContext &C, SourceRange Range,
-                              SymbolRef Sym) const;
+  void HandleUseZeroAlloc(CheckerContext &C, SourceRange Range,
+                          SymbolRef Sym) const;
 
-  void ReportFunctionPointerFree(CheckerContext &C, SVal ArgVal,
-                                 SourceRange Range, const Expr *FreeExpr,
-                                 AllocationFamily Family) const;
+  void HandleFunctionPtrFree(CheckerContext &C, SVal ArgVal, SourceRange Range,
+                             const Expr *FreeExpr,
+                             AllocationFamily Family) const;
 
   /// Find the location of the allocation for Sym on the path leading to the
   /// exploded node N.
   static LeakInfo getAllocationSite(const ExplodedNode *N, SymbolRef Sym,
                                     CheckerContext &C);
 
-  void reportLeak(SymbolRef Sym, ExplodedNode *N, CheckerContext &C) const;
+  void HandleLeak(SymbolRef Sym, ExplodedNode *N, CheckerContext &C) const;
 
   /// Test if value in ArgVal equals to value in macro `ZERO_SIZE_PTR`.
   bool isArgZERO_SIZE_PTR(ProgramStateRef State, CheckerContext &C,
@@ -1743,6 +1744,15 @@ ProgramStateRef MallocChecker::FreeMemAux(
   const MemRegion *R = ArgVal.getAsRegion();
   const Expr *ParentExpr = Call.getOriginExpr();
 
+  // NOTE: We detected a bug, but the checker under whose name we would emit the
+  // error could be disabled. Generally speaking, the MallocChecker family is an
+  // integral part of the Static Analyzer, and disabling any part of it should
+  // only be done under exceptional circumstances, such as frequent false
+  // positives. If this is the case, we can reasonably believe that there are
+  // serious faults in our understanding of the source code, and even if we
+  // don't emit an warning, we should terminate further analysis with a sink
+  // node.
+
   // Nonlocs can't be freed, of course.
   // Non-region locations (labels and fixed addresses) also shouldn't be freed.
   if (!R) {
@@ -1752,7 +1762,8 @@ ProgramStateRef MallocChecker::FreeMemAux(
     // zero-sized memory block which is allowed to be freed, despite not being a
     // null pointer.
     if (Family != AF_Malloc || !isArgZERO_SIZE_PTR(State, C, ArgVal))
-      ReportBadFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr, Family);
+      HandleNonHeapDealloc(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
+                           Family);
     return nullptr;
   }
 
@@ -1760,7 +1771,8 @@ ProgramStateRef MallocChecker::FreeMemAux(
 
   // Blocks might show up as heap data, but should not be free()d
   if (isa<BlockDataRegion>(R)) {
-    ReportBadFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr, Family);
+    HandleNonHeapDealloc(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
+                         Family);
     return nullptr;
   }
 
@@ -1778,9 +1790,10 @@ ProgramStateRef MallocChecker::FreeMemAux(
     // False negatives are better than false positives.
 
     if (isa<AllocaRegion>(R))
-      ReportFreeAlloca(C, ArgVal, ArgExpr->getSourceRange());
+      HandleFreeAlloca(C, ArgVal, ArgExpr->getSourceRange());
     else
-      ReportBadFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr, Family);
+      HandleNonHeapDealloc(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
+                           Family);
 
     return nullptr;
   }
@@ -1802,14 +1815,14 @@ ProgramStateRef MallocChecker::FreeMemAux(
 
     // Memory returned by alloca() shouldn't be freed.
     if (RsBase->getAllocationFamily() == AF_Alloca) {
-      ReportFreeAlloca(C, ArgVal, ArgExpr->getSourceRange());
+      HandleFreeAlloca(C, ArgVal, ArgExpr->getSourceRange());
       return nullptr;
     }
 
     // Check for double free first.
     if ((RsBase->isReleased() || RsBase->isRelinquished()) &&
         !didPreviousFreeFail(State, SymBase, PreviousRetStatusSymbol)) {
-      ReportDoubleFree(C, ParentExpr->getSourceRange(), RsBase->isReleased(),
+      HandleDoubleFree(C, ParentExpr->getSourceRange(), RsBase->isReleased(),
                        SymBase, PreviousRetStatusSymbol);
       return nullptr;
 
@@ -1821,8 +1834,8 @@ ProgramStateRef MallocChecker::FreeMemAux(
       // Check if an expected deallocation function matches the real one.
       bool DeallocMatchesAlloc = RsBase->getAllocationFamily() == Family;
       if (!DeallocMatchesAlloc) {
-        ReportMismatchedDealloc(C, ArgExpr->getSourceRange(),
-                                ParentExpr, RsBase, SymBase, Hold);
+        HandleMismatchedDealloc(C, ArgExpr->getSourceRange(), ParentExpr,
+                                RsBase, SymBase, Hold);
         return nullptr;
       }
 
@@ -1833,7 +1846,7 @@ ProgramStateRef MallocChecker::FreeMemAux(
           !Offset.hasSymbolicOffset() &&
           Offset.getOffset() != 0) {
         const Expr *AllocExpr = cast<Expr>(RsBase->getStmt());
-        ReportOffsetFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
+        HandleOffsetFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
                          Family, AllocExpr);
         return nullptr;
       }
@@ -1841,8 +1854,8 @@ ProgramStateRef MallocChecker::FreeMemAux(
   }
 
   if (SymBase->getType()->isFunctionPointerType()) {
-    ReportFunctionPointerFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
-                              Family);
+    HandleFunctionPtrFree(C, ArgVal, ArgExpr->getSourceRange(), ParentExpr,
+                          Family);
     return nullptr;
   }
 
@@ -2009,13 +2022,15 @@ bool MallocChecker::SummarizeRegion(raw_ostream &os,
   }
 }
 
-void MallocChecker::ReportBadFree(CheckerContext &C, SVal ArgVal,
-                                  SourceRange Range, const Expr *DeallocExpr,
-                                  AllocationFamily Family) const {
+void MallocChecker::HandleNonHeapDealloc(CheckerContext &C, SVal ArgVal,
+                                         SourceRange Range,
+                                         const Expr *DeallocExpr,
+                                         AllocationFamily Family) const {
 
-  if (!ChecksEnabled[CK_MallocChecker] &&
-      !ChecksEnabled[CK_NewDeleteChecker])
+  if (!ChecksEnabled[CK_MallocChecker] && !ChecksEnabled[CK_NewDeleteChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(Family);
   if (!CheckKind.hasValue())
@@ -2055,7 +2070,7 @@ void MallocChecker::ReportBadFree(CheckerContext &C, SVal ArgVal,
   }
 }
 
-void MallocChecker::ReportFreeAlloca(CheckerContext &C, SVal ArgVal,
+void MallocChecker::HandleFreeAlloca(CheckerContext &C, SVal ArgVal,
                                      SourceRange Range) const {
 
   Optional<MallocChecker::CheckKind> CheckKind;
@@ -2064,8 +2079,10 @@ void MallocChecker::ReportFreeAlloca(CheckerContext &C, SVal ArgVal,
     CheckKind = CK_MallocChecker;
   else if (ChecksEnabled[CK_MismatchedDeallocatorChecker])
     CheckKind = CK_MismatchedDeallocatorChecker;
-  else
+  else {
+    C.addSink();
     return;
+  }
 
   if (ExplodedNode *N = C.generateErrorNode()) {
     if (!BT_FreeAlloca[*CheckKind])
@@ -2081,15 +2098,16 @@ void MallocChecker::ReportFreeAlloca(CheckerContext &C, SVal ArgVal,
   }
 }
 
-void MallocChecker::ReportMismatchedDealloc(CheckerContext &C,
+void MallocChecker::HandleMismatchedDealloc(CheckerContext &C,
                                             SourceRange Range,
                                             const Expr *DeallocExpr,
-                                            const RefState *RS,
-                                            SymbolRef Sym,
+                                            const RefState *RS, SymbolRef Sym,
                                             bool OwnershipTransferred) const {
 
-  if (!ChecksEnabled[CK_MismatchedDeallocatorChecker])
+  if (!ChecksEnabled[CK_MismatchedDeallocatorChecker]) {
+    C.addSink();
     return;
+  }
 
   if (ExplodedNode *N = C.generateErrorNode()) {
     if (!BT_MismatchedDealloc)
@@ -2137,14 +2155,15 @@ void MallocChecker::ReportMismatchedDealloc(CheckerContext &C,
   }
 }
 
-void MallocChecker::ReportOffsetFree(CheckerContext &C, SVal ArgVal,
+void MallocChecker::HandleOffsetFree(CheckerContext &C, SVal ArgVal,
                                      SourceRange Range, const Expr *DeallocExpr,
                                      AllocationFamily Family,
                                      const Expr *AllocExpr) const {
 
-  if (!ChecksEnabled[CK_MallocChecker] &&
-      !ChecksEnabled[CK_NewDeleteChecker])
+  if (!ChecksEnabled[CK_MallocChecker] && !ChecksEnabled[CK_NewDeleteChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(Family);
   if (!CheckKind.hasValue())
@@ -2194,13 +2213,14 @@ void MallocChecker::ReportOffsetFree(CheckerContext &C, SVal ArgVal,
   C.emitReport(std::move(R));
 }
 
-void MallocChecker::ReportUseAfterFree(CheckerContext &C, SourceRange Range,
+void MallocChecker::HandleUseAfterFree(CheckerContext &C, SourceRange Range,
                                        SymbolRef Sym) const {
 
-  if (!ChecksEnabled[CK_MallocChecker] &&
-      !ChecksEnabled[CK_NewDeleteChecker] &&
-      !ChecksEnabled[CK_InnerPointerChecker])
+  if (!ChecksEnabled[CK_MallocChecker] && !ChecksEnabled[CK_NewDeleteChecker] &&
+      !ChecksEnabled[CK_InnerPointerChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(C, Sym);
   if (!CheckKind.hasValue())
@@ -2232,13 +2252,14 @@ void MallocChecker::ReportUseAfterFree(CheckerContext &C, SourceRange Range,
   }
 }
 
-void MallocChecker::ReportDoubleFree(CheckerContext &C, SourceRange Range,
+void MallocChecker::HandleDoubleFree(CheckerContext &C, SourceRange Range,
                                      bool Released, SymbolRef Sym,
                                      SymbolRef PrevSym) const {
 
-  if (!ChecksEnabled[CK_MallocChecker] &&
-      !ChecksEnabled[CK_NewDeleteChecker])
+  if (!ChecksEnabled[CK_MallocChecker] && !ChecksEnabled[CK_NewDeleteChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(C, Sym);
   if (!CheckKind.hasValue())
@@ -2263,10 +2284,12 @@ void MallocChecker::ReportDoubleFree(CheckerContext &C, SourceRange Range,
   }
 }
 
-void MallocChecker::ReportDoubleDelete(CheckerContext &C, SymbolRef Sym) const {
+void MallocChecker::HandleDoubleDelete(CheckerContext &C, SymbolRef Sym) const {
 
-  if (!ChecksEnabled[CK_NewDeleteChecker])
+  if (!ChecksEnabled[CK_NewDeleteChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(C, Sym);
   if (!CheckKind.hasValue())
@@ -2287,13 +2310,13 @@ void MallocChecker::ReportDoubleDelete(CheckerContext &C, SymbolRef Sym) const {
   }
 }
 
-void MallocChecker::ReportUseZeroAllocated(CheckerContext &C,
-                                           SourceRange Range,
-                                           SymbolRef Sym) const {
+void MallocChecker::HandleUseZeroAlloc(CheckerContext &C, SourceRange Range,
+                                       SymbolRef Sym) const {
 
-  if (!ChecksEnabled[CK_MallocChecker] &&
-      !ChecksEnabled[CK_NewDeleteChecker])
+  if (!ChecksEnabled[CK_MallocChecker] && !ChecksEnabled[CK_NewDeleteChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(C, Sym);
 
@@ -2318,12 +2341,14 @@ void MallocChecker::ReportUseZeroAllocated(CheckerContext &C,
   }
 }
 
-void MallocChecker::ReportFunctionPointerFree(CheckerContext &C, SVal ArgVal,
-                                              SourceRange Range,
-                                              const Expr *FreeExpr,
-                                              AllocationFamily Family) const {
-  if (!ChecksEnabled[CK_MallocChecker])
+void MallocChecker::HandleFunctionPtrFree(CheckerContext &C, SVal ArgVal,
+                                          SourceRange Range,
+                                          const Expr *FreeExpr,
+                                          AllocationFamily Family) const {
+  if (!ChecksEnabled[CK_MallocChecker]) {
+    C.addSink();
     return;
+  }
 
   Optional<MallocChecker::CheckKind> CheckKind = getCheckIfTracked(Family);
   if (!CheckKind.hasValue())
@@ -2521,7 +2546,7 @@ MallocChecker::LeakInfo MallocChecker::getAllocationSite(const ExplodedNode *N,
   return LeakInfo(AllocNode, ReferenceRegion);
 }
 
-void MallocChecker::reportLeak(SymbolRef Sym, ExplodedNode *N,
+void MallocChecker::HandleLeak(SymbolRef Sym, ExplodedNode *N,
                                CheckerContext &C) const {
 
   if (!ChecksEnabled[CK_MallocChecker] &&
@@ -2637,7 +2662,7 @@ void MallocChecker::checkDeadSymbols(SymbolReaper &SymReaper,
     if (N) {
       for (SmallVectorImpl<SymbolRef>::iterator
            I = Errors.begin(), E = Errors.end(); I != E; ++I) {
-        reportLeak(*I, N, C);
+        HandleLeak(*I, N, C);
       }
     }
   }
@@ -2822,7 +2847,7 @@ bool MallocChecker::checkUseAfterFree(SymbolRef Sym, CheckerContext &C,
                                       const Stmt *S) const {
 
   if (isReleased(Sym, C)) {
-    ReportUseAfterFree(C, S->getSourceRange(), Sym);
+    HandleUseAfterFree(C, S->getSourceRange(), Sym);
     return true;
   }
 
@@ -2835,17 +2860,17 @@ void MallocChecker::checkUseZeroAllocated(SymbolRef Sym, CheckerContext &C,
 
   if (const RefState *RS = C.getState()->get<RegionState>(Sym)) {
     if (RS->isAllocatedOfSizeZero())
-      ReportUseZeroAllocated(C, RS->getStmt()->getSourceRange(), Sym);
+      HandleUseZeroAlloc(C, RS->getStmt()->getSourceRange(), Sym);
   }
   else if (C.getState()->contains<ReallocSizeZeroSymbols>(Sym)) {
-    ReportUseZeroAllocated(C, S->getSourceRange(), Sym);
+    HandleUseZeroAlloc(C, S->getSourceRange(), Sym);
   }
 }
 
 bool MallocChecker::checkDoubleDelete(SymbolRef Sym, CheckerContext &C) const {
 
   if (isReleased(Sym, C)) {
-    ReportDoubleDelete(C, Sym);
+    HandleDoubleDelete(C, Sym);
     return true;
   }
   return false;
diff --git a/clang/test/Analysis/Malloc+NewDelete_intersections.cpp b/clang/test/Analysis/Malloc+NewDelete_intersections.cpp
deleted file mode 100644
index 9140e1f4a372b..0000000000000
--- a/clang/test/Analysis/Malloc+NewDelete_intersections.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete -std=c++11 -verify %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,cplusplus.NewDeleteLeaks -std=c++11 -verify %s
-
-typedef __typeof(sizeof(int)) size_t;
-void *malloc(size_t);
-void free(void *);
-
-//-------------------------------------------------------------------
-// Check that unix.Malloc + cplusplus.NewDelete does not enable
-// warnings produced by unix.MismatchedDeallocator.
-//-------------------------------------------------------------------
-void testMismatchedDeallocator() {
-  int *p = (int *)malloc(sizeof(int));
-  delete p;
-} // expected-warning{{Potential leak of memory pointed to by 'p'}}
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index ba179749510cc..f0d42171a8756 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -1,42 +1,31 @@
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
+// RUN:   -verify=expected,newdelete \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \
+// RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
+// RUN:   -verify=expected,newdelete \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-config c++-allocator-inlining=true
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
 // RUN:   -analyzer-config c++-allocator-inlining=true
 //
-// RUN: %clang_analyze_cc1 -DTEST_INLINABLE_ALLOCATORS \
-// RUN:   -std=c++11 -fblocks -verify %s \
-// RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=cplusplus.NewDelete
-//
-// RUN: %clang_analyze_cc1 -DLEAKS -DTEST_INLINABLE_ALLOCATORS \
-// RUN:   -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN:   -verify=expected,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
-//
-// RUN: %clang_analyze_cc1 -DTEST_INLINABLE_ALLOCATORS \
-// RUN:   -std=c++11 -fblocks -verify %s \
-// RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=cplusplus.NewDelete \
-// RUN:   -analyzer-config c++-allocator-inlining=true
-//
-// RUN: %clang_analyze_cc1 -DLEAKS -DTEST_INLINABLE_ALLOCATORS \
-// RUN:   -std=c++11 -fblocks -verify %s \
-// RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
-// RUN:   -analyzer-config c++-allocator-inlining=true
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -52,50 +41,28 @@ int *global;
 //----- Standard non-placement operators
 void testGlobalOpNew() {
   void *p = operator new(0);
-}
-#ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
-#endif
+} // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
 void testGlobalOpNewArray() {
   void *p = operator new[](0);
-}
-#ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
-#endif
+} // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
 void testGlobalNewExpr() {
   int *p = new int;
-}
-#ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
-#endif
+} // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
 void testGlobalNewExprArray() {
   int *p = new int[0];
-}
-#ifdef LEAKS
-// expected-warning@-2{{Potential leak of memory pointed to by 'p'}}
-#endif
+} // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
 //----- Standard nothrow placement operators
 void testGlobalNoThrowPlacementOpNewBeforeOverload() {
   void *p = operator new(0, std::nothrow);
-}
-#ifdef LEAKS
-#ifndef TEST_INLINABLE_ALLOCATORS
-// expected-warning@-3{{Potential leak of memory pointed to by 'p'}}
-#endif
-#endif
+} // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
 void testGlobalNoThrowPlacementExprNewBeforeOverload() {
   int *p = new(std::nothrow) int;
-}
-#ifdef LEAKS
-#ifndef TEST_INLINABLE_ALLOCATORS
-// expected-warning@-3{{Potential leak of memory pointed to by 'p'}}
-#endif
-#endif
+} // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
 //----- Standard pointer placement operators
 void testGlobalPointerPlacementNew() {
@@ -135,13 +102,13 @@ void testNewInvalidationPlacement(PtrWrapper *w) {
 
 void testUseZeroAlloc1() {
   int *p = (int *)operator new(0);
-  *p = 1; // expected-warning {{Use of zero-allocated memory}}
+  *p = 1; // newdelete-warning {{Use of zero-allocated memory}}
   delete p;
 }
 
 int testUseZeroAlloc2() {
   int *p = (int *)operator new[](0);
-  return p[0]; // expected-warning {{Use of zero-allocated memory}}
+  return p[0]; // newdelete-warning {{Use of zero-allocated memory}}
   delete[] p;
 }
 
@@ -149,7 +116,7 @@ void f(int);
 
 void testUseZeroAlloc3() {
   int *p = new int[0];
-  f(*p); // expected-warning {{Use of zero-allocated memory}}
+  f(*p); // newdelete-warning {{Use of zero-allocated memory}}
   delete[] p;
 }
 
@@ -168,70 +135,68 @@ void g(SomeClass &c, ...);
 void testUseFirstArgAfterDelete() {
   int *p = new int;
   delete p;
-  f(p); // expected-warning{{Use of memory after it is freed}}
+  f(p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testUseMiddleArgAfterDelete(int *p) {
   delete p;
-  f(0, p); // expected-warning{{Use of memory after it is freed}}
+  f(0, p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testUseLastArgAfterDelete(int *p) {
   delete p;
-  f(0, 0, p); // expected-warning{{Use of memory after it is freed}}
+  f(0, 0, p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testUseSeveralArgsAfterDelete(int *p) {
   delete p;
-  f(p, p, p); // expected-warning{{Use of memory after it is freed}}
+  f(p, p, p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testUseRefArgAfterDelete(SomeClass &c) {
   delete &c;
-  g(c); // expected-warning{{Use of memory after it is freed}}
+  g(c); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testVariadicArgAfterDelete() {
   SomeClass c;
   int *p = new int;
   delete p;
-  g(c, 0, p); // expected-warning{{Use of memory after it is freed}}
+  g(c, 0, p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testUseMethodArgAfterDelete(int *p) {
   SomeClass *c = new SomeClass;
   delete p;
-  c->f(p); // expected-warning{{Use of memory after it is freed}}
+  c->f(p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testUseThisAfterDelete() {
   SomeClass *c = new SomeClass;
   delete c;
-  c->f(0); // expected-warning{{Use of memory after it is freed}}
+  c->f(0); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testDoubleDelete() {
   int *p = new int;
   delete p;
-  delete p; // expected-warning{{Attempt to free released memory}}
+  delete p; // newdelete-warning{{Attempt to free released memory}}
 }
 
 void testExprDeleteArg() {
   int i;
-  delete &i; // expected-warning{{Argument to 'delete' is the address of the local variable 'i', which is not memory allocated by 'new'}}
+  delete &i; // newdelete-warning{{Argument to 'delete' is the address of the local variable 'i', which is not memory allocated by 'new'}}
 }
 
 void testExprDeleteArrArg() {
   int i;
-  delete[] &i; // expected-warning{{Argument to 'delete[]' is the address of the local variable 'i', which is not memory allocated by 'new[]'}}
+  delete[] & i; // newdelete-warning{{Argument to 'delete[]' is the address of the local variable 'i', which is not memory allocated by 'new[]'}}
 }
 
 void testAllocDeallocNames() {
   int *p = new(std::nothrow) int[1];
   delete[] (++p);
-#ifndef TEST_INLINABLE_ALLOCATORS
-  // expected-warning@-2{{Argument to 'delete[]' is offset by 4 bytes from the start of memory allocated by 'new[]'}}
-#endif
+  // newdelete-warning@-1{{Argument to 'delete[]' is offset by 4 bytes from the start of memory allocated by 'new[]'}}
 }
 
 //--------------------------------
@@ -408,7 +373,7 @@ class DerefClass{
 void testDoubleDeleteClassInstance() {
   DerefClass *foo = new DerefClass();
   delete foo;
-  delete foo; // expected-warning {{Attempt to delete released memory}}
+  delete foo; // newdelete-warning {{Attempt to delete released memory}}
 }
 
 class EmptyClass{
@@ -420,7 +385,7 @@ class EmptyClass{
 void testDoubleDeleteEmptyClass() {
   EmptyClass *foo = new EmptyClass();
   delete foo;
-  delete foo;  // expected-warning {{Attempt to delete released memory}}
+  delete foo; // newdelete-warning {{Attempt to delete released memory}}
 }
 
 struct Base {
diff --git a/clang/test/Analysis/NewDelete-intersections.mm b/clang/test/Analysis/NewDelete-intersections.mm
index b3707858f00ca..f01d62f8d365a 100644
--- a/clang/test/Analysis/NewDelete-intersections.mm
+++ b/clang/test/Analysis/NewDelete-intersections.mm
@@ -1,7 +1,20 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -verify %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks -std=c++11 -DLEAKS -fblocks -verify %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete -std=c++11 -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,cplusplus.NewDelete,cplusplus.NewDeleteLeaks -std=c++11 -DLEAKS -fblocks -DTEST_INLINABLE_ALLOCATORS -verify %s
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
+// RUN:  -verify=newdelete \
+// RUN:  -analyzer-checker=core \
+// RUN:  -analyzer-checker=cplusplus.NewDelete
+
+// RUN: %clang_analyze_cc1 -std=c++11 -DLEAKS -fblocks %s \
+// RUN:   -verify=leak \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
+
+// leak-no-diagnostics
+
+// RUN: %clang_analyze_cc1 -std=c++11 -DLEAKS -fblocks %s \
+// RUN:   -verify=mismatch \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=unix.MismatchedDeallocator
+
 #include "Inputs/system-header-simulator-cxx.h"
 #include "Inputs/system-header-simulator-objc.h"
 
@@ -10,12 +23,6 @@
 extern "C" void *alloca(size_t);
 extern "C" void free(void *);
 
-//----------------------------------------------------------------------------
-// Check for intersections with unix.Malloc and unix.MallocWithAnnotations 
-// checkers bounded with cplusplus.NewDelete.
-//----------------------------------------------------------------------------
-
-//----- malloc()/free() are subjects of unix.Malloc and unix.MallocWithAnnotations
 void testMallocFreeNoWarn() {
   int i;
   free(&i); // no warn
@@ -39,7 +46,8 @@ void testMallocFreeNoWarn() {
 
 void testDeleteMalloced() {
   int *p1 = (int *)malloc(sizeof(int));
-  delete p1; // no warn
+  delete p1;
+  // mismatch-warning@-1{{Memory allocated by malloc() should be deallocated by free(), not 'delete'}}
 
   int *p2 = (int *)__builtin_alloca(sizeof(int));
   delete p2; // no warn
@@ -54,35 +62,30 @@ void testUseZeroAllocatedMalloced() {
 void testFreeOpNew() {
   void *p = operator new(0);
   free(p);
+  // mismatch-warning@-1{{Memory allocated by operator new should be deallocated by 'delete', not free()}}
 }
-#ifdef LEAKS
-// expected-warning@-2 {{Potential leak of memory pointed to by 'p'}}
-#endif
 
 void testFreeNewExpr() {
   int *p = new int;
   free(p);
+  // mismatch-warning@-1{{Memory allocated by 'new' should be deallocated by 'delete', not free()}}
+  free(p);
 }
-#ifdef LEAKS
-// expected-warning@-2 {{Potential leak of memory pointed to by 'p'}}
-#endif
 
 void testObjcFreeNewed() {
   int *p = new int;
   NSData *nsdata = [NSData dataWithBytesNoCopy:p length:sizeof(int) freeWhenDone:1];
-#ifdef LEAKS
-  // expected-warning@-2 {{Potential leak of memory pointed to by 'p'}}
-#endif
+  // mismatch-warning@-1{{+dataWithBytesNoCopy:length:freeWhenDone: cannot take ownership of memory allocated by 'new'}}
 }
 
 void testFreeAfterDelete() {
   int *p = new int;  
   delete p;
-  free(p); // expected-warning{{Use of memory after it is freed}}
+  free(p); // newdelete-warning{{Use of memory after it is freed}}
 }
 
 void testStandardPlacementNewAfterDelete() {
   int *p = new int;  
   delete p;
-  p = new(p) int; // expected-warning{{Use of memory after it is freed}}
+  p = new (p) int; // newdelete-warning{{Use of memory after it is freed}}
 }
diff --git a/clang/test/Analysis/new.cpp b/clang/test/Analysis/new.cpp
index 3384cfeb61417..2c3eb2825a6bb 100644
--- a/clang/test/Analysis/new.cpp
+++ b/clang/test/Analysis/new.cpp
@@ -115,11 +115,6 @@ void testUseAfter(int *p) {
   delete c;
 }
 
-//--------------------------------------------------------------------
-// Check for intersection with other checkers from MallocChecker.cpp 
-// bounded with unix.Malloc
-//--------------------------------------------------------------------
-
 // new/delete oparators are subjects of cplusplus.NewDelete.
 void testNewDeleteNoWarn() {
   int i;
@@ -135,11 +130,11 @@ void testNewDeleteNoWarn() {
   int *p3 = new int; // no-warning
 }
 
-// unix.Malloc does not know about operators new/delete.
 void testDeleteMallocked() {
   int *x = (int *)malloc(sizeof(int));
-  delete x; // FIXME: Should detect pointer escape and keep silent after 'delete' is modeled properly.
-} // expected-warning{{Potential leak of memory pointed to by 'x'}}
+  // unix.MismatchedDeallocator would catch this, but we're not testing it here.
+  delete x;
+}
 
 void testDeleteOpAfterFree() {
   int *p = (int *)malloc(sizeof(int));

From a924dac44f31ffa19508165fc61a9f10cd1d4836 Mon Sep 17 00:00:00 2001
From: stevewan <wan.yu@ibm.com>
Date: Tue, 26 May 2020 15:39:21 -0400
Subject: [PATCH 159/770] [NFC] Fix formatting for the 'aix-ld.c' test case.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Based on comments received in D80415 pertinent to test case format, the following fixes are provided to other tests in 'aix-ld.c' for the sake of consistency and readability,
  - Align flags in RUN directives vertically.
  - Align patterns in CHECK directives vertically.
  - Remove the ‘-o %t.o’ as it’s unnecessary for tests with ‘-###’.
  - Fix typos in comments.

Reviewers: ZarkoCA, hubert.reinterpretcast, daltenty

Reviewed By: hubert.reinterpretcast

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80532
---
 clang/test/Driver/aix-ld.c | 249 ++++++++++++++++++-------------------
 1 file changed, 124 insertions(+), 125 deletions(-)

diff --git a/clang/test/Driver/aix-ld.c b/clang/test/Driver/aix-ld.c
index 218fbd2bb3802..59e35248af30c 100644
--- a/clang/test/Driver/aix-ld.c
+++ b/clang/test/Driver/aix-ld.c
@@ -2,177 +2,177 @@
 // sysroot to make these tests independent of the host system.
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -target powerpc-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -target powerpc-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD32 %s
 // CHECK-LD32-NOT: warning:
-// CHECK-LD32: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
-// CHECK-LD32: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD32: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD32:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD32-NOT: "-bnso"
-// CHECK-LD32: "-b32" 
-// CHECK-LD32: "-bpT:0x10000000" "-bpD:0x20000000" 
-// CHECK-LD32: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
-// CHECK-LD32: "-L[[SYSROOT]]/usr/lib" 
-// CHECK-LD32: "-lc"
+// CHECK-LD32:     "-b32"
+// CHECK-LD32:     "-bpT:0x10000000" "-bpD:0x20000000"
+// CHECK-LD32:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
+// CHECK-LD32:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD32:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD64 %s
 // CHECK-LD64-NOT: warning:
-// CHECK-LD64: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
-// CHECK-LD64: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD64: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD64:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
+// CHECK-LD64:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD64:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD64-NOT: "-bnso"
-// CHECK-LD64: "-b64" 
-// CHECK-LD64: "-bpT:0x100000000" "-bpD:0x110000000" 
-// CHECK-LD64: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0_64.o"
-// CHECK-LD64: "-L[[SYSROOT]]/usr/lib" 
-// CHECK-LD64: "-lc"
+// CHECK-LD64:     "-b64"
+// CHECK-LD64:     "-bpT:0x100000000" "-bpD:0x110000000"
+// CHECK-LD64:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0_64.o"
+// CHECK-LD64:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD64:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Enable POSIX thread support.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -pthread \
-// RUN:         -target powerpc-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -pthread \
+// RUN:        -target powerpc-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD32-PTHREAD %s
 // CHECK-LD32-PTHREAD-NOT: warning:
-// CHECK-LD32-PTHREAD: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
-// CHECK-LD32-PTHREAD: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD32-PTHREAD: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD32-PTHREAD:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-PTHREAD:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32-PTHREAD:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD32-PTHREAD-NOT: "-bnso"
-// CHECK-LD32-PTHREAD: "-b32" 
-// CHECK-LD32-PTHREAD: "-bpT:0x10000000" "-bpD:0x20000000" 
-// CHECK-LD32-PTHREAD: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
-// CHECK-LD32-PTHREAD: "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD32-PTHREAD: "-lpthreads"
-// CHECK-LD32-PTHREAD: "-lc"
+// CHECK-LD32-PTHREAD:     "-b32"
+// CHECK-LD32-PTHREAD:     "-bpT:0x10000000" "-bpD:0x20000000"
+// CHECK-LD32-PTHREAD:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
+// CHECK-LD32-PTHREAD:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD32-PTHREAD:     "-lpthreads"
+// CHECK-LD32-PTHREAD:     "-lc"
 
-// Check powerpc-ibm-aix7.1.0.0, 64-bit. POSIX thread alias.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -pthreads \
-// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// Check powerpc64-ibm-aix7.1.0.0, 64-bit. POSIX thread alias.
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -pthreads \
+// RUN:        -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD64-PTHREAD %s
 // CHECK-LD64-PTHREAD-NOT: warning:
-// CHECK-LD64-PTHREAD: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
-// CHECK-LD64-PTHREAD: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD64-PTHREAD: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD64-PTHREAD:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
+// CHECK-LD64-PTHREAD:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD64-PTHREAD:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD64-PTHREAD-NOT: "-bnso"
-// CHECK-LD64-PTHREAD: "-b64" 
-// CHECK-LD64-PTHREAD: "-bpT:0x100000000" "-bpD:0x110000000" 
-// CHECK-LD64-PTHREAD: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0_64.o"
-// CHECK-LD64-PTHREAD: "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD64-PTHREAD: "-lpthreads"
-// CHECK-LD64-PTHREAD: "-lc"
+// CHECK-LD64-PTHREAD:     "-b64"
+// CHECK-LD64-PTHREAD:     "-bpT:0x100000000" "-bpD:0x110000000"
+// CHECK-LD64-PTHREAD:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0_64.o"
+// CHECK-LD64-PTHREAD:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD64-PTHREAD:     "-lpthreads"
+// CHECK-LD64-PTHREAD:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Enable profiling.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -p \
-// RUN:         -target powerpc-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -p \
+// RUN:        -target powerpc-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD32-PROF %s
 // CHECK-LD32-PROF-NOT: warning:
-// CHECK-LD32-PROF: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
-// CHECK-LD32-PROF: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD32-PROF: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD32-PROF:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-PROF:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32-PROF:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD32-PROF-NOT: "-bnso"
-// CHECK-LD32-PROF: "-b32" 
-// CHECK-LD32-PROF: "-bpT:0x10000000" "-bpD:0x20000000" 
-// CHECK-LD32-PROF: "[[SYSROOT]]/usr/lib{{/|\\\\}}mcrt0.o"
-// CHECK-LD32-PROF: "-L[[SYSROOT]]/usr/lib" 
-// CHECK-LD32-PROF: "-lc"
+// CHECK-LD32-PROF:     "-b32"
+// CHECK-LD32-PROF:     "-bpT:0x10000000" "-bpD:0x20000000"
+// CHECK-LD32-PROF:     "[[SYSROOT]]/usr/lib{{/|\\\\}}mcrt0.o"
+// CHECK-LD32-PROF:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD32-PROF:     "-lc"
 
 // Check powerpc64-ibm-aix7.1.0.0, 64-bit. Enable g-profiling.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -pg \
-// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -pg \
+// RUN:        -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD64-GPROF %s
 // CHECK-LD64-GPROF-NOT: warning:
-// CHECK-LD64-GPROF: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
-// CHECK-LD64-GPROF: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD64-GPROF: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD64-GPROF:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
+// CHECK-LD64-GPROF:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD64-GPROF:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD64-GPROF-NOT: "-bnso"
-// CHECK-LD64-GPROF: "-b64" 
-// CHECK-LD64-GPROF: "-bpT:0x100000000" "-bpD:0x110000000" 
-// CHECK-LD64-GPROF: "[[SYSROOT]]/usr/lib{{/|\\\\}}gcrt0_64.o"
-// CHECK-LD64-GPROF: "-L[[SYSROOT]]/usr/lib" 
-// CHECK-LD64-GPROF: "-lc"
+// CHECK-LD64-GPROF:     "-b64"
+// CHECK-LD64-GPROF:     "-bpT:0x100000000" "-bpD:0x110000000"
+// CHECK-LD64-GPROF:     "[[SYSROOT]]/usr/lib{{/|\\\\}}gcrt0_64.o"
+// CHECK-LD64-GPROF:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD64-GPROF:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Static linking.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -static \
-// RUN:         -target powerpc-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -static \
+// RUN:        -target powerpc-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD32-STATIC %s
 // CHECK-LD32-STATIC-NOT: warning:
-// CHECK-LD32-STATIC: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
-// CHECK-LD32-STATIC: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD32-STATIC: "{{.*}}ld{{(.exe)?}}" 
-// CHECK-LD32-STATIC: "-bnso"
-// CHECK-LD32-STATIC: "-b32" 
-// CHECK-LD32-STATIC: "-bpT:0x10000000" "-bpD:0x20000000" 
-// CHECK-LD32-STATIC: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
-// CHECK-LD32-STATIC: "-L[[SYSROOT]]/usr/lib" 
-// CHECK-LD32-STATIC: "-lc"
+// CHECK-LD32-STATIC:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-STATIC:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32-STATIC:     "{{.*}}ld{{(.exe)?}}"
+// CHECK-LD32-STATIC:     "-bnso"
+// CHECK-LD32-STATIC:     "-b32"
+// CHECK-LD32-STATIC:     "-bpT:0x10000000" "-bpD:0x20000000"
+// CHECK-LD32-STATIC:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
+// CHECK-LD32-STATIC:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD32-STATIC:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. Library search path.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -L%S/Inputs/aix_ppc_tree/powerpc-ibm-aix7.1.0.0 \
-// RUN:         -target powerpc-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -L%S/Inputs/aix_ppc_tree/powerpc-ibm-aix7.1.0.0 \
+// RUN:        -target powerpc-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD32-LIBP %s
 // CHECK-LD32-LIBP-NOT: warning:
-// CHECK-LD32-LIBP: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
-// CHECK-LD32-LIBP: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD32-LIBP: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD32-LIBP:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-LIBP:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32-LIBP:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD32-LIBP-NOT: "-bnso"
-// CHECK-LD32-LIBP: "-b32" 
-// CHECK-LD32-LIBP: "-bpT:0x10000000" "-bpD:0x20000000" 
-// CHECK-LD32-LIBP: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
-// CHECK-LD32-LIBP: "-L[[SYSROOT]]/powerpc-ibm-aix7.1.0.0" 
-// CHECK-LD32-LIBP: "-L[[SYSROOT]]/usr/lib" 
-// CHECK-LD32-LIBP: "-lc"
+// CHECK-LD32-LIBP:     "-b32"
+// CHECK-LD32-LIBP:     "-bpT:0x10000000" "-bpD:0x20000000"
+// CHECK-LD32-LIBP:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
+// CHECK-LD32-LIBP:     "-L[[SYSROOT]]/powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-LIBP:     "-L[[SYSROOT]]/usr/lib"
+// CHECK-LD32-LIBP:     "-lc"
 
 // Check powerpc-ibm-aix7.1.0.0, 32-bit. nostdlib.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -nostdlib \
-// RUN:         -pthread \
-// RUN:         -target powerpc-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -nostdlib \
+// RUN:        -pthread \
+// RUN:        -target powerpc-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD32-NO-STD-LIB %s
 // CHECK-LD32-NO-STD-LIB-NOT: warning:
-// CHECK-LD32-NO-STD-LIB: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
-// CHECK-LD32-NO-STD-LIB: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD32-NO-STD-LIB: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD32-NO-STD-LIB:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-LD32-NO-STD-LIB:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD32-NO-STD-LIB:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD32-NO-STD-LIB-NOT: "-bnso"
-// CHECK-LD32-NO-STD-LIB: "-b32" 
-// CHECK-LD32-NO-STD-LIB: "-bpT:0x10000000" "-bpD:0x20000000" 
+// CHECK-LD32-NO-STD-LIB:     "-b32"
+// CHECK-LD32-NO-STD-LIB:     "-bpT:0x10000000" "-bpD:0x20000000"
 // CHECK-LD32-NO-STD-LIB-NOT: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0.o"
-// CHECK-LD32-NO-STD-LIB: "-L[[SYSROOT]]/usr/lib" 
+// CHECK-LD32-NO-STD-LIB:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD32-NO-STD-LIB-NOT: "-lpthreads"
 // CHECK-LD32-NO-STD-LIB-NOT: "-lc"
 
-// Check powerpc-ibm-aix7.1.0.0, 64-bit. nodefaultlibs.
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:         -nodefaultlibs \
-// RUN:         -pthread \
-// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
-// RUN:         --sysroot %S/Inputs/aix_ppc_tree \
+// Check powerpc64-ibm-aix7.1.0.0, 64-bit. nodefaultlibs.
+// RUN: %clang -no-canonical-prefixes %s -### 2>&1 \
+// RUN:        -nodefaultlibs \
+// RUN:        -pthread \
+// RUN:        -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:        --sysroot %S/Inputs/aix_ppc_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD64-NO-DEFAULT-LIBS %s
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: warning:
-// CHECK-LD64-NO-DEFAULT-LIBS: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
-// CHECK-LD64-NO-DEFAULT-LIBS: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-LD64-NO-DEFAULT-LIBS: "{{.*}}ld{{(.exe)?}}" 
+// CHECK-LD64-NO-DEFAULT-LIBS:     {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
+// CHECK-LD64-NO-DEFAULT-LIBS:     "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-LD64-NO-DEFAULT-LIBS:     "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-bnso"
-// CHECK-LD64-NO-DEFAULT-LIBS: "-b64" 
-// CHECK-LD64-NO-DEFAULT-LIBS: "-bpT:0x100000000" "-bpD:0x110000000" 
-// CHECK-LD64-NO-DEFAULT-LIBS: "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0_64.o"
-// CHECK-LD64-NO-DEFAULT-LIBS: "-L[[SYSROOT]]/usr/lib" 
+// CHECK-LD64-NO-DEFAULT-LIBS:     "-b64"
+// CHECK-LD64-NO-DEFAULT-LIBS:     "-bpT:0x100000000" "-bpD:0x110000000"
+// CHECK-LD64-NO-DEFAULT-LIBS:     "[[SYSROOT]]/usr/lib{{/|\\\\}}crt0_64.o"
+// CHECK-LD64-NO-DEFAULT-LIBS:     "-L[[SYSROOT]]/usr/lib"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lpthreads"
 // CHECK-LD64-NO-DEFAULT-LIBS-NOT: "-lc"
 
@@ -181,8 +181,7 @@
 // RUN:          -Wl,-bnocdtors \
 // RUN:          -target powerpc-ibm-aix7.1.0.0 \
 // RUN:          --sysroot %S/Inputs/aix_ppc_tree \
-// RUN: | FileCheck --check-prefix=CHECK-LD32-CXX-ARG-ORDER %s
-
+// RUN:   | FileCheck --check-prefix=CHECK-LD32-CXX-ARG-ORDER %s
 // CHECK-LD32-CXX-ARG-ORDER:     {{.*}}clang{{.*}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
 // CHECK-LD32-CXX-ARG-ORDER:     "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK-LD32-CXX-ARG-ORDER:     "{{.*}}ld{{(.exe)?}}"

From 9a0b0855a96ad91e082c6fb066e0ebabe72eb6b3 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 26 May 2020 12:36:03 -0700
Subject: [PATCH 160/770] Modify verifier checks to support musttail +
 preallocated

Summary:
preallocated and musttail can work together, but we don't want to call
@llvm.call.preallocated.setup() to modify the stack in musttail calls.
So we shouldn't have the "preallocated" operand bundle when a
preallocated call is musttail.

Also disallow use of preallocated on calls without preallocated.

Codegen not yet implemented.

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80581
---
 llvm/docs/LangRef.rst                      | 17 +++++++++++------
 llvm/lib/IR/Verifier.cpp                   | 16 ++++++++++------
 llvm/test/Verifier/preallocated-invalid.ll | 21 +++++++++++++++++----
 llvm/test/Verifier/preallocated-valid.ll   | 11 +++++++++++
 4 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0891392b1e61e..61a0085c6f881 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1065,17 +1065,22 @@ Currently, only the following parameter attributes are defined:
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
+
+.. _attr_preallocated:
+
 ``preallocated(<ty>)``
     This indicates that the pointer parameter should really be passed by
     value to the function, and that the pointer parameter's pointee has
     already been initialized before the call instruction. This attribute
     is only valid on LLVM pointer arguments. The argument must be the value
     returned by the appropriate
-    :ref:`llvm.call.preallocated.arg<int_call_preallocated_arg>`, although is
-    ignored during codegen.
+    :ref:`llvm.call.preallocated.arg<int_call_preallocated_arg>` on non
+    ``musttail`` calls, or the corresponding caller parameter in ``musttail``
+    calls, although it is ignored during codegen.
 
-    Any function call with a ``preallocated`` attribute in any parameter
-    must have a ``"preallocated"`` operand bundle.
+    A non ``musttail`` function call with a ``preallocated`` attribute in
+    any parameter must have a ``"preallocated"`` operand bundle. A ``musttail``
+    function call cannot have a ``"preallocated"`` operand bundle.
 
     The preallocated attribute requires a type argument, which must be
     the same as the pointee type of the argument.
@@ -10634,8 +10639,8 @@ This instruction requires several arguments:
 
    #. The call will not cause unbounded stack growth if it is part of a
       recursive cycle in the call graph.
-   #. Arguments with the :ref:`inalloca <attr_inalloca>` attribute are
-      forwarded in place.
+   #. Arguments with the :ref:`inalloca <attr_inalloca>` or
+      :ref:`preallocated <attr_preallocated>` attribute are forwarded in place.
    #. If the musttail call appears in a function with the ``"thunk"`` attribute
       and the caller and callee both have varargs, than any unprototyped
       arguments in register or memory are forwarded to the callee. Similarly,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 4d64af3e8de4c..5ca6762d1c7fd 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2988,9 +2988,13 @@ void Verifier::visitCallBase(CallBase &Call) {
 
     if (Call.paramHasAttr(i, Attribute::Preallocated)) {
       Value *ArgVal = Call.getArgOperand(i);
-      Assert(Call.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0,
-             "preallocated operand requires a preallocated bundle", ArgVal,
-             Call);
+      bool hasOB =
+          Call.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0;
+      bool isMustTail = Call.isMustTailCall();
+      Assert(hasOB != isMustTail,
+             "preallocated operand either requires a preallocated bundle or "
+             "the call to be musttail (but not both)",
+             ArgVal, Call);
     }
   }
 
@@ -3150,9 +3154,6 @@ static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
 
 void Verifier::verifyMustTailCall(CallInst &CI) {
   Assert(!CI.isInlineAsm(), "cannot use musttail call with inline asm", &CI);
-  // FIXME: support musttail + preallocated
-  Assert(!CI.countOperandBundlesOfType(LLVMContext::OB_preallocated),
-         "musttail and preallocated not yet supported", &CI);
 
   // - The caller and callee prototypes must match.  Pointer types of
   //   parameters or return types may differ in pointee type, but not
@@ -4533,6 +4534,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
             ++NumPreallocatedArgs;
           }
         }
+        Assert(NumPreallocatedArgs != 0,
+               "cannot use preallocated intrinsics on a call without "
+               "preallocated arguments");
         Assert(NumArgs->equalsInt(NumPreallocatedArgs),
                "llvm.call.preallocated.setup arg size must be equal to number "
                "of preallocated arguments "
diff --git a/llvm/test/Verifier/preallocated-invalid.ll b/llvm/test/Verifier/preallocated-invalid.ll
index faa4c7a9fbfe6..7fdab33167e51 100644
--- a/llvm/test/Verifier/preallocated-invalid.ll
+++ b/llvm/test/Verifier/preallocated-invalid.ll
@@ -89,7 +89,7 @@ define void @preallocated_attribute_type_mismatch() {
     ret void
 }
 
-; CHECK: preallocated operand requires a preallocated bundle
+; CHECK: preallocated operand either requires a preallocated bundle or the call to be musttail
 define void @preallocated_require_bundle() {
     %cs = call token @llvm.call.preallocated.setup(i32 1)
     %x = call i8* @llvm.call.preallocated.arg(token %cs, i32 0) preallocated(i32)
@@ -117,9 +117,22 @@ define void @preallocated_arg_token() {
     ret void
 }
 
-; CHECK: musttail and preallocated not yet supported
-define void @musttail() {
+; CHECK: cannot use preallocated intrinsics on a call without preallocated arguments
+define void @preallocated_no_preallocated_args() {
     %cs = call token @llvm.call.preallocated.setup(i32 0)
-    musttail call void @foo0() ["preallocated"(token %cs)]
+    call void @foo0() ["preallocated"(token %cs)]
+    ret void
+}
+
+; CHECK: preallocated operand either requires a preallocated bundle or the call to be musttail
+define void @musttail_and_bundle(i32* preallocated(i32) %a) {
+    %cs = call token @llvm.call.preallocated.setup(i32 0)
+    musttail call void @musttail_and_bundle(i32* preallocated(i32) %a) ["preallocated"(token %cs)]
+    ret void
+}
+
+; CHECK: cannot guarantee tail call due to mismatched ABI impacting function attributes
+define void @musttail_attr_no_match(i32* preallocated(i32) %a) {
+    musttail call void @musttail_and_bundle(i32* %a)
     ret void
 }
diff --git a/llvm/test/Verifier/preallocated-valid.ll b/llvm/test/Verifier/preallocated-valid.ll
index 07f748ca8678b..483493c0c7470 100644
--- a/llvm/test/Verifier/preallocated-valid.ll
+++ b/llvm/test/Verifier/preallocated-valid.ll
@@ -4,6 +4,7 @@ declare token @llvm.call.preallocated.setup(i32)
 declare i8* @llvm.call.preallocated.arg(token, i32)
 
 declare void @foo1(i32* preallocated(i32))
+declare i64 @foo1_i64(i32* preallocated(i32))
 declare void @foo2(i32* preallocated(i32), i32*, i32* preallocated(i32))
 
 define void @preallocated() {
@@ -38,3 +39,13 @@ define void @preallocated_num_args() {
     call void @foo2(i32* preallocated(i32) %x1, i32* %a, i32* preallocated(i32) %y1) ["preallocated"(token %cs)]
     ret void
 }
+
+define void @preallocate_musttail(i32* preallocated(i32) %a) {
+    musttail call void @foo1(i32* preallocated(i32) %a)
+    ret void
+}
+
+define i64 @preallocate_musttail_i64(i32* preallocated(i32) %a) {
+    %r = musttail call i64 @foo1_i64(i32* preallocated(i32) %a)
+    ret i64 %r
+}

From 842a8cc10c4146cee6cedd94fbf556c94b8ec365 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <alexshap@fb.com>
Date: Tue, 26 May 2020 16:49:56 -0700
Subject: [PATCH 161/770] [llvm-objcopy][MachO] Add support for removing Swift
 symbols

cctools strip has the option "-T" which removes Swift symbols.
This diff implements this option in llvm-strip for MachO.

Test plan: make check-all

Differential revision: https://reviews.llvm.org/D80099
---
 llvm/docs/CommandGuide/llvm-strip.rst         |   4 +
 .../MachO/remove-swift-symbols.test           | 221 ++++++++++++++++++
 llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp  |   3 +-
 llvm/tools/llvm-objcopy/CopyConfig.cpp        |   1 +
 llvm/tools/llvm-objcopy/CopyConfig.h          |   1 +
 llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp    |   4 +-
 .../tools/llvm-objcopy/MachO/MachOObjcopy.cpp |   6 +-
 llvm/tools/llvm-objcopy/MachO/MachOReader.cpp |  23 ++
 llvm/tools/llvm-objcopy/MachO/MachOReader.h   |   1 +
 llvm/tools/llvm-objcopy/MachO/Object.h        |   7 +
 llvm/tools/llvm-objcopy/StripOpts.td          |   3 +
 11 files changed, 271 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objcopy/MachO/remove-swift-symbols.test

diff --git a/llvm/docs/CommandGuide/llvm-strip.rst b/llvm/docs/CommandGuide/llvm-strip.rst
index 455dc07e9c5cb..a40537bd51c17 100644
--- a/llvm/docs/CommandGuide/llvm-strip.rst
+++ b/llvm/docs/CommandGuide/llvm-strip.rst
@@ -181,6 +181,10 @@ them.
  segments. Note that many tools will not be able to use an object without
  section headers.
 
+.. option:: -T
+
+ Remove Swift symbols.
+
 EXIT STATUS
 -----------
 
diff --git a/llvm/test/tools/llvm-objcopy/MachO/remove-swift-symbols.test b/llvm/test/tools/llvm-objcopy/MachO/remove-swift-symbols.test
new file mode 100644
index 0000000000000..a47a2dfb9f37b
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/remove-swift-symbols.test
@@ -0,0 +1,221 @@
+## Verify that -T removes Swift symbols.
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__DATA \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000040070105 %s -o %t1
+# RUN: llvm-strip -x -T %t1
+# RUN: llvm-readobj -symbols %t1 | FileCheck --check-prefix=NO-SWIFT-SYMBOLS %s
+
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__DATA_CONST \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000040070105 %s -o %t2
+# RUN: llvm-strip -x -T %t2
+# RUN: llvm-readobj -symbols %t2 | FileCheck --check-prefix=NO-SWIFT-SYMBOLS %s
+
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__DATA_DIRTY \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000040070105 %s -o %t3
+# RUN: llvm-strip -x -T %t3
+# RUN: llvm-readobj -symbols %t3 | FileCheck --check-prefix=NO-SWIFT-SYMBOLS %s
+
+# NO-SWIFT-SYMBOLS:      Symbols [
+# NO-SWIFT-SYMBOLS-NEXT:  Symbol {
+# NO-SWIFT-SYMBOLS-NEXT:    Name: _main (1)
+# NO-SWIFT-SYMBOLS-NEXT:    Extern
+# NO-SWIFT-SYMBOLS-NEXT:    Type: Section (0xE)
+# NO-SWIFT-SYMBOLS-NEXT:    Section: __text (0x1)
+# NO-SWIFT-SYMBOLS-NEXT:    RefType: UndefinedNonLazy (0x0)
+# NO-SWIFT-SYMBOLS-NEXT:    Flags [ (0x0)
+# NO-SWIFT-SYMBOLS-NEXT:    ]
+# NO-SWIFT-SYMBOLS-NEXT:    Value: 0x100000B70
+# NO-SWIFT-SYMBOLS-NEXT:  }
+# NO-SWIFT-SYMBOLS-NEXT: ]
+
+## Verify that -T does not remove (public) Swift symbols when the binary
+## does not contain __objc_imageinfo in one of the expected segments.
+
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__DATA \
+# RUN:  -D SECTION_NAME=__not_objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000040070105 %s -o %t4
+# RUN: llvm-strip -x -T %t4
+# RUN: llvm-readobj -symbols %t4 | FileCheck --check-prefix=SWIFT-SYMBOLS %s
+
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__NOT_DATA \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000040070105 %s -o %t5
+# RUN: llvm-strip -x -T %t5
+# RUN: llvm-readobj -symbols %t5 | FileCheck --check-prefix=SWIFT-SYMBOLS %s
+
+## Verify that -T does not remove (public) Swift symbols when swift_version is zero.
+
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__DATA \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000000000000 %s -o %t6
+# RUN: llvm-strip -x -T %t6
+# RUN: llvm-readobj -symbols %t6 | FileCheck --check-prefix=SWIFT-SYMBOLS %s
+
+## Verify that -T does not remove (public) Swift symbols when the binary
+## contains invalid (too small) __objc_imageinfo.
+
+# RUN: yaml2obj -D FLAGS=0x00200085 -D SEGMENT_NAME=__DATA \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=4 \
+# RUN:  -D SECTION_CONTENT=00000000 %s -o %t7
+# RUN: llvm-strip -x -T %t7
+# RUN: llvm-readobj -symbols %t7 | FileCheck --check-prefix=SWIFT-SYMBOLS %s
+
+## Verify that -T does not remove (public) Swift symbols
+## when the flag MH_DYLDLINK is not set.
+
+# RUN: yaml2obj -D FLAGS=0x00200000 -D SEGMENT_NAME=__DATA \
+# RUN:  -D SECTION_NAME=__objc_imageinfo -D SECTION_SIZE=8 \
+# RUN:  -D SECTION_CONTENT=0000000040070105 %s -o %t8
+# RUN: llvm-strip -x -T %t8
+# RUN: llvm-readobj -symbols %t8 | FileCheck --check-prefix=SWIFT-SYMBOLS %s
+
+# SWIFT-SYMBOLS:      Symbols [
+# SWIFT-SYMBOLS-NEXT:  Symbol {
+# SWIFT-SYMBOLS-NEXT:    Name: _$S1a13PublicSymbol1Sivp (26)
+# SWIFT-SYMBOLS-NEXT:    Extern
+# SWIFT-SYMBOLS-NEXT:    Type: Section (0xE)
+# SWIFT-SYMBOLS-NEXT:    Section: __text (0x1)
+# SWIFT-SYMBOLS-NEXT:    RefType: UndefinedNonLazy (0x0)
+# SWIFT-SYMBOLS-NEXT:    Flags [ (0x0)
+# SWIFT-SYMBOLS-NEXT:    ]
+# SWIFT-SYMBOLS-NEXT:    Value: 0x100001160
+# SWIFT-SYMBOLS-NEXT:  }
+# SWIFT-SYMBOLS-NEXT:  Symbol {
+# SWIFT-SYMBOLS-NEXT:    Name: _$s1a13PublicSymbol2Sivp (1)
+# SWIFT-SYMBOLS-NEXT:    Extern
+# SWIFT-SYMBOLS-NEXT:    Type: Section (0xE)
+# SWIFT-SYMBOLS-NEXT:    Section: __text (0x1)
+# SWIFT-SYMBOLS-NEXT:    RefType: UndefinedNonLazy (0x0)
+# SWIFT-SYMBOLS-NEXT:    Flags [ (0x0)
+# SWIFT-SYMBOLS-NEXT:    ]
+# SWIFT-SYMBOLS-NEXT:    Value: 0x100001168
+# SWIFT-SYMBOLS-NEXT:  }
+# SWIFT-SYMBOLS-NEXT:  Symbol {
+# SWIFT-SYMBOLS-NEXT:    Name: _main (51)
+# SWIFT-SYMBOLS-NEXT:    Extern
+# SWIFT-SYMBOLS-NEXT:    Type: Section (0xE)
+# SWIFT-SYMBOLS-NEXT:    Section: __text (0x1)
+# SWIFT-SYMBOLS-NEXT:    RefType: UndefinedNonLazy (0x0)
+# SWIFT-SYMBOLS-NEXT:    Flags [ (0x0)
+# SWIFT-SYMBOLS-NEXT:    ]
+# SWIFT-SYMBOLS-NEXT:    Value: 0x100000B70
+# SWIFT-SYMBOLS-NEXT:  }
+# SWIFT-SYMBOLS-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x80000003
+  filetype:        0x00000002
+  ncmds:           4
+  sizeofcmds:      400
+  flags:           [[FLAGS]]
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000B70
+        size:            845
+        offset:          0x00000B70
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         [[SEGMENT_NAME]]
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        [[SECTION_NAME]]
+        segname:         [[SEGMENT_NAME]]
+        addr:            0x0000000100001090
+        size:            [[SECTION_SIZE]]
+        offset:          0x00001090
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         "[[SECTION_CONTENT]]"
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        188
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          8192
+    nsyms:           5
+    stroff:          8272
+    strsize:         108
+LinkEditData:
+  NameList:
+    - n_strx:          50
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971760
+    - n_strx:          1
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971768
+    - n_strx:          74
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971744
+    - n_strx:          25
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971752
+    - n_strx:          99
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294970224
+  StringTable:
+    - ''
+    - '_$s1a12LocalSymbol2Sivp'
+    - '_$s1a13PublicSymbol2Sivp'
+    - '_$S1a12LocalSymbol1Sivp'
+    - '_$S1a13PublicSymbol1Sivp'
+    - _main
+    - ''
+    - ''
+    - ''
+...
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
index 2e363f26eaccc..43ec2b1fa82f2 100644
--- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
@@ -251,7 +251,8 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) {
       !Config.SymbolsToKeepGlobal.empty() || !Config.SectionsToRename.empty() ||
       !Config.SetSectionAlignment.empty() || Config.ExtractDWO ||
       Config.LocalizeHidden || Config.PreserveDates || Config.StripDWO ||
-      Config.StripNonAlloc || Config.StripSections || Config.Weaken ||
+      Config.StripNonAlloc || Config.StripSections ||
+      Config.StripSwiftSymbols || Config.Weaken ||
       Config.DecompressDebugSections ||
       Config.DiscardMode == DiscardType::Locals ||
       !Config.SymbolsToAdd.empty() || Config.EntryExpr) {
diff --git a/llvm/tools/llvm-objcopy/CopyConfig.cpp b/llvm/tools/llvm-objcopy/CopyConfig.cpp
index ff12e4bd89f3e..1e151f01e01e2 100644
--- a/llvm/tools/llvm-objcopy/CopyConfig.cpp
+++ b/llvm/tools/llvm-objcopy/CopyConfig.cpp
@@ -912,6 +912,7 @@ parseStripOptions(ArrayRef<const char *> ArgsArr,
   if (auto Arg = InputArgs.getLastArg(STRIP_strip_all, STRIP_no_strip_all))
     Config.StripAll = Arg->getOption().getID() == STRIP_strip_all;
   Config.StripAllGNU = InputArgs.hasArg(STRIP_strip_all_gnu);
+  Config.StripSwiftSymbols = InputArgs.hasArg(STRIP_strip_swift_symbols);
   Config.OnlyKeepDebug = InputArgs.hasArg(STRIP_only_keep_debug);
   Config.KeepFileSymbols = InputArgs.hasArg(STRIP_keep_file_symbols);
 
diff --git a/llvm/tools/llvm-objcopy/CopyConfig.h b/llvm/tools/llvm-objcopy/CopyConfig.h
index be1dca46b9682..acf783c7f2789 100644
--- a/llvm/tools/llvm-objcopy/CopyConfig.h
+++ b/llvm/tools/llvm-objcopy/CopyConfig.h
@@ -219,6 +219,7 @@ struct CopyConfig {
   bool StripDebug = false;
   bool StripNonAlloc = false;
   bool StripSections = false;
+  bool StripSwiftSymbols = false;
   bool StripUnneeded = false;
   bool Weaken = false;
   bool DecompressDebugSections = false;
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index f7332b7f66fe0..8e14c887170d9 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -604,7 +604,9 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) {
 // system. The only priority is that keeps/copies overrule removes.
 static Error handleArgs(const CopyConfig &Config, Object &Obj,
                         const Reader &Reader, ElfType OutputElfType) {
-
+  if (Config.StripSwiftSymbols)
+    return createStringError(llvm::errc::invalid_argument,
+                             "option not supported by llvm-objcopy for ELF");
   if (!Config.SplitDWO.empty())
     if (Error E =
             splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType))
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
index be44fdbe45f95..ae8889af8c42f 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
@@ -65,13 +65,17 @@ static void updateAndRemoveSymbols(const CopyConfig &Config, Object &Obj) {
       Sym.Name = std::string(I->getValue());
   }
 
-  auto RemovePred = [Config](const std::unique_ptr<SymbolEntry> &N) {
+  auto RemovePred = [Config, &Obj](const std::unique_ptr<SymbolEntry> &N) {
     if (N->Referenced)
       return false;
     if (Config.StripAll)
       return true;
     if (Config.DiscardMode == DiscardType::All && !(N->n_type & MachO::N_EXT))
       return true;
+    // This behavior is consistent with cctools' strip.
+    if (Config.StripSwiftSymbols && (Obj.Header.Flags & MachO::MH_DYLDLINK) &&
+        Obj.SwiftVersion && *Obj.SwiftVersion && N->isSwiftSymbol())
+      return true;
     return false;
   };
 
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp b/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
index cf32f00f36153..39a8893c1eb1b 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
+++ b/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
@@ -283,6 +283,28 @@ void MachOReader::readIndirectSymbolTable(Object &O) const {
   }
 }
 
+void MachOReader::readSwiftVersion(Object &O) const {
+  struct ObjCImageInfo {
+    uint32_t Version;
+    uint32_t Flags;
+  } ImageInfo;
+
+  for (const LoadCommand &LC : O.LoadCommands)
+    for (const std::unique_ptr<Section> &Sec : LC.Sections)
+      if (Sec->Sectname == "__objc_imageinfo" &&
+          (Sec->Segname == "__DATA" || Sec->Segname == "__DATA_CONST" ||
+           Sec->Segname == "__DATA_DIRTY") &&
+          Sec->Content.size() >= sizeof(ObjCImageInfo)) {
+        memcpy(&ImageInfo, Sec->Content.data(), sizeof(ObjCImageInfo));
+        if (MachOObj.isLittleEndian() != sys::IsLittleEndianHost) {
+          sys::swapByteOrder(ImageInfo.Version);
+          sys::swapByteOrder(ImageInfo.Flags);
+        }
+        O.SwiftVersion = (ImageInfo.Flags >> 8) & 0xff;
+        return;
+      }
+}
+
 std::unique_ptr<Object> MachOReader::create() const {
   auto Obj = std::make_unique<Object>();
   readHeader(*Obj);
@@ -297,6 +319,7 @@ std::unique_ptr<Object> MachOReader::create() const {
   readDataInCodeData(*Obj);
   readFunctionStartsData(*Obj);
   readIndirectSymbolTable(*Obj);
+  readSwiftVersion(*Obj);
   return Obj;
 }
 
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.h b/llvm/tools/llvm-objcopy/MachO/MachOReader.h
index 00c8f0d55f61f..a369907147d6f 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.h
+++ b/llvm/tools/llvm-objcopy/MachO/MachOReader.h
@@ -39,6 +39,7 @@ class MachOReader : public Reader {
   void readDataInCodeData(Object &O) const;
   void readFunctionStartsData(Object &O) const;
   void readIndirectSymbolTable(Object &O) const;
+  void readSwiftVersion(Object &O) const;
 
 public:
   explicit MachOReader(const object::MachOObjectFile &Obj) : MachOObj(Obj) {}
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.h b/llvm/tools/llvm-objcopy/MachO/Object.h
index b0123732f80ae..b9ecd1e7818f2 100644
--- a/llvm/tools/llvm-objcopy/MachO/Object.h
+++ b/llvm/tools/llvm-objcopy/MachO/Object.h
@@ -115,6 +115,11 @@ struct SymbolEntry {
     return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
   }
 
+  bool isSwiftSymbol() const {
+    return StringRef(Name).startswith("_$s") ||
+           StringRef(Name).startswith("_$S");
+  }
+
   Optional<uint32_t> section() const {
     return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
   }
@@ -298,6 +303,8 @@ struct Object {
   LinkData DataInCode;
   LinkData FunctionStarts;
 
+  Optional<uint32_t> SwiftVersion;
+
   /// The index of LC_SYMTAB load command if present.
   Optional<size_t> SymTabCommandIndex;
   /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
diff --git a/llvm/tools/llvm-objcopy/StripOpts.td b/llvm/tools/llvm-objcopy/StripOpts.td
index cd02cffae6732..001da23528d78 100644
--- a/llvm/tools/llvm-objcopy/StripOpts.td
+++ b/llvm/tools/llvm-objcopy/StripOpts.td
@@ -15,3 +15,6 @@ def d : Flag<["-"], "d">,
 def S : Flag<["-"], "S">,
         Alias<strip_debug>,
         HelpText<"Alias for --strip-debug">;
+
+def strip_swift_symbols : Flag<["-"], "T">,
+                          HelpText<"Remove Swift symbols">;

From cf42b704391c44e84485dd2547ae006196998266 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Thu, 21 May 2020 11:04:43 -0700
Subject: [PATCH 162/770] [mlir][shape] Add `shape.get_extent`.

Summary:
This op extracts an extent from a shape.

This also is the first op which constant folds to shape.const_size,
which revealed that shape.const_size needs a folder (ConstantLike ops
seem to always need folders for the constant folding infra to work).

Differential Revision: https://reviews.llvm.org/D80394
---
 .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 32 ++++++++++++++++++
 mlir/lib/Dialect/Shape/IR/Shape.cpp           | 33 +++++++++++++++++++
 mlir/test/Dialect/Shape/canonicalize.mlir     | 30 +++++++++++++++++
 3 files changed, 95 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 7d62cebff8e66..0278d7bbeb065 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -131,6 +131,7 @@ def Shape_ConstSizeOp : Shape_Op<"const_size",
   let results = (outs Shape_SizeType:$result);
 
   let assemblyFormat = "attr-dict $value";
+  let hasFolder = 1;
 }
 
 def Shape_FromExtentsOp : Shape_Op<"from_extents", [
@@ -190,6 +191,37 @@ def Shape_ToExtentTensorOp : Shape_Op<"to_extent_tensor", []> {
   let hasFolder = 1;
 }
 
+def Shape_GetExtentOp : Shape_Op<"get_extent",
+    [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "Gets the specified extent from a shape";
+  let description = [{
+    Gets the extent indexed by `dim` from `shape`.
+
+    If the shape is an error, it returns an error size.
+  }];
+  let arguments = (ins
+    Shape_ShapeType:$shape,
+    Confined<I64Attr, [IntNonNegative]>:$dim
+  );
+  let results = (outs Shape_SizeType:$extent);
+  let assemblyFormat = "$shape `,` $dim attr-dict";
+
+  let builders = [
+    // Builder that allows passing a simple integer instead of an IntegerAttr.
+    OpBuilder<
+      [{
+        OpBuilder &builder, OperationState &result,
+        Value shape, int64_t dim
+      }],
+      [{
+        build(builder, result, shape, builder.getI64IntegerAttr(dim));
+      }]
+    >
+  ];
+
+  let hasFolder = 1;
+}
+
 def Shape_JoinOp : Shape_Op<"join", []> {
   let summary = "Returns the least general shape.size of its operands";
   let description = [{
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 5c6a0c2204c3c..095c41720fbae 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -245,6 +245,8 @@ ConstSizeOp::inferReturnTypes(MLIRContext *context, Optional<Location> location,
   return success();
 }
 
+OpFoldResult ConstSizeOp::fold(ArrayRef<Attribute>) { return valueAttr(); }
+
 //===----------------------------------------------------------------------===//
 // FromExtentsOp
 //===----------------------------------------------------------------------===//
@@ -267,6 +269,37 @@ OpFoldResult FromExtentsOp::fold(ArrayRef<Attribute> operands) {
   return builder.getI64TensorAttr(extents);
 }
 
+//===----------------------------------------------------------------------===//
+// GetExtentOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+GetExtentOp::inferReturnTypes(MLIRContext *context, Optional<Location> location,
+                              ValueRange operands, DictionaryAttr attributes,
+                              RegionRange regions,
+                              SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(SizeType::get(context));
+  return success();
+}
+
+OpFoldResult GetExtentOp::fold(ArrayRef<Attribute> operands) {
+  auto elements = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!elements)
+    return nullptr;
+  uint64_t dimToGet = dim().getLimitedValue();
+  // TODO: Constant fold this to some kind of constant error.
+  if (dimToGet >= (uint64_t)elements.getNumElements())
+    return nullptr;
+  // This is a little inconvenient because getValue returns an IntegerAttr
+  // that is not of IndexType, but the result here needs to be of
+  // IndexType.
+  // TODO: Make ConstShapeOp hold an tensor of index instead of i64.
+  Builder builder(getContext());
+  return builder.getIntegerAttr(
+      builder.getIndexType(),
+      elements.getValue<IntegerAttr>({dimToGet}).getInt());
+}
+
 //===----------------------------------------------------------------------===//
 // ShapeOfOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 2e35fc748d86b..018f5b212b4e4 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -106,3 +106,33 @@ func @no_fold(%arg0: index) -> !shape.shape {
   %ret = shape.from_extents %e0, %arg0
   return %ret : !shape.shape
 }
+
+// -----
+// Canonicalization of shape.get_extent
+
+// Basic folding.
+// CHECK-LABEL: func @basic
+func @basic() -> !shape.size {
+  // CHECK: shape.const_size 2
+  %0 = shape.const_shape [0, 1, 2]
+  %1 = shape.get_extent %0, 2
+  return %1 : !shape.size
+}
+
+// Should not fold.
+// CHECK-LABEL: func @out_of_bounds
+func @out_of_bounds() -> !shape.size {
+  // CHECK: shape.const_shape
+  // CHECK: shape.get_extent
+  %0 = shape.const_shape [0, 1, 2]
+  %1 = shape.get_extent %0, 3
+  return %1 : !shape.size
+}
+
+// Should not fold.
+// CHECK-LABEL: func @not_const
+func @not_const(%arg0: !shape.shape) -> !shape.size {
+  // CHECK: shape.get_extent
+  %0 = shape.get_extent %arg0, 3
+  return %0 : !shape.size
+}

From e724db03752a0cd06a86153fea0d95e377f999c0 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 17:00:48 -0700
Subject: [PATCH 163/770] [lldb/Test] Modify TestSymbolTable.py for reproducers

Work around global module caching during reproducer replay. See inline
comment for the details.
---
 .../API/lang/objc/foundation/TestSymbolTable.py     | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
index df4860f148260..02c551b83876f 100644
--- a/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
+++ b/lldb/test/API/lang/objc/foundation/TestSymbolTable.py
@@ -39,7 +39,18 @@ def test_with_python_api(self):
         self.assertTrue(process, PROCESS_IS_VALID)
 
         # Create the filespec by which to locate our a.out module.
-        filespec = lldb.SBFileSpec(exe, False)
+        #
+        #  - Use the absolute path to get the module for the current variant.
+        #  - Use the relative path for reproducers. The modules are never
+        #    orphaned because the SB objects are leaked intentionally. This
+        #    causes LLDB to reuse the same module for every variant, because the
+        #    UUID is the same for all the inferiors. FindModule below only
+        #    compares paths and is oblivious to the fact that the UUIDs are the
+        #    same.
+        if configuration.is_reproducer():
+            filespec = lldb.SBFileSpec('a.out', False)
+        else:
+            filespec = lldb.SBFileSpec(exe, False)
 
         module = target.FindModule(filespec)
         self.assertTrue(module, VALID_MODULE)

From 1079978b3c506abca2b4dd9a5b131c024330206b Mon Sep 17 00:00:00 2001
From: Alex Langford <apl@fb.com>
Date: Mon, 11 May 2020 16:24:42 -0700
Subject: [PATCH 164/770] [lldb][Core] Remove dead codepath in Mangled

Summary:
Objective-C names are stored in m_demangled, not in m_mangled. The
method in the condition will never return true.

Differential Revision: https://reviews.llvm.org/D79823
---
 lldb/source/Core/Mangled.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 56914ae117ddb..143ec8770bf47 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -413,8 +413,6 @@ lldb::LanguageType Mangled::GuessLanguage() const {
     const char *mangled_name = mangled.GetCString();
     if (CPlusPlusLanguage::IsCPPMangledName(mangled_name))
       return lldb::eLanguageTypeC_plus_plus;
-    else if (ObjCLanguage::IsPossibleObjCMethodName(mangled_name))
-      return lldb::eLanguageTypeObjC;
   } else {
     // ObjC names aren't really mangled, so they won't necessarily be in the
     // mangled name slot.

From b90eb0f23b5bf3db4a091748b3ea6de9a45645c9 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 26 May 2020 17:27:46 -0700
Subject: [PATCH 165/770] Autogen a couple of test files to make a future diff
 easier to read

---
 .../base-pointers-4.ll                        | 39 +++++++--
 .../RewriteStatepointsForGC/basic.ll          | 84 +++++++++++++------
 .../deopt-lowering-attrs.ll                   | 15 +++-
 3 files changed, 103 insertions(+), 35 deletions(-)

diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll b/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
index b9f67c1a37400..7fe70b22eb100 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
 ; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
 
@@ -10,17 +11,43 @@ declare i64 addrspace(1)* @generate_obj()
 declare void @consume_obj(i64 addrspace(1)*)
 
 define void @test(i32 %condition) gc "statepoint-example" {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i64 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* ()* @generate_obj, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 addrspace(1)* @llvm.experimental.gc.result.p1i64(token [[STATEPOINT_TOKEN]])
+; CHECK-NEXT:    switch i32 [[CONDITION:%.*]], label [[DEST_A:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEST_B:%.*]]
+; CHECK-NEXT:    i32 1, label [[DEST_C:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       dest_a:
+; CHECK-NEXT:    br label [[MERGE:%.*]]
+; CHECK:       dest_b:
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       dest_c:
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE:%.*]] = phi i64 addrspace(1)* [ [[TMP0]], [[DEST_A]] ], [ null, [[DEST_B]] ], [ null, [[DEST_C]] ], !is_base_value !0
+; CHECK-NEXT:    [[OBJ_TO_CONSUME:%.*]] = phi i64 addrspace(1)* [ [[TMP0]], [[DEST_A]] ], [ null, [[DEST_B]] ], [ null, [[DEST_C]] ]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 2882400000, i32 0, void (i64 addrspace(1)*)* @consume_obj, i32 1, i32 0, i64 addrspace(1)* [[OBJ_TO_CONSUME]], i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* [[OBJ_TO_CONSUME_BASE]], i64 addrspace(1)* [[OBJ_TO_CONSUME]])
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 13, i32 13)
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[OBJ_TO_CONSUME_BASE_RELOCATED]] to i64 addrspace(1)*
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 13, i32 14)
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[OBJ_TO_CONSUME_RELOCATED]] to i64 addrspace(1)*
+; CHECK-NEXT:    br label [[MERGE_SPLIT:%.*]]
+; CHECK:       merge.split:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
 loop:                                             ; preds = %merge.split, %entry
-; CHECK: loop:
-; CHECK:  [[TOKEN_0:%[^ ]+]] = call token (i64, i32, i64 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* ()* @generate_obj, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i3
-; CHECK-NEXT:  [[RESULT_0:%[^ ]+]] = call i64 addrspace(1)* @llvm.experimental.gc.result
   %0 = call i64 addrspace(1)* @generate_obj() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
   switch i32 %condition, label %dest_a [
-    i32 0, label %dest_b
-    i32 1, label %dest_c
+  i32 0, label %dest_b
+  i32 1, label %dest_c
   ]
 
 dest_a:                                           ; preds = %loop
@@ -33,8 +60,6 @@ dest_c:                                           ; preds = %loop
   br label %merge
 
 merge:                                            ; preds = %dest_c, %dest_b, %dest_a
-; CHECK: merge:
-; CHECK:  %obj_to_consume = phi i64 addrspace(1)* [ [[RESULT_0]], %dest_a ], [ null, %dest_b ], [ null, %dest_c ]
   %obj_to_consume = phi i64 addrspace(1)* [ %0, %dest_a ], [ null, %dest_b ], [ null, %dest_c ]
   call void @consume_obj(i64 addrspace(1)* %obj_to_consume) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
   br label %merge.split
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll b/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll
index c1c160b14274b..8e052a61a4dc3 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
 ; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
 
@@ -6,69 +7,104 @@ declare i32 @h()
 
 define i32 addrspace(1)* @f0(i32 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: @f0(
- entry:
-; CHECK: [[TOKEN_0:%[^ ]+]] = call token {{[^@]*}} @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
+;
+  entry:
   call void @g() [ "deopt"(i32 100) ]
 
-; CHECK: %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_0]], i32 8, i32 8)
   ret i32 addrspace(1)* %arg
 }
 
 define i32 addrspace(1)* @f1(i32 addrspace(1)* %arg) gc "statepoint-example"  personality i32 8  {
 ; CHECK-LABEL: @f1(
- entry:
-; CHECK: [[TOKEN_1:%[^ ]+]] = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    to label [[NORMAL_DEST:%.*]] unwind label [[UNWIND_DEST:%.*]]
+; CHECK:       normal_dest:
+; CHECK-NEXT:    [[ARG_RELOCATED1:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED1_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED1]] to i32 addrspace(1)*
+; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED1_CASTED]]
+; CHECK:       unwind_dest:
+; CHECK-NEXT:    [[LPAD:%.*]] = landingpad token
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[LPAD]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    resume token undef
+;
+  entry:
   invoke void @g() [ "deopt"(i32 100) ] to label %normal_dest unwind label %unwind_dest
 
- normal_dest:
-; CHECK: %arg.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_1]], i32 8, i32 8)
+  normal_dest:
   ret i32 addrspace(1)* %arg
 
- unwind_dest: 
+  unwind_dest:
   %lpad = landingpad token cleanup
   resume token undef
 }
 
 define i32 addrspace(1)* @f2(i32 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: @f2(
- entry:
-; CHECK: [[TOKEN_2:%[^ ]+]] = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @llvm.experimental.gc.result.i32(token [[STATEPOINT_TOKEN]])
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    store i32 [[VAL1]], i32 addrspace(1)* [[ARG_RELOCATED_CASTED]], align 4
+; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
+;
+  entry:
   %val = call i32 @h() [ "deopt"(i32 100) ]
 
-; CHECK: [[RESULT_F2:%[^ ]+]] = call i32 @llvm.experimental.gc.result.i32(token [[TOKEN_2]])
-; CHECK: %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_2]], i32 8, i32 8)
-; CHECK: %arg.relocated.casted = bitcast i8 addrspace(1)* %arg.relocated to i32 addrspace(1)*
 
   store i32 %val, i32 addrspace(1)* %arg
-; CHECK: store i32 [[RESULT_F2]], i32 addrspace(1)* %arg.relocated.casted
   ret i32 addrspace(1)* %arg
 }
 
 define i32 addrspace(1)* @f3(i32 addrspace(1)* %arg) gc "statepoint-example"  personality i32 8  {
 ; CHECK-LABEL: @f3(
- entry:
-; CHECK: [[TOKEN_3:%[^ ]+]] = invoke token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = invoke token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    to label [[NORMAL_DEST:%.*]] unwind label [[UNWIND_DEST:%.*]]
+; CHECK:       normal_dest:
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @llvm.experimental.gc.result.i32(token [[STATEPOINT_TOKEN]])
+; CHECK-NEXT:    [[ARG_RELOCATED2:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED2_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED2]] to i32 addrspace(1)*
+; CHECK-NEXT:    store i32 [[VAL1]], i32 addrspace(1)* [[ARG_RELOCATED2_CASTED]], align 4
+; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED2_CASTED]]
+; CHECK:       unwind_dest:
+; CHECK-NEXT:    [[LPAD:%.*]] = landingpad token
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[LPAD]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    resume token undef
+;
+  entry:
   %val = invoke i32 @h() [ "deopt"(i32 100) ] to label %normal_dest unwind label %unwind_dest
 
- normal_dest:
-; CHECK: [[RESULT_F3:%[^ ]+]] = call i32 @llvm.experimental.gc.result.i32(token [[TOKEN_3]])
-; CHECK: [[ARG_RELOCATED:%[^ ]+]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_3]], i32 8, i32 8)
-; CHECK: [[ARG_RELOCATED_CASTED:%[^ ]+]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+  normal_dest:
 
   store i32 %val, i32 addrspace(1)* %arg
 
-; CHECK: store i32 [[RESULT_F3]], i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
   ret i32 addrspace(1)* %arg
 
- unwind_dest: 
+  unwind_dest:
   %lpad = landingpad token cleanup
   resume token undef
 }
 
 define i32 addrspace(1)* @f4(i32 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: @f4(
- entry:
-; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 1, i32 2, i32 400, i8 90,
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 1, i32 2, i32 400, i8 90, i32 0, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 9, i32 9)
+; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
+;
+  entry:
   call void @g() [ "gc-transition"(i32 400, i8 90) ]
   ret i32 addrspace(1)* %arg
 }
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll b/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
index d0a331905088f..65e38d9d37587 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
 ; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
 ; Check that the "deopt-lowering" function attribute gets transcoded into
@@ -12,9 +13,12 @@ declare void @baz() "deopt-lowering"="live-through"
 
 define void @test1() gc "statepoint-example" {
 ; CHECK-LABEL: @test1(
-; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 1, i32 57)
-; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 42)
-; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 13)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 1, i32 57)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 42)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 13)
+; CHECK-NEXT:    ret void
+;
 
 entry:
   call void @foo() [ "deopt"(i32 57) ]
@@ -26,7 +30,10 @@ entry:
 ; add deopt-lowering attribute as part of callsite
 define void @test2() gc "statepoint-example" {
 ; CHECK-LABEL: @test2(
-; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 2, i32 0, i32 1, i32 57)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 2, i32 0, i32 1, i32 57) #0
+; CHECK-NEXT:    ret void
+;
 
 entry:
   call void @foo()  "deopt-lowering"="live-in"  [ "deopt"(i32 57) ]

From bed6624ac43bc223114d0b9380d593f2dfd749ff Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 26 May 2020 17:33:07 -0700
Subject: [PATCH 166/770] Split a test file so that most of it can be autogened

---
 .../scalar-base-vector-2.ll                   | 76 +++++++++++++++++++
 .../scalar-base-vector.ll                     | 74 +-----------------
 2 files changed, 77 insertions(+), 73 deletions(-)
 create mode 100644 llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll

diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll
new file mode 100644
index 0000000000000..1cfda09b2c1b0
--- /dev/null
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
+
+; Assertions are almost autogenerated except for last testcase widget, which was
+; updated (with -DAG instead of -NEXT) to fix buildbot failure reproducible only on two boxes.
+
+; Uses of extractelement that are of scalar type should not have the BDV
+; incorrectly identified as a vector type.
+define void @widget() gc "statepoint-example" {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb6:
+; CHECK-NEXT:    [[BASE_EE:%.*]] = extractelement <2 x i8 addrspace(1)*> zeroinitializer, i32 1, !is_base_value !0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i8 addrspace(1)*> undef, i32 1
+; CHECK-NEXT:    br i1 undef, label [[BB7:%.*]], label [[BB9:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP]], i64 12
+; CHECK-NEXT:    br label [[BB11:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP]], i64 12
+; CHECK-NEXT:    br i1 undef, label [[BB11]], label [[BB15:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP12_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE_EE]], [[BB7]] ], [ [[BASE_EE]], [[BB9]] ], !is_base_value !0
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i8 addrspace(1)* [ [[TMP8]], [[BB7]] ], [ [[TMP10]], [[BB9]] ]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 1, i32 undef, i8 addrspace(1)* [[TMP12_BASE]], i8 addrspace(1)* [[TMP12]])
+; CHECK-NEXT:    [[TMP12_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[TMP12_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 9)
+; CHECK-NEXT:    br label [[BB15]]
+; CHECK:       bb15:
+; CHECK-NEXT:    [[TMP16_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE_EE]], [[BB9]] ], [ [[TMP12_BASE_RELOCATED]], [[BB11]] ], !is_base_value !0
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i8 addrspace(1)* [ [[TMP10]], [[BB9]] ], [ [[TMP12_RELOCATED]], [[BB11]] ]
+; CHECK-NEXT:    br i1 undef, label [[BB17:%.*]], label [[BB20:%.*]]
+; CHECK:       bb17:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 1, i32 undef, i8 addrspace(1)* [[TMP16_BASE]], i8 addrspace(1)* [[TMP16]])
+; CHECK-NEXT:    [[TMP16_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 8)
+; CHECK-NEXT:    [[TMP16_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 9)
+; CHECK-NEXT:    br label [[BB20]]
+; CHECK:       bb20:
+; CHECK-DAG:    [[DOT05:%.*]] = phi i8 addrspace(1)* [ [[TMP16_BASE_RELOCATED]], [[BB17]] ], [ [[TMP16_BASE]], [[BB15]] ]
+; CHECK-DAG:    [[DOT0:%.*]] = phi i8 addrspace(1)* [ [[TMP16_RELOCATED]], [[BB17]] ], [ [[TMP16]], [[BB15]] ]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void (i8 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i8f(i64 2882400000, i32 0, void (i8 addrspace(1)*)* @foo, i32 1, i32 0, i8 addrspace(1)* [[DOT0]], i32 0, i32 0, i8 addrspace(1)* [[DOT05]], i8 addrspace(1)* [[DOT0]])
+; CHECK-NEXT:    [[TMP16_BASE_RELOCATED3:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 8, i32 8)
+; CHECK-NEXT:    [[TMP16_RELOCATED4:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 8, i32 9)
+; CHECK-NEXT:    ret void
+;
+bb6:                                              ; preds = %bb3
+  %tmp = extractelement <2 x i8 addrspace(1)*> undef, i32 1
+  br i1 undef, label %bb7, label %bb9
+
+bb7:                                              ; preds = %bb6
+  %tmp8 = getelementptr inbounds i8, i8 addrspace(1)* %tmp, i64 12
+  br label %bb11
+
+bb9:                                              ; preds = %bb6, %bb6
+  %tmp10 = getelementptr inbounds i8, i8 addrspace(1)* %tmp, i64 12
+  br i1 undef, label %bb11, label %bb15
+
+bb11:                                             ; preds = %bb9, %bb7
+  %tmp12 = phi i8 addrspace(1)* [ %tmp8, %bb7 ], [ %tmp10, %bb9 ]
+  call void @snork() [ "deopt"(i32 undef) ]
+  br label %bb15
+
+bb15:                                             ; preds = %bb11, %bb9, %bb9
+  %tmp16 = phi i8 addrspace(1)* [ %tmp10, %bb9 ], [ %tmp12, %bb11 ]
+  br i1 undef, label %bb17, label %bb20
+
+bb17:                                             ; preds = %bb15
+  call void @snork() [ "deopt"(i32 undef) ]
+  br label %bb20
+
+bb20:                                             ; preds = %bb17, %bb15, %bb15
+  call void @foo(i8 addrspace(1)* %tmp16)
+  ret void
+}
+
+declare void @snork()
+declare void @foo(i8 addrspace(1)*)
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
index e5e765be2b846..34af81cd7337e 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
@@ -1,9 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
 ; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
 
-; Assertions are almost autogenerated except for last testcase widget, which was
-; updated (with -DAG instead of -NEXT) to fix buildbot failure reproducible only on two boxes.
-
 declare void @do_safepoint()
 declare i8 addrspace(1)* @def_ptr()
 
@@ -194,75 +192,5 @@ latch:                                              ; preds = %bb25, %bb7
   br label %header
 }
 
-; Uses of extractelement that are of scalar type should not have the BDV
-; incorrectly identified as a vector type.
-define void @widget() gc "statepoint-example" {
-; CHECK-LABEL: @widget(
-; CHECK-NEXT:  bb6:
-; CHECK-NEXT:    [[BASE_EE:%.*]] = extractelement <2 x i8 addrspace(1)*> zeroinitializer, i32 1, !is_base_value !0
-; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i8 addrspace(1)*> undef, i32 1
-; CHECK-NEXT:    br i1 undef, label [[BB7:%.*]], label [[BB9:%.*]]
-; CHECK:       bb7:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP]], i64 12
-; CHECK-NEXT:    br label [[BB11:%.*]]
-; CHECK:       bb9:
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP]], i64 12
-; CHECK-NEXT:    br i1 undef, label [[BB11]], label [[BB15:%.*]]
-; CHECK:       bb11:
-; CHECK-NEXT:    [[TMP12_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE_EE]], [[BB7]] ], [ [[BASE_EE]], [[BB9]] ], !is_base_value !0
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i8 addrspace(1)* [ [[TMP8]], [[BB7]] ], [ [[TMP10]], [[BB9]] ]
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 1, i32 undef, i8 addrspace(1)* [[TMP12_BASE]], i8 addrspace(1)* [[TMP12]])
-; CHECK-NEXT:    [[TMP12_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
-; CHECK-NEXT:    [[TMP12_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 9)
-; CHECK-NEXT:    br label [[BB15]]
-; CHECK:       bb15:
-; CHECK-NEXT:    [[TMP16_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE_EE]], [[BB9]] ], [ [[TMP12_BASE_RELOCATED]], [[BB11]] ], !is_base_value !0
-; CHECK-NEXT:    [[TMP16:%.*]] = phi i8 addrspace(1)* [ [[TMP10]], [[BB9]] ], [ [[TMP12_RELOCATED]], [[BB11]] ]
-; CHECK-NEXT:    br i1 undef, label [[BB17:%.*]], label [[BB20:%.*]]
-; CHECK:       bb17:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 1, i32 undef, i8 addrspace(1)* [[TMP16_BASE]], i8 addrspace(1)* [[TMP16]])
-; CHECK-NEXT:    [[TMP16_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 8)
-; CHECK-NEXT:    [[TMP16_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 9)
-; CHECK-NEXT:    br label [[BB20]]
-; CHECK:       bb20:
-; CHECK-DAG:    [[DOT05:%.*]] = phi i8 addrspace(1)* [ [[TMP16_BASE_RELOCATED]], [[BB17]] ], [ [[TMP16_BASE]], [[BB15]] ]
-; CHECK-DAG:    [[DOT0:%.*]] = phi i8 addrspace(1)* [ [[TMP16_RELOCATED]], [[BB17]] ], [ [[TMP16]], [[BB15]] ]
-; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void (i8 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i8f(i64 2882400000, i32 0, void (i8 addrspace(1)*)* @foo, i32 1, i32 0, i8 addrspace(1)* [[DOT0]], i32 0, i32 0, i8 addrspace(1)* [[DOT05]], i8 addrspace(1)* [[DOT0]])
-; CHECK-NEXT:    [[TMP16_BASE_RELOCATED3:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 8, i32 8)
-; CHECK-NEXT:    [[TMP16_RELOCATED4:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 8, i32 9)
-; CHECK-NEXT:    ret void
-;
-bb6:                                              ; preds = %bb3
-  %tmp = extractelement <2 x i8 addrspace(1)*> undef, i32 1
-  br i1 undef, label %bb7, label %bb9
-
-bb7:                                              ; preds = %bb6
-  %tmp8 = getelementptr inbounds i8, i8 addrspace(1)* %tmp, i64 12
-  br label %bb11
-
-bb9:                                              ; preds = %bb6, %bb6
-  %tmp10 = getelementptr inbounds i8, i8 addrspace(1)* %tmp, i64 12
-  br i1 undef, label %bb11, label %bb15
-
-bb11:                                             ; preds = %bb9, %bb7
-  %tmp12 = phi i8 addrspace(1)* [ %tmp8, %bb7 ], [ %tmp10, %bb9 ]
-  call void @snork() [ "deopt"(i32 undef) ]
-  br label %bb15
-
-bb15:                                             ; preds = %bb11, %bb9, %bb9
-  %tmp16 = phi i8 addrspace(1)* [ %tmp10, %bb9 ], [ %tmp12, %bb11 ]
-  br i1 undef, label %bb17, label %bb20
-
-bb17:                                             ; preds = %bb15
-  call void @snork() [ "deopt"(i32 undef) ]
-  br label %bb20
-
-bb20:                                             ; preds = %bb17, %bb15, %bb15
-  call void @foo(i8 addrspace(1)* %tmp16)
-  ret void
-}
-
-declare void @snork()
-declare void @foo(i8 addrspace(1)*)
 declare void @spam()
 declare <2 x i8 addrspace(1)*> @baz()

From 40c4ecabc238cfdd639bc1e927800337457e69e3 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 26 May 2020 17:22:53 -0700
Subject: [PATCH 167/770] [lldb/Docs] Add the application speicfic lldbinit to
 the man page

This used to be part of the man page but got lost when we moved to
generating it with Sphinx.
---
 lldb/docs/man/lldb.rst | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lldb/docs/man/lldb.rst b/lldb/docs/man/lldb.rst
index a3a0736680ad4..842a693f47518 100644
--- a/lldb/docs/man/lldb.rst
+++ b/lldb/docs/man/lldb.rst
@@ -303,10 +303,13 @@ CONFIGURATION FILES
 -------------------
 
 :program:`lldb` reads things like settings, aliases and commands from the
-.lldbinit file. It will first look for ~/.lldbinit and load that first.
-Secondly, it will look for an .lldbinit file in the current working directory.
-For security reasons, :program:`lldb` will print a warning and not source this
-file by default. This behavior can be changed by changing the
+.lldbinit file. First, it will read the application specific init file whose
+name is ~/.lldbinit followed by a "-" and the name of the current program. This
+would be ~/.lldbinit-lldb for the command line :program:`lldb` and
+~/.lldbinit-Xcode for Xcode. Secondly, the global ~/.lldbinit will be read.
+Finally, :program:`lldb` will look for an .lldbinit file in the current working
+directory. For security reasons, :program:`lldb` will print a warning and not
+source this file by default. This behavior can be changed by changing the
 target.load-cwd-lldbinit setting.
 
 To always load the .lldbinit file in the current working directory, add the

From 323d850427472ed060fc4c495b2010e6174b875b Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 26 May 2020 17:34:54 -0700
Subject: [PATCH 168/770] Add self as code owner for SCEV and IndVars

This was discussed on llvm-dev thread "Transferring code ownership for SCEV and IndVars" a few months back.  I just forgot to make the actual change.
---
 llvm/CODE_OWNERS.TXT | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
index 35f8ef81c1ea9..5cc5b87364c15 100644
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -52,8 +52,8 @@ N: Pete Couperus
 E: petecoup@synopsys.com
 D: ARC backend (lib/Target/ARC/*)
 
-N: Sanjoy Das
-E: sanjoy@playingwithpointers.com
+N: Philip Reames
+E: listmail@philipreames.com
 D: IndVar Simplify, Scalar Evolution
 
 N: Marshall Clow

From ae597a771ed4d7530e2ef232d02a253067e3312f Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 26 May 2020 12:56:14 -0700
Subject: [PATCH 169/770] [AArch64][GlobalISel] Do not modify predicate when
 optimizing G_ICMP

This fixes a bug in `tryOptArithImmedIntegerCompare`.

It is unsafe to update the predicate on a MachineOperand when optimizing a
G_ICMP, because it may be used in more than one place.

For example, when we are optimizing G_SELECT, we allow compares which are used
in more than one G_SELECT. If we modify the G_ICMP, then we'll break one of
the G_SELECTs.

Since the compare is being produced to either

1) Select a G_ICMP
2) Fold a G_ICMP into an instruction when profitable

there's no reason to actually modify it. The change is local to the specific
compare.

Instead, pass a `CmpInst::Predicate` to `tryOptArithImmedIntegerCompare` which
can be modified by reference.

Differential Revision: https://reviews.llvm.org/D80585
---
 .../AArch64/AArch64InstructionSelector.cpp    | 33 ++++----
 .../GlobalISel/select-arith-immed-compare.mir | 78 +++++++++++++++++++
 2 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 57eaf140a6380..1b321260ed02c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -155,7 +155,9 @@ class AArch64InstructionSelector : public InstructionSelector {
 
   // Emit an integer compare between LHS and RHS, which checks for Predicate.
   //
-  // This may update Predicate when emitting the compare.
+  // This returns the produced compare instruction, and the predicate which
+  // was ultimately used in the compare. The predicate may differ from what
+  // is passed in \p Predicate due to optimization.
   std::pair<MachineInstr *, CmpInst::Predicate>
   emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                      MachineOperand &Predicate,
@@ -307,7 +309,7 @@ class AArch64InstructionSelector : public InstructionSelector {
                                       MachineIRBuilder &MIRBuilder) const;
   MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS,
                                                MachineOperand &RHS,
-                                               MachineOperand &Predicate,
+                                               CmpInst::Predicate &Predicate,
                                                MachineIRBuilder &MIB) const;
   MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS,
                                           MachineOperand &RHS,
@@ -3685,13 +3687,16 @@ AArch64InstructionSelector::emitIntegerCompare(
     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
     MachineIRBuilder &MIRBuilder) const {
   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+  assert(Predicate.isPredicate() && "Expected predicate?");
   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
 
+  CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
+
   // Fold the compare if possible.
   MachineInstr *FoldCmp =
       tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
   if (FoldCmp)
-    return {FoldCmp, (CmpInst::Predicate)Predicate.getPredicate()};
+    return {FoldCmp, P};
 
   // Can't fold into a CMN. Just emit a normal compare.
   unsigned CmpOpc = 0;
@@ -3712,21 +3717,21 @@ AArch64InstructionSelector::emitIntegerCompare(
 
   // Try to match immediate forms.
   MachineInstr *ImmedCmp =
-      tryOptArithImmedIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
+      tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder);
   if (ImmedCmp)
-    return {ImmedCmp, (CmpInst::Predicate)Predicate.getPredicate()};
+    return {ImmedCmp, P};
 
   // If we don't have an immediate, we may have a shift which can be folded
   // into the compare.
   MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder);
   if (ShiftedCmp)
-    return {ShiftedCmp, (CmpInst::Predicate)Predicate.getPredicate()};
+    return {ShiftedCmp, P};
 
   auto CmpMI =
       MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()});
   // Make sure that we can constrain the compare that we emitted.
   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-  return {&*CmpMI, (CmpInst::Predicate)Predicate.getPredicate()};
+  return {&*CmpMI, P};
 }
 
 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
@@ -4042,7 +4047,7 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
 }
 
 MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
-    MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
+    MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P,
     MachineIRBuilder &MIB) const {
   // Attempt to select the immediate form of an integer compare.
   MachineRegisterInfo &MRI = *MIB.getMRI();
@@ -4051,7 +4056,6 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
   unsigned Size = Ty.getSizeInBits();
   assert((Size == 32 || Size == 64) &&
          "Expected 32 bit or 64 bit compare only?");
-  auto P = (CmpInst::Predicate)Predicate.getPredicate();
 
   // Check if this is a case we can already handle.
   InstructionSelector::ComplexRendererFns ImmFns;
@@ -4066,6 +4070,7 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
     // We have a constant, but it doesn't fit. Try adjusting it by one and
     // updating the predicate if possible.
     uint64_t C = *MaybeImmed;
+    CmpInst::Predicate NewP;
     switch (P) {
     default:
       return nullptr;
@@ -4080,7 +4085,7 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
       if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
           (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
         return nullptr;
-      P = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
+      NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
       C -= 1;
       break;
     case CmpInst::ICMP_ULT:
@@ -4093,7 +4098,7 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
       // When c is not zero.
       if (C == 0)
         return nullptr;
-      P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+      NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
       C -= 1;
       break;
     case CmpInst::ICMP_SLE:
@@ -4107,7 +4112,7 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
       if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
           (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
         return nullptr;
-      P = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
+      NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
       C += 1;
       break;
     case CmpInst::ICMP_ULE:
@@ -4121,7 +4126,7 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
       if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
           (Size == 64 && C == UINT64_MAX))
         return nullptr;
-      P = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+      NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
       C += 1;
       break;
     }
@@ -4132,7 +4137,7 @@ MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
     ImmFns = select12BitValueWithLeftShift(C);
     if (!ImmFns)
       return nullptr;
-    Predicate.setPredicate(P);
+    P = NewP;
   }
 
   // At this point, we know we can select an immediate form. Go ahead and do
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-arith-immed-compare.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-arith-immed-compare.mir
index 59fcbd09c4c12..37d7ec60f553a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-arith-immed-compare.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-arith-immed-compare.mir
@@ -627,4 +627,82 @@ body:             |
     %3:gpr(s64) = G_AND %6, %5
     $x0 = COPY %3(s64)
     RET_ReallyLR implicit $x0
+
+...
+---
+name:            more_than_one_use_select
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+
+    ; Both of these selects use the same compare.
+    ;
+    ; They should both be optimized in the same way, so the SUBS produced for
+    ; each CSEL should be the same.
+
+    ; CHECK-LABEL: name: more_than_one_use_select
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %a:gpr64common = COPY $x0
+    ; CHECK: %b:gpr64 = COPY $x1
+    ; CHECK: %c:gpr64 = COPY $x2
+    ; CHECK: $xzr = SUBSXri %a, 0, 0, implicit-def $nzcv
+    ; CHECK: %select1:gpr64 = CSELXr %a, %b, 11, implicit $nzcv
+    ; CHECK: $xzr = SUBSXri %a, 0, 0, implicit-def $nzcv
+    ; CHECK: %select2:gpr64 = CSELXr %b, %c, 11, implicit $nzcv
+    ; CHECK: %add:gpr64 = ADDXrr %select1, %select2
+    ; CHECK: $x0 = COPY %add
+    ; CHECK: RET_ReallyLR implicit $x0
+    %a:gpr(s64) = COPY $x0
+    %b:gpr(s64) = COPY $x1
+    %c:gpr(s64) = COPY $x2
+    %cst:gpr(s64) = G_CONSTANT i64 -1
+    %cmp:gpr(s32) = G_ICMP intpred(sle), %a(s64), %cst
+    %trunc_cmp:gpr(s1) = G_TRUNC %cmp(s32)
+    %select1:gpr(s64) = G_SELECT %trunc_cmp(s1), %a, %b
+    %select2:gpr(s64) = G_SELECT %trunc_cmp(s1), %b, %c
+    %add:gpr(s64) = G_ADD %select1, %select2
+    $x0 = COPY %add(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            more_than_one_use_select_no_opt
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+
+    ; When we don't end up doing the optimization, we should not change the
+    ; predicate.
+    ;
+    ; In this case, the CSELXrs should both have predicate code 13.
+
+    ; CHECK-LABEL: name: more_than_one_use_select_no_opt
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %a:gpr64 = COPY $x0
+    ; CHECK: %b:gpr64 = COPY $x1
+    ; CHECK: %c:gpr64 = COPY $x2
+    ; CHECK: %cst:gpr64 = MOVi64imm 922337203685477580
+    ; CHECK: [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr %a, %cst, implicit-def $nzcv
+    ; CHECK: %select1:gpr64 = CSELXr %a, %b, 13, implicit $nzcv
+    ; CHECK: [[SUBSXrr1:%[0-9]+]]:gpr64 = SUBSXrr %a, %cst, implicit-def $nzcv
+    ; CHECK: %select2:gpr64 = CSELXr %b, %c, 13, implicit $nzcv
+    ; CHECK: %add:gpr64 = ADDXrr %select1, %select2
+    ; CHECK: $x0 = COPY %add
+    ; CHECK: RET_ReallyLR implicit $x0
+    %a:gpr(s64) = COPY $x0
+    %b:gpr(s64) = COPY $x1
+    %c:gpr(s64) = COPY $x2
+    %cst:gpr(s64) = G_CONSTANT i64 922337203685477580
+    %cmp:gpr(s32) = G_ICMP intpred(sle), %a(s64), %cst
+    %trunc_cmp:gpr(s1) = G_TRUNC %cmp(s32)
+    %select1:gpr(s64) = G_SELECT %trunc_cmp(s1), %a, %b
+    %select2:gpr(s64) = G_SELECT %trunc_cmp(s1), %b, %c
+    %add:gpr(s64) = G_ADD %select1, %select2
+    $x0 = COPY %add(s64)
+    RET_ReallyLR implicit $x0
 ...

From f20ace6f333fa56af1879f7480a0e7979201c374 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 16:12:08 -0700
Subject: [PATCH 170/770] [NFC, StackSafety] Better names for internal stuff

Remove const from some parameters as upcoming changes in ScalarEvolution
calls will need non const pointers.
---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 36 ++++++++++-------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 31f30d4b5d56f..4985647c29d41 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -201,15 +201,14 @@ class StackSafetyLocalAnalysis {
 
   const ConstantRange UnknownRange;
 
-  ConstantRange offsetFromAlloca(Value *Addr, const Value *AllocaPtr);
-  ConstantRange getAccessRange(Value *Addr, const Value *AllocaPtr,
+  ConstantRange offsetFrom(Value *Addr, Value *Base);
+  ConstantRange getAccessRange(Value *Addr, Value *Base,
                                ConstantRange SizeRange);
-  ConstantRange getAccessRange(Value *Addr, const Value *AllocaPtr,
-                               TypeSize Size);
+  ConstantRange getAccessRange(Value *Addr, Value *Base, TypeSize Size);
   ConstantRange getMemIntrinsicAccessRange(const MemIntrinsic *MI, const Use &U,
-                                           const Value *AllocaPtr);
+                                           Value *Base);
 
-  bool analyzeAllUses(const Value *Ptr, UseInfo &AS);
+  bool analyzeAllUses(Value *Ptr, UseInfo &AS);
 
   ConstantRange getRange(uint64_t Lower, uint64_t Upper) const {
     return ConstantRange(APInt(PointerSize, Lower), APInt(PointerSize, Upper));
@@ -225,13 +224,11 @@ class StackSafetyLocalAnalysis {
   FunctionInfo run();
 };
 
-ConstantRange
-StackSafetyLocalAnalysis::offsetFromAlloca(Value *Addr,
-                                           const Value *AllocaPtr) {
+ConstantRange StackSafetyLocalAnalysis::offsetFrom(Value *Addr, Value *Base) {
   if (!SE.isSCEVable(Addr->getType()))
     return UnknownRange;
 
-  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  AllocaOffsetRewriter Rewriter(SE, Base);
   const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
   ConstantRange Offset = SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
   assert(!Offset.isEmptySet());
@@ -239,7 +236,7 @@ StackSafetyLocalAnalysis::offsetFromAlloca(Value *Addr,
 }
 
 ConstantRange
-StackSafetyLocalAnalysis::getAccessRange(Value *Addr, const Value *AllocaPtr,
+StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
                                          ConstantRange SizeRange) {
   // Zero-size loads and stores do not access memory.
   if (SizeRange.isEmptySet())
@@ -248,7 +245,7 @@ StackSafetyLocalAnalysis::getAccessRange(Value *Addr, const Value *AllocaPtr,
   if (!SE.isSCEVable(Addr->getType()))
     return UnknownRange;
 
-  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  AllocaOffsetRewriter Rewriter(SE, Base);
   const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
 
   ConstantRange AccessStartRange =
@@ -258,17 +255,16 @@ StackSafetyLocalAnalysis::getAccessRange(Value *Addr, const Value *AllocaPtr,
   return AccessRange;
 }
 
-ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr,
-                                                       const Value *AllocaPtr,
+ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
                                                        TypeSize Size) {
   ConstantRange SizeRange = Size.isScalable()
                                 ? ConstantRange::getFull(PointerSize)
                                 : getRange(0, Size.getFixedSize());
-  return getAccessRange(Addr, AllocaPtr, SizeRange);
+  return getAccessRange(Addr, Base, SizeRange);
 }
 
 ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
-    const MemIntrinsic *MI, const Use &U, const Value *AllocaPtr) {
+    const MemIntrinsic *MI, const Use &U, Value *Base) {
   if (auto MTI = dyn_cast<MemTransferInst>(MI)) {
     if (MTI->getRawSource() != U && MTI->getRawDest() != U)
       return getRange(0, 1);
@@ -281,13 +277,13 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
   if (!Len)
     return UnknownRange;
   ConstantRange AccessRange =
-      getAccessRange(U, AllocaPtr, getRange(0, Len->getZExtValue()));
+      getAccessRange(U, Base, getRange(0, Len->getZExtValue()));
   return AccessRange;
 }
 
 /// The function analyzes all local uses of Ptr (alloca or argument) and
 /// calculates local access range and all function calls where it was used.
-bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
+bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, UseInfo &US) {
   SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 8> WorkList;
   WorkList.push_back(Ptr);
@@ -354,7 +350,7 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
         auto B = CB.arg_begin(), E = CB.arg_end();
         for (auto A = B; A != E; ++A) {
           if (A->get() == V) {
-            ConstantRange Offset = offsetFromAlloca(UI, Ptr);
+            ConstantRange Offset = offsetFrom(UI, Ptr);
             US.Calls.emplace_back(Callee, A - B, Offset);
           }
         }
@@ -387,7 +383,7 @@ FunctionInfo StackSafetyLocalAnalysis::run() {
     }
   }
 
-  for (const Argument &A : make_range(F.arg_begin(), F.arg_end())) {
+  for (Argument &A : make_range(F.arg_begin(), F.arg_end())) {
     Info.Params.emplace_back(PointerSize);
     UseInfo &PS = Info.Params.back();
     analyzeAllUses(&A, PS);

From 5afef79ff465e1711a9412f6814d66ff80f50dcf Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 16:48:08 -0700
Subject: [PATCH 171/770] [NFC, StackSafety] Remove duplicate code

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 4985647c29d41..223f99804bda4 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -242,14 +242,7 @@ StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
   if (SizeRange.isEmptySet())
     return ConstantRange::getEmpty(PointerSize);
 
-  if (!SE.isSCEVable(Addr->getType()))
-    return UnknownRange;
-
-  AllocaOffsetRewriter Rewriter(SE, Base);
-  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
-
-  ConstantRange AccessStartRange =
-      SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
+  ConstantRange AccessStartRange = offsetFrom(Addr, Base);
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
   assert(!AccessRange.isEmptySet());
   return AccessRange;

From 4320d4aa1c1c7d8bd75537703f7a11140552b0fa Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 16:53:02 -0700
Subject: [PATCH 172/770] [NFC, StackSafety] Add some missing includes

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 223f99804bda4..4b2fc300b1188 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -9,13 +9,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <memory>
 
 using namespace llvm;

From b5ae70046b0211ff75be8459f7282fe07ad918d8 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 17:04:09 -0700
Subject: [PATCH 173/770] [StackSafety] Simplify SCEVRewriteVisitor

Probably NFC.
---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp       | 13 ++-----------
 llvm/test/Analysis/StackSafetyAnalysis/local.ll |  1 +
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 4b2fc300b1188..10b9f14bc75a7 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -42,16 +42,6 @@ class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
   AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
       : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
 
-  const SCEV *visit(const SCEV *Expr) {
-    // Only re-write the expression if the alloca is used in an addition
-    // expression (it can be used in other types of expressions if it's cast to
-    // an int and passed as an argument.)
-    if (!isa<SCEVAddRecExpr>(Expr) && !isa<SCEVAddExpr>(Expr) &&
-        !isa<SCEVUnknown>(Expr))
-      return Expr;
-    return SCEVRewriteVisitor<AllocaOffsetRewriter>::visit(Expr);
-  }
-
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     // FIXME: look through one or several levels of definitions?
     // This can be inttoptr(AllocaPtr) and SCEV would not unwrap
@@ -237,7 +227,8 @@ ConstantRange StackSafetyLocalAnalysis::offsetFrom(Value *Addr, Value *Base) {
   AllocaOffsetRewriter Rewriter(SE, Base);
   const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
   ConstantRange Offset = SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
-  assert(!Offset.isEmptySet());
+  if (Offset.isEmptySet())
+    return UnknownRange;
   return Offset;
 }
 
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/local.ll b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
index 0ba1694e99eb4..b7c9eb1d29539 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/local.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
@@ -177,6 +177,7 @@ define void @NonConstantOffset(i1 zeroext %z) {
 ; CHECK-NEXT: args uses:
 ; CHECK-NEXT: z[]: full-set{{$}}
 ; CHECK-NEXT: allocas uses:
+; FIXME: SCEV can't look through selects.
 ; CHECK-NEXT: x[4]: [0,4){{$}}
 ; CHECK-NOT: ]:
 entry:

From ef3e83122665adcb2f7a7f380c9deb3dac68cb80 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 23 May 2020 18:10:34 -0400
Subject: [PATCH 174/770] GlobalISel: Basic legalization for G_PTRMASK

---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |   3 +
 .../CodeGen/GlobalISel/LegalityPredicates.cpp |   6 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  16 ++
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  19 +-
 .../AMDGPU/GlobalISel/legalize-ptrmask.mir    | 221 ++++++++++++++++++
 5 files changed, 254 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 624fa70f1aa69..f913f5f41b8e8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -252,6 +252,9 @@ LegalityPredicate sizeNotPow2(unsigned TypeIdx);
 /// is not a power of 2.
 LegalityPredicate scalarOrEltSizeNotPow2(unsigned TypeIdx);
 
+/// True if the total bitwidth of the specified type index is \p Size bits.
+LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size);
+
 /// True iff the specified type indices are both the same bit size.
 LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1);
 /// True iff the specified MMO index has a size that is not a power of 2
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 601d50e9806fd..b6fb061a8334b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -126,6 +126,12 @@ LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
   };
 }
 
+LegalityPredicate LegalityPredicates::sizeIs(unsigned TypeIdx, unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].getSizeInBits() == Size;
+  };
+}
+
 LegalityPredicate LegalityPredicates::sameSize(unsigned TypeIdx0,
                                                unsigned TypeIdx1) {
   return [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 704f1c4f96285..189c645ad9f4f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1210,6 +1210,14 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_PTRMASK: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    narrowScalarSrc(MI, NarrowTy, 2);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   }
 }
 
@@ -2143,6 +2151,14 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_PTRMASK: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c24996b93fa06..74e03e1d99199 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -66,12 +66,6 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx,
   };
 }
 
-static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
-  return [=](const LegalityQuery &Query) {
-    return Query.Types[TypeIdx].getSizeInBits() == Size;
-  };
-}
-
 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT Ty = Query.Types[TypeIdx];
@@ -560,14 +554,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
+  // FIXME: Clamp offset operand.
   getActionDefinitionsBuilder(G_PTR_ADD)
-    .scalarize(0)
-    .alwaysLegal();
+    .legalIf(isPointer(0))
+    .scalarize(0);
 
-  // TODO: Clamp mask to pointer sizes
   getActionDefinitionsBuilder(G_PTRMASK)
-    .scalarize(0)
-    .alwaysLegal();
+    .legalIf(typeInSet(1, {S64, S32}))
+    .minScalar(1, S32)
+    .maxScalarIf(sizeIs(0, 32), 1, S32)
+    .maxScalarIf(sizeIs(0, 64), 1, S64)
+    .scalarize(0);
 
   auto &CmpBuilder =
     getActionDefinitionsBuilder(G_ICMP)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir
new file mode 100644
index 0000000000000..fe819107ce6e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir
@@ -0,0 +1,221 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: ptrmask_p1_s16
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: ptrmask_p1_s16
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p1) = G_PTRMASK [[COPY]], [[AND]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p1)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(s16) = G_TRUNC %1
+    %3:_(p1) = G_PTRMASK %0, %2
+    $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: ptrmask_p1_s32
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: ptrmask_p1_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p1) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p1)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(p1) = G_PTRMASK %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: ptrmask_p1_s64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p1_s64
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p1) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p1)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(p1) = G_PTRMASK %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: ptrmask_p1_s96
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+
+    ; CHECK-LABEL: name: ptrmask_p1_s96
+    ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[COPY1]](s96)
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p1) = G_PTRMASK [[COPY]], [[TRUNC]](s64)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p1)
+    %0:_(p1) = COPY $vgpr0_vgpr1
+    %1:_(s96) = COPY $vgpr2_vgpr3_vgpr4
+    %2:_(p1) = G_PTRMASK %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: ptrmask_p0_s16
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: ptrmask_p0_s16
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[COPY]], [[AND]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p0)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(s16) = G_TRUNC %1
+    %3:_(p0) = G_PTRMASK %0, %2
+    $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: ptrmask_p0_s32
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: ptrmask_p0_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p0)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(p0) = G_PTRMASK %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: ptrmask_p0_s64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p0_s64
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p0)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(p0) = G_PTRMASK %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: ptrmask_p0_s96
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+
+    ; CHECK-LABEL: name: ptrmask_p0_s96
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[COPY1]](s96)
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[COPY]], [[TRUNC]](s64)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[PTRMASK]](p0)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s96) = COPY $vgpr2_vgpr3_vgpr4
+    %2:_(p0) = G_PTRMASK %0, %1
+    $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: ptrmask_p3_s16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p3_s16
+    ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[AND]](s32)
+    ; CHECK: $vgpr0 = COPY [[PTRMASK]](p3)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s16) = G_TRUNC %1
+    %3:_(p3) = G_PTRMASK %0, %2
+    $vgpr0 = COPY %3
+...
+
+---
+name: ptrmask_p3_s32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: ptrmask_p3_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[COPY1]](s32)
+    ; CHECK: $vgpr0 = COPY [[PTRMASK]](p3)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p3) = G_PTRMASK %0, %1
+    $vgpr0 = COPY %2
+...
+
+---
+name: ptrmask_p3_s64
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2
+
+    ; CHECK-LABEL: name: ptrmask_p3_s64
+    ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr1_vgpr2
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[COPY1]](s64)
+    ; CHECK: $vgpr0 = COPY [[PTRMASK]](p3)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s64) = COPY $vgpr1_vgpr2
+    %2:_(p3) = G_PTRMASK %0, %1
+    $vgpr0 = COPY %2
+...
+
+---
+name: ptrmask_p3_s96
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1_vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: ptrmask_p3_s96
+    ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr1_vgpr2_vgpr3
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s96)
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[TRUNC]](s32)
+    ; CHECK: $vgpr0 = COPY [[PTRMASK]](p3)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s96) = COPY $vgpr1_vgpr2_vgpr3
+    %2:_(p3) = G_PTRMASK %0, %1
+    $vgpr0 = COPY %2
+...

From 8e3307f5519fa58827c7b030274f122b1ed36617 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 24 May 2020 10:49:22 -0400
Subject: [PATCH 175/770] GlobalISel: Add a clarification to G_STORE
 documentation

Mirror the note on G_LOAD. We probably do need to add an explicit
G_TRUNCSTORE opcode for the vector case, although I do not have a use
for it.
---
 llvm/docs/GlobalISel/GenericOpcode.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 6372192c0088f..2350b9cf37645 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -567,7 +567,11 @@ Same as G_INDEXED_LOAD except that the load performed is zero-extending, as with
 G_STORE
 ^^^^^^^
 
-Generic store. Expects a MachineMemOperand in addition to explicit operands.
+Generic store. Expects a MachineMemOperand in addition to explicit
+operands. If the stored value size is greater than the memory size,
+the high bits are implicitly truncated. If this is a vector store, the
+high elements are discarded (i.e. this does not function as a per-lane
+vector, truncating store)
 
 G_INDEXED_STORE
 ^^^^^^^^^^^^^^^

From 97a133f15724aa7ddf5d9b62dc9c0657a4efd115 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Tue, 26 May 2020 18:50:19 -0700
Subject: [PATCH 176/770] Temporarily Revert "[Clang][AArch64] Capturing proper
 pointer alignment for Neon vld1 intrinsicts" as it's causing crashes on code
 generation and https://bugs.llvm.org/show_bug.cgi?id=46084

This reverts commit 98cad555e29187a03e2bc3db5780762981913902.
---
 clang/lib/CodeGen/CGBuiltin.cpp              | 12 ++---
 clang/test/CodeGen/aarch64-neon-intrinsics.c | 52 ++++++++++----------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index bef0ad27145f3..b5129249c016d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10329,9 +10329,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
   case NEON::BI__builtin_neon_vld1_v:
   case NEON::BI__builtin_neon_vld1q_v: {
-    auto Alignment = CGM.getNaturalPointeeTypeAlignment(
-        E->getArg(0)->IgnoreParenCasts()->getType());
     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
+    auto Alignment = CharUnits::fromQuantity(
+        BuiltinID == NEON::BI__builtin_neon_vld1_v ? 8 : 16);
     return Builder.CreateAlignedLoad(VTy, Ops[0], Alignment);
   }
   case NEON::BI__builtin_neon_vst1_v:
@@ -10344,8 +10344,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    auto Alignment = CGM.getNaturalPointeeTypeAlignment(
-        E->getArg(0)->IgnoreParenCasts()->getType());
+    auto Alignment = CharUnits::fromQuantity(
+        BuiltinID == NEON::BI__builtin_neon_vld1_lane_v ? 8 : 16);
     Ops[0] =
         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
@@ -10355,8 +10355,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Value *V = UndefValue::get(Ty);
     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    auto Alignment = CGM.getNaturalPointeeTypeAlignment(
-        E->getArg(0)->IgnoreParenCasts()->getType());
+    auto Alignment = CharUnits::fromQuantity(
+        BuiltinID == NEON::BI__builtin_neon_vld1_dup_v ? 8 : 16);
     Ops[0] =
         Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment);
     llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CodeGen/aarch64-neon-intrinsics.c
index 1fb245f3d3429..7744b4f4a159d 100644
--- a/clang/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics.c
@@ -8956,7 +8956,7 @@ float64_t test_vrsqrted_f64(float64_t a) {
 
 // CHECK-LABEL: @test_vld1q_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
 // CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const *a) {
   return vld1q_u8(a);
@@ -8965,7 +8965,7 @@ uint8x16_t test_vld1q_u8(uint8_t const *a) {
 // CHECK-LABEL: @test_vld1q_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
 // CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const *a) {
   return vld1q_u16(a);
@@ -8974,7 +8974,7 @@ uint16x8_t test_vld1q_u16(uint16_t const *a) {
 // CHECK-LABEL: @test_vld1q_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
 // CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const *a) {
   return vld1q_u32(a);
@@ -8983,7 +8983,7 @@ uint32x4_t test_vld1q_u32(uint32_t const *a) {
 // CHECK-LABEL: @test_vld1q_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
 // CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const *a) {
   return vld1q_u64(a);
@@ -8991,7 +8991,7 @@ uint64x2_t test_vld1q_u64(uint64_t const *a) {
 
 // CHECK-LABEL: @test_vld1q_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
 // CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const *a) {
   return vld1q_s8(a);
@@ -9000,7 +9000,7 @@ int8x16_t test_vld1q_s8(int8_t const *a) {
 // CHECK-LABEL: @test_vld1q_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
 // CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const *a) {
   return vld1q_s16(a);
@@ -9009,7 +9009,7 @@ int16x8_t test_vld1q_s16(int16_t const *a) {
 // CHECK-LABEL: @test_vld1q_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
 // CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const *a) {
   return vld1q_s32(a);
@@ -9018,7 +9018,7 @@ int32x4_t test_vld1q_s32(int32_t const *a) {
 // CHECK-LABEL: @test_vld1q_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
 // CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const *a) {
   return vld1q_s64(a);
@@ -9027,7 +9027,7 @@ int64x2_t test_vld1q_s64(int64_t const *a) {
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[TMP2:%.*]] = load <8 x half>, <8 x half>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <8 x half>, <8 x half>* [[TMP1]]
 // CHECK:   ret <8 x half> [[TMP2]]
 float16x8_t test_vld1q_f16(float16_t const *a) {
   return vld1q_f16(a);
@@ -9036,7 +9036,7 @@ float16x8_t test_vld1q_f16(float16_t const *a) {
 // CHECK-LABEL: @test_vld1q_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
-// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
 // CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const *a) {
   return vld1q_f32(a);
@@ -9045,7 +9045,7 @@ float32x4_t test_vld1q_f32(float32_t const *a) {
 // CHECK-LABEL: @test_vld1q_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
-// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
 // CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vld1q_f64(float64_t const *a) {
   return vld1q_f64(a);
@@ -9053,7 +9053,7 @@ float64x2_t test_vld1q_f64(float64_t const *a) {
 
 // CHECK-LABEL: @test_vld1q_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
 // CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const *a) {
   return vld1q_p8(a);
@@ -9062,7 +9062,7 @@ poly8x16_t test_vld1q_p8(poly8_t const *a) {
 // CHECK-LABEL: @test_vld1q_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
 // CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const *a) {
   return vld1q_p16(a);
@@ -9070,7 +9070,7 @@ poly16x8_t test_vld1q_p16(poly16_t const *a) {
 
 // CHECK-LABEL: @test_vld1_u8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
 // CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const *a) {
   return vld1_u8(a);
@@ -9079,7 +9079,7 @@ uint8x8_t test_vld1_u8(uint8_t const *a) {
 // CHECK-LABEL: @test_vld1_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
 // CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const *a) {
   return vld1_u16(a);
@@ -9088,7 +9088,7 @@ uint16x4_t test_vld1_u16(uint16_t const *a) {
 // CHECK-LABEL: @test_vld1_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
 // CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const *a) {
   return vld1_u32(a);
@@ -9097,7 +9097,7 @@ uint32x2_t test_vld1_u32(uint32_t const *a) {
 // CHECK-LABEL: @test_vld1_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]], align 8
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
 // CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const *a) {
   return vld1_u64(a);
@@ -9105,7 +9105,7 @@ uint64x1_t test_vld1_u64(uint64_t const *a) {
 
 // CHECK-LABEL: @test_vld1_s8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
 // CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const *a) {
   return vld1_s8(a);
@@ -9114,7 +9114,7 @@ int8x8_t test_vld1_s8(int8_t const *a) {
 // CHECK-LABEL: @test_vld1_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
 // CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const *a) {
   return vld1_s16(a);
@@ -9123,7 +9123,7 @@ int16x4_t test_vld1_s16(int16_t const *a) {
 // CHECK-LABEL: @test_vld1_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
-// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
 // CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const *a) {
   return vld1_s32(a);
@@ -9132,7 +9132,7 @@ int32x2_t test_vld1_s32(int32_t const *a) {
 // CHECK-LABEL: @test_vld1_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]], align 8
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
 // CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const *a) {
   return vld1_s64(a);
@@ -9141,7 +9141,7 @@ int64x1_t test_vld1_s64(int64_t const *a) {
 // CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[TMP2:%.*]] = load <4 x half>, <4 x half>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <4 x half>, <4 x half>* [[TMP1]]
 // CHECK:   ret <4 x half> [[TMP2]]
 float16x4_t test_vld1_f16(float16_t const *a) {
   return vld1_f16(a);
@@ -9150,7 +9150,7 @@ float16x4_t test_vld1_f16(float16_t const *a) {
 // CHECK-LABEL: @test_vld1_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
-// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
 // CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const *a) {
   return vld1_f32(a);
@@ -9159,7 +9159,7 @@ float32x2_t test_vld1_f32(float32_t const *a) {
 // CHECK-LABEL: @test_vld1_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
-// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]], align 8
+// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
 // CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vld1_f64(float64_t const *a) {
   return vld1_f64(a);
@@ -9167,7 +9167,7 @@ float64x1_t test_vld1_f64(float64_t const *a) {
 
 // CHECK-LABEL: @test_vld1_p8(
 // CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
 // CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const *a) {
   return vld1_p8(a);
@@ -9176,7 +9176,7 @@ poly8x8_t test_vld1_p8(poly8_t const *a) {
 // CHECK-LABEL: @test_vld1_p16(
 // CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
 // CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const *a) {
   return vld1_p16(a);

From 23a2f4521467a708fb1f9ae1f9536f302a1dc7e3 Mon Sep 17 00:00:00 2001
From: Kang Zhang <shkzhang@cn.ibm.com>
Date: Wed, 27 May 2020 02:35:45 +0000
Subject: [PATCH 177/770] [NFC][PowerPC] Modify the test case
 two-address-crash.mir

---
 .../CodeGen/PowerPC/two-address-crash.mir     | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/PowerPC/two-address-crash.mir b/llvm/test/CodeGen/PowerPC/two-address-crash.mir
index 6e98d3d8d398b..caf036358af9f 100644
--- a/llvm/test/CodeGen/PowerPC/two-address-crash.mir
+++ b/llvm/test/CodeGen/PowerPC/two-address-crash.mir
@@ -1,5 +1,7 @@
 # RUN: not --crash llc -mtriple=ppc32-- %s -run-pass=phi-node-elimination \
 # RUN:   -verify-machineinstrs -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple=ppc32-- %s -start-before=phi-node-elimination \
+# RUN:   -verify-machineinstrs -o /dev/null 2>&1
 
 --- |
   define void @VerifyTwoAddressCrash(i16 %div.0.i.i.i.i, i32 %L_num.0.i.i.i.i, i32 %tmp1.i.i206.i.i, i16* %P) {
@@ -16,6 +18,56 @@
 ...
 ---
 name:            VerifyTwoAddressCrash
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+  - { id: 0, class: gprc, preferred-register: '' }
+  - { id: 1, class: gprc, preferred-register: '' }
+  - { id: 2, class: gprc, preferred-register: '' }
+  - { id: 3, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 4, class: gprc, preferred-register: '' }
+  - { id: 5, class: crrc, preferred-register: '' }
+  - { id: 6, class: crbitrc, preferred-register: '' }
+  - { id: 7, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 8, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 9, class: gprc, preferred-register: '' }
+  - { id: 10, class: gprc, preferred-register: '' }
+  - { id: 11, class: gprc, preferred-register: '' }
+liveins:
+  - { reg: '$r3', virtual-reg: '%0' }
+  - { reg: '$r4', virtual-reg: '%1' }
+  - { reg: '$r5', virtual-reg: '%2' }
+  - { reg: '$r6', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
 body:             |
   bb.0 (%ir-block.0):
     liveins: $r3, $r4, $r5, $r6
@@ -40,6 +92,6 @@ body:             |
 # CHECK-LABEL: Bad machine code: Two-address instruction operands must be identical
 # CHECK-NEXT:  - function:    VerifyTwoAddressCrash
 # CHECK-NEXT:  - basic block: %bb.0
-# CHECK-NEXT:  - instruction: %10:gprc = RLWIMI killed %9:gprc(tied-def 0), killed %3:gprc, 1, 0, 30
+# CHECK-NEXT:  - instruction: %10:gprc = RLWIMI killed %9:gprc(tied-def 0), killed %0:gprc, 1, 0, 30
 # CHECK-NEXT:  - operand 1:   killed %9:gprc(tied-def 0)
 # CHECK-NEXT:  LLVM ERROR: Found 1 machine code errors.

From a7141480fb04eadf8d7d60c03494bcc885979a8e Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Wed, 27 May 2020 02:37:04 +0000
Subject: [PATCH 178/770] [compiler-rt][NFC]Fix Wdeprecated warnings for
 fsanitize-coverage

A few testcases are still using deprecated options.

warning: argument '-fsanitize-coverage=[func|bb|edge]' is deprecated,
use '-fsanitize-coverage=[func|bb|edge],[trace-pc-guard|trace-pc]'
instead [-Wdeprecated]

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D79741
---
 .../test/asan/TestCases/Windows/coverage-basic.cpp   |  2 +-
 .../test/asan/TestCases/coverage-disabled.cpp        |  2 +-
 compiler-rt/test/msan/coverage-levels.cpp            |  8 ++++----
 .../test/ubsan/TestCases/Misc/coverage-levels.cpp    | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/Windows/coverage-basic.cpp b/compiler-rt/test/asan/TestCases/Windows/coverage-basic.cpp
index 1469e1c30ae34..163247e09bf7d 100644
--- a/compiler-rt/test/asan/TestCases/Windows/coverage-basic.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/coverage-basic.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t-dir
 // RUN: mkdir %t-dir && cd %t-dir
-// RUN: %clangxx_asan -fsanitize-coverage=func %s -o test.exe
+// RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard %s -o test.exe
 // RUN: %env_asan_opts=coverage=1 %run ./test.exe
 //
 // RUN: %sancov print *.sancov | FileCheck %s
diff --git a/compiler-rt/test/asan/TestCases/coverage-disabled.cpp b/compiler-rt/test/asan/TestCases/coverage-disabled.cpp
index 46a822dff08c1..2a283b4652121 100644
--- a/compiler-rt/test/asan/TestCases/coverage-disabled.cpp
+++ b/compiler-rt/test/asan/TestCases/coverage-disabled.cpp
@@ -3,7 +3,7 @@
 // RUN: rm -rf %t-dir
 // RUN: mkdir -p %t-dir
 //
-// RUN: %clangxx_asan -fsanitize-coverage=func %s -o %t
+// RUN: %clangxx_asan -fsanitize-coverage=func,trace-pc-guard %s -o %t
 //
 // RUN: %env_asan_opts=coverage_direct=0:coverage_dir='"%t-dir"':verbosity=1 %run %t
 // RUN: not %sancov print %t-dir/*.sancov 2>&1
diff --git a/compiler-rt/test/msan/coverage-levels.cpp b/compiler-rt/test/msan/coverage-levels.cpp
index 5ca3b717d04fb..1b7778e9d7aa8 100644
--- a/compiler-rt/test/msan/coverage-levels.cpp
+++ b/compiler-rt/test/msan/coverage-levels.cpp
@@ -1,13 +1,13 @@
 // Test various levels of coverage
 //
-// RUN: %clangxx_msan -DINIT_VAR=1 -O1 -fsanitize-coverage=func  %s -o %t
+// RUN: %clangxx_msan -DINIT_VAR=1 -O1 -fsanitize-coverage=func,trace-pc-guard  %s -o %t
 // RUN: mkdir -p %t-dir
 // RUN: MSAN_OPTIONS=coverage=1:verbosity=1:coverage_dir=%t-dir %run %t 2>&1 | FileCheck %s --check-prefix=CHECK1 --check-prefix=CHECK_NOWARN
-// RUN: %clangxx_msan -O1 -fsanitize-coverage=func  %s -o %t
+// RUN: %clangxx_msan -O1 -fsanitize-coverage=func,trace-pc-guard  %s -o %t
 // RUN: MSAN_OPTIONS=coverage=1:verbosity=1:coverage_dir=%t-dir not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK1 --check-prefix=CHECK_WARN
-// RUN: %clangxx_msan -O1 -fsanitize-coverage=bb  %s -o %t
+// RUN: %clangxx_msan -O1 -fsanitize-coverage=bb,trace-pc-guard  %s -o %t
 // RUN: MSAN_OPTIONS=coverage=1:verbosity=1:coverage_dir=%t-dir not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK2 --check-prefix=CHECK_WARN
-// RUN: %clangxx_msan -O1 -fsanitize-coverage=edge  %s -o %t
+// RUN: %clangxx_msan -O1 -fsanitize-coverage=edge,trace-pc-guard  %s -o %t
 // RUN: MSAN_OPTIONS=coverage=1:verbosity=1:coverage_dir=%t-dir not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK3 --check-prefix=CHECK_WARN
 
 volatile int sink;
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/coverage-levels.cpp b/compiler-rt/test/ubsan/TestCases/Misc/coverage-levels.cpp
index 364f985c50514..4a94350ec1620 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/coverage-levels.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/coverage-levels.cpp
@@ -4,20 +4,20 @@
 // REQUIRES: shell
 //
 // RUN: rm -rf %t-dir && mkdir %t-dir
-// RUN: %clangxx -fsanitize=shift                        -DGOOD_SHIFT=1 -O1 -fsanitize-coverage=func  %s -o %t
+// RUN: %clangxx -fsanitize=shift                        -DGOOD_SHIFT=1 -O1 -fsanitize-coverage=func,trace-pc-guard  %s -o %t
 // RUN: %env_ubsan_opts=coverage=1:verbosity=1:coverage_dir='"%t-dir"' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK1 --check-prefix=CHECK_NOWARN
-// RUN: %clangxx -fsanitize=undefined                    -DGOOD_SHIFT=1 -O1 -fsanitize-coverage=func  %s -o %t
+// RUN: %clangxx -fsanitize=undefined                    -DGOOD_SHIFT=1 -O1 -fsanitize-coverage=func,trace-pc-guard  %s -o %t
 // RUN: %env_ubsan_opts=coverage=1:verbosity=1:coverage_dir='"%t-dir"' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK1 --check-prefix=CHECK_NOWARN
 
 // Also works without any sanitizer.
-// RUN: %clangxx                                         -DGOOD_SHIFT=1 -O1 -fsanitize-coverage=func  %s -o %t
+// RUN: %clangxx                                         -DGOOD_SHIFT=1 -O1 -fsanitize-coverage=func,trace-pc-guard  %s -o %t
 // RUN: %env_ubsan_opts=coverage=1:verbosity=1:coverage_dir='"%t-dir"' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK1 --check-prefix=CHECK_NOWARN
 
-// RUN: %clangxx -fsanitize=shift -O1 -fsanitize-coverage=func  %s -o %t
+// RUN: %clangxx -fsanitize=shift -O1 -fsanitize-coverage=func,trace-pc-guard  %s -o %t
 // RUN: %env_ubsan_opts=coverage=1:verbosity=1:coverage_dir='"%t-dir"' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK1 --check-prefix=CHECK_WARN
-// RUN: %clangxx -fsanitize=shift -O1 -fsanitize-coverage=bb  %s -o %t
+// RUN: %clangxx -fsanitize=shift -O1 -fsanitize-coverage=bb,trace-pc-guard  %s -o %t
 // RUN: %env_ubsan_opts=coverage=1:verbosity=1:coverage_dir='"%t-dir"' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK2 --check-prefix=CHECK_WARN
-// RUN: %clangxx -fsanitize=shift -O1 -fsanitize-coverage=edge  %s -o %t
+// RUN: %clangxx -fsanitize=shift -O1 -fsanitize-coverage=edge,trace-pc-guard  %s -o %t
 // RUN: %env_ubsan_opts=coverage=1:verbosity=1:coverage_dir='"%t-dir"' %run %t 2>&1 | FileCheck %s --check-prefix=CHECK3 --check-prefix=CHECK_WARN
 
 // Coverage is not yet implemented in TSan.

From 5759e4731635e1f28fef2c4619491a1b4a2bc305 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Tue, 26 May 2020 16:35:20 -0700
Subject: [PATCH 179/770] [mlir][Linalg] Avoid using scf.parallel for
 non-parallel loops in Linalg ops.

Modifying the loop nest builder for generating scf.parallel loops to
not generate scf.parallel loops for non-parallel iterator types in
Linalg operations. The existing implementation incorrectly generated
scf.parallel for all tiled loops. It is rectified by refactoring logic
used while lowering to loops that accounted for this.

Differential Revision: https://reviews.llvm.org/D80188
---
 .../include/mlir/Dialect/Linalg/Utils/Utils.h |  31 +++++
 mlir/lib/Dialect/Linalg/Transforms/Loops.cpp  |  81 ++-----------
 mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp |  10 +-
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       |  89 +++++++++++++++
 mlir/test/Dialect/Linalg/parallel_loops.mlir  |  38 +++++-
 .../Dialect/Linalg/tile_parallel_reduce.mlir  | 108 ++++++++++++++++++
 .../Dialect/Linalg/transform-patterns.mlir    |  25 +++-
 .../lib/Transforms/TestLinalgTransforms.cpp   |   8 ++
 8 files changed, 312 insertions(+), 78 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/tile_parallel_reduce.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 9caec484659e8..c8a5d83438f56 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -9,14 +9,21 @@
 #ifndef MLIR_DIALECT_LINALG_UTILS_H_
 #define MLIR_DIALECT_LINALG_UTILS_H_
 
+#include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
+#include "mlir/Dialect/Linalg/EDSC/Builders.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 
 #include "llvm/ADT/SetVector.h"
 
+using mlir::edsc::intrinsics::AffineIndexedValue;
+using mlir::edsc::intrinsics::StdIndexedValue;
+
 namespace mlir {
 class AffineExpr;
+class AffineForOp;
 class AffineMap;
 class OperationFolder;
 class PatternRewriter;
@@ -49,6 +56,15 @@ struct RegionMatcher {
   static Optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
 };
 
+/// Checks if an iterator_type attribute is parallel.
+bool isParallelIteratorType(Attribute attr);
+
+/// Checks if an iterator_type attribute is parallel.
+bool isReductionIteratorType(Attribute attr);
+
+/// Checks if an iterator_type attribute is parallel.
+bool isWindowIteratorType(Attribute attr);
+
 /// Checks whether the specific `producer` is the last write to exactly the
 /// whole `consumedView`. This checks structural dominance, that the dependence
 /// is a RAW without any interleaved write to any piece of `consumedView`.
@@ -141,6 +157,21 @@ void applyPermutationToVector(SmallVector<T, N> &inVec,
   inVec = auxVec;
 }
 
+/// Utility class used to generate nested loops with ranges described by
+/// `loopRanges` and loop type described by the `iteratorTypes`. `allIvs` is
+/// populated with induction variables for all generated loops on return, with
+/// `fun` used to generate the body of the innermost loop.
+template <typename LoopTy>
+struct GenerateLoopNest {
+  using IndexedValueTy =
+      typename std::conditional<std::is_same<LoopTy, AffineForOp>::value,
+                                AffineIndexedValue, StdIndexedValue>::type;
+  static void doit(MutableArrayRef<Value> allIvs,
+                   ArrayRef<SubViewOp::Range> loopRanges,
+                   ArrayRef<Attribute> iteratorTypes,
+                   std::function<void(void)> fun);
+};
+
 } // namespace linalg
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
index 74da63dafee37..910078875f57b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -487,80 +487,9 @@ class LinalgScopedEmitter<IndexedValueType, IndexedGenericOp> {
   }
 };
 
-namespace {
-/// Helper struct to generate the loop nest for the op. This factored out here
-/// to be able to partially specialize this for different LoopTy.
-template <typename LoopTy, typename ConcreteOpTy>
-class GenerateLoopNest {
-public:
-  using IndexedValueTy =
-      typename std::conditional<std::is_same<LoopTy, AffineForOp>::value,
-                                AffineIndexedValue, StdIndexedValue>::type;
-  static void doit(ConcreteOpTy linalgOp, ArrayRef<SubViewOp::Range> loopRanges,
-                   MutableArrayRef<Value> allIvs) {
-    GenericLoopNestRangeBuilder<LoopTy>(allIvs, loopRanges)([&] {
-      SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
-      LinalgScopedEmitter<IndexedValueTy,
-                          ConcreteOpTy>::emitScalarImplementation(allIvValues,
-                                                                  linalgOp);
-    });
-  }
-};
-
-/// Generates loop nest using scf.parallel. scf.parallel is only used for the
-/// outer parallel loops. All other loops are generated using scf.for
-/// operation.
-template <typename ConcreteOpTy>
-class GenerateLoopNest<scf::ParallelOp, ConcreteOpTy> {
-public:
-  using IndexedValueTy = StdIndexedValue;
-
-  static void doit(ConcreteOpTy linalgOp, ArrayRef<SubViewOp::Range> loopRanges,
-                   MutableArrayRef<Value> allIvs) {
-    // Only generate scf.parallel for outer consecutive "parallel"
-    // iterator_types.
-    // TODO(ravishankarm): Generate scf.parallel for all "parallel" iterator
-    // types, not just the outer most ones. Also handle "reduction" iterator
-    // types.
-    auto nOuterPar = linalgOp.iterator_types()
-                         .getValue()
-                         .take_while([](Attribute attr) {
-                           return attr.cast<StringAttr>().getValue() ==
-                                  getParallelIteratorTypeName();
-                         })
-                         .size();
-    // If there are no outer parallel loops, then number of loop ops is same as
-    // the number of loops, and they are all scf.for ops.
-    if (nOuterPar) {
-      GenericLoopNestRangeBuilder<scf::ParallelOp>(
-          allIvs.take_front(nOuterPar), loopRanges.take_front(nOuterPar))([&] {
-        GenericLoopNestRangeBuilder<scf::ForOp>(
-            allIvs.drop_front(nOuterPar),
-            loopRanges.drop_front(nOuterPar))([&] {
-          SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
-          LinalgScopedEmitter<StdIndexedValue, ConcreteOpTy>::
-              emitScalarImplementation(allIvValues, linalgOp);
-        });
-      });
-    } else {
-      // If there are no parallel loops then fallback to generating all scf.for
-      // operations.
-      GenericLoopNestRangeBuilder<scf::ForOp>(allIvs, loopRanges)([&] {
-        SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
-        LinalgScopedEmitter<StdIndexedValue,
-                            ConcreteOpTy>::emitScalarImplementation(allIvValues,
-                                                                    linalgOp);
-      });
-    }
-  }
-};
-} // namespace
-
 template <typename LoopTy, typename ConcreteOpTy>
 Optional<LinalgLoops> linalgOpToLoopsImpl(Operation *op, OpBuilder &builder) {
-  using Impl = GenerateLoopNest<LoopTy, ConcreteOpTy>;
-  using IndexedValueTy =
-      typename GenerateLoopNest<LoopTy, ConcreteOpTy>::IndexedValueTy;
+  using IndexedValueTy = typename GenerateLoopNest<LoopTy>::IndexedValueTy;
 
   ScopedContext scope(builder, op->getLoc());
 
@@ -591,7 +520,13 @@ Optional<LinalgLoops> linalgOpToLoopsImpl(Operation *op, OpBuilder &builder) {
       emitLoopRanges(scope.getBuilderRef(), scope.getLocation(), invertedMap,
                      getViewSizes(builder, linalgOp));
   assert(loopRanges.size() == allIvs.size());
-  Impl::doit(linalgOp, loopRanges, allIvs);
+  GenerateLoopNest<LoopTy>::doit(
+      allIvs, loopRanges, linalgOp.iterator_types().getValue(), [&] {
+        SmallVector<Value, 4> allIvValues(allIvs.begin(), allIvs.end());
+        LinalgScopedEmitter<IndexedValueTy,
+                            ConcreteOpTy>::emitScalarImplementation(allIvValues,
+                                                                    linalgOp);
+      });
   // Number of loop ops might be different from the number of ivs since some
   // loops like affine.parallel and scf.parallel have multiple ivs.
   llvm::SetVector<Operation *> loopSet;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 2d875d4e95e4d..5b4fec4bbf20e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -376,7 +376,11 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
   // 3. Create the tiled loops.
   LinalgOp res = op;
   SmallVector<Value, 4> ivs(loopRanges.size());
-  GenericLoopNestRangeBuilder<LoopTy>(ivs, loopRanges)([&] {
+  SmallVector<Attribute, 4> iteratorTypes =
+      llvm::to_vector<4>(op.iterator_types().cast<ArrayAttr>().getValue());
+  if (!options.interchangeVector.empty())
+    applyPermutationToVector(iteratorTypes, options.interchangeVector);
+  GenerateLoopNest<LoopTy>::doit(ivs, loopRanges, iteratorTypes, [&] {
     auto &b = ScopedContext::getBuilderRef();
     auto loc = ScopedContext::getLocation();
     SmallVector<Value, 4> ivValues(ivs.begin(), ivs.end());
@@ -384,8 +388,8 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
     // If we have to apply a permutation to the tiled loop nest, we have to
     // reorder the induction variables This permutation is the right one
     // assuming that loopRanges have previously been permuted by
-    // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation of
-    // that one: (d0,d1,d2)->(d2,d0,d1)
+    // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation
+    // of that one: (d0,d1,d2)->(d2,d0,d1)
     if (!options.interchangeVector.empty())
       ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues);
 
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 4f86b934172b0..cd8b17650bb11 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/SCF/EDSC/Builders.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExpr.h"
@@ -101,3 +102,91 @@ mlir::linalg::getAssumedNonViewOperands(LinalgOp linalgOp) {
   }
   return res;
 }
+
+bool mlir::linalg::isParallelIteratorType(Attribute attr) {
+  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
+    return strAttr.getValue() == getParallelIteratorTypeName();
+  }
+  return false;
+}
+
+bool mlir::linalg::isReductionIteratorType(Attribute attr) {
+  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
+    return strAttr.getValue() == getReductionIteratorTypeName();
+  }
+  return false;
+}
+
+bool mlir::linalg::isWindowIteratorType(Attribute attr) {
+  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
+    return strAttr.getValue() == getWindowIteratorTypeName();
+  }
+  return false;
+}
+
+/// Explicit instantiation of loop nest generator for different loop types.
+template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
+template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
+template struct mlir::linalg::GenerateLoopNest<AffineForOp>;
+
+/// Specialization of loop nest generator for scf.parallel loops to handle
+/// iterator types that are not parallel. These are generated as sequential
+/// loops.
+template <>
+void mlir::linalg::GenerateLoopNest<scf::ForOp>::doit(
+    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
+    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+  edsc::GenericLoopNestRangeBuilder<scf::ForOp>(allIvs, loopRanges)(fun);
+}
+
+template <>
+void mlir::linalg::GenerateLoopNest<AffineForOp>::doit(
+    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
+    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+  edsc::GenericLoopNestRangeBuilder<AffineForOp>(allIvs, loopRanges)(fun);
+}
+
+template <>
+void mlir::linalg::GenerateLoopNest<scf::ParallelOp>::doit(
+    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
+    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+  // Check if there is nothing to do here. This is also the recursion
+  // termination.
+  if (loopRanges.empty())
+    return;
+  size_t nOuterPar = iteratorTypes.take_front(loopRanges.size())
+                         .take_while(isParallelIteratorType)
+                         .size();
+  if (nOuterPar == 0 && loopRanges.size() == 1)
+    // Generate the sequential for loop for the remaining non-parallel loop.
+    return GenerateLoopNest<scf::ForOp>::doit(allIvs, loopRanges, iteratorTypes,
+                                              fun);
+  if (nOuterPar == 0) {
+    // The immediate outer loop is not parallel. Generate a scf.for op for this
+    // loop, but there might be subsequent loops that are parallel. Use
+    // recursion to find those.
+    auto nestedFn = [&]() {
+      GenerateLoopNest<scf::ParallelOp>::doit(allIvs.drop_front(),
+                                              loopRanges.drop_front(),
+                                              iteratorTypes.drop_front(), fun);
+    };
+    return GenerateLoopNest<scf::ForOp>::doit(allIvs[0], loopRanges[0],
+                                              iteratorTypes[0], nestedFn);
+  }
+  if (nOuterPar == loopRanges.size()) {
+    // All loops are parallel, so generate the scf.parallel op.
+    return edsc::GenericLoopNestRangeBuilder<scf::ParallelOp>(allIvs,
+                                                              loopRanges)(fun);
+  }
+  // Generate scf.parallel for the outer parallel loops. The next inner loop is
+  // sequential, but there might be more parallel loops after that. So recurse
+  // into the same method.
+  auto nestedFn = [&]() {
+    GenerateLoopNest<scf::ParallelOp>::doit(
+        allIvs.drop_front(nOuterPar), loopRanges.drop_front(nOuterPar),
+        iteratorTypes.drop_front(nOuterPar), fun);
+  };
+  return GenerateLoopNest<scf::ParallelOp>::doit(
+      allIvs.take_front(nOuterPar), loopRanges.take_front(nOuterPar),
+      iteratorTypes.take_front(nOuterPar), nestedFn);
+}
diff --git a/mlir/test/Dialect/Linalg/parallel_loops.mlir b/mlir/test/Dialect/Linalg/parallel_loops.mlir
index abe9cccc8b75b..2174ddc3c269d 100644
--- a/mlir/test/Dialect/Linalg/parallel_loops.mlir
+++ b/mlir/test/Dialect/Linalg/parallel_loops.mlir
@@ -57,6 +57,42 @@ func @lower_outer_parallel(%A: memref<?x?x?x?xf32>, %B: memref<?x?x?xf32>) {
 //   CHECK-DAG: %[[D3:.*]] = dim %{{.*}}, 3
 //       CHECK: scf.parallel (%[[IV0:.*]], %[[IV1:.*]]) = (%[[C0]], %[[C0]]) to (%[[D0]], %[[D1]]) step (%[[C1]], %[[C1]])
 //       CHECK:   scf.for %[[IV2:.*]] = %[[C0]] to %[[D2]] step %[[C1]]
-//       CHECK:     scf.for %[[IV3:.*]] = %[[C0]] to %[[D3]] step %[[C1]]
+//       CHECK:     scf.parallel (%[[IV3:.*]]) = (%[[C0]]) to (%[[D3]]) step (%[[C1]])
 //       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
 //       CHECK:       store %{{.*}}, %{{.*}}[%[[IV0]], %[[IV1]], %[[IV3]]]
+
+// -----
+
+#accesses = [
+  affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>,
+  affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>
+]
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"],
+  indexing_maps = #accesses
+}
+
+func @lower_mixed_parallel(%A: memref<?x?x?x?x?x?xf32>, %B: memref<?x?x?x?xf32>) {
+  linalg.generic #trait %A, %B {
+    ^bb0(%a: f32, %b: f32):
+      linalg.yield %a: f32
+  } : memref<?x?x?x?x?x?xf32>, memref<?x?x?x?xf32>
+  return
+}
+// CHECK-LABEL: @lower_mixed_parallel
+//   CHECK-DAG: %[[C0:.*]] = constant 0
+//   CHECK-DAG: %[[C1:.*]] = constant 1
+//   CHECK-DAG: %[[D0:.*]] = dim %{{.*}}, 0
+//   CHECK-DAG: %[[D1:.*]] = dim %{{.*}}, 1
+//   CHECK-DAG: %[[D2:.*]] = dim %{{.*}}, 2
+//   CHECK-DAG: %[[D3:.*]] = dim %{{.*}}, 3
+//   CHECK-DAG: %[[D4:.*]] = dim %{{.*}}, 4
+//   CHECK-DAG: %[[D5:.*]] = dim %{{.*}}, 5
+//       CHECK: scf.parallel (%[[IV0:.*]], %[[IV1:.*]]) = (%[[C0]], %[[C0]]) to (%[[D0]], %[[D1]]) step (%[[C1]], %[[C1]])
+//       CHECK:   scf.for %[[IV2:.*]] = %[[C0]] to %[[D2]] step %[[C1]]
+//       CHECK:     scf.parallel (%[[IV3:.*]], %[[IV4:.*]]) = (%[[C0]], %[[C0]]) to (%[[D3]], %[[D4]]) step (%[[C1]], %[[C1]])
+//       CHECK:       scf.for %[[IV5:.*]] = %[[C0]] to %[[D5]] step %[[C1]]
+//       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]], %[[IV4]], %[[IV5]]]
+//       CHECK:       store %{{.*}}, %{{.*}}[%[[IV0]], %[[IV2]], %[[IV4]], %[[IV5]]]
diff --git a/mlir/test/Dialect/Linalg/tile_parallel_reduce.mlir b/mlir/test/Dialect/Linalg/tile_parallel_reduce.mlir
new file mode 100644
index 0000000000000..bfa14570aef13
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/tile_parallel_reduce.mlir
@@ -0,0 +1,108 @@
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2,4,8" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2" -split-input-file | FileCheck %s -check-prefix=TILE1
+// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2,4" -split-input-file | FileCheck %s -check-prefix=TILE2
+
+func @gemm(%arg0 : memref<?x?xf32>,
+           %arg1 : memref<?x?xf32>,
+           %arg2 : memref<?x?xf32>)
+{
+  linalg.matmul(%arg0, %arg1, %arg2)
+    : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+  return
+}
+// CHECK-LABEL: func @gemm
+//   CHECK-DAG:   %[[C2:.*]] = constant 2 : index
+//   CHECK-DAG:   %[[C4:.*]] = constant 4 : index
+//   CHECK-DAG:   %[[C8:.*]] = constant 8 : index
+//       CHECK:   scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) =
+//  CHECK-SAME:     step (%[[C2]], %[[C4]])
+//       CHECK:     scf.for %[[ARG5:.*]] =
+//  CHECK-SAME:       step %[[C8]]
+//       CHECK:       %[[SV1:.*]] = subview %{{.*}}[%[[ARG3]], %[[ARG5]]]
+//       CHECK:       %[[SV2:.*]] = subview %{{.*}}[%[[ARG5]], %[[ARG4]]]
+//       CHECK:       %[[SV3:.*]] = subview %{{.*}}[%[[ARG3]], %[[ARG4]]]
+//       CHECK:       linalg.matmul(%[[SV1]], %[[SV2]], %[[SV3]])
+
+// TILE1-LABEL: func @gemm
+//   TILE1-DAG:   %[[C2:.*]] = constant 2 : index
+//       TILE1:   scf.parallel (%[[ARG3:.*]]) =
+//  TILE1-SAME:     step (%[[C2]])
+//       TILE1:     %[[SV1:.*]] = subview %{{.*}}[%[[ARG3]], 0]
+//       TILE1:     %[[SV3:.*]] = subview %{{.*}}[%[[ARG3]], 0]
+//   TILE1-NOT:     subview
+//       TILE1:     linalg.matmul(%[[SV1]], %{{.*}}, %[[SV3]])
+
+// TILE2-LABEL: func @gemm
+//   TILE2-DAG:   %[[C2:.*]] = constant 2 : index
+//   TILE2-DAG:   %[[C4:.*]] = constant 4 : index
+//       TILE2:   scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) =
+//  TILE2-SAME:     step (%[[C2]], %[[C4]])
+//       TILE2:       %[[SV1:.*]] = subview %{{.*}}[%[[ARG3]], 0]
+//       TILE2:       %[[SV2:.*]] = subview %{{.*}}[0, %[[ARG4]]]
+//       TILE2:       %[[SV3:.*]] = subview %{{.*}}[%[[ARG3]], %[[ARG4]]]
+//       TILE2:       linalg.matmul(%[[SV1]], %[[SV2]], %[[SV3]])
+
+// -----
+
+#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d1)>
+#accesses = [#map0, #map1, #map2]
+#trait = {
+  args_in = 2 : i64,
+  args_out = 1 : i64,
+  iterator_types = ["reduction", "parallel", "reduction"],
+  indexing_maps = #accesses
+}
+
+func @reduction(%arg0 : memref<?x?x?xf32>,
+                %arg1 : memref<?x?xf32>,
+                %arg2 : memref<?xf32>)
+{
+  linalg.generic #trait %arg0, %arg1, %arg2 {
+  ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
+    %0 = addf %arg3, %arg4 : f32
+    %1 = addf %0, %arg5 : f32
+    linalg.yield %1 : f32
+  } : memref<?x?x?xf32>, memref<?x?xf32>, memref<?xf32>
+  return
+}
+
+// CHECK-LABEL: func @reduction
+//   CHECK-DAG:   %[[C2:.*]] = constant 2 : index
+//   CHECK-DAG:   %[[C4:.*]] = constant 4 : index
+//   CHECK-DAG:   %[[C8:.*]] = constant 8 : index
+//       CHECK:   scf.for %[[ARG3:.*]] =
+//  CHECK-SAME:     step %[[C2]]
+//       CHECK:     scf.parallel (%[[ARG4:.*]]) =
+//  CHECK-SAME:       step (%[[C4]])
+//       CHECK:       scf.for %[[ARG5:.*]] =
+//  CHECK-SAME:         step %[[C8]]
+//       CHECK:         %[[SV1:.*]] = subview %{{.*}}[%[[ARG3]], %[[ARG4]], %[[ARG5]]]
+//       CHECK:         %[[SV2:.*]] = subview %{{.*}}[%[[ARG3]], %[[ARG5]]]
+//       CHECK:         %[[SV3:.*]] = subview %{{.*}}[%[[ARG4]]]
+//       CHECK:         linalg.generic
+//  CHECK-SAME:           %[[SV1]], %[[SV2]], %[[SV3]]
+
+// TILE1-LABEL: func @reduction
+//   TILE1-DAG:   %[[C2:.*]] = constant 2 : index
+//       TILE1:   scf.for %[[ARG3:.*]] =
+//  TILE1-SAME:     step %[[C2]]
+//       TILE1:         %[[SV1:.*]] = subview %{{.*}}[%[[ARG3]], 0, 0]
+//       TILE1:         %[[SV2:.*]] = subview %{{.*}}[%[[ARG3]], 0]
+//   TILE1-NOT:         subview
+//       TILE1:         linalg.generic
+//  TILE1-SAME:           %[[SV1]], %[[SV2]], %{{.*}}
+
+// TILE2-LABEL: func @reduction
+//   TILE2-DAG:   %[[C2:.*]] = constant 2 : index
+//   TILE2-DAG:   %[[C4:.*]] = constant 4 : index
+//       TILE2:   scf.for %[[ARG3:.*]] =
+//  TILE2-SAME:     step %[[C2]]
+//       TILE2:     scf.parallel (%[[ARG4:.*]]) =
+//  TILE2-SAME:       step (%[[C4]])
+//       TILE2:         %[[SV1:.*]] = subview %{{.*}}[%[[ARG3]], %[[ARG4]], 0]
+//       TILE2:         %[[SV2:.*]] = subview %{{.*}}[%[[ARG3]], 0]
+//       TILE2:         %[[SV3:.*]] = subview %{{.*}}[%[[ARG4]]]
+//       TILE2:         linalg.generic
+//  TILE2-SAME:           %[[SV1]], %[[SV2]], %[[SV3]]
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
index ce868d156f6d8..4c46c74fe4909 100644
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -44,7 +44,8 @@ func @matvec(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
 // CHECK-DAG:     %[[c0:.*]] = constant 0 : index
 // CHECK-DAG:     %[[c5:.*]] = constant 5 : index
 // CHECK-DAG:     %[[c6:.*]] = constant 6 : index
-// CHECK:         scf.parallel {{.*}} step (%[[c5]], %[[c6]])
+// CHECK:         scf.parallel {{.*}} step (%[[c5]])
+// CHECK:           scf.for {{.*}} step %[[c6]]
 // CHECK:             linalg.matvec({{.*}}, {{.*}}, {{.*}}) : memref<?x?xf32, #[[STRIDED_2D]]>, memref<?xf32, #[[STRIDED_1D]]>, memref<?xf32, #[[STRIDED_1D]]>
 
 func @matmul(%A: memref<?x?xf32, offset: ?, strides: [?, 1]>,
@@ -364,3 +365,25 @@ func @aligned_promote_fill(%arg0: memref<?x?xf32, offset: ?, strides: [?, 1]>) {
 // CHECK:         linalg.fill(%[[v0]], {{%.*}}) : memref<?x?xf32>, f32
 // CHECK:         linalg.copy(%[[s0]], %[[l0]]) : memref<?x?xf32, #map{{.*}}>, memref<?x?xf32, #map{{.*}}>
 // CHECK:         linalg.fill(%[[v0]], %[[cf]]) : memref<?x?xf32>, f32
+
+func @tile_permute_parallel_loop(%arg0: memref<?x?xf32>,
+                                 %arg1: memref<?x?xf32>,
+                                 %arg2: memref<?x?xf32>) {
+  linalg.matmul(%arg0, %arg1, %arg2) {__internal_linalg_transform__ = "par__with_perm__"}
+    : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+  return
+}
+// CHECK-LABEL: func @tile_permute_parallel_loop
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//   CHECK-DAG:   %[[C16:.*]] = constant 16 : index
+//   CHECK-DAG:   %[[C8:.*]] = constant 8 : index
+//   CHECK-DAG:   %[[C4:.*]] = constant 4 : index
+//   CHECK-DAG:   %[[C0:.*]] = constant 0 : index
+//   CHECK-DAG:   %[[D0:.*]] = dim %[[ARG0]], 0
+//   CHECK-DAG:   %[[D1:.*]] = dim %[[ARG0]], 1
+//   CHECK-DAG:   %[[D2:.*]] = dim %[[ARG1]], 1
+//       CHECK:   scf.parallel (%{{.*}}) = (%[[C0]]) to (%[[D2]]) step (%[[C8]])
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %[[D1]] step %[[C4]]
+//       CHECK:       scf.parallel (%{{.*}}) = (%[[C0]]) to (%[[D0]]) step (%[[C16]])
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index e38153058419c..7547e2953ef21 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -101,6 +101,14 @@ static void applyPatterns(FuncOp funcOp) {
       ctx, LinalgTilingOptions().setTileSizes({5, 6}).setInterchange({1, 0}),
       LinalgMarker({"__with_perm__"}, "L1__with_perm__"));
 
+  patterns.insert<LinalgTilingPattern<MatmulOp>>(
+      ctx,
+      LinalgTilingOptions()
+          .setTileSizes({16, 8, 4})
+          .setInterchange({1, 2, 0})
+          .setLoopType(LinalgTilingLoopType::ParallelLoops),
+      LinalgMarker({"par__with_perm__"}, "after_par__with_perm__"));
+
   //===--------------------------------------------------------------------===//
   // Linalg to loops patterns.
   //===--------------------------------------------------------------------===//

From 0ed2d4c7cba8fb15e51d0f6f4e9011027c17085c Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Tue, 26 May 2020 16:35:59 -0700
Subject: [PATCH 180/770] [mlir][linalg] Allow promotion to use callbacks for
 alloc/dealloc/copies.

Add options to LinalgPromotion to use callbacks for implementating the
allocation, deallocation of buffers used for the promoted subviews,
and to copy data into and from the original subviews to the allocated
buffers.
Also some misc. cleanup of the code.

Differential Revision: https://reviews.llvm.org/D80365
---
 .../Dialect/Linalg/Transforms/Transforms.h    |  73 +++-
 .../include/mlir/Dialect/Linalg/Utils/Utils.h |  22 --
 .../Dialect/Linalg/Transforms/Promotion.cpp   | 331 +++++++++++-------
 .../Dialect/Linalg/Transforms/Transforms.cpp  |  19 +-
 .../Dialect/Linalg/promotion_options.mlir     |  33 ++
 .../lib/Transforms/TestLinalgTransforms.cpp   |  70 ++++
 6 files changed, 369 insertions(+), 179 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/promotion_options.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 6d34a0943e5e3..2da631956572f 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -57,18 +57,27 @@ Optional<TiledLinalgOp> tileLinalgOp(OpBuilder &b, LinalgOp op,
 /// (i.e. `[1,1,2]` is an invalid permutation).
 LinalgOp interchange(LinalgOp op, ArrayRef<unsigned> interchangeVector);
 
-/// Promotes the `subViews` into a new buffer allocated at the insertion point
-/// `b`. Promotion occurs in 3 steps:
-///   1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
-///   2. Take a full view on the buffer and `linalg.fill` it with zeros (use
-///      float zero for now).
-///   3. Take a partial slice of the full view in step 2. and copy into it.
-/// Infers statically sized buffers from subViews unless `dynamicBuffers` is
-/// true.
-///
-/// Returns a list of PromotionInfo which hold the promoted buffer and the
-/// full and partial views indexing into the buffer.
-// TODO: revisit dynamicBuffers option.
+/// Callback function type used to perform the allocation for the promoted
+/// `subView`. In `boundingSubViewsize` a best attempt is made to find the
+/// smallest constant value for the size of the buffer needed for each
+/// dimension. If that is not possible, contains the dynamic size of the
+/// subview. The call back should return the buffer to use.
+using AllocBufferCallbackFn = std::function<Optional<Value>(
+    OpBuilder &b, SubViewOp subView, ArrayRef<Value> boundingSubViewSize,
+    OperationFolder *folder)>;
+
+/// Callback function type used to deallocate the buffers used to hold the
+/// promoted subview.
+using DeallocBufferCallbackFn =
+    std::function<LogicalResult(OpBuilder &b, Value buffer)>;
+
+/// Callback function type used to insert copy from original subview to subview
+/// of the promoted region for the read operands/subview of promoted region to
+/// original subview for the results. The copy has to happen from `src` to
+/// `dst`.
+using CopyCallbackFn =
+    std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;
+
 struct LinalgPromotionOptions {
   /// Indices of subViews to promote. If `None`, try to promote all operands.
   Optional<DenseSet<unsigned>> operandsToPromote = None;
@@ -111,10 +120,44 @@ struct LinalgPromotionOptions {
     alignment = align;
     return *this;
   }
+  /// Callback function to do the allocation of the promoted buffer. If None,
+  /// then the default allocation scheme of allocating a memref<?xi8> buffer
+  /// followed by a view operation is used.
+  Optional<AllocBufferCallbackFn> allocationFn = None;
+  Optional<DeallocBufferCallbackFn> deallocationFn = None;
+  LinalgPromotionOptions &
+  setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn,
+                               DeallocBufferCallbackFn const &deallocFn) {
+    allocationFn = allocFn;
+    deallocationFn = deallocFn;
+    return *this;
+  }
+
+  /// Callback function to do the copy of data to and from the promoted
+  /// subview. If None then a linalg.copy is used.
+  Optional<CopyCallbackFn> copyInFn = None;
+  Optional<CopyCallbackFn> copyOutFn = None;
+  LinalgPromotionOptions &setCopyInOutFns(CopyCallbackFn const &copyIn,
+                                          CopyCallbackFn const &copyOut) {
+    copyInFn = copyIn;
+    copyOutFn = copyOut;
+    return *this;
+  }
 };
-LinalgOp promoteSubViews(OpBuilder &b, LinalgOp op,
-                         LinalgPromotionOptions options,
-                         OperationFolder *folder = nullptr);
+
+/// Promotes the `subViews` into a new buffer allocated at the insertion point
+/// `b`. Promotion occurs in 3 steps:
+///   1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
+///   2. Take a full view on the buffer.
+///   3. Take a partial slice of the full view in step 2. and copy into it.
+/// Infers statically sized buffers from subViews unless `dynamicBuffers` is
+/// true.
+///
+/// Returns the modified linalg op (the modification happens in place) as well
+/// as all the copy ops created.
+Optional<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
+                                   LinalgPromotionOptions options,
+                                   OperationFolder *folder = nullptr);
 
 /// Emit a suitable vector form for a Linalg op with fully static shape.
 void vectorizeLinalgOp(OpBuilder &builder, Operation *op);
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index c8a5d83438f56..235dedd604017 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -117,28 +117,6 @@ SmallVector<Value, 4> applyMapToValues(OpBuilder &b, Location loc,
                                        AffineMap map, ArrayRef<Value> values,
                                        OperationFolder *folder = nullptr);
 
-struct PromotionInfo {
-  Value buffer;
-  Value fullLocalView;
-  Value partialLocalView;
-};
-
-/// Promotes the `subViews` into a new buffer allocated at the insertion point
-/// `b`. For now, promotion occurs in 3 steps:
-///   1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
-///   2. Take a full view on the buffer and `linalg.fill` it with zeros (use
-///      float zero for now).
-///   3. Take a partial slice of the full view in step 2. and copy into it.
-/// Infers statically sized buffers from subViews unless `dynamicBuffers` is
-/// true.
-///
-/// Returns a list of PromotionInfo which hold the promoted buffer and the
-/// full and partial views indexing into the buffer.
-SmallVector<PromotionInfo, 8>
-promoteSubViews(OpBuilder &b, Location loc, ArrayRef<Value> subViews,
-                bool dynamicBuffers = false, int64_t alignment = 0,
-                OperationFolder *folder = nullptr);
-
 /// Returns all the operands of `linalgOp` that are not views.
 /// Asserts that these operands are value types to allow transformations like
 /// tiling to just use the values when cloning `linalgOp`.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index 44de2a1021c27..de8514f0fa41a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -25,8 +25,7 @@
 #include "mlir/IR/AffineMap.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/FoldUtils.h"
-
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace mlir;
@@ -35,7 +34,7 @@ using namespace mlir::edsc::intrinsics;
 using namespace mlir::linalg;
 using namespace mlir::scf;
 
-using llvm::SetVector;
+using llvm::MapVector;
 
 using folded_affine_min = FoldedValueBuilder<AffineMinOp>;
 using folded_linalg_range = FoldedValueBuilder<linalg::RangeOp>;
@@ -45,6 +44,87 @@ using folded_std_view = FoldedValueBuilder<ViewOp>;
 
 #define DEBUG_TYPE "linalg-promotion"
 
+/// If `size` comes from an AffineMinOp and one of the values of AffineMinOp
+/// is a constant then return a new value set to the smallest such constant.
+/// Otherwise return size.
+static Value extractSmallestConstantBoundingSize(OpBuilder &b, Location loc,
+                                                 Value size) {
+  Optional<int64_t> boundingConst = {};
+  if (auto affineMinOp = size.getDefiningOp<AffineMinOp>()) {
+    for (auto e : affineMinOp.getAffineMap().getResults())
+      if (auto cst = e.dyn_cast<AffineConstantExpr>())
+        boundingConst = boundingConst
+                            ? std::min(boundingConst.getValue(), cst.getValue())
+                            : cst.getValue();
+  } else if (auto constIndexOp = size.getDefiningOp<ConstantOp>()) {
+    if (constIndexOp.getType().isa<IndexType>())
+      boundingConst = constIndexOp.value().cast<IntegerAttr>().getInt();
+  }
+  return boundingConst && *boundingConst >= 0
+             ? b.create<ConstantIndexOp>(loc, *boundingConst)
+             : size;
+}
+
+/// Alloc a new buffer of `size`. If `dynamicBuffers` is true allocate exactly
+/// the size needed, otherwise try to allocate a static bounding box.
+static Value allocBuffer(Type elementType, Value size, bool dynamicBuffers,
+                         OperationFolder *folder,
+                         Optional<unsigned> alignment = None) {
+  auto *ctx = size.getContext();
+  auto width = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8);
+  IntegerAttr alignment_attr;
+  if (alignment.hasValue())
+    alignment_attr =
+        IntegerAttr::get(IntegerType::get(64, ctx), alignment.getValue());
+  if (!dynamicBuffers)
+    if (auto cst = size.getDefiningOp<ConstantIndexOp>())
+      return std_alloc(
+          MemRefType::get(width * cst.getValue(), IntegerType::get(8, ctx)),
+          ValueRange{}, alignment_attr);
+  Value mul =
+      folded_std_muli(folder, folded_std_constant_index(folder, width), size);
+  return std_alloc(MemRefType::get(-1, IntegerType::get(8, ctx)), mul,
+                   alignment_attr);
+}
+
+/// Default allocation callback function. This allocates a promoted buffer when
+/// no call back to do so is provided. The default is to allocate a
+/// memref<..xi8> and return a view to get a memref type of shape
+/// boundingSubViewSize.
+static Optional<Value>
+allocBufferCallBack(OpBuilder &builder, SubViewOp subView,
+                    ArrayRef<Value> boundingSubViewSize, bool dynamicBuffers,
+                    Optional<unsigned> alignment, OperationFolder *folder) {
+  ShapedType viewType = subView.getType();
+  int64_t rank = viewType.getRank();
+  (void)rank;
+  assert(rank > 0 && boundingSubViewSize.size() == static_cast<size_t>(rank));
+  auto zero = folded_std_constant_index(folder, 0);
+  auto one = folded_std_constant_index(folder, 1);
+
+  Value allocSize = one;
+  for (auto size : llvm::enumerate(boundingSubViewSize))
+    allocSize = folded_std_muli(folder, allocSize, size.value());
+  Value buffer = allocBuffer(viewType.getElementType(), allocSize,
+                             dynamicBuffers, folder, alignment);
+  SmallVector<int64_t, 4> dynSizes(boundingSubViewSize.size(),
+                                   ShapedType::kDynamicSize);
+  Value view = folded_std_view(
+      folder, MemRefType::get(dynSizes, viewType.getElementType()), buffer,
+      zero, boundingSubViewSize);
+  return view;
+}
+
+/// Default implementation of deallocation of the buffer use for promotion. It
+/// expects to get the same value that the default allocation method returned,
+/// i.e. result of a ViewOp.
+static LogicalResult deallocCallBack(OpBuilder &b, Value fullLocalView) {
+  auto viewOp = fullLocalView.getDefiningOp<ViewOp>();
+  assert(viewOp && "expected full local view to be a ViewOp");
+  std_dealloc(viewOp.source());
+  return success();
+}
+
 namespace {
 
 /// Helper struct that captures the information required to apply the
@@ -55,81 +135,65 @@ struct LinalgOpInstancePromotionOptions {
   LinalgOpInstancePromotionOptions(LinalgOp op,
                                    const LinalgPromotionOptions &options);
   /// SubViews to promote.
-  SetVector<Value> subViews;
+  MapVector<unsigned, Value> subViews;
   /// True if the full view should be used for the promoted buffer.
   DenseMap<Value, bool> useFullTileBuffers;
+
+  /// Callback functions for allocation and deallocation of promoted buffers, as
+  /// well as to copy the data into and out of these buffers.
+  AllocBufferCallbackFn allocationFn;
+  DeallocBufferCallbackFn deallocationFn;
+  CopyCallbackFn copyInFn;
+  CopyCallbackFn copyOutFn;
+
   /// Allow the use of dynamicaly-sized buffers.
   bool dynamicBuffers;
   /// Alignment of promoted buffer.
   Optional<unsigned> alignment;
 };
+
+struct PromotionInfo {
+  Value fullLocalView;
+  Value partialLocalView;
+};
 } // namespace
 
 LinalgOpInstancePromotionOptions::LinalgOpInstancePromotionOptions(
     LinalgOp linalgOp, const LinalgPromotionOptions &options)
-    : subViews(), useFullTileBuffers(), dynamicBuffers(options.dynamicBuffers),
+    : subViews(), dynamicBuffers(options.dynamicBuffers),
       alignment(options.alignment) {
   unsigned nBuffers = linalgOp.getNumInputsAndOutputBuffers();
   auto vUseFullTileBuffers =
       options.useFullTileBuffers.getValueOr(llvm::SmallBitVector());
   vUseFullTileBuffers.resize(nBuffers, options.useFullTileBuffersDefault);
 
-  if (options.operandsToPromote.hasValue()) {
-    for (auto it : llvm::enumerate(options.operandsToPromote.getValue())) {
-      auto *op = linalgOp.getBuffer(it.value()).getDefiningOp();
-      if (auto sv = dyn_cast_or_null<SubViewOp>(op)) {
-        subViews.insert(sv);
-        useFullTileBuffers[sv] = vUseFullTileBuffers[it.index()];
-      }
-    }
-  } else {
-    for (unsigned idx = 0; idx < nBuffers; ++idx) {
-      auto *op = linalgOp.getBuffer(idx).getDefiningOp();
-      if (auto sv = dyn_cast_or_null<SubViewOp>(op)) {
-        subViews.insert(sv);
-        useFullTileBuffers[sv] = vUseFullTileBuffers[idx];
-      }
+  for (unsigned idx = 0; idx != nBuffers; ++idx) {
+    if (options.operandsToPromote && !options.operandsToPromote->count(idx))
+      continue;
+    auto *op = linalgOp.getBuffer(idx).getDefiningOp();
+    if (auto sv = dyn_cast_or_null<SubViewOp>(op)) {
+      subViews[idx] = sv;
+      useFullTileBuffers[sv] = vUseFullTileBuffers[idx];
     }
   }
-}
-
-/// If `size` comes from an AffineMinOp and one of the values of AffineMinOp
-/// is a constant then return a new value set to the smallest such constant.
-/// Otherwise return size.
-static Value extractSmallestConstantBoundingSize(OpBuilder &b, Location loc,
-                                                 Value size) {
-  auto affineMinOp = size.getDefiningOp<AffineMinOp>();
-  if (!affineMinOp)
-    return size;
-  int64_t minConst = std::numeric_limits<int64_t>::max();
-  for (auto e : affineMinOp.getAffineMap().getResults())
-    if (auto cst = e.dyn_cast<AffineConstantExpr>())
-      minConst = std::min(minConst, cst.getValue());
-  return (minConst == std::numeric_limits<int64_t>::max())
-             ? size
-             : b.create<ConstantIndexOp>(loc, minConst);
-}
 
-/// Alloc a new buffer of `size`. If `dynamicBuffers` is true allocate exactly
-/// the size needed, otherwise try to allocate a static bounding box.
-static Value allocBuffer(Type elementType, Value size, bool dynamicBuffers,
-                         OperationFolder *folder,
-                         Optional<unsigned> alignment = None) {
-  auto *ctx = size.getContext();
-  auto width = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8);
-  IntegerAttr alignment_attr;
-  if (alignment.hasValue())
-    alignment_attr =
-        IntegerAttr::get(IntegerType::get(64, ctx), alignment.getValue());
-  if (!dynamicBuffers)
-    if (auto cst = size.getDefiningOp<ConstantIndexOp>())
-      return std_alloc(
-          MemRefType::get(width * cst.getValue(), IntegerType::get(8, ctx)),
-          ValueRange{}, alignment_attr);
-  Value mul =
-      folded_std_muli(folder, folded_std_constant_index(folder, width), size);
-  return std_alloc(MemRefType::get(-1, IntegerType::get(8, ctx)), mul,
-                   alignment_attr);
+  allocationFn =
+      (options.allocationFn ? *(options.allocationFn)
+                            : [&](OpBuilder &builder, SubViewOp subViewOp,
+                                  ArrayRef<Value> boundingSubViewSize,
+                                  OperationFolder *folder) -> Optional<Value> {
+        return allocBufferCallBack(builder, subViewOp, boundingSubViewSize,
+                                   dynamicBuffers, alignment, folder);
+      });
+  deallocationFn =
+      (options.deallocationFn ? *(options.deallocationFn) : deallocCallBack);
+  auto defaultCopyCallBack = [&](OpBuilder &builder, Value src,
+                                 Value dst) -> LogicalResult {
+    linalg_copy(src, dst);
+    return success();
+  };
+  copyInFn = (options.copyInFn ? *(options.copyInFn) : defaultCopyCallBack);
+  copyOutFn = (options.copyOutFn ? *(options.copyOutFn) : defaultCopyCallBack);
 }
 
 // Performs promotion of a `subView` into a local buffer of the size of the
@@ -149,45 +213,41 @@ static Value allocBuffer(Type elementType, Value size, bool dynamicBuffers,
 // To account for general boundary effects, padding must be performed on the
 // boundary tiles. For now this is done with an unconditional `fill` op followed
 // by a partial `copy` op.
-static PromotionInfo promoteSubviewAsNewBuffer(OpBuilder &b, Location loc,
-                                               SubViewOp subView,
-                                               bool dynamicBuffers,
-                                               Optional<unsigned> alignment,
-                                               OperationFolder *folder) {
-  auto zero = folded_std_constant_index(folder, 0);
-  auto one = folded_std_constant_index(folder, 1);
-
+static Optional<PromotionInfo>
+promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, SubViewOp subView,
+                          LinalgOpInstancePromotionOptions const &options,
+                          OperationFolder *folder) {
   auto viewType = subView.getType();
   auto rank = viewType.getRank();
-  Value allocSize = one;
-  SmallVector<Value, 8> fullSizes, partialSizes;
+  SmallVector<Value, 4> fullSizes, partialSizes;
   fullSizes.reserve(rank);
   partialSizes.reserve(rank);
   for (auto en : llvm::enumerate(subView.getOrCreateRanges(b, loc))) {
-    auto rank = en.index();
     auto rangeValue = en.value();
     // Try to extract a tight constant.
     LLVM_DEBUG(llvm::dbgs() << "Extract tightest: " << rangeValue.size << "\n");
     Value size = extractSmallestConstantBoundingSize(b, loc, rangeValue.size);
     LLVM_DEBUG(llvm::dbgs() << "Extracted tightest: " << size << "\n");
-    allocSize = folded_std_muli(folder, allocSize, size);
     fullSizes.push_back(size);
-    partialSizes.push_back(folded_std_dim(folder, subView, rank));
+    partialSizes.push_back(folded_std_dim(folder, subView, en.index()));
   }
   SmallVector<int64_t, 4> dynSizes(fullSizes.size(), -1);
-  auto buffer = allocBuffer(viewType.getElementType(), allocSize,
-                            dynamicBuffers, folder, alignment);
-  auto fullLocalView = folded_std_view(
-      folder, MemRefType::get(dynSizes, viewType.getElementType()), buffer,
-      zero, fullSizes);
+  // If a callback is not specified, then use the default implementation for
+  // allocating the promoted buffer.
+  Optional<Value> fullLocalView =
+      options.allocationFn(b, subView, fullSizes, folder);
+  if (!fullLocalView)
+    return {};
+  auto zero = folded_std_constant_index(folder, 0);
+  auto one = folded_std_constant_index(folder, 1);
   SmallVector<Value, 4> zeros(fullSizes.size(), zero);
   SmallVector<Value, 4> ones(fullSizes.size(), one);
   auto partialLocalView =
-      folded_std_subview(folder, fullLocalView, zeros, partialSizes, ones);
-  return PromotionInfo{buffer, fullLocalView, partialLocalView};
+      folded_std_subview(folder, *fullLocalView, zeros, partialSizes, ones);
+  return PromotionInfo{*fullLocalView, partialLocalView};
 }
 
-static SmallVector<PromotionInfo, 8>
+static Optional<MapVector<unsigned, PromotionInfo>>
 promoteSubViews(OpBuilder &b, Location loc,
                 LinalgOpInstancePromotionOptions options,
                 OperationFolder *folder) {
@@ -195,24 +255,18 @@ promoteSubViews(OpBuilder &b, Location loc,
     return {};
 
   ScopedContext scope(b, loc);
-  SmallVector<PromotionInfo, 8> res;
-  res.reserve(options.subViews.size());
-  DenseMap<Value, PromotionInfo> promotionInfoMap;
-  for (auto v : options.subViews) {
-    SubViewOp subView = cast<SubViewOp>(v.getDefiningOp());
-    auto promotionInfo = promoteSubviewAsNewBuffer(
-        b, loc, subView, options.dynamicBuffers, options.alignment, folder);
-    promotionInfoMap.insert(std::make_pair(subView.getResult(), promotionInfo));
-    res.push_back(promotionInfo);
-  }
+  MapVector<unsigned, PromotionInfo> promotionInfoMap;
 
   for (auto v : options.subViews) {
-    SubViewOp subView = cast<SubViewOp>(v.getDefiningOp());
-    auto info = promotionInfoMap.find(v);
-    if (info == promotionInfoMap.end())
-      continue;
+    SubViewOp subView = cast<SubViewOp>(v.second.getDefiningOp());
+    Optional<PromotionInfo> promotionInfo =
+        promoteSubviewAsNewBuffer(b, loc, subView, options, folder);
+    if (!promotionInfo)
+      return {};
+    promotionInfoMap[v.first] = *promotionInfo;
+
     // Only fill the buffer if the full local view is used
-    if (!options.useFullTileBuffers[v])
+    if (!options.useFullTileBuffers[v.second])
       continue;
     Value fillVal;
     if (auto t = subView.getType().getElementType().dyn_cast<FloatType>())
@@ -220,75 +274,80 @@ promoteSubViews(OpBuilder &b, Location loc,
     else if (auto t =
                  subView.getType().getElementType().dyn_cast<IntegerType>())
       fillVal = folded_std_constant_int(folder, 0, t);
-    // TODO(ntv): fill is only necessary if `promotionInfo` has a full local
-    // view that is different from the partial local view and we are on the
-    // boundary.
-    linalg_fill(info->second.fullLocalView, fillVal);
+    linalg_fill(promotionInfo->fullLocalView, fillVal);
   }
 
+  // Copy data into the promoted buffers. Use callback if provided.
   for (auto v : options.subViews) {
-    auto info = promotionInfoMap.find(v);
+    auto info = promotionInfoMap.find(v.first);
     if (info == promotionInfoMap.end())
       continue;
-    linalg_copy(cast<SubViewOp>(v.getDefiningOp()),
-                info->second.partialLocalView);
+    if (failed(options.copyInFn(b, cast<SubViewOp>(v.second.getDefiningOp()),
+                                info->second.partialLocalView)))
+      return {};
   }
-  return res;
+  return promotionInfoMap;
 }
 
-static void promoteSubViews(OpBuilder &b, LinalgOp op,
-                            LinalgOpInstancePromotionOptions options,
-                            OperationFolder *folder) {
+static Optional<LinalgOp>
+promoteSubViews(OpBuilder &b, LinalgOp op,
+                LinalgOpInstancePromotionOptions options,
+                OperationFolder *folder) {
   assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
 
   if (auto convOp = dyn_cast<linalg::ConvOp>(op.getOperation())) {
     // TODO(ntv): add a level of indirection to linalg.generic.
     if (convOp.padding())
-      llvm_unreachable("Unexpected conv with padding");
+      return {};
   }
 
   // 1. Promote the specified views and use them in the new op.
   auto loc = op.getLoc();
-  auto promotedBufferAndViews = promoteSubViews(b, loc, options, folder);
+  auto promotedBuffersAndViews = promoteSubViews(b, loc, options, folder);
+  if (!promotedBuffersAndViews ||
+      promotedBuffersAndViews->size() != options.subViews.size())
+    return {};
+
+  // 2. Append all other operands as they appear, this enforces that such
+  // operands are not views. This is to support cases such as FillOp taking
+  // extra scalars etc.  Keep a reference to output buffers;
   SmallVector<Value, 8> opViews;
   opViews.reserve(op.getNumInputsAndOutputs());
   SmallVector<std::pair<Value, Value>, 8> writebackViews;
-  writebackViews.reserve(promotedBufferAndViews.size());
-  unsigned promotedIdx = 0;
-  for (auto view : op.getInputsAndOutputBuffers()) {
-    if (options.subViews.count(view) != 0) {
-      if (options.useFullTileBuffers[view])
-        opViews.push_back(promotedBufferAndViews[promotedIdx].fullLocalView);
+  writebackViews.reserve(promotedBuffersAndViews->size());
+  for (auto view : llvm::enumerate(op.getInputsAndOutputBuffers())) {
+    if (options.subViews.count(view.index()) != 0) {
+      if (options.useFullTileBuffers[view.value()])
+        opViews.push_back(
+            (*promotedBuffersAndViews)[view.index()].fullLocalView);
       else
-        opViews.push_back(promotedBufferAndViews[promotedIdx].partialLocalView);
-      writebackViews.emplace_back(std::make_pair(
-          view, promotedBufferAndViews[promotedIdx].partialLocalView));
-      promotedIdx++;
+        opViews.push_back(
+            (*promotedBuffersAndViews)[view.index()].partialLocalView);
+      if (view.index() >= op.getNumInputs())
+        writebackViews.emplace_back(std::make_pair(
+            view.value(),
+            (*promotedBuffersAndViews)[view.index()].partialLocalView));
     } else {
-      opViews.push_back(view);
+      opViews.push_back(view.value());
     }
   }
-
-  // 2. Append all other operands as they appear, this enforces that such
-  // operands are not views. This is to support cases such as FillOp taking
-  // extra scalars etc.
-  // Keep a reference to output buffers;
-  DenseSet<Value> originalOutputs(op.getOutputBuffers().begin(),
-                                  op.getOutputBuffers().end());
   op.getOperation()->setOperands(0, opViews.size(), opViews);
 
   OpBuilder::InsertionGuard guard(b);
   b.setInsertionPointAfter(op);
   ScopedContext scope(b, loc);
   // 3. Emit write-back for the promoted output views: copy the partial view.
-  for (auto viewAndPartialLocalView : writebackViews)
-    if (originalOutputs.count(viewAndPartialLocalView.first))
-      linalg_copy(viewAndPartialLocalView.second,
-                  viewAndPartialLocalView.first);
+  for (auto viewAndPartialLocalView : writebackViews) {
+    if (failed(options.copyOutFn(b, viewAndPartialLocalView.second,
+                                 viewAndPartialLocalView.first)))
+      return {};
+  }
 
   // 4. Dealloc all local buffers.
-  for (const auto &pi : promotedBufferAndViews)
-    std_dealloc(pi.buffer);
+  for (const auto &pi : *promotedBuffersAndViews) {
+    options.deallocationFn(b, pi.second.fullLocalView);
+  }
+  return op;
 }
 
 LogicalResult
@@ -312,13 +371,13 @@ mlir::linalg::promoteSubviewsPrecondition(Operation *op,
   return failure();
 }
 
-LinalgOp mlir::linalg::promoteSubViews(OpBuilder &b, LinalgOp linalgOp,
-                                       LinalgPromotionOptions options,
-                                       OperationFolder *folder) {
+Optional<LinalgOp> mlir::linalg::promoteSubViews(OpBuilder &b,
+                                                 LinalgOp linalgOp,
+                                                 LinalgPromotionOptions options,
+                                                 OperationFolder *folder) {
   LinalgOpInstancePromotionOptions linalgOptions(linalgOp, options);
-  ::promoteSubViews(
+  return ::promoteSubViews(
       b, linalgOp, LinalgOpInstancePromotionOptions(linalgOp, options), folder);
-  return linalgOp;
 }
 
 namespace {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 2ce949aa034c4..527d162298bf4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -179,12 +179,19 @@ LogicalResult mlir::linalg::LinalgBasePromotionPattern::matchAndRewrite(
     return failure();
   if (failed(promoteSubviewsPrecondition(op, options)))
     return failure();
-  rewriter.updateRootInPlace(op, [&]() {
-    auto promotedOp = promoteSubViews(rewriter, op, options);
-    (void)promotedOp;
-    assert(promotedOp && "Unexpected pattern failure");
-    marker.replaceLinalgMarker(rewriter, op);
-  });
+
+  // TODO: We cannot use root update here. This pattern is creating other ops,
+  // so if the promotion fails, those need to be cleaned up, which doesnt seem
+  // to be happening here. So to fail properly, we should be cloning the op and
+  // deleting the previous op. This needs more investigation.
+  rewriter.startRootUpdate(op);
+  Optional<LinalgOp> promotedOp = promoteSubViews(rewriter, op, options);
+  if (!promotedOp) {
+    rewriter.cancelRootUpdate(op);
+    return op->emitError("subview promotion failed");
+  }
+  rewriter.finalizeRootUpdate(op);
+  marker.replaceLinalgMarker(rewriter, op);
   return success();
 }
 
diff --git a/mlir/test/Dialect/Linalg/promotion_options.mlir b/mlir/test/Dialect/Linalg/promotion_options.mlir
new file mode 100644
index 0000000000000..e6c8e2158fc3e
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/promotion_options.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-promotion-options -split-input-file | FileCheck %s
+
+func @gemm(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+   linalg.matmul(%a, %b, %c) {__internal_linalg_transform__ = "START"}
+     : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+   return
+}
+
+//      CHECK: func @gemm
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-DAG: %[[C42:.+]] = constant 4.200000e+01 : f32
+//      CHECK: scf.for
+//      CHECK:   scf.for
+//      CHECK:     scf.for
+//      CHECK:       %[[T7:.+]] = subview %[[ARG0]]
+//      CHECK:       %[[T12:.+]] = subview %[[ARG1]]
+//      CHECK:       %[[T17:.+]] = subview %[[ARG2]]
+//      CHECK:       %[[T18:.+]] = alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 3>
+//      CHECK:       %[[T19:.+]] = subview %[[T18]]
+//      CHECK:       %[[T20:.+]] = alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 3>
+//      CHECK:       %[[T21:.+]] = subview %[[T20]]
+//      CHECK:       linalg.fill(%[[T19]], %[[C42]])
+//      CHECK:       linalg.copy(%[[T7]], %[[T19]])
+//      CHECK:       linalg.fill(%[[T21]], %[[C42]])
+//      CHECK:       linalg.copy(%[[T17]], %[[T21]])
+//      CHECK:       linalg.matmul(%[[T19]], %[[T12]], %[[T21]])
+//  CHECK-NOT:       linalg.fill
+//      CHECK:       linalg.copy(%[[T21]], %[[T17]])
+//      CHECK:       dealloc %[[T18]]
+//      CHECK:       dealloc %[[T20]]
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index 7547e2953ef21..c38494fe27783 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -45,6 +45,9 @@ struct TestLinalgTransforms
           "Test a fused pass that applies patterns from matmul to vectors via "
           "2-d tiling"),
       llvm::cl::init(false)};
+  Option<bool> testPromotionOptions{*this, "test-linalg-promotion-options",
+                                    llvm::cl::desc("Test promotion options"),
+                                    llvm::cl::init(false)};
 };
 } // end anonymous namespace
 
@@ -197,10 +200,77 @@ static void fillL1TilingAndMatmulToVectorPatterns(
               LinalgVectorizationPattern<CopyOp>>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// Test promotion callbacks
+//===----------------------------------------------------------------------===//
+
+// Allocation call back
+static Optional<Value> allocCallBackFn(OpBuilder &b, SubViewOp subView,
+                                       ArrayRef<Value> boundingSubViewSize,
+                                       OperationFolder *folder) {
+  SmallVector<int64_t, 4> shape(boundingSubViewSize.size(), -1);
+  return b
+      .create<AllocOp>(subView.getLoc(),
+                       MemRefType::get(shape,
+                                       subView.getType().getElementType(),
+                                       /*affineMapComposition =*/{}, 3),
+                       boundingSubViewSize)
+      .getResult();
+}
+
+// Deallocation callback
+static LogicalResult deallocCallBackFn(OpBuilder &b, Value buffer) {
+  b.create<DeallocOp>(buffer.getLoc(), buffer);
+  return success();
+}
+
+// Copy in call back
+static LogicalResult copyCallBackFn(OpBuilder &b, Value src, Value dst,
+                                    bool isOutput) {
+  auto floatType = src.getType().cast<MemRefType>().getElementType();
+  if (!floatType.isa<FloatType>())
+    return failure();
+  if (!isOutput)
+    b.create<FillOp>(
+        src.getLoc(), dst,
+        b.create<ConstantOp>(src.getLoc(), FloatAttr::get(floatType, 42.0)));
+  b.create<CopyOp>(src.getLoc(), src, dst);
+  return success();
+}
+
+void fillPromotionCallBackPatterns(MLIRContext *context,
+                                   OwningRewritePatternList &patterns) {
+  patterns.insert<LinalgTilingPattern<MatmulOp>>(
+      context, LinalgTilingOptions().setTileSizes({16, 16, 16}),
+      LinalgMarker({"START"}, "PROMOTE"));
+  patterns.insert<LinalgPromotionPattern<MatmulOp>>(
+      context,
+      LinalgPromotionOptions()
+          .setOperandsToPromote({0, 2})
+          .setUseFullTileBuffers({false, false})
+          .setAllocationDeallocationFns(allocCallBackFn, deallocCallBackFn)
+          .setCopyInOutFns(
+              [](OpBuilder &b, Value src, Value dst) -> LogicalResult {
+                copyCallBackFn(b, src, dst, false);
+                return success();
+              },
+              [](OpBuilder &b, Value src, Value dst) -> LogicalResult {
+                copyCallBackFn(b, src, dst, true);
+                return success();
+              }),
+      LinalgMarker({"PROMOTE"}));
+}
+
 /// Apply transformations specified as patterns.
 void TestLinalgTransforms::runOnFunction() {
   if (testPatterns) {
     applyPatterns(getFunction());
+    return;
+  }
+  if (testPromotionOptions) {
+    OwningRewritePatternList patterns;
+    fillPromotionCallBackPatterns(&getContext(), patterns);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
   } else {
     SmallVector<OwningRewritePatternList, 4> stage1Patterns;
     if (testMatmulToVectorPatterns1dTiling) {

From 9f69d3d0bc65ff50b1dc3ab0a6a08ddc32b190a6 Mon Sep 17 00:00:00 2001
From: Siva Chandra Reddy <sivachandra@google.com>
Date: Tue, 26 May 2020 22:22:09 -0700
Subject: [PATCH 181/770] [libc][NFC][Obvious] Convert the MPFR operations enum
 to an enum class.

This was suggested in https://reviews.llvm.org/D79149.
---
 libc/test/src/math/cosf_test.cpp     |  8 ++++----
 libc/test/src/math/exp2f_test.cpp    | 19 ++++++++++---------
 libc/test/src/math/expf_test.cpp     | 10 +++++-----
 libc/test/src/math/fabs_test.cpp     |  2 +-
 libc/test/src/math/fabsf_test.cpp    |  3 ++-
 libc/test/src/math/sincosf_test.cpp  | 16 ++++++++--------
 libc/test/src/math/sinf_test.cpp     | 10 +++++-----
 libc/utils/MPFRWrapper/MPFRUtils.cpp | 14 +++++++-------
 libc/utils/MPFRWrapper/MPFRUtils.h   |  2 +-
 9 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index f9fc9c2e2d0d1..1f9dffd87c10a 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -80,7 +80,7 @@ TEST(CosfTest, InFloatRange) {
     float x = valueFromBits(v);
     if (isnan(x) || isinf(x))
       continue;
-    ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, __llvm_libc::cosf(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), tolerance);
   }
 }
 
@@ -88,12 +88,12 @@ TEST(CosfTest, InFloatRange) {
 TEST(CosfTest, SmallValues) {
   float x = valueFromBits(0x17800000U);
   float result = __llvm_libc::cosf(x);
-  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result, tolerance);
   EXPECT_EQ(BitPatterns::one, valueAsBits(result));
 
   x = valueFromBits(0x0040000U);
   result = __llvm_libc::cosf(x);
-  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result, tolerance);
   EXPECT_EQ(BitPatterns::one, valueAsBits(result));
 }
 
@@ -102,6 +102,6 @@ TEST(CosfTest, SmallValues) {
 TEST(CosfTest, SDCOMP_26094) {
   for (uint32_t v : sdcomp26094Values) {
     float x = valueFromBits(v);
-    ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, __llvm_libc::cosf(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), tolerance);
   }
 }
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index dbb7046e28bda..c900ec6695444 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -92,27 +92,27 @@ TEST(ExpfTest, Borderline) {
 
   llvmlibc_errno = 0;
   x = valueFromBits(0x42fa0001U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0x42ffffffU);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0xc2fa0001U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0xc2fc0000U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0xc2fc0001U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0xc3150000U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 }
 
@@ -124,12 +124,12 @@ TEST(ExpfTest, Underflow) {
 
   llvmlibc_errno = 0;
   float x = valueFromBits(0xc3158000U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, ERANGE);
 
   llvmlibc_errno = 0;
   x = valueFromBits(0xc3165432U);
-  EXPECT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, ERANGE);
 }
 
@@ -149,6 +149,7 @@ TEST(exp2fTest, InFloatRange) {
     // wider precision.
     if (isnan(result) || isinf(result) || llvmlibc_errno != 0)
       continue;
-    ASSERT_MPFR_MATCH(mpfr::OP_Exp2, x, __llvm_libc::exp2f(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Exp2, x, __llvm_libc::exp2f(x),
+                      tolerance);
   }
 }
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index aa50bd71974b5..c99058dbf6e5f 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -109,19 +109,19 @@ TEST(ExpfTest, Borderline) {
 
   llvmlibc_errno = 0;
   x = valueFromBits(0x42affff8U);
-  ASSERT_MPFR_MATCH(mpfr::OP_Exp, x, __llvm_libc::expf(x), tolerance);
+  ASSERT_MPFR_MATCH(mpfr::Operation::Exp, x, __llvm_libc::expf(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0x42b00008U);
-  ASSERT_MPFR_MATCH(mpfr::OP_Exp, x, __llvm_libc::expf(x), tolerance);
+  ASSERT_MPFR_MATCH(mpfr::Operation::Exp, x, __llvm_libc::expf(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0xc2affff8U);
-  ASSERT_MPFR_MATCH(mpfr::OP_Exp, x, __llvm_libc::expf(x), tolerance);
+  ASSERT_MPFR_MATCH(mpfr::Operation::Exp, x, __llvm_libc::expf(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 
   x = valueFromBits(0xc2b00008U);
-  ASSERT_MPFR_MATCH(mpfr::OP_Exp, x, __llvm_libc::expf(x), tolerance);
+  ASSERT_MPFR_MATCH(mpfr::Operation::Exp, x, __llvm_libc::expf(x), tolerance);
   EXPECT_EQ(llvmlibc_errno, 0);
 }
 
@@ -141,6 +141,6 @@ TEST(ExpfTest, InFloatRange) {
     // wider precision.
     if (isnan(result) || isinf(result) || llvmlibc_errno != 0)
       continue;
-    ASSERT_MPFR_MATCH(mpfr::OP_Exp, x, __llvm_libc::expf(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Exp, x, __llvm_libc::expf(x), tolerance);
   }
 }
diff --git a/libc/test/src/math/fabs_test.cpp b/libc/test/src/math/fabs_test.cpp
index a4c934b07f5a6..a9ce9e764298b 100644
--- a/libc/test/src/math/fabs_test.cpp
+++ b/libc/test/src/math/fabs_test.cpp
@@ -59,6 +59,6 @@ TEST(FabsTest, InDoubleRange) {
     double x = valueFromBits(v);
     if (isnan(x) || isinf(x))
       continue;
-    ASSERT_MPFR_MATCH(mpfr::OP_Abs, x, __llvm_libc::fabs(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Abs, x, __llvm_libc::fabs(x), tolerance);
   }
 }
diff --git a/libc/test/src/math/fabsf_test.cpp b/libc/test/src/math/fabsf_test.cpp
index 40e61e6091b64..4231a251bf137 100644
--- a/libc/test/src/math/fabsf_test.cpp
+++ b/libc/test/src/math/fabsf_test.cpp
@@ -61,6 +61,7 @@ TEST(FabsfTest, InFloatRange) {
     double x = valueFromBits(v);
     if (isnan(x) || isinf(x))
       continue;
-    ASSERT_MPFR_MATCH(mpfr::OP_Abs, x, __llvm_libc::fabsf(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Abs, x, __llvm_libc::fabsf(x),
+                      tolerance);
   }
 }
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 9a87e2c9e58c5..66b247aeb0afd 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -95,8 +95,8 @@ TEST(SinCosfTest, InFloatRange) {
 
     float sin, cos;
     __llvm_libc::sincosf(x, &sin, &cos);
-    ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, cos, tolerance);
-    ASSERT_MPFR_MATCH(mpfr::OP_Sin, x, sin, tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, cos, tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Sin, x, sin, tolerance);
   }
 }
 
@@ -106,16 +106,16 @@ TEST(SinCosfTest, SmallValues) {
   float x = valueFromBits(bits);
   float result_cos, result_sin;
   __llvm_libc::sincosf(x, &result_sin, &result_cos);
-  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result_cos, tolerance);
-  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result_sin, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result_cos, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, result_sin, tolerance);
   EXPECT_EQ(BitPatterns::one, valueAsBits(result_cos));
   EXPECT_EQ(bits, valueAsBits(result_sin));
 
   bits = 0x00400000;
   x = valueFromBits(bits);
   __llvm_libc::sincosf(x, &result_sin, &result_cos);
-  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result_cos, tolerance);
-  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result_sin, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result_cos, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, result_sin, tolerance);
   EXPECT_EQ(BitPatterns::one, valueAsBits(result_cos));
   EXPECT_EQ(bits, valueAsBits(result_sin));
 }
@@ -127,7 +127,7 @@ TEST(SinCosfTest, SDCOMP_26094) {
     float x = valueFromBits(v);
     float sin, cos;
     __llvm_libc::sincosf(x, &sin, &cos);
-    EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, cos, tolerance);
-    EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, sin, tolerance);
+    EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, cos, tolerance);
+    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, sin, tolerance);
   }
 }
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index e0821c621dccf..437281ada43a1 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -80,13 +80,13 @@ TEST(SinfTest, InFloatRange) {
     float x = valueFromBits(v);
     if (isnan(x) || isinf(x))
       continue;
-    ASSERT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance);
+    ASSERT_MPFR_MATCH(mpfr::Operation::Sin, x, __llvm_libc::sinf(x), tolerance);
   }
 }
 
 TEST(SinfTest, SpecificBitPatterns) {
   float x = valueFromBits(0xc70d39a1);
-  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, __llvm_libc::sinf(x), tolerance);
 }
 
 // For small values, sin(x) is x.
@@ -94,13 +94,13 @@ TEST(SinfTest, SmallValues) {
   uint32_t bits = 0x17800000;
   float x = valueFromBits(bits);
   float result = __llvm_libc::sinf(x);
-  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, result, tolerance);
   EXPECT_EQ(bits, valueAsBits(result));
 
   bits = 0x00400000;
   x = valueFromBits(bits);
   result = __llvm_libc::sinf(x);
-  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, result, tolerance);
   EXPECT_EQ(bits, valueAsBits(result));
 }
 
@@ -109,6 +109,6 @@ TEST(SinfTest, SmallValues) {
 TEST(SinfTest, SDCOMP_26094) {
   for (uint32_t v : sdcomp26094Values) {
     float x = valueFromBits(v);
-    EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance);
+    EXPECT_MPFR_MATCH(mpfr::Operation::Sin, x, __llvm_libc::sinf(x), tolerance);
   }
 }
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 74c2f760f0347..51c8c37592921 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -86,21 +86,21 @@ class MPFRNumber {
     mpfr_init2(value, mpfrPrecision);
     MPFRNumber mpfrInput(rawValue);
     switch (op) {
-    case OP_Abs:
+    case Operation::Abs:
       mpfr_abs(value, mpfrInput.value, MPFR_RNDN);
       break;
-    case OP_Cos:
+    case Operation::Cos:
       mpfr_cos(value, mpfrInput.value, MPFR_RNDN);
       break;
-    case OP_Sin:
-      mpfr_sin(value, mpfrInput.value, MPFR_RNDN);
-      break;
-    case OP_Exp:
+    case Operation::Exp:
       mpfr_exp(value, mpfrInput.value, MPFR_RNDN);
       break;
-    case OP_Exp2:
+    case Operation::Exp2:
       mpfr_exp2(value, mpfrInput.value, MPFR_RNDN);
       break;
+    case Operation::Sin:
+      mpfr_sin(value, mpfrInput.value, MPFR_RNDN);
+      break;
     }
   }
 
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index f6660f2fa78e7..e39ed91281a9d 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -39,7 +39,7 @@ struct Tolerance {
   uint32_t bits;
 };
 
-enum Operation { OP_Abs, OP_Cos, OP_Sin, OP_Exp, OP_Exp2 };
+enum class Operation : int { Abs, Cos, Exp, Exp2, Sin };
 
 namespace internal {
 

From 6bbaa62d26b6061c93eb62c82048c14014ab7bd7 Mon Sep 17 00:00:00 2001
From: Denys Petrov <dpetrov@accesssoftek.com>
Date: Fri, 22 May 2020 18:01:53 +0300
Subject: [PATCH 182/770] [analyzer] Add support for IE of keyboard and mouse
 navigation in HTML report

IE throws errors while using key and mouse navigation through the error path tips.
querySelectorAll method returns NodeList. NodeList belongs to browser API. IE doesn't have forEach among NodeList's methods. At the same time Array is a JavaScript object and can be used instead. The fix is in the converting NodeList into Array and keeps using forEach method as before.

Checked in IE11, Chrome and Opera.

Differential Revision: https://reviews.llvm.org/D80444
---
 clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index 184fdcfb3d4b7..bc7c41d039c4d 100644
--- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -1070,8 +1070,13 @@ StringRef HTMLDiagnostics::generateKeyboardNavigationJavascript() {
 <script type='text/javascript'>
 var digitMatcher = new RegExp("[0-9]+");
 
+var querySelectorAllArray = function(selector) {
+  return Array.prototype.slice.call(
+    document.querySelectorAll(selector));
+}
+
 document.addEventListener("DOMContentLoaded", function() {
-    document.querySelectorAll(".PathNav > a").forEach(
+    querySelectorAllArray(".PathNav > a").forEach(
         function(currentValue, currentIndex) {
             var hrefValue = currentValue.getAttribute("href");
             currentValue.onclick = function() {
@@ -1091,7 +1096,7 @@ var findNum = function() {
 };
 
 var scrollTo = function(el) {
-    document.querySelectorAll(".selected").forEach(function(s) {
+    querySelectorAllArray(".selected").forEach(function(s) {
         s.classList.remove("selected");
     });
     el.classList.add("selected");

From d0f1f5adfa574ece80d566f400ebb689ae822a16 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 22:05:41 -0700
Subject: [PATCH 183/770] [StackSafety] Use getSignedRange for offsets

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     |  6 +--
 llvm/test/Analysis/StackSafetyAnalysis/ipa.ll |  4 +-
 .../Analysis/StackSafetyAnalysis/local.ll     | 37 ++++++++++++++++---
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 10b9f14bc75a7..f72197e827a2a 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -226,10 +226,10 @@ ConstantRange StackSafetyLocalAnalysis::offsetFrom(Value *Addr, Value *Base) {
 
   AllocaOffsetRewriter Rewriter(SE, Base);
   const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
-  ConstantRange Offset = SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
-  if (Offset.isEmptySet())
+  ConstantRange Offset = SE.getSignedRange(Expr);
+  if (Offset.isEmptySet() || Offset.isFullSet() || Offset.isSignWrappedSet())
     return UnknownRange;
-  return Offset;
+  return Offset.sextOrTrunc(PointerSize);
 }
 
 ConstantRange
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll b/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll
index 6791dd0866b85..fe378d355adc3 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll
@@ -434,14 +434,14 @@ entry:
 ; CHECK-NEXT: args uses:
 ; LOCAL-NEXT: p[]: [0,4), @RecursiveNoOffset(arg0, [4,5)){{$}}
 ; GLOBAL-NEXT: p[]: full-set, @RecursiveNoOffset(arg0, [4,5)){{$}}
-; CHECK-NEXT: size[]: empty-set, @RecursiveNoOffset(arg1, [4294967295,4294967296)){{$}}
+; CHECK-NEXT: size[]: empty-set, @RecursiveNoOffset(arg1, [-1,0)){{$}}
 ; CHECK-NEXT: acc[]: [0,4), @RecursiveNoOffset(arg2, [0,1)){{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NOT: ]:
 
 ; CHECK-LABEL: @RecursiveWithOffset{{$}}
 ; CHECK-NEXT: args uses:
-; CHECK-NEXT: size[]: empty-set, @RecursiveWithOffset(arg0, [4294967295,4294967296)){{$}}
+; CHECK-NEXT: size[]: empty-set, @RecursiveWithOffset(arg0, [-1,0)){{$}}
 ; LOCAL-NEXT: acc[]: [0,4), @RecursiveWithOffset(arg1, [4,5)){{$}}
 ; GLOBAL-NEXT: acc[]: full-set, @RecursiveWithOffset(arg1, [4,5)){{$}}
 ; CHECK-NEXT: allocas uses:
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/local.ll b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
index b7c9eb1d29539..445fb5c46a907 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/local.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
@@ -70,7 +70,7 @@ define void @StoreInBounds4() {
 ; CHECK-LABEL: @StoreInBounds4 dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:
 ; CHECK-NEXT: allocas uses:
-; CHECK-NEXT: x[4]: [2,-1){{$}}
+; CHECK-NEXT: x[4]: [-9223372036854775808,9223372036854775807){{$}}
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
@@ -178,7 +178,7 @@ define void @NonConstantOffset(i1 zeroext %z) {
 ; CHECK-NEXT: z[]: full-set{{$}}
 ; CHECK-NEXT: allocas uses:
 ; FIXME: SCEV can't look through selects.
-; CHECK-NEXT: x[4]: [0,4){{$}}
+; CHECK-NEXT: x[4]: [-4,4){{$}}
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
@@ -189,12 +189,39 @@ entry:
   ret void
 }
 
+define void @NegativeOffset() {
+; CHECK-LABEL: @NegativeOffset dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[40]: [-1600000000000,-1599999999996){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, i32 10, align 4
+  %x2 = getelementptr i32, i32* %x, i64 -400000000000
+  store i32 0, i32* %x2, align 1
+  ret void
+}
+
+define void @PossiblyNegativeOffset(i16 %z) {
+; CHECK-LABEL: @PossiblyNegativeOffset dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: z[]: full-set
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[40]: [-131072,131072){{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, i32 10, align 4
+  %x2 = getelementptr i32, i32* %x, i16 %z
+  store i32 0, i32* %x2, align 1
+  ret void
+}
+
 define void @NonConstantOffsetOOB(i1 zeroext %z) {
 ; CHECK-LABEL: @NonConstantOffsetOOB dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:
 ; CHECK-NEXT: z[]: full-set{{$}}
 ; CHECK-NEXT: allocas uses:
-; CHECK-NEXT: x[4]: [0,6){{$}}
+; CHECK-NEXT: x[4]: [-8,8){{$}}
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
@@ -251,7 +278,7 @@ entry:
 define void @DynamicAlloca(i64 %size) {
 ; CHECK-LABEL: @DynamicAlloca dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:
-; CHECK-NEXT: size[]: [0,-12){{$}}
+; CHECK-NEXT: size[]: [-9223372036854775808,9223372036854775796){{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NEXT: x[0]: [0,4){{$}}
 ; CHECK-NOT: ]:
@@ -266,7 +293,7 @@ entry:
 define void @DynamicAllocaFiniteSizeRange(i1 zeroext %z) {
 ; CHECK-LABEL: @DynamicAllocaFiniteSizeRange dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:
-; CHECK-NEXT: z[]: [0,-12){{$}}
+; CHECK-NEXT: z[]: [-9223372036854775808,9223372036854775796){{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NEXT: x[0]: [0,4){{$}}
 ; CHECK-NOT: ]:

From 32a1f60d11f7295c1b93c33c190303c606b1b41d Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 May 2020 23:20:12 -0700
Subject: [PATCH 184/770] [StackSafety] Use SCEV to find mem operation length

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     | 25 ++++++++++++-------
 .../Analysis/StackSafetyAnalysis/local.ll     |  2 +-
 .../Analysis/StackSafetyAnalysis/memintrin.ll |  4 +--
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index f72197e827a2a..1581ca80726b1 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -247,9 +247,8 @@ StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
 
 ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
                                                        TypeSize Size) {
-  ConstantRange SizeRange = Size.isScalable()
-                                ? ConstantRange::getFull(PointerSize)
-                                : getRange(0, Size.getFixedSize());
+  ConstantRange SizeRange =
+      Size.isScalable() ? UnknownRange : getRange(0, Size.getFixedSize());
   return getAccessRange(Addr, Base, SizeRange);
 }
 
@@ -262,13 +261,21 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
     if (MI->getRawDest() != U)
       return getRange(0, 1);
   }
-  const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
-  // Non-constant size => unsafe. FIXME: try SCEV getRange.
-  if (!Len)
+  auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize);
+  if (!SE.isSCEVable(MI->getLength()->getType()))
     return UnknownRange;
-  ConstantRange AccessRange =
-      getAccessRange(U, Base, getRange(0, Len->getZExtValue()));
-  return AccessRange;
+
+  const SCEV *Expr =
+      SE.getTruncateOrZeroExtend(SE.getSCEV(MI->getLength()), CalculationTy);
+  ConstantRange LenRange = SE.getSignedRange(Expr);
+  assert(!LenRange.isEmptySet());
+  if (LenRange.isSignWrappedSet() || LenRange.isFullSet() ||
+      LenRange.getUpper().isNegative())
+    return UnknownRange;
+  LenRange = LenRange.sextOrTrunc(PointerSize);
+  ConstantRange SizeRange(APInt::getNullValue(PointerSize),
+                          LenRange.getUpper() - 1);
+  return getAccessRange(U, Base, SizeRange);
 }
 
 /// The function analyzes all local uses of Ptr (alloca or argument) and
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/local.ll b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
index 445fb5c46a907..f749dc07bb768 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/local.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
@@ -362,7 +362,7 @@ define dso_local void @SizeCheck(i32 %sz) {
 ; CHECK-NEXT: args uses:
 ; CHECK-NEXT: sz[]: [0,1){{$}}
 ; CHECK-NEXT: allocas uses:
-; CHECK-NEXT: x1[128]: full-set{{$}}
+; CHECK-NEXT: x1[128]: [0,4294967295){{$}}
 ; CHECK-NOT: ]:
 entry:
   %x1 = alloca [128 x i8], align 16
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll b/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
index 2eea9ea74bdf9..87c92aca35388 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
@@ -55,7 +55,7 @@ define void @MemsetNonConst(i32 %size) {
 ; CHECK-NEXT: args uses:
 ; CHECK-NEXT: size[]: [0,1){{$}}
 ; CHECK-NEXT: allocas uses:
-; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NEXT: x[4]: [0,4294967295){{$}}
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
@@ -71,7 +71,7 @@ define void @MemsetNonConstInBounds(i1 zeroext %z) {
 ; CHECK-NEXT: args uses:
 ; CHECK-NEXT: z[]: [0,1){{$}}
 ; CHECK-NEXT: allocas uses:
-; CHECK-NEXT: x[4]: full-set{{$}}
+; CHECK-NEXT: x[4]: [0,4294967295){{$}}
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4

From 6565b5858444ba7dcf799467f5be63d2c2370715 Mon Sep 17 00:00:00 2001
From: "Wang, Pengfei" <pengfei.wang@intel.com>
Date: Wed, 27 May 2020 10:50:30 +0800
Subject: [PATCH 185/770] [X86][llvm-mc] Make the suffix matcher more accurate.

Summary:
Some instruction like VPMULDQ is NOT the variant of VPMULD but a new
one.
So we should make sure the suffix matcher only works for memory variant
that has the same size with the suffix.
Currently we only check for SSE/AVX* instructions, because many legacy
instructions didn't declare the alias instructions of their variants.

Differential Revision: https://reviews.llvm.org/D80608
---
 .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 39 ++++++++++++++++---
 llvm/lib/Target/X86/AsmParser/X86Operand.h    |  8 ++++
 llvm/test/MC/X86/avx512-err.s                 |  6 +++
 .../X86/BdVer2/dependent-pmuld-paddd.s        |  2 +-
 .../X86/BtVer2/dependent-pmuld-paddd.s        |  2 +-
 5 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 6b06656410eb5..a842a91bbb069 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3441,20 +3441,47 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   // Otherwise, we assume that this may be an integer instruction, which comes
   // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
   const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+  // MemSize corresponding to Suffixes.  { 8, 16, 32, 64 }    { 32, 64, 80, 0 }
+  const char *MemSize = Base[0] != 'f' ? "\x08\x10\x20\x40" : "\x20\x40\x50\0";
 
   // Check for the various suffix matches.
   uint64_t ErrorInfoIgnore;
   FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings.
   unsigned Match[4];
 
+  // Some instruction like VPMULDQ is NOT the variant of VPMULD but a new one.
+  // So we should make sure the suffix matcher only works for memory variant
+  // that has the same size with the suffix.
+  // FIXME: This flag is a workaround for legacy instructions that didn't
+  // declare non suffix variant assembly.
+  bool HasVectorReg = false;
+  X86Operand *MemOp = nullptr;
+  for (const auto &Op : Operands) {
+    X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+    if (X86Op->isVectorReg())
+      HasVectorReg = true;
+    else if (X86Op->isMem()) {
+      MemOp = X86Op;
+      assert(MemOp->Mem.Size == 0 && "Memory size always 0 under ATT syntax");
+      // Have we found an unqualified memory operand,
+      // break. IA allows only one memory operand.
+      break;
+    }
+  }
+
   for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
     Tmp.back() = Suffixes[I];
-    Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
-                                MissingFeatures, MatchingInlineAsm,
-                                isParsingIntelSyntax());
-    // If this returned as a missing feature failure, remember that.
-    if (Match[I] == Match_MissingFeature)
-      ErrorInfoMissingFeatures = MissingFeatures;
+    if (MemOp)
+      MemOp->Mem.Size = MemSize[I];
+    Match[I] = Match_MnemonicFail;
+    if (MemOp || !HasVectorReg) {
+      Match[I] =
+          MatchInstruction(Operands, Inst, ErrorInfoIgnore, MissingFeatures,
+                           MatchingInlineAsm, isParsingIntelSyntax());
+      // If this returned as a missing feature failure, remember that.
+      if (Match[I] == Match_MissingFeature)
+        ErrorInfoMissingFeatures = MissingFeatures;
+    }
   }
 
   // Restore the old token.
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 3da8009762f33..fb5f3355532eb 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -456,6 +456,14 @@ struct X86Operand final : public MCParsedAsmOperand {
       X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
   }
 
+  bool isVectorReg() const {
+    return Kind == Register &&
+           (X86MCRegisterClasses[X86::VR64RegClassID].contains(getReg()) ||
+            X86MCRegisterClasses[X86::VR128XRegClassID].contains(getReg()) ||
+            X86MCRegisterClasses[X86::VR256XRegClassID].contains(getReg()) ||
+            X86MCRegisterClasses[X86::VR512RegClassID].contains(getReg()));
+  }
+
   bool isVK1Pair() const {
     return Kind == Register &&
       X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg());
diff --git a/llvm/test/MC/X86/avx512-err.s b/llvm/test/MC/X86/avx512-err.s
index 9d8183d8872b6..0d353a6c54981 100644
--- a/llvm/test/MC/X86/avx512-err.s
+++ b/llvm/test/MC/X86/avx512-err.s
@@ -14,3 +14,9 @@ cvtsd2sil  {rn-sae}, %xmm1, %eax
 
 // ERR: Expected an identifier after {
 cvtsd2sil  {{sae}, %xmm1, %eax
+
+// ERR: invalid instruction mnemonic 'vpmuld'
+vpmuld %xmm1, %xmm2, %xmm3
+
+// ERR: invalid instruction mnemonic 'maskmov'
+maskmov %mm1, %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s b/llvm/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
index bf49f18c3a81c..efa99fd32dd0c 100644
--- a/llvm/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
+++ b/llvm/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=500 -timeline < %s | FileCheck %s
 
-vpmuld %xmm0, %xmm0, %xmm1
+vpmuldq %xmm0, %xmm0, %xmm1
 vpaddd %xmm1, %xmm1, %xmm0
 vpaddd %xmm0, %xmm0, %xmm3
 
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/dependent-pmuld-paddd.s b/llvm/test/tools/llvm-mca/X86/BtVer2/dependent-pmuld-paddd.s
index 586aa73b15ab1..2804ef6977091 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/dependent-pmuld-paddd.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/dependent-pmuld-paddd.s
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=500 -timeline < %s | FileCheck %s
 
-vpmuld %xmm0, %xmm0, %xmm1
+vpmuldq %xmm0, %xmm0, %xmm1
 vpaddd %xmm1, %xmm1, %xmm0
 vpaddd %xmm0, %xmm0, %xmm3
 

From b4978b24445cdc33311bbdb661060f9d9229efe9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 26 May 2020 22:36:58 -0700
Subject: [PATCH 186/770] [X86] Use SIMD_EXC to remove some let statements in
 tablegen. NFCI

---
 llvm/lib/Target/X86/X86InstrSSE.td | 35 ++++++++++++------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0bc0279162582..243ad6d8a2839 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1795,18 +1795,16 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
                             SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm,
                             X86FoldableSchedWrite sched> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let isCommutable = 1 in
   def rr : SIi8<0xC2, MRMSrcReg,
                 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
-                Sched<[sched]>;
+                Sched<[sched]>, SIMD_EXC;
   def rm : SIi8<0xC2, MRMSrcMem,
                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1),
                                          (ld_frag addr:$src2), timm:$cc))]>,
-                Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+                Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 
 let isCodeGenOnly = 1 in {
@@ -1835,19 +1833,17 @@ let isCodeGenOnly = 1 in {
 multiclass sse12_cmp_scalar_int<Operand memop,
                          Intrinsic Int, string asm, X86FoldableSchedWrite sched,
                          PatFrags mem_frags> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
   def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
                                                VR128:$src2, timm:$cc))]>,
-           Sched<[sched]>;
-let mayLoad = 1 in
+           Sched<[sched]>, SIMD_EXC;
+  let mayLoad = 1 in
   def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
                                                (mem_frags addr:$src2), timm:$cc))]>,
-           Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 
 // Aliases to match intrinsics which expect XMM operand(s).
@@ -1878,18 +1874,17 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                          ValueType vt, X86MemOperand x86memop,
                          PatFrag ld_frag, string OpcodeStr, Domain d,
                          X86FoldableSchedWrite sched = WriteFComX> {
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
-    ExeDomain = d in {
+  let ExeDomain = d in {
   def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
-          Sched<[sched]>;
-let mayLoad = 1 in
+          Sched<[sched]>, SIMD_EXC;
+  let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
                                            (ld_frag addr:$src2)))]>,
-          Sched<[sched.Folded, sched.ReadAfterFold]>;
+          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 }
 
@@ -1899,17 +1894,17 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
                              PatFrags mem_frags, string OpcodeStr,
                              Domain d,
                              X86FoldableSchedWrite sched = WriteFComX> {
-let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
+let ExeDomain = d in {
   def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
-          Sched<[sched]>;
+          Sched<[sched]>, SIMD_EXC;
 let mayLoad = 1 in
   def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
                                            (mem_frags addr:$src2)))]>,
-          Sched<[sched.Folded, sched.ReadAfterFold]>;
+          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 }
 
@@ -1961,18 +1956,16 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             ValueType VT, string asm,
                             X86FoldableSchedWrite sched,
                             Domain d, PatFrag ld_frag> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
              [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
-            Sched<[sched]>;
+            Sched<[sched]>, SIMD_EXC;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
              [(set RC:$dst,
                (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
-            Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,

From 84cf8ed8fd3f950b6e30225cae6f092da768cbe6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 26 May 2020 23:42:11 -0700
Subject: [PATCH 187/770] [X86] Lower sse_cmp_ss/sse2_cmp_sd intrinsics to
 X86ISD::FSETCC with vector types.

Isel match that instead of the intrinsic. Similar to what we do
for avx512.

Trying to move more intrinsics to target specific ISD opcodes.
Hoping to add DAG combines to shrink simple loads going into
scalar intrinsics that only read 32 or 64 bits.
---
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td |  4 +
 llvm/lib/Target/X86/X86InstrInfo.td          |  3 -
 llvm/lib/Target/X86/X86InstrSSE.td           | 99 ++++++++------------
 llvm/lib/Target/X86/X86IntrinsicsInfo.h      |  2 +
 4 files changed, 43 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index d07474c534006..f3f7d17d9b3cc 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -61,7 +61,11 @@ def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86FCmp>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86FCmp>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<0, 1>,
+                                      SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
 def X86cmps    : SDNode<"X86ISD::FSETCC",    SDTX86Cmps>;
+
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 5a9d79203786c..7b5bfea06474f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -21,9 +21,6 @@ def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
 def SDTX86FCmp    : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
                                          SDTCisSameAs<1, 2>]>;
 
-def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
-//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
-
 def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 243ad6d8a2839..15f0c8ef37dcd 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1792,83 +1792,58 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            SDNode OpNode, ValueType VT,
+                            Operand memop, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm,
-                            X86FoldableSchedWrite sched> {
-  let isCommutable = 1 in
-  def rr : SIi8<0xC2, MRMSrcReg,
-                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
-                Sched<[sched]>, SIMD_EXC;
-  def rm : SIi8<0xC2, MRMSrcMem,
-                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1),
-                                         (ld_frag addr:$src2), timm:$cc))]>,
-                Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
-}
-
-let isCodeGenOnly = 1 in {
-  let ExeDomain = SSEPackedSingle in
-  defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
-                   "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                   SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
-  let ExeDomain = SSEPackedDouble in
-  defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
-                   "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                   SchedWriteFCmpSizes.PD.Scl>,
-                   XD, VEX_4V, VEX_LIG, VEX_WIG;
-
-  let Constraints = "$src1 = $dst" in {
-    let ExeDomain = SSEPackedSingle in
-    defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
-                    "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                    SchedWriteFCmpSizes.PS.Scl>, XS;
-    let ExeDomain = SSEPackedDouble in
-    defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
-                    "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                    SchedWriteFCmpSizes.PD.Scl>, XD;
-  }
-}
-
-multiclass sse12_cmp_scalar_int<Operand memop,
-                         Intrinsic Int, string asm, X86FoldableSchedWrite sched,
-                         PatFrags mem_frags> {
+                            X86FoldableSchedWrite sched,
+                            PatFrags mem_frags> {
   def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
-                        [(set VR128:$dst, (Int VR128:$src1,
-                                               VR128:$src2, timm:$cc))]>,
+                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
+                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
+                                              VR128:$src2, timm:$cc))]>,
            Sched<[sched]>, SIMD_EXC;
   let mayLoad = 1 in
   def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
-                        [(set VR128:$dst, (Int VR128:$src1,
-                                               (mem_frags addr:$src2), timm:$cc))]>,
+                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
+                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
+                                              (mem_frags addr:$src2), timm:$cc))]>,
            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+  let isCodeGenOnly = 1 in {
+    let isCommutable = 1 in
+    def rr : SIi8<0xC2, MRMSrcReg,
+                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
+                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
+                  Sched<[sched]>, SIMD_EXC;
+    def rm : SIi8<0xC2, MRMSrcMem,
+                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
+                  [(set RC:$dst, (OpNode RC:$src1,
+                                         (ld_frag addr:$src2), timm:$cc))]>,
+                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+  }
 }
 
-// Aliases to match intrinsics which expect XMM operand(s).
 let ExeDomain = SSEPackedSingle in
-defm VCMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
-                     "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                     SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
-                     XS, VEX_4V, VEX_LIG, VEX_WIG;
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
+                 XS, VEX_4V, VEX_LIG, VEX_WIG;
 let ExeDomain = SSEPackedDouble in
-defm VCMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
-                     "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                     SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
-                     XD, VEX_4V, VEX_LIG, VEX_WIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
+                 XD, VEX_4V, VEX_LIG, VEX_WIG;
+
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
-  defm CMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
-                       "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                       SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
   let ExeDomain = SSEPackedDouble in
-  defm CMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
-                       "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                       SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
 }
 
-
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                          ValueType vt, X86MemOperand x86memop,
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index e697059e11781..1c10c07abeee2 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1002,6 +1002,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(sse_cmp_ss,        INTR_TYPE_3OP, X86ISD::FSETCC, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
@@ -1026,6 +1027,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse2_cmp_pd,       INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(sse2_cmp_sd,       INTR_TYPE_3OP, X86ISD::FSETCC, 0),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),

From de02a75e398415bad4df27b4547c25b896c8bf3b Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Mon, 25 May 2020 20:44:35 +0200
Subject: [PATCH 188/770] [PGO] Fix computation of function Hash

And bump its version number accordingly.

This is a patched recommit of 7c298c104bfe725d4315926a656263e8a5ac3054

Previous hash implementation was incorrectly passing an uint64_t, that got converted
to an uint8_t, to finalize the hash computation. This led to different functions
having the same hash if they only differ by the remaining statements, which is
incorrect.

Added a new test case that trivially tests that a small function change is
reflected in the hash value.

Not that as this patch fixes the hash computation, it would invalidate all hashes
computed before that patch applies, this is why we bumped the version number.

Update profile data hash entries due to hash function update, except for binary
version, in which case we keep the buggy behavior for backward compatibility.

Differential Revision: https://reviews.llvm.org/D79961
---
 clang/docs/ReleaseNotes.rst                   |   4 +++
 clang/lib/CodeGen/CodeGenPGO.cpp              |  27 ++++++++++++------
 .../Inputs/c-counter-overflows.proftext       |   2 +-
 .../test/Profile/Inputs/c-general.profdata.v5 | Bin 0 -> 2376 bytes
 clang/test/Profile/Inputs/c-general.proftext  |  14 ++++-----
 .../Inputs/c-unprofiled-blocks.proftext       |   4 +--
 .../test/Profile/Inputs/cxx-rangefor.proftext |   2 +-
 clang/test/Profile/Inputs/cxx-throws.proftext |   2 +-
 .../Inputs/misexpect-switch-default.proftext  |   2 +-
 .../Inputs/misexpect-switch-nonconst.proftext |   2 +-
 .../Profile/Inputs/misexpect-switch.proftext  |   2 +-
 clang/test/Profile/c-collision.c              |  22 ++++++++++++++
 clang/test/Profile/c-general.c                |   1 +
 llvm/include/llvm/ProfileData/InstrProf.h     |   3 ++
 .../llvm/ProfileData/InstrProfData.inc        |   2 +-
 15 files changed, 65 insertions(+), 24 deletions(-)
 create mode 100644 clang/test/Profile/Inputs/c-general.profdata.v5
 create mode 100644 clang/test/Profile/c-collision.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c38ff0e367902..571b54904754f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -82,6 +82,10 @@ Non-comprehensive list of changes in this release
   linker. If the user links the program with the ``clang`` or ``clang-cl``
   drivers, the driver will pass this flag for them.
 
+- Clang's profile files generated through ``-fprofile-instr-generate`` are using
+  a fixed hashing algorithm that prevents some collision when loading
+  out-of-date profile informations. Clang can still read old profile files.
+
 New Compiler Flags
 ------------------
 
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 3c91a04d54642..e810f608ab787 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -52,9 +52,10 @@ void CodeGenPGO::setFuncName(llvm::Function *Fn) {
 enum PGOHashVersion : unsigned {
   PGO_HASH_V1,
   PGO_HASH_V2,
+  PGO_HASH_V3,
 
   // Keep this set to the latest hash version.
-  PGO_HASH_LATEST = PGO_HASH_V2
+  PGO_HASH_LATEST = PGO_HASH_V3
 };
 
 namespace {
@@ -122,7 +123,7 @@ class PGOHash {
     BinaryOperatorGE,
     BinaryOperatorEQ,
     BinaryOperatorNE,
-    // The preceding values are available with PGO_HASH_V2.
+    // The preceding values are available since PGO_HASH_V2.
 
     // Keep this last.  It's for the static assert that follows.
     LastHashType
@@ -144,7 +145,9 @@ static PGOHashVersion getPGOHashVersion(llvm::IndexedInstrProfReader *PGOReader,
                                         CodeGenModule &CGM) {
   if (PGOReader->getVersion() <= 4)
     return PGO_HASH_V1;
-  return PGO_HASH_V2;
+  if (PGOReader->getVersion() <= 5)
+    return PGO_HASH_V2;
+  return PGO_HASH_V3;
 }
 
 /// A RecursiveASTVisitor that fills a map of statements to PGO counters.
@@ -288,7 +291,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
         return PGOHash::BinaryOperatorLAnd;
       if (BO->getOpcode() == BO_LOr)
         return PGOHash::BinaryOperatorLOr;
-      if (HashVersion == PGO_HASH_V2) {
+      if (HashVersion >= PGO_HASH_V2) {
         switch (BO->getOpcode()) {
         default:
           break;
@@ -310,7 +313,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     }
     }
 
-    if (HashVersion == PGO_HASH_V2) {
+    if (HashVersion >= PGO_HASH_V2) {
       switch (S->getStmtClass()) {
       default:
         break;
@@ -747,13 +750,21 @@ uint64_t PGOHash::finalize() {
     return Working;
 
   // Check for remaining work in Working.
-  if (Working)
-    MD5.update(Working);
+  if (Working) {
+    // Keep the buggy behavior from v1 and v2 for backward-compatibility. This
+    // is buggy because it converts a uint64_t into an array of uint8_t.
+    if (HashVersion < PGO_HASH_V3) {
+      MD5.update({(uint8_t)Working});
+    } else {
+      using namespace llvm::support;
+      uint64_t Swapped = endian::byte_swap<uint64_t, little>(Working);
+      MD5.update(llvm::makeArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
+    }
+  }
 
   // Finalize the MD5 and return the hash.
   llvm::MD5::MD5Result Result;
   MD5.final(Result);
-  using namespace llvm::support;
   return Result.low();
 }
 
diff --git a/clang/test/Profile/Inputs/c-counter-overflows.proftext b/clang/test/Profile/Inputs/c-counter-overflows.proftext
index b2e5dd1d77aea..4d0287c787051 100644
--- a/clang/test/Profile/Inputs/c-counter-overflows.proftext
+++ b/clang/test/Profile/Inputs/c-counter-overflows.proftext
@@ -1,5 +1,5 @@
 main
-10111551811706059223
+7779561829442898616
 8
 1
 68719476720
diff --git a/clang/test/Profile/Inputs/c-general.profdata.v5 b/clang/test/Profile/Inputs/c-general.profdata.v5
new file mode 100644
index 0000000000000000000000000000000000000000..435ef2b6ef1d8af7ec2bbdc75fd8321072323ac4
GIT binary patch
literal 2376
zcmb_eTSyd97(Qcbc2QFlD~;NOQVB~mMPk+5#1JVWNH)=S$Jy1Ln4Ot+MpB}ypcqtS
z8C|4SD(Io2rwAenD!RQWkv-%?5uw$K5<PSQt3Cg>#&M;W_T`)Z`>*H!&-vLY%?ul@
z>fyO8{;zx<=b=v{v-yN>LW$5XpfiaRX?Yx>N4!Wk_Z0b!DRZI@BLDW3BQAd^n@3c5
z8b)O^=O8ZtbuJ+NgFJqMVv;ineK)y<$g|Sqs6{T>Wf4a1rbNH8NzNghIFK6sLcdlJ
zUzD>b`jtTP8bVJc=g~(O@uSu8`7(EWzG@a;V$Kf_)q9gooj7R`Wl#*=uiC1i%bM3P
z17X=V!b2lm$!EYAsv?(|j4n&8>qg?$$xX+Dyi1IkGoxGEUzU2ZT<J9nNr$MIF?tSM
zNp?W=5;KOMC{I`AsC(kbe{8A9=%a`MIjnVh10AYuRWuc*)@OX9>oD`EB|@=F{~?n)
zb^#@}r{KA3^^v~&DC-uPSbJ4l2?ngnU{}|($T^lFL`HjneBg}Hi;4{a8F8h67#@&6
z9L9j~gA7a*>VO^UPhemBD;)LPnlmtLADOfmhGv*v%`i;MJ6znbZfyB=EDD?g!$n^q
z;{idQO^+&eobD({ytHZ;s(nf|>~0U4R?a|O^Lpjo5MC*g192o#o{fi{*jmW&L#rXf
zPeg1X*g;0Ka7c;0?&;0^@$o_uT3t>qmKrj(0B`khS<OIN$PpVXviZptSWohF^qp6Z
zUh6%7*^^Eh58<Cd24p?953G~#T_?V1j=yd-w?yb%a&!|SAo!KUW4b6Ifqj2<Y^==R
z{|2|uNtPj5)tAsW4g`dE<BMeX=E1G@VzSF8R0`Vy`tMDqtyMNywrs14x3yhYhHJSV
zQ$sj(A7Lf7e|+qbdFHd5?{LP@b-&?ll{L){hK=^NVBe>Svn@Rn9VlIlhdhW2lvvf4
zWxB7YdLJ@JsPGoG&CA!q$W+6f?nd=C8j09WvIK%BI<^$RWAg#Q1F=+q;DI;;5NAMs
w+{<w|pb)}ZavVhfLI1x+S$u**f)Wgi_>0&dB>yD&ApW2T{+Zam#M>nQ0?fy_r~m)}

literal 0
HcmV?d00001

diff --git a/clang/test/Profile/Inputs/c-general.proftext b/clang/test/Profile/Inputs/c-general.proftext
index c0d03b4b755ec..18ad2050fe1a0 100644
--- a/clang/test/Profile/Inputs/c-general.proftext
+++ b/clang/test/Profile/Inputs/c-general.proftext
@@ -7,7 +7,7 @@ simple_loops
 75
 
 conditionals
-4190663230902537370
+4904767535850050386
 11
 1
 100
@@ -22,7 +22,7 @@ conditionals
 100
 
 early_exits
-8265526549255474475
+2880354649761471549
 9
 1
 0
@@ -35,7 +35,7 @@ early_exits
 0
 
 jumps
-15872630527555456493
+15051420506203462683
 22
 1
 1
@@ -61,7 +61,7 @@ jumps
 9
 
 switches
-11892326508727782373
+43242458792028222
 19
 1
 1
@@ -84,7 +84,7 @@ switches
 0
 
 big_switch
-16933280399284440835
+13144136522122330070
 17
 1
 32
@@ -117,7 +117,7 @@ boolean_operators
 50
 
 boolop_loops
-11270260636676715317
+12402604614320574815
 9
 1
 50
@@ -137,7 +137,7 @@ conditional_operator
 1
 
 do_fallthrough
-6898770640283947069
+8714614136504380050
 4
 1
 10
diff --git a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
index ef7f653811fbe..d880663fed32d 100644
--- a/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
+++ b/clang/test/Profile/Inputs/c-unprofiled-blocks.proftext
@@ -1,5 +1,5 @@
 never_called
-5644096560937528444
+6820425066224770721
 9
 0
 0
@@ -17,7 +17,7 @@ main
 1
 
 dead_code
-9636018207904213947
+5254464978620792806
 10
 1
 0
diff --git a/clang/test/Profile/Inputs/cxx-rangefor.proftext b/clang/test/Profile/Inputs/cxx-rangefor.proftext
index b597292078598..d41205bbde147 100644
--- a/clang/test/Profile/Inputs/cxx-rangefor.proftext
+++ b/clang/test/Profile/Inputs/cxx-rangefor.proftext
@@ -1,5 +1,5 @@
 _Z9range_forv
-6169071350249721981
+8789831523895825398
 5
 1
 4
diff --git a/clang/test/Profile/Inputs/cxx-throws.proftext b/clang/test/Profile/Inputs/cxx-throws.proftext
index 32fcf5d50cd4c..043dea08c728f 100644
--- a/clang/test/Profile/Inputs/cxx-throws.proftext
+++ b/clang/test/Profile/Inputs/cxx-throws.proftext
@@ -1,5 +1,5 @@
 _Z6throwsv
-340120998528097520
+18172607911962830854
 9
 1
 100
diff --git a/clang/test/Profile/Inputs/misexpect-switch-default.proftext b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
index 7b2d59781a1db..533da91765234 100644
--- a/clang/test/Profile/Inputs/misexpect-switch-default.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch-default.proftext
@@ -1,6 +1,6 @@
 main
 # Func Hash:
-8712453512413296413
+8734802134600123338
 # Num Counters:
 9
 # Counter Values:
diff --git a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
index 52b7b70cab9a1..8e8db667d329d 100644
--- a/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch-nonconst.proftext
@@ -1,6 +1,6 @@
 main
 # Func Hash:
-1965403898329309329
+3721743393642630379
 # Num Counters:
 10
 # Counter Values:
diff --git a/clang/test/Profile/Inputs/misexpect-switch.proftext b/clang/test/Profile/Inputs/misexpect-switch.proftext
index ce4c96b3e3a61..ce41cd0552d3a 100644
--- a/clang/test/Profile/Inputs/misexpect-switch.proftext
+++ b/clang/test/Profile/Inputs/misexpect-switch.proftext
@@ -1,6 +1,6 @@
 main
 # Func Hash:
-1965403898329309329
+872687477373597607
 # Num Counters:
 9
 # Counter Values:
diff --git a/clang/test/Profile/c-collision.c b/clang/test/Profile/c-collision.c
new file mode 100644
index 0000000000000..fabecd752b4ef
--- /dev/null
+++ b/clang/test/Profile/c-collision.c
@@ -0,0 +1,22 @@
+// Test that a slight change in the code leads to a different hash.
+// RUN: %clang_cc1 -UEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-NOEXTRA
+// RUN: %clang_cc1 -DEXTRA -triple x86_64-unknown-linux-gnu -main-file-name c-collision.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s --check-prefix=CHECK-EXTRA
+
+// CHECK-NOEXTRA: @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 7156072912471487002,
+// CHECK-EXTRA:   @__profd_foo = private global { {{.*}} } { i64 6699318081062747564, i64 -4383447408116050035,
+
+extern int bar;
+void foo() {
+  if (bar) {
+  }
+  if (bar) {
+  }
+  if (bar) {
+    if (bar) {
+#ifdef EXTRA
+      if (bar) {
+      }
+#endif
+    }
+  }
+}
diff --git a/clang/test/Profile/c-general.c b/clang/test/Profile/c-general.c
index 22b4288a5fd69..a7f03e872881f 100644
--- a/clang/test/Profile/c-general.c
+++ b/clang/test/Profile/c-general.c
@@ -4,6 +4,7 @@
 
 // RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v5 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 // Also check compatibility with older profiles.
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v1 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index cdd50d2d5ebca..62a0c6955708e 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -979,6 +979,9 @@ enum ProfVersion {
   Version4 = 4,
   // In this version, the frontend PGO stable hash algorithm defaults to V2.
   Version5 = 5,
+  // In this version, the frontend PGO stable hash algorithm got fixed and
+  // may produce hashes different from Version5.
+  Version6 = 6,
   // The current version is 5.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 5e5f4ff941f36..a6913527e67f0 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -657,7 +657,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 5
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 5
+#define INSTR_PROF_INDEX_VERSION 6
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 3
 

From 0b5d81e6bbad1656c2e059621948967aaeaa5702 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 May 2020 05:30:49 +0000
Subject: [PATCH 189/770] Automatically configure MLIR when flang is enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is more friendly than the "Unknown CMake command “mlir_tablegen”."
that would be issued instead.

Differential Revision: https://reviews.llvm.org/D80359
---
 llvm/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 06b8646ca37ba..b224393ac16a6 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -81,6 +81,10 @@ set(LLVM_ENABLE_PROJECTS "" CACHE STRING
 if( LLVM_ENABLE_PROJECTS STREQUAL "all" )
   set( LLVM_ENABLE_PROJECTS ${LLVM_ALL_PROJECTS})
 endif()
+if ("flang" IN_LIST LLVM_ENABLE_PROJECTS AND NOT "mlir" IN_LIST LLVM_ENABLE_PROJECTS)
+  message(STATUS "Enabling MLIR as a dependency to flang")
+  list(APPEND LLVM_ENABLE_PROJECTS "mlir")
+endif()
 
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for

From 602d9b0afc77828f419869289b159a567c62ae81 Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Thu, 14 May 2020 06:09:04 +0000
Subject: [PATCH 190/770] [OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN
 architecture - Part 1

Summary:
Allow AMDGCN as a GPU offloading target for OpenMP during compiler
invocation and allow setting CUDAMode for it.

Originally authored by Greg Rodgers (@gregrodgers).

Reviewers: ronlieb, yaxunl, b-sumner, scchan, JonChesterfield, jdoerfert, sameerds, msearles, hliao, arsenm

Reviewed By: sameerds

Subscribers: sstefan1, jvesely, wdng, arsenm, guansong, dexonsmith, cfe-commits, llvm-commits, gregrodgers

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D79754
---
 clang/lib/AST/Decl.cpp                        |  9 +++++++
 clang/lib/Frontend/CompilerInvocation.cpp     | 12 +++++----
 clang/test/Driver/openmp-offload-gpu.c        | 21 +++++++++++++--
 .../OpenMP/amdgcn_device_function_call.cpp    | 27 +++++++++++++++++++
 .../OpenMP/target_parallel_no_exceptions.cpp  |  1 +
 llvm/include/llvm/ADT/Triple.h                |  3 +++
 6 files changed, 66 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/OpenMP/amdgcn_device_function_call.cpp

diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 27b3ae3ef00ec..e6800073ee58d 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3224,6 +3224,15 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
       !(BuiltinID == Builtin::BIprintf || BuiltinID == Builtin::BImalloc))
     return 0;
 
+  // As AMDGCN implementation of OpenMP does not have a device-side standard
+  // library, none of the predefined library functions except printf and malloc
+  // should be treated as a builtin i.e. 0 should be returned for them.
+  if (Context.getTargetInfo().getTriple().isAMDGCN() &&
+      Context.getLangOpts().OpenMPIsDevice &&
+      Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID) &&
+      !(BuiltinID == Builtin::BIprintf || BuiltinID == Builtin::BImalloc))
+    return 0;
+
   return BuiltinID;
 }
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index f98490cd9a114..1d820090f8109 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3109,7 +3109,8 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
 
   // Set the flag to prevent the implementation from emitting device exception
   // handling code for those requiring so.
-  if ((Opts.OpenMPIsDevice && T.isNVPTX()) || Opts.OpenCLCPlusPlus) {
+  if ((Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN())) ||
+      Opts.OpenCLCPlusPlus) {
     Opts.Exceptions = 0;
     Opts.CXXExceptions = 0;
   }
@@ -3143,6 +3144,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
             TT.getArch() == llvm::Triple::ppc64le ||
             TT.getArch() == llvm::Triple::nvptx ||
             TT.getArch() == llvm::Triple::nvptx64 ||
+            TT.getArch() == llvm::Triple::amdgcn ||
             TT.getArch() == llvm::Triple::x86 ||
             TT.getArch() == llvm::Triple::x86_64))
         Diags.Report(diag::err_drv_invalid_omp_target) << A->getValue(i);
@@ -3160,13 +3162,13 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
           << Opts.OMPHostIRFile;
   }
 
-  // Set CUDA mode for OpenMP target NVPTX if specified in options
-  Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && T.isNVPTX() &&
+  // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
+  Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
                         Args.hasArg(options::OPT_fopenmp_cuda_mode);
 
-  // Set CUDA mode for OpenMP target NVPTX if specified in options
+  // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
   Opts.OpenMPCUDAForceFullRuntime =
-      Opts.OpenMPIsDevice && T.isNVPTX() &&
+      Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
       Args.hasArg(options::OPT_fopenmp_cuda_force_full_runtime);
 
   // Record whether the __DEPRECATED define was requested.
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index dc4dbd1f37c97..6415f1d61b720 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -6,6 +6,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: powerpc-registered-target
 // REQUIRES: nvptx-registered-target
+// REQUIRES: amdgpu-registered-target
 
 /// ###########################################################################
 
@@ -254,24 +255,40 @@
 // RUN:   | FileCheck -check-prefix=CUDA_MODE %s
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-mode -fopenmp-cuda-mode 2>&1 \
 // RUN:   | FileCheck -check-prefix=CUDA_MODE %s
-// CUDA_MODE: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-mode 2>&1 \
+// RUN:   | FileCheck -check-prefix=CUDA_MODE %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-mode -fopenmp-cuda-mode 2>&1 \
+// RUN:   | FileCheck -check-prefix=CUDA_MODE %s
+// CUDA_MODE: clang{{.*}}"-cc1"{{.*}}"-triple" "{{nvptx64-nvidia-cuda|amdgcn-amd-amdhsa}}"
 // CUDA_MODE-SAME: "-fopenmp-cuda-mode"
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-mode 2>&1 \
 // RUN:   | FileCheck -check-prefix=NO_CUDA_MODE %s
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-mode -fno-openmp-cuda-mode 2>&1 \
 // RUN:   | FileCheck -check-prefix=NO_CUDA_MODE %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-mode 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_CUDA_MODE %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-mode -fno-openmp-cuda-mode 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_CUDA_MODE %s
 // NO_CUDA_MODE-NOT: "-{{fno-|f}}openmp-cuda-mode"
 
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-force-full-runtime 2>&1 \
 // RUN:   | FileCheck -check-prefix=FULL_RUNTIME %s
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-force-full-runtime -fopenmp-cuda-force-full-runtime 2>&1 \
 // RUN:   | FileCheck -check-prefix=FULL_RUNTIME %s
-// FULL_RUNTIME: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-force-full-runtime 2>&1 \
+// RUN:   | FileCheck -check-prefix=FULL_RUNTIME %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-force-full-runtime -fopenmp-cuda-force-full-runtime 2>&1 \
+// RUN:   | FileCheck -check-prefix=FULL_RUNTIME %s
+// FULL_RUNTIME: clang{{.*}}"-cc1"{{.*}}"-triple" "{{nvptx64-nvidia-cuda|amdgcn-amd-amdhsa}}"
 // FULL_RUNTIME-SAME: "-fopenmp-cuda-force-full-runtime"
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-force-full-runtime 2>&1 \
 // RUN:   | FileCheck -check-prefix=NO_FULL_RUNTIME %s
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-force-full-runtime -fno-openmp-cuda-force-full-runtime 2>&1 \
 // RUN:   | FileCheck -check-prefix=NO_FULL_RUNTIME %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-force-full-runtime 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_FULL_RUNTIME %s
+// RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-force-full-runtime -fno-openmp-cuda-force-full-runtime 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO_FULL_RUNTIME %s
 // NO_FULL_RUNTIME-NOT: "-{{fno-|f}}openmp-cuda-force-full-runtime"
 
 // RUN:   %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-teams-reduction-recs-num=2048 2>&1 \
diff --git a/clang/test/OpenMP/amdgcn_device_function_call.cpp b/clang/test/OpenMP/amdgcn_device_function_call.cpp
new file mode 100644
index 0000000000000..443600072396b
--- /dev/null
+++ b/clang/test/OpenMP/amdgcn_device_function_call.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: llvm-dis < %t-ppc-host.bc | FileCheck %s -check-prefix=HOST
+
+// device side declarations
+#pragma omp declare target
+extern "C" float cosf(float __x);
+#pragma omp end declare target
+
+// host side declaration
+extern "C" float cosf(float __x);
+
+void test_amdgcn_openmp_device(float __x) {
+  // the default case where predefined library functions are treated as
+  // builtins on the host
+  // HOST: call float @llvm.cos.f32(float
+  __x = cosf(__x);
+
+#pragma omp target
+  {
+    // cosf should not be treated as builtin on device
+    // CHECK-NOT: call float @llvm.cos.f32(float
+    __x = cosf(__x);
+  }
+}
diff --git a/clang/test/OpenMP/target_parallel_no_exceptions.cpp b/clang/test/OpenMP/target_parallel_no_exceptions.cpp
index 95189a3ade2b3..ab1479c0eb5a4 100644
--- a/clang/test/OpenMP/target_parallel_no_exceptions.cpp
+++ b/clang/test/OpenMP/target_parallel_no_exceptions.cpp
@@ -1,6 +1,7 @@
 /// Make sure no exception messages are inclided in the llvm output.
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHK-EXCEPTION
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHK-EXCEPTION
 
 void test_increment() {
 #pragma omp target
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index a00b3b2495cd0..fa437a57520aa 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -692,6 +692,9 @@ class Triple {
     return getArch() == Triple::nvptx || getArch() == Triple::nvptx64;
   }
 
+  /// Tests whether the target is AMDGCN
+  bool isAMDGCN() const { return getArch() == Triple::amdgcn; }
+
   bool isAMDGPU() const {
     return getArch() == Triple::r600 || getArch() == Triple::amdgcn;
   }

From fc44da746faab5c0ad20e9de8b8fca43b7c5f408 Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <suc-daniil@yandex.ru>
Date: Mon, 25 May 2020 14:59:25 +0700
Subject: [PATCH 191/770] Add test exposing a bug in SimpleLoopUnswitch.

---
 ...dead-blocks-uses-in-unreachablel-blocks.ll | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll

diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll b/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll
new file mode 100644
index 0000000000000..4dec9a3919aa9
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll
@@ -0,0 +1,33 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: opt < %s -passes='unswitch<nontrivial>' -disable-output
+; RUN: opt < %s -simple-loop-unswitch -enable-nontrivial-unswitch -disable-output
+
+
+; Make sure we don't crash due to a dangling use of %tmp2 in bb7.
+define void @test.use_in_dead_block(i1 %arg1, i1 %arg2) {
+bb1:
+  br label %bb2
+
+bb2:                                              ; preds = %bb4, %bb1
+  %tmp1 = phi i64 [ 0, %bb4 ], [ 42, %bb1 ]
+  br i1 %arg1, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb2
+  br i1 %arg2, label %bb5, label %bb4
+
+bb4:                                              ; preds = %bb3
+  %tmp2 = add i32 1, 1
+  br label %bb2
+
+bb5:                                             ; preds = %bb3
+  ret void
+
+bb6:                                             ; preds = %bb2
+  %tmp3 = add i64 %tmp1, 1
+  ret void
+
+bb7:                                             ; No predecessors!
+  %tmp4 = add i32 %tmp2, 1
+  ret void
+}

From dedaf3a2ac59548c70a0d54da7267bbb082782c0 Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Wed, 27 May 2020 09:39:39 +0200
Subject: [PATCH 192/770] [VE] Dynamic stack allocation

Summary:
This patch implements dynamic stack allocation for the VE target. Changes:
* compiler-rt: `__ve_grow_stack` to request stack allocation on the VE.
* VE: base pointer support, dynamic stack allocation.

Differential Revision: https://reviews.llvm.org/D79084
---
 .../cmake/Modules/CompilerRTUtils.cmake       |   3 +
 compiler-rt/cmake/base-config-ix.cmake        |   2 +
 compiler-rt/cmake/builtin-config-ix.cmake     |   8 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |   6 +
 compiler-rt/lib/builtins/ve/grow_stack.S      |  31 ++++
 .../lib/builtins/ve/grow_stack_align.S        |  31 ++++
 llvm/lib/Target/VE/VECallingConv.td           |   3 +
 llvm/lib/Target/VE/VEFrameLowering.cpp        | 148 +++++++++---------
 llvm/lib/Target/VE/VEFrameLowering.h          |  22 +--
 llvm/lib/Target/VE/VEISelLowering.cpp         |  73 +++++++++
 llvm/lib/Target/VE/VEISelLowering.h           |   7 +-
 llvm/lib/Target/VE/VEInstrInfo.cpp            |  41 ++++-
 llvm/lib/Target/VE/VEInstrInfo.h              |   1 +
 llvm/lib/Target/VE/VEInstrInfo.td             |  11 ++
 llvm/lib/Target/VE/VERegisterInfo.cpp         |  14 +-
 llvm/lib/Target/VE/VESubtarget.h              |   2 +-
 llvm/test/CodeGen/VE/alloca.ll                |  25 +++
 llvm/test/CodeGen/VE/alloca_aligned.ll        |  29 ++++
 18 files changed, 367 insertions(+), 90 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/ve/grow_stack.S
 create mode 100644 compiler-rt/lib/builtins/ve/grow_stack_align.S
 create mode 100644 llvm/test/CodeGen/VE/alloca.ll
 create mode 100644 llvm/test/CodeGen/VE/alloca_aligned.ll

diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
index a83e916990d72..0a686e38ff886 100644
--- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
@@ -166,6 +166,7 @@ macro(detect_target_arch)
   check_symbol_exists(__sparcv9 "" __SPARCV9)
   check_symbol_exists(__wasm32__ "" __WEBASSEMBLY32)
   check_symbol_exists(__wasm64__ "" __WEBASSEMBLY64)
+  check_symbol_exists(__ve__ "" __VE)
   if(__ARM)
     add_default_target_arch(arm)
   elseif(__AARCH64)
@@ -200,6 +201,8 @@ macro(detect_target_arch)
     add_default_target_arch(wasm32)
   elseif(__WEBASSEMBLY64)
     add_default_target_arch(wasm64)
+  elseif(__VE)
+    add_default_target_arch(ve)
   endif()
 endmacro()
 
diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
index 234cd7262b724..964dd598f1022 100644
--- a/compiler-rt/cmake/base-config-ix.cmake
+++ b/compiler-rt/cmake/base-config-ix.cmake
@@ -237,6 +237,8 @@ macro(test_targets)
       test_target_arch(wasm32 "" "--target=wasm32-unknown-unknown")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "wasm64")
       test_target_arch(wasm64 "" "--target=wasm64-unknown-unknown")
+    elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "ve")
+      test_target_arch(ve "__ve__" "--target=ve-unknown-none")
     endif()
     set(COMPILER_RT_OS_SUFFIX "")
   endif()
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 1bd7ad46df444..5f4275ae54d4c 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -37,6 +37,7 @@ set(SPARC sparc)
 set(SPARCV9 sparcv9)
 set(WASM32 wasm32)
 set(WASM64 wasm64)
+set(VE ve)
 
 if(APPLE)
   set(ARM64 arm64 arm64e)
@@ -44,8 +45,11 @@ if(APPLE)
   set(X86_64 x86_64 x86_64h)
 endif()
 
-set(ALL_BUILTIN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
-    ${HEXAGON} ${MIPS32} ${MIPS64} ${PPC64} ${RISCV32} ${RISCV64} ${SPARC} ${SPARCV9} ${WASM32} ${WASM64})
+set(ALL_BUILTIN_SUPPORTED_ARCH
+  ${X86} ${X86_64} ${ARM32} ${ARM64}
+  ${HEXAGON} ${MIPS32} ${MIPS64} ${PPC64}
+  ${RISCV32} ${RISCV64} ${SPARC} ${SPARCV9}
+  ${WASM32} ${WASM64} ${VE})
 
 include(CompilerRTUtils)
 include(CompilerRTDarwinUtils)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index f63f06c3bfa20..f8431bdcf059f 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -573,6 +573,12 @@ set(wasm64_SOURCES
   ${GENERIC_SOURCES}
 )
 
+set(ve_SOURCES
+  ve/grow_stack.S
+  ve/grow_stack_align.S
+  ${GENERIC_TF_SOURCES}
+  ${GENERIC_SOURCES})
+
 add_custom_target(builtins)
 set_target_properties(builtins PROPERTIES FOLDER "Compiler-RT Misc")
 
diff --git a/compiler-rt/lib/builtins/ve/grow_stack.S b/compiler-rt/lib/builtins/ve/grow_stack.S
new file mode 100644
index 0000000000000..f403798495af6
--- /dev/null
+++ b/compiler-rt/lib/builtins/ve/grow_stack.S
@@ -0,0 +1,31 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../assembly.h"
+
+// grow_stack routine
+// This routine is VE specific
+// https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v1.1.pdf
+
+// destroy %s62 and %s63 only
+
+#ifdef __ve__
+
+.text
+.p2align        4
+DEFINE_COMPILERRT_FUNCTION(__ve_grow_stack)
+        subu.l          %sp, %sp, %s0           # sp -= alloca size
+        and             %sp, -16, %sp           # align sp
+        brge.l.t        %sp, %sl, 1f
+        ld              %s63, 0x18(,%tp)        # load param area
+        lea             %s62, 0x13b             # syscall # of grow
+        shm.l           %s62, 0x0(%s63)         # stored at addr:0
+        shm.l           %sl, 0x8(%s63)          # old limit at addr:8
+        shm.l           %sp, 0x10(%s63)         # new limit at addr:16
+        monc
+1:
+        b.l             (,%lr)
+END_COMPILERRT_FUNCTION(__ve_grow_stack)
+
+#endif // __ve__
diff --git a/compiler-rt/lib/builtins/ve/grow_stack_align.S b/compiler-rt/lib/builtins/ve/grow_stack_align.S
new file mode 100644
index 0000000000000..19a1dfa8726c7
--- /dev/null
+++ b/compiler-rt/lib/builtins/ve/grow_stack_align.S
@@ -0,0 +1,31 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../assembly.h"
+
+// grow_stack routine
+// This routine is VE specific
+// https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v1.1.pdf
+
+// destroy %s62 and %s63 only
+
+#ifdef __ve__
+
+.text
+.p2align        4
+DEFINE_COMPILERRT_FUNCTION(__ve_grow_stack_align)
+        subu.l          %sp, %sp, %s0           # sp -= alloca size
+        and             %sp, %sp, %s1           # align sp
+        brge.l.t        %sp, %sl, 1f
+        ld              %s63, 0x18(,%tp)        # load param area
+        lea             %s62, 0x13b             # syscall # of grow
+        shm.l           %s62, 0x0(%s63)         # stored at addr:0
+        shm.l           %sl, 0x8(%s63)          # old limit at addr:8
+        shm.l           %sp, 0x10(%s63)         # new limit at addr:16
+        monc
+1:
+        b.l             (,%lr)
+END_COMPILERRT_FUNCTION(__ve_grow_stack_align)
+
+#endif // __ve__
diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td
index 5c32962658bbf..4f04dae884ab5 100644
--- a/llvm/lib/Target/VE/VECallingConv.td
+++ b/llvm/lib/Target/VE/VECallingConv.td
@@ -84,3 +84,6 @@ def RetCC_VE : CallingConv<[
 // Callee-saved registers
 def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>;
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+// PreserveAll (clobbers s62,s63) - used for ve_grow_stack
+def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61))>;
diff --git a/llvm/lib/Target/VE/VEFrameLowering.cpp b/llvm/lib/Target/VE/VEFrameLowering.cpp
index e6cd56285198d..8b10e6466123a 100644
--- a/llvm/lib/Target/VE/VEFrameLowering.cpp
+++ b/llvm/lib/Target/VE/VEFrameLowering.cpp
@@ -30,12 +30,13 @@ using namespace llvm;
 
 VEFrameLowering::VEFrameLowering(const VESubtarget &ST)
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(16), 0,
-                          Align(16)) {}
+                          Align(16)),
+      STI(ST) {}
 
 void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
-                                        int NumBytes,
+                                        uint64_t NumBytes,
                                         bool RequireFPUpdate) const {
 
   DebugLoc dl;
@@ -47,6 +48,7 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
   //    st %lr, 8(,%sp)
   //    st %got, 24(,%sp)
   //    st %plt, 32(,%sp)
+  //    st %s17, 40(,%sp) iff this function is using s17 as BP
   //    or %fp, 0, %sp
 
   BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
@@ -69,6 +71,12 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
       .addImm(0)
       .addImm(32)
       .addReg(VE::SX16);
+  if (hasBP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(40)
+        .addReg(VE::SX17);
   BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX9)
       .addReg(VE::SX11)
       .addImm(0);
@@ -77,7 +85,7 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
 void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
-                                        int NumBytes,
+                                        uint64_t NumBytes,
                                         bool RequireFPUpdate) const {
 
   DebugLoc dl;
@@ -86,6 +94,7 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
   // Insert following codes here as epilogue
   //
   //    or %sp, 0, %fp
+  //    ld %s17, 40(,%sp) iff this function is using s17 as BP
   //    ld %got, 32(,%sp)
   //    ld %plt, 24(,%sp)
   //    ld %lr, 8(,%sp)
@@ -94,6 +103,11 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
   BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX11)
       .addReg(VE::SX9)
       .addImm(0);
+  if (hasBP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX17)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(40);
   BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX16)
       .addReg(VE::SX11)
       .addImm(0)
@@ -115,7 +129,8 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
 void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
                                        MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
-                                       int NumBytes) const {
+                                       int64_t NumBytes,
+                                       MaybeAlign MaybeAlign) const {
   DebugLoc dl;
   const VEInstrInfo &TII =
       *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
@@ -143,11 +158,17 @@ void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
       .addReg(VE::SX11)
       .addReg(VE::SX13)
       .addImm(Hi_32(NumBytes));
+
+  if (MaybeAlign) {
+    // and %sp, %sp, Align-1
+    BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX11)
+        .addReg(VE::SX11)
+        .addImm(M1(64 - Log2_64(MaybeAlign.valueOrOne().value())));
+  }
 }
 
 void VEFrameLowering::emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   int NumBytes) const {
+                                   MachineBasicBlock::iterator MBBI) const {
   DebugLoc dl;
   const VEInstrInfo &TII =
       *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
@@ -186,11 +207,8 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>();
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(Subtarget.getInstrInfo());
-  const VERegisterInfo &RegInfo =
-      *static_cast<const VERegisterInfo *>(Subtarget.getRegisterInfo());
+  const VEInstrInfo &TII = *STI.getInstrInfo();
+  const VERegisterInfo &RegInfo = *STI.getRegisterInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -209,30 +227,15 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
                        "(probably because it has a dynamic alloca).");
 
   // Get the number of bytes to allocate from the FrameInfo
-  int NumBytes = (int)MFI.getStackSize();
-  // The VE ABI requires a reserved 176-byte area in the user's stack, starting
-  // at %sp + 16. This is for the callee Register Save Area (RSA).
-  //
-  // We therefore need to add that offset to the total stack size
-  // after all the stack objects are placed by
-  // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack
-  // needs to be aligned *after* the extra size is added, we need to disable
-  // calculateFrameObjectOffsets's built-in stack alignment, by having
-  // targetHandlesStackFrameRounding return true.
-
-  // Add the extra call frame stack size, if needed. (This is the same
-  // code as in PrologEpilogInserter, but also gets disabled by
-  // targetHandlesStackFrameRounding)
-  if (MFI.adjustsStack() && hasReservedCallFrame(MF))
-    NumBytes += MFI.getMaxCallFrameSize();
-
-  // Adds the VE subtarget-specific spill area to the stack
-  // size. Also ensures target-required alignment.
-  NumBytes = Subtarget.getAdjustedFrameSize(NumBytes);
+  uint64_t NumBytes = MFI.getStackSize();
+
+  // The VE ABI requires a reserved 176 bytes area at the top
+  // of stack as described in VESubtarget.cpp.  So, we adjust it here.
+  NumBytes = STI.getAdjustedFrameSize(NumBytes);
 
   // Finally, ensure that the size is sufficiently aligned for the
   // data on the stack.
-  NumBytes = alignTo(NumBytes, MFI.getMaxAlign().value());
+  NumBytes = alignTo(NumBytes, MFI.getMaxAlign());
 
   // Update stack size with corrected value.
   MFI.setStackSize(NumBytes);
@@ -241,16 +244,25 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
   emitPrologueInsns(MF, MBB, MBBI, NumBytes, true);
 
   // Emit stack adjust instructions
-  emitSPAdjustment(MF, MBB, MBBI, -NumBytes);
+  MaybeAlign RuntimeAlign =
+      NeedsStackRealignment ? MaybeAlign(MFI.getMaxAlign()) : None;
+  emitSPAdjustment(MF, MBB, MBBI, -(int64_t)NumBytes, RuntimeAlign);
+
+  if (hasBP(MF)) {
+    // Copy SP to BP.
+    BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX17)
+        .addReg(VE::SX11)
+        .addImm(0);
+  }
 
   // Emit stack extend instructions
-  emitSPExtend(MF, MBB, MBBI, -NumBytes);
+  emitSPExtend(MF, MBB, MBBI);
 
-  unsigned regFP = RegInfo.getDwarfRegNum(VE::SX9, true);
+  Register RegFP = RegInfo.getDwarfRegNum(VE::SX9, true);
 
   // Emit ".cfi_def_cfa_register 30".
   unsigned CFIIndex =
-      MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
+      MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, RegFP));
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
@@ -265,7 +277,7 @@ MachineBasicBlock::iterator VEFrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   if (!hasReservedCallFrame(MF)) {
     MachineInstr &MI = *I;
-    int Size = MI.getOperand(0).getImm();
+    int64_t Size = MI.getOperand(0).getImm();
     if (MI.getOpcode() == VE::ADJCALLSTACKDOWN)
       Size = -Size;
 
@@ -281,20 +293,17 @@ void VEFrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  int NumBytes = (int)MFI.getStackSize();
+  uint64_t NumBytes = MFI.getStackSize();
 
   // Emit Epilogue instructions to restore %lr
   emitEpilogueInsns(MF, MBB, MBBI, NumBytes, true);
 }
 
-bool VEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  // Reserve call frame if there are no variable sized objects on the stack.
-  return !MF.getFrameInfo().hasVarSizedObjects();
-}
-
 // hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas or
-// if frame pointer elimination is disabled.
+// pointer register.  This is true if the function has variable sized allocas
+// or if frame pointer elimination is disabled.  For the case of VE, we don't
+// implement FP eliminator yet, but we returns false from this function to
+// not refer fp from generated code.
 bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
@@ -304,44 +313,41 @@ bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
          MFI.isFrameAddressTaken();
 }
 
+bool VEFrameLowering::hasBP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+}
+
 int VEFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                             Register &FrameReg) const {
-  const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const VERegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const VERegisterInfo *RegInfo = STI.getRegisterInfo();
   const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   bool isFixed = MFI.isFixedObjectIndex(FI);
 
-  // Addressable stack objects are accessed using neg. offsets from
-  // %fp, or positive offsets from %sp.
-  bool UseFP = true;
+  int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI);
 
-  // VE uses FP-based references in general, even when "hasFP" is
-  // false. That function is rather a misnomer, because %fp is
-  // actually always available, unless isLeafProc.
   if (FuncInfo->isLeafProc()) {
     // If there's a leaf proc, all offsets need to be %sp-based,
     // because we haven't caused %fp to actually point to our frame.
-    UseFP = false;
-  } else if (isFixed) {
-    // Otherwise, argument access should always use %fp.
-    UseFP = true;
-  } else if (RegInfo->needsStackRealignment(MF)) {
-    // If there is dynamic stack realignment, all local object
-    // references need to be via %sp, to take account of the
-    // re-alignment.
-    UseFP = false;
+    FrameReg = VE::SX11; // %sp
+    return FrameOffset + MF.getFrameInfo().getStackSize();
   }
-
-  int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI);
-
-  if (UseFP) {
-    FrameReg = RegInfo->getFrameRegister(MF);
-    return FrameOffset;
+  if (RegInfo->needsStackRealignment(MF) && !isFixed) {
+    // If there is dynamic stack realignment, all local object
+    // references need to be via %sp or %s17 (bp), to take account
+    // of the re-alignment.
+    if (hasBP(MF))
+      FrameReg = VE::SX17; // %bp
+    else
+      FrameReg = VE::SX11; // %sp
+    return FrameOffset + MF.getFrameInfo().getStackSize();
   }
-
-  FrameReg = VE::SX11; // %sp
-  return FrameOffset + MF.getFrameInfo().getStackSize();
+  // Finally, default to using %fp.
+  FrameReg = RegInfo->getFrameRegister(MF);
+  return FrameOffset;
 }
 
 bool VEFrameLowering::isLeafProc(MachineFunction &MF) const {
diff --git a/llvm/lib/Target/VE/VEFrameLowering.h b/llvm/lib/Target/VE/VEFrameLowering.h
index de0227e613bce..b548d663c5043 100644
--- a/llvm/lib/Target/VE/VEFrameLowering.h
+++ b/llvm/lib/Target/VE/VEFrameLowering.h
@@ -28,18 +28,23 @@ class VEFrameLowering : public TargetFrameLowering {
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitPrologueInsns(MachineFunction &MF, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI, int NumBytes,
+                         MachineBasicBlock::iterator MBBI, uint64_t NumBytes,
                          bool RequireFPUpdate) const;
   void emitEpilogueInsns(MachineFunction &MF, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI, int NumBytes,
+                         MachineBasicBlock::iterator MBBI, uint64_t NumBytes,
                          bool RequireFPUpdate) const;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool hasBP(const MachineFunction &MF) const;
   bool hasFP(const MachineFunction &MF) const override;
+  // VE reserves argument space always for call sites in the function
+  // immediately on entry of the current function.
+  bool hasReservedCallFrame(const MachineFunction &MF) const override {
+    return true;
+  }
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
 
@@ -58,10 +63,8 @@ class VEFrameLowering : public TargetFrameLowering {
     return Offsets;
   }
 
-  /// targetHandlesStackFrameRounding - Returns true if the target is
-  /// responsible for rounding up the stack frame (probably at emitPrologue
-  /// time).
-  bool targetHandlesStackFrameRounding() const override { return true; }
+protected:
+  const VESubtarget &STI;
 
 private:
   // Returns true if MF is a leaf procedure.
@@ -69,11 +72,12 @@ class VEFrameLowering : public TargetFrameLowering {
 
   // Emits code for adjusting SP in function prologue/epilogue.
   void emitSPAdjustment(MachineFunction &MF, MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator MBBI, int NumBytes) const;
+                        MachineBasicBlock::iterator MBBI, int64_t NumBytes,
+                        MaybeAlign MayAlign = MaybeAlign()) const;
 
   // Emits code for extending SP in function prologue/epilogue.
   void emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator MBBI, int NumBytes) const;
+                    MachineBasicBlock::iterator MBBI) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 8c611f7f292c6..cbdf861307b34 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -583,6 +583,11 @@ VETargetLowering::VETargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   /// } VAARG handling
 
+  /// Stack {
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  /// } Stack
+
   /// Int Ops {
   for (MVT IntVT : {MVT::i32, MVT::i64}) {
     // VE has no REM or DIVREM operations.
@@ -641,6 +646,7 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
     TARGET_NODE_CASE(Lo)
     TARGET_NODE_CASE(Hi)
     TARGET_NODE_CASE(GETFUNPLT)
+    TARGET_NODE_CASE(GETSTACKTOP)
     TARGET_NODE_CASE(GETTLSADDR)
     TARGET_NODE_CASE(CALL)
     TARGET_NODE_CASE(RET_FLAG)
@@ -860,12 +866,79 @@ SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
 }
 
+SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // Generate following code.
+  //   (void)__llvm_grow_stack(size);
+  //   ret = GETSTACKTOP;        // pseudo instruction
+  SDLoc DL(Op);
+
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  MaybeAlign Alignment(Op.getConstantOperandVal(2));
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  Align StackAlign = TFI.getStackAlign();
+  bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
+
+  // Prepare arguments
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Size;
+  Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+  Args.push_back(Entry);
+  if (NeedsAlign) {
+    Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Args.push_back(Entry);
+  }
+  Type *RetTy = Type::getVoidTy(*DAG.getContext());
+
+  EVT PtrVT = Op.getValueType();
+  SDValue Callee;
+  if (NeedsAlign) {
+    Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
+  } else {
+    Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL)
+      .setChain(Chain)
+      .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
+      .setDiscardResult(true);
+  std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
+  Chain = pair.second;
+  SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
+  if (NeedsAlign) {
+    Result = DAG.getNode(ISD::ADD, DL, VT, Result,
+                         DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
+    Result = DAG.getNode(ISD::AND, DL, VT, Result,
+                         DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
+  }
+  //  Chain = Result.getValue(1);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+                             DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, DL);
+}
+
 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Should not custom lower this!");
   case ISD::BlockAddress:
     return LowerBlockAddress(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index a3ead990bccf8..097960f05a830 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -27,8 +27,10 @@ enum NodeType : unsigned {
   Hi,
   Lo, // Hi/Lo operations, typically on a global address.
 
-  GETFUNPLT,  // load function address through %plt insturction
-  GETTLSADDR, // load address for TLS access
+  GETFUNPLT,   // load function address through %plt insturction
+  GETTLSADDR,  // load address for TLS access
+  GETSTACKTOP, // retrieve address of stack top (first address of
+               // locals and temporaries)
 
   CALL,            // A call instruction.
   RET_FLAG,        // Return with a flag operand.
@@ -81,6 +83,7 @@ class VETargetLowering : public TargetLowering {
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   /// } Custom Lower
 
   SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 02a63f4aa3656..aa19c6ce0687c 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define DEBUG_TYPE "ve"
+#define DEBUG_TYPE "ve-instr-info"
 
 using namespace llvm;
 
@@ -457,6 +457,9 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return true;
   }
+  case VE::GETSTACKTOP: {
+    return expandGetStackTopPseudo(MI);
+  }
   }
   return false;
 }
@@ -464,8 +467,8 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 bool VEInstrInfo::expandExtendStackPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const VESubtarget &STI = MF.getSubtarget<VESubtarget>();
+  const VEInstrInfo &TII = *STI.getInstrInfo();
   DebugLoc dl = MBB.findDebugLoc(MI);
 
   // Create following instructions and multiple basic blocks.
@@ -544,3 +547,35 @@ bool VEInstrInfo::expandExtendStackPseudo(MachineInstr &MI) const {
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return true;
 }
+
+bool VEInstrInfo::expandGetStackTopPseudo(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const VESubtarget &STI = MF.getSubtarget<VESubtarget>();
+  const VEInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MBB->findDebugLoc(MI);
+
+  // Create following instruction
+  //
+  //   dst = %sp + target specific frame + the size of parameter area
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const VEFrameLowering &TFL = *STI.getFrameLowering();
+
+  // The VE ABI requires a reserved 176 bytes area at the top
+  // of stack as described in VESubtarget.cpp.  So, we adjust it here.
+  unsigned NumBytes = STI.getAdjustedFrameSize(0);
+
+  // Also adds the size of parameter area.
+  if (MFI.adjustsStack() && TFL.hasReservedCallFrame(MF))
+    NumBytes += MFI.getMaxCallFrameSize();
+
+  BuildMI(*MBB, MI, DL, TII.get(VE::LEArii))
+      .addDef(MI.getOperand(0).getReg())
+      .addReg(VE::SX11)
+      .addImm(0)
+      .addImm(NumBytes);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return true;
+}
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 4e28279a6675e..7b6662df1d605 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -81,6 +81,7 @@ class VEInstrInfo : public VEGenInstrInfo {
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   bool expandExtendStackPseudo(MachineInstr &MI) const;
+  bool expandGetStackTopPseudo(MachineInstr &MI) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 87c8015c775b7..c7815efb8c71e 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -414,6 +414,9 @@ def GetTLSAddr : SDNode<"VEISD::GETTLSADDR", SDT_SPCall,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                          SDNPVariadic]>;
 
+// GETSTACKTOP
+def GetStackTop : SDNode<"VEISD::GETSTACKTOP", SDTNone,
+                        [SDNPHasChain, SDNPSideEffect]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1398,6 +1401,14 @@ def EXTEND_STACK_GUARD : Pseudo<(outs), (ins),
                                 "# EXTEND STACK GUARD",
                                 []>;
 
+// Dynamic stack allocation yields a __llvm_grow_stack for VE targets.
+// These calls are needed to probe the stack when allocating more over
+// %s8 (%sl - stack limit).
+
+let Uses = [SX11], hasSideEffects = 1 in
+def GETSTACKTOP : Pseudo<(outs I64:$dst), (ins),
+                         "# GET STACK TOP",
+                         [(set iPTR:$dst, (GetStackTop))]>;
 // SETCC pattern matches
 //
 //   CMP  %tmp, lhs, rhs     ; compare lhs and rhs
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index b0ddc956d7cc5..5783a8df69d24 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -34,12 +34,22 @@ VERegisterInfo::VERegisterInfo() : VEGenRegisterInfo(VE::SX10) {}
 
 const MCPhysReg *
 VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_SaveList;
+  switch (MF->getFunction().getCallingConv()) {
+  default:
+    return CSR_SaveList;
+  case CallingConv::PreserveAll:
+    return CSR_preserve_all_SaveList;
+  }
 }
 
 const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                                      CallingConv::ID CC) const {
-  return CSR_RegMask;
+  switch (CC) {
+  default:
+    return CSR_RegMask;
+  case CallingConv::PreserveAll:
+    return CSR_preserve_all_RegMask;
+  }
 }
 
 const uint32_t *VERegisterInfo::getNoPreservedMask() const {
diff --git a/llvm/lib/Target/VE/VESubtarget.h b/llvm/lib/Target/VE/VESubtarget.h
index e9637cc16023b..f3a2c206162e9 100644
--- a/llvm/lib/Target/VE/VESubtarget.h
+++ b/llvm/lib/Target/VE/VESubtarget.h
@@ -42,7 +42,7 @@ class VESubtarget : public VEGenSubtargetInfo {
               const TargetMachine &TM);
 
   const VEInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const TargetFrameLowering *getFrameLowering() const override {
+  const VEFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
   const VERegisterInfo *getRegisterInfo() const override {
diff --git a/llvm/test/CodeGen/VE/alloca.ll b/llvm/test/CodeGen/VE/alloca.ll
new file mode 100644
index 0000000000000..a4d349fefd0a4
--- /dev/null
+++ b/llvm/test/CodeGen/VE/alloca.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+
+declare void @bar(i8*, i64)
+
+; Function Attrs: nounwind
+define void @test(i64 %n) {
+; CHECK-LABEL: test:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    or %s1, 0, %s0
+; CHECK-NEXT:    lea %s0, 15(, %s0)
+; CHECK-NEXT:    and %s0, -16, %s0
+; CHECK-NEXT:    lea %s2, __ve_grow_stack@lo
+; CHECK-NEXT:    and %s2, %s2, (32)0
+; CHECK-NEXT:    lea.sl %s12, __ve_grow_stack@hi(, %s2)
+; CHECK-NEXT:    bsic %s10, (, %s12)
+; CHECK-NEXT:    lea %s0, 240(, %s11)
+; CHECK-NEXT:    lea %s2, bar@lo
+; CHECK-NEXT:    and %s2, %s2, (32)0
+; CHECK-NEXT:    lea.sl %s12, bar@hi(, %s2)
+; CHECK-NEXT:    bsic %s10, (, %s12)
+; CHECK-NEXT:    or %s11, 0, %s9
+  %dyna = alloca i8, i64 %n, align 8
+  call void @bar(i8* %dyna, i64 %n)
+  ret void
+}
diff --git a/llvm/test/CodeGen/VE/alloca_aligned.ll b/llvm/test/CodeGen/VE/alloca_aligned.ll
new file mode 100644
index 0000000000000..81cdcb56f792b
--- /dev/null
+++ b/llvm/test/CodeGen/VE/alloca_aligned.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+
+declare void @bar(i8*, i64)
+
+; Function Attrs: nounwind
+define void @test(i64 %n) {
+; CHECK-LABEL: test:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    or %s2, 0, %s0
+; CHECK-NEXT:    lea %s0, 15(, %s0)
+; CHECK-NEXT:    and %s0, -16, %s0
+; CHECK-NEXT:    lea %s1, __ve_grow_stack_align@lo
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea.sl %s12, __ve_grow_stack_align@hi(, %s1)
+; CHECK-NEXT:    or %s1, -32, (0)1
+; CHECK-NEXT:    bsic %s10, (, %s12)
+; CHECK-NEXT:    lea %s0, 240(, %s11)
+; CHECK-NEXT:    lea %s0, 31(, %s0)
+; CHECK-NEXT:    and %s0, -32, %s0
+; CHECK-NEXT:    lea %s1, bar@lo
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lea.sl %s12, bar@hi(, %s1)
+; CHECK-NEXT:    or %s1, 0, %s2
+; CHECK-NEXT:    bsic %s10, (, %s12)
+; CHECK-NEXT:    or %s11, 0, %s9
+  %dyna = alloca i8, i64 %n, align 32
+  call void @bar(i8* %dyna, i64 %n)
+  ret void
+}

From a1dfd6d828ac4f8e11e8013b952f0ef080890dcf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Wed, 27 May 2020 01:20:30 -0700
Subject: [PATCH 193/770] [X86] Add helper function to reduce some code
 duplication when shrinking a vector load to a vzext_load.

There's more code for calling CombineTo and replacing the nodes
that I'd like to share, but its complicated by the getNode call
in the middle that needs to be specific to each opcode.

While there are also make sure we recursively delete the load
we're replacing. It eventually gets removed by a RemoveDeadNodes
call at the end of DAG combine, but we should be more eager about
it. We were inconsistently doing this in some places but not all.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 73 +++++++++++--------------
 1 file changed, 31 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6bf61af00590c..e086c65c40cb5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33596,6 +33596,21 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
   return N;
 }
 
+// Helper to look for a normal load that can be narrowed into a vzload with the
+// specified VT and memory VT. Returns SDValue() on failure.
+static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
+                                  SelectionDAG &DAG) {
+  // Can't if the load is volatile or atomic.
+  if (!LN->isSimple())
+    return SDValue();
+
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+  return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
+                                 LN->getPointerInfo(), LN->getOriginalAlign(),
+                                 LN->getMemOperand()->getFlags());
+}
+
 // Attempt to match a combined shuffle mask against supported unary shuffle
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
@@ -35598,13 +35613,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     if (VT == MVT::v2f64 && Src.hasOneUse() &&
         ISD::isNormalLoad(Src.getNode())) {
       LoadSDNode *LN = cast<LoadSDNode>(Src);
-      // Unless the load is volatile or atomic.
-      if (LN->isSimple()) {
-        SDVTList Tys = DAG.getVTList(MVT::v2f64, MVT::Other);
-        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-        SDValue VZLoad = DAG.getMemIntrinsicNode(
-            X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::f64, LN->getPointerInfo(),
-            LN->getOriginalAlign(), LN->getMemOperand()->getFlags());
+      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
         DCI.CombineTo(N.getNode(), Movddup);
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
@@ -35786,7 +35795,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
         SDValue BcastLd = DAG.getMemIntrinsicNode(
             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
-            LN->getPointerInfo(), LN->getAlign(),
+            LN->getPointerInfo(), LN->getOriginalAlign(),
             LN->getMemOperand()->getFlags());
         DCI.CombineTo(N.getNode(), BcastLd);
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
@@ -35804,13 +35813,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     // the load is volatile.
     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
       auto *LN = cast<LoadSDNode>(N0);
-      if (LN->isSimple()) {
-        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
-        SDValue VZLoad = DAG.getMemIntrinsicNode(
-            X86ISD::VZEXT_LOAD, DL, Tys, Ops, VT.getVectorElementType(),
-            LN->getPointerInfo(), LN->getAlign(),
-            LN->getMemOperand()->getFlags());
+      if (SDValue VZLoad =
+              narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
         DCI.CombineTo(N.getNode(), VZLoad);
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
         DCI.recursivelyDeleteUnusedNodes(LN);
@@ -44541,21 +44545,16 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-    // Unless the load is volatile or atomic.
-    if (LN->isSimple()) {
+    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+    MVT MemVT = MVT::getIntegerVT(NumBits);
+    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
       SDLoc dl(N);
-      unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
-      MVT MemVT = MVT::getIntegerVT(NumBits);
-      MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
-      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad = DAG.getMemIntrinsicNode(
-          X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, LN->getPointerInfo(),
-          LN->getOriginalAlign(), LN->getMemOperand()->getFlags());
       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
                                     DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+      DCI.recursivelyDeleteUnusedNodes(LN);
       return SDValue(N, 0);
     }
   }
@@ -44575,21 +44574,16 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
     LoadSDNode *LN = cast<LoadSDNode>(In);
-    // Unless the load is volatile or atomic.
-    if (LN->isSimple()) {
+    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+    MVT MemVT = MVT::getFloatingPointVT(NumBits);
+    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
       SDLoc dl(N);
-      unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
-      MVT MemVT = MVT::getFloatingPointVT(NumBits);
-      MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
-      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad = DAG.getMemIntrinsicNode(
-          X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, LN->getPointerInfo(),
-          LN->getOriginalAlign(), LN->getMemOperand()->getFlags());
       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
                                     DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+      DCI.recursivelyDeleteUnusedNodes(LN);
       return SDValue(N, 0);
     }
   }
@@ -44665,18 +44659,13 @@ static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
     // Convert a full vector load into vzload when not all bits are needed.
     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-      // Unless the load is volatile or atomic.
-      if (LN->isSimple()) {
+      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
         SDLoc dl(N);
-        SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-        SDValue VZLoad = DAG.getMemIntrinsicNode(
-            X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, LN->getPointerInfo(),
-            LN->getOriginalAlign(), LN->getMemOperand()->getFlags());
         SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
                                       DAG.getBitcast(MVT::v8i16, VZLoad));
         DCI.CombineTo(N, Convert);
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
         return SDValue(N, 0);
       }
     }

From 65030821d4a6af94b84a33e66a40c08ca26f1526 Mon Sep 17 00:00:00 2001
From: Djordje Todorovic <djordje.todorovic@syrmia.com>
Date: Wed, 27 May 2020 09:42:15 +0200
Subject: [PATCH 194/770] [NFC][Debugify] Format the CheckModuleDebugify output

This fixes the output of the check-debugify option.
Without the patch an example of running the option:

$ opt -check-debugify test.ll -S -o testDebugify.ll
CheckModuleDebugifySkipping module without debugify metadata

After the patch:

$ opt -check-debugify test.ll -S -o testDebugify.ll
CheckModuleDebugify: Skipping module without debugify metadata

Differential Revision: https://reviews.llvm.org/D80553
---
 llvm/lib/Transforms/Utils/Debugify.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 19c73f3840fc6..7c178aba5ad24 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -293,7 +293,7 @@ bool checkDebugifyMetadata(Module &M,
   // Skip modules without debugify metadata.
   NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
   if (!NMD) {
-    dbg() << Banner << "Skipping module without debugify metadata\n";
+    dbg() << Banner << ": Skipping module without debugify metadata\n";
     return false;
   }
 

From 84c643358691b8057199e8c8597428ad0d960786 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Wed, 27 May 2020 12:21:19 +0300
Subject: [PATCH 195/770] [DebugInfo] - Fix typo in comment. NFC.

I've forgot to address this bit when landed D80476.
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 51dc54e49fcc9..fe24f3942ffb2 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -304,7 +304,7 @@ constexpr uint64_t getCIEId(bool IsDWARF64, bool IsEH) {
 }
 
 void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
-  // A CIE with a zero length is a terminator entry in the .eh_frame sextion.
+  // A CIE with a zero length is a terminator entry in the .eh_frame section.
   if (IsEH && Length == 0) {
     OS << format("%08" PRIx64, Offset) << " ZERO terminator\n";
     return;

From b101c6251a9bce8dc11f47bce70ee169e9fe5bfe Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 01:42:28 -0700
Subject: [PATCH 196/770] [StackSafety] Ignore some use of values

We should ignore value used in MemTransferInst
as other then src/dst argument.
---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     |  4 ++--
 .../Analysis/StackSafetyAnalysis/local.ll     |  3 +--
 .../Analysis/StackSafetyAnalysis/memintrin.ll | 22 +++++++++++++++++--
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 1581ca80726b1..9c937b0b1ddaa 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -256,10 +256,10 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
     const MemIntrinsic *MI, const Use &U, Value *Base) {
   if (auto MTI = dyn_cast<MemTransferInst>(MI)) {
     if (MTI->getRawSource() != U && MTI->getRawDest() != U)
-      return getRange(0, 1);
+      return ConstantRange::getEmpty(PointerSize);
   } else {
     if (MI->getRawDest() != U)
-      return getRange(0, 1);
+      return ConstantRange::getEmpty(PointerSize);
   }
   auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize);
   if (!SE.isSCEVable(MI->getLength()->getType()))
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/local.ll b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
index f749dc07bb768..eeb7ccbe42d3e 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/local.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
@@ -356,11 +356,10 @@ for.cond.cleanup:
   ret i8 %add
 }
 
-; FIXME: we don't understand that %sz in the memset call is limited to 128 by the preceding check.
 define dso_local void @SizeCheck(i32 %sz) {
 ; CHECK-LABEL: @SizeCheck{{$}}
 ; CHECK-NEXT: args uses:
-; CHECK-NEXT: sz[]: [0,1){{$}}
+; CHECK-NEXT: sz[]: empty-set{{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NEXT: x1[128]: [0,4294967295){{$}}
 ; CHECK-NOT: ]:
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll b/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
index 87c92aca35388..aec124600b542 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
@@ -53,7 +53,7 @@ entry:
 define void @MemsetNonConst(i32 %size) {
 ; CHECK-LABEL: MemsetNonConst dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:
-; CHECK-NEXT: size[]: [0,1){{$}}
+; CHECK-NEXT: size[]: empty-set{{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NEXT: x[4]: [0,4294967295){{$}}
 ; CHECK-NOT: ]:
@@ -69,7 +69,7 @@ entry:
 define void @MemsetNonConstInBounds(i1 zeroext %z) {
 ; CHECK-LABEL: MemsetNonConstInBounds dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:
-; CHECK-NEXT: z[]: [0,1){{$}}
+; CHECK-NEXT: z[]: empty-set{{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NEXT: x[4]: [0,4294967295){{$}}
 ; CHECK-NOT: ]:
@@ -81,6 +81,24 @@ entry:
   ret void
 }
 
+define void @MemsetNonConstSize() {
+; CHECK-LABEL: MemsetNonConstSize dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT: x[4]: [0,4294967295){{$}}
+; CHECK-NEXT: y[4]: empty-set{{$}}
+; CHECK-NOT: ]:
+entry:
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %x1 = bitcast i32* %x to i8*
+  %xint = ptrtoint i32* %x to i32
+  %yint = ptrtoint i32* %y to i32
+  %d = sub i32 %xint, %yint
+  call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 %d, i1 false)
+  ret void
+}
+
 define void @MemcpyInBounds() {
 ; CHECK-LABEL: MemcpyInBounds dso_preemptable{{$}}
 ; CHECK-NEXT: args uses:

From 06a07dd6080c72ca886cc7bb21beef2a372d94cf Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 01:43:27 -0700
Subject: [PATCH 197/770] [StackSafety] Fix formatting in the test

---
 llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll b/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
index aec124600b542..f6e3fedc50f48 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/memintrin.ll
@@ -32,7 +32,7 @@ define void @VolatileMemsetInBounds() {
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
-    %x1 = bitcast i32* %x to i8*
+  %x1 = bitcast i32* %x to i8*
   call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 4, i1 true)
   ret void
 }
@@ -45,7 +45,7 @@ define void @MemsetOutOfBounds() {
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
-    %x1 = bitcast i32* %x to i8*
+  %x1 = bitcast i32* %x to i8*
   call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 5, i1 false)
   ret void
 }
@@ -59,7 +59,7 @@ define void @MemsetNonConst(i32 %size) {
 ; CHECK-NOT: ]:
 entry:
   %x = alloca i32, align 4
-    %x1 = bitcast i32* %x to i8*
+  %x1 = bitcast i32* %x to i8*
   call void @llvm.memset.p0i8.i32(i8* %x1, i8 42, i32 %size, i1 false)
   ret void
 }

From f6383643d9e84a139f68cbe19fa16d4969d20d5c Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 02:45:43 -0700
Subject: [PATCH 198/770] [StackSafety] Bailout on some function calls

Don't miss values used in calls outside regular argument list.
---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp       |  6 ++++++
 llvm/test/Analysis/StackSafetyAnalysis/local.ll | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 9c937b0b1ddaa..a44732613c0fa 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -345,12 +345,18 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, UseInfo &US) {
         assert(isa<Function>(Callee) || isa<GlobalAlias>(Callee));
 
         auto B = CB.arg_begin(), E = CB.arg_end();
+        int Found = 0;
         for (auto A = B; A != E; ++A) {
           if (A->get() == V) {
+            ++Found;
             ConstantRange Offset = offsetFrom(UI, Ptr);
             US.Calls.emplace_back(Callee, A - B, Offset);
           }
         }
+        if (!Found) {
+          US.updateRange(UnknownRange);
+          return false;
+        }
 
         break;
       }
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/local.ll b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
index eeb7ccbe42d3e..5dee4b52a2e41 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/local.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/local.ll
@@ -412,3 +412,15 @@ entry:
   %val = load %zerosize_type, %zerosize_type* %p, align 4
   ret void
 }
+
+define void @OperandBundle() {
+; CHECK-LABEL: @OperandBundle dso_preemptable{{$}}
+; CHECK-NEXT: args uses:
+; CHECK-NEXT: allocas uses:
+; CHECK-NEXT:   a[4]: full-set
+; CHECK-NOT: ]:
+entry:
+  %a = alloca i32, align 4
+  call void @LeakAddress() ["unknown"(i32* %a)]
+  ret void
+}

From f2fad3f703aa20cc7b452bdf1605cb46eb960653 Mon Sep 17 00:00:00 2001
From: Konstantin Schwarz <konstantin.schwarz@hightec-rt.com>
Date: Sat, 23 May 2020 13:26:09 +0200
Subject: [PATCH 199/770] [GlobalISel][InlineAsm] Add missing EarlyClobber flag
 to inline asm output operands

Summary:
Previously, we only added early-clobber flags to the 'group' immediate flag operand
of an inline asm operand.
However, we also have to add the EarlyClobber flag to the MachineOperand itself.

This fixes PR46028

Reviewers: arsenm, leonardchan

Reviewed By: arsenm, leonardchan

Subscribers: phosek, wdng, rovka, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80467
---
 .../lib/CodeGen/GlobalISel/InlineAsmLowering.cpp |  3 ++-
 .../GlobalISel/irtranslator-inline-asm.ll        | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 363b4b59ec544..3ac52b8e3e736 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -379,7 +379,8 @@ bool InlineAsmLowering::lowerInlineAsm(
 
         for (Register Reg : OpInfo.Regs) {
           Inst.addReg(Reg,
-                      RegState::Define | getImplRegState(Reg.isPhysical()));
+                      RegState::Define | getImplRegState(Reg.isPhysical()) |
+                          (OpInfo.isEarlyClobber ? RegState::EarlyClobber : 0));
         }
 
         // Remember this output operand for later processing
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
index 18540f1b8313f..f1be1011fa865 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
@@ -23,6 +23,22 @@ define void @asm_simple_register_clobber() {
   ret void
 }
 
+define i64 @asm_register_early_clobber() {
+  ; CHECK-LABEL: name: asm_register_early_clobber
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, 1441803 /* regdef-ec:GPR64common */, def early-clobber %0, 1441803 /* regdef-ec:GPR64common */, def early-clobber %1, !0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY %0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %1
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+  ; CHECK:   $x0 = COPY [[ADD]](s64)
+  ; CHECK:   RET_ReallyLR implicit $x0
+  call { i64, i64 } asm sideeffect "mov $0, 7; mov $1, 7", "=&r,=&r"(), !srcloc !0
+  %asmresult = extractvalue { i64, i64 } %1, 0
+  %asmresult1 = extractvalue { i64, i64 } %1, 1
+  %add = add i64 %asmresult, %asmresult1
+  ret i64 %add
+}
+
 define i32 @test_specific_register_output() nounwind ssp {
   ; CHECK-LABEL: name: test_specific_register_output
   ; CHECK: bb.1.entry:

From 410667f1b74c614d9382f180d29f5aa1e42cc5c9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 11:05:55 +0100
Subject: [PATCH 200/770] [X86][SSE] Convert PTEST to MOVMSK for allsign bits
 vector results

If we are using PTEST to check 'allsign bits' vector elements we can use MOVMSK to extract the signbits directly and perform the comparison on the scalar value.

For vXi16 cases, as we don't have a MOVMSK for this type, we must mask each signbit out of a PMOVMSKB v2Xi8 result, which folds into the TEST comparison.

If this allows us to remove a vector op (via the SimplifyMultipleUseDemandedBits call) this is consistently faster than a PTEST (https://godbolt.org/z/ziJUst).

I'm investigating whether we ever get regressions without the SimplifyMultipleUseDemandedBits call, even if this means we don't remove a vector op, but that has exposed some other poor codegen issues that I'm still investigating and would have to wait for a later patch.

Suggested on PR42035 to avoid unnecessary ashr(x,bw-1)/pcmpgt(0,x) sign splat patterns feeding into ptest.

Differential Revision: https://reviews.llvm.org/D80563
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 37 +++++++++++++++++++++++--
 llvm/test/CodeGen/X86/combine-ptest.ll  | 20 ++++++-------
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e086c65c40cb5..d70b5a7f3a227 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40079,7 +40079,8 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
 /// to avoid the inversion.
 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
-                              SelectionDAG &DAG) {
+                              SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
       EFLAGS.getOpcode() != X86ISD::TESTP)
@@ -40141,6 +40142,9 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
 
     if (Op0 == Op1) {
       SDValue BC = peekThroughBitcasts(Op0);
+      EVT BCVT = BC.getValueType();
+      assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
+             "Unexpected vector type");
 
       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
@@ -40156,6 +40160,35 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
                            DAG.getBitcast(OpVT, BC.getOperand(0)),
                            DAG.getBitcast(OpVT, BC.getOperand(1)));
       }
+
+      // If every element is an all-sign value, see if we can use MOVMSK to
+      // more efficiently extract the sign bits and compare that.
+      // TODO: Handle TESTC with comparison inversion.
+      // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
+      // MOVMSK combines to make sure its never worse than PTEST?
+      unsigned EltBits = BCVT.getScalarSizeInBits();
+      if (DAG.ComputeNumSignBits(BC) == EltBits) {
+        assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
+        APInt SignMask = APInt::getSignMask(EltBits);
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (SDValue Res =
+                TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
+          // For vXi16 cases we need to use pmovmksb and extract every other
+          // sign bit.
+          SDLoc DL(EFLAGS);
+          if (EltBits == 16) {
+            MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
+            Res = DAG.getBitcast(MovmskVT, Res);
+            Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+            Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
+                              DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+          } else {
+            Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+          }
+          return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
+                             DAG.getConstant(0, DL, MVT::i32));
+        }
+      }
     }
 
     // TESTZ(-1,X) == TESTZ(X,X)
@@ -40183,7 +40216,7 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
     return R;
 
-  if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG))
+  if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
     return R;
 
   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index 2928023c7fc2a..975440cf8297a 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -299,16 +299,15 @@ start:
 }
 
 ;
-; TODO: testz(ashr(X,bw-1),-1) -> movmsk(X)
+; testz(ashr(X,bw-1),-1) -> movmsk(X)
 ;
 
 define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_v2i64_signbits:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vptest %xmm0, %xmm0
+; CHECK-NEXT:    vmovmskpd %xmm0, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = ashr <2 x i64> %c, <i64 63, i64 63>
@@ -334,8 +333,8 @@ define i32 @ptestz_v8i32_signbits(<8 x i32> %c, i32 %a, i32 %b) {
 ; AVX2-LABEL: ptestz_v8i32_signbits:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
-; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    vmovmskps %ymm0, %ecx
+; AVX2-NEXT:    testl %ecx, %ecx
 ; AVX2-NEXT:    cmovnel %esi, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -351,8 +350,8 @@ define i32 @ptestz_v8i16_signbits(<8 x i16> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: ptestz_v8i16_signbits:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    vpsraw $15, %xmm0, %xmm0
-; CHECK-NEXT:    vptest %xmm0, %xmm0
+; CHECK-NEXT:    vpmovmskb %xmm0, %ecx
+; CHECK-NEXT:    testl $43690, %ecx # imm = 0xAAAA
 ; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = ashr <8 x i16> %c, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -380,9 +379,8 @@ define i32 @ptestz_v32i8_signbits(<32 x i8> %c, i32 %a, i32 %b) {
 ; AVX2-LABEL: ptestz_v32i8_signbits:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    movl %edi, %eax
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    testl %ecx, %ecx
 ; AVX2-NEXT:    cmovnel %esi, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq

From 35963f6d8519d7384c9040d629cbb4cf6ff96de8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 11:26:14 +0100
Subject: [PATCH 201/770] VPlanValue.h - reduce unnecessary includes to forward
 declarations. NFC.

---
 llvm/lib/Transforms/Vectorize/VPlanValue.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index efad124930f4a..f73505d0279af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -23,16 +23,14 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
 // Forward declarations.
-class VPUser;
-
+class raw_ostream;
+class Value;
 class VPSlotTracker;
+class VPUser;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins

From 019bd6485c52a62c008eacfdf0d13a26ca6b0a6f Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Wed, 27 May 2020 12:39:13 +0200
Subject: [PATCH 202/770] [lldb] Don't complete ObjCInterfaceDecls in
 ClangExternalASTSourceCallbacks::FindExternalVisibleDeclsByName

Summary:
For ObjCInterfaceDecls, LLDB iterates over the `methods` of the interface in FindExternalVisibleDeclsByName
since commit ef423a3ba57045f80b0fcafce72121449a8b54d4 .
However, when LLDB calls `oid->methods()` in that function, Clang will pull in all declarations in the current
DeclContext from the current ExternalASTSource (which is again, `ClangExternalASTSourceCallbacks`). The
reason for that is that `methods()` is just a wrapper for `decls()` which is supposed to provide a list of *all*
(both currently loaded and external) decls in the DeclContext.

However, `ClangExternalASTSourceCallbacks::FindExternalLexicalDecls` doesn't implement support for ObjCInterfaceDecl,
so we don't actually add any declarations and just mark the ObjCInterfaceDecl as having no ExternalLexicalStorage.

As LLDB uses the ExternalLexicalStorage to see if it can complete a type with the ExternalASTSource, this causes
that LLDB thinks our class can't be completed any further by the ExternalASTSource
and will from on no longer make any CompleteType/FindExternalLexicalDecls calls to that decl. This essentially
renders those types unusable in the expression parser as they will always be considered incomplete.

This patch just changes the call to `methods` (which is just a `decls()` wrapper), to some ad-hoc `noload_methods`
call which is wrapping `noload_decls()`. `noload_decls()` won't trigger any calls to the ExternalASTSource, so
this prevents that ExternalLexicalStorage will be set to false.

The test for this is just adding a method to an ObjC interface. Before this patch, this unset the ExternalLexicalStorage
flag and put the interface into the state described above.

In a normal user session this situation was triggered by setting a breakpoint in a method of some ObjC class. This
caused LLDB to create the MethodDecl for that specific method and put it into the the ObjCInterfaceDecl.
Also `ObjCLanguageRuntime::LookupInCompleteClassCache` needs to be unable to resolve the type do
an actual definition when the breakpoint is set (I'm not sure how exactly this can happen, but we just
found no Type instance that had the `TypePayloadClang::IsCompleteObjCClass` flag set in its payload in
the situation where this happens. This however doesn't seem to be a regression as logic wasn't changed
from what I can see).

The module-ownership.mm test had to be changed as the only reason why the ObjC interface in that test had
it's ExternalLexicalStorage flag set to false was because of this unintended side effect. What actually happens
in the test is that ExternalLexicalStorage is first set to false in `DWARFASTParserClang::CompleteTypeFromDWARF`
when we try to complete the `SomeClass` interface, but is then the flag is set back to true once we add
the last ivar of `SomeClass` (see `SetMemberOwningModule` in `TypeSystemClang.cpp` which is called
when we add the ivar). I'll fix the code for that in a follow-up patch.

I think some of the code here needs some rethinking. LLDB and Clang shouldn't infer anything about the ExternalASTSource
and its ability to complete the current type form the `ExternalLexicalStorage` flag. We probably should
also actually provide any declarations when we get asked for the lexical decls of an ObjCInterfaceDecl. But both of those
changes are bigger (and most likely would cause us to eagerly complete more types), so those will be follow up patches
and this patch just brings us back to the state before commit ef423a3ba57045f80b0fcafce72121449a8b54d4 .

Fixes rdar://63584164

Reviewers: aprantl, friss, shafik

Reviewed By: aprantl, shafik

Subscribers: arphaman, abidh, JDevlieghere

Differential Revision: https://reviews.llvm.org/D80556
---
 .../Clang/ClangExternalASTSourceCallbacks.cpp |  3 +-
 .../SymbolFile/DWARF/module-ownership.mm      |  2 +-
 lldb/unittests/Symbol/TestTypeSystemClang.cpp | 34 +++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp
index e4054b441d55a..390afb458b5aa 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp
@@ -53,7 +53,8 @@ bool ClangExternalASTSourceCallbacks::FindExternalVisibleDeclsByName(
   // Objective-C methods are not added into the LookupPtr when they originate
   // from an external source. SetExternalVisibleDeclsForName() adds them.
   if (auto *oid = llvm::dyn_cast<clang::ObjCInterfaceDecl>(DC)) {
-    for (auto *omd : oid->methods())
+    clang::ObjCContainerDecl::method_range noload_methods(oid->noload_decls());
+    for (auto *omd : noload_methods)
       if (omd->getDeclName() == Name)
         decls.push_back(omd);
   }
diff --git a/lldb/test/Shell/SymbolFile/DWARF/module-ownership.mm b/lldb/test/Shell/SymbolFile/DWARF/module-ownership.mm
index b20e08024b9b8..311fd34d40e83 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/module-ownership.mm
+++ b/lldb/test/Shell/SymbolFile/DWARF/module-ownership.mm
@@ -46,7 +46,7 @@ @implementation SomeClass {
 // RUN: lldb-test symbols -dump-clang-ast -find type --language=ObjC++ \
 // RUN:   -compiler-context 'Module:A,Struct:SomeClass' %t.o \
 // RUN:   | FileCheck %s --check-prefix=CHECK-OBJC
-// CHECK-OBJC: ObjCInterfaceDecl {{.*}} imported in A SomeClass
+// CHECK-OBJC: ObjCInterfaceDecl {{.*}} imported in A <undeserialized declarations> SomeClass
 // CHECK-OBJC-NEXT: |-ObjCIvarDecl
 // CHECK-OBJC-NEXT: |-ObjCMethodDecl 0x[[NUMBER:[0-9a-f]+]]{{.*}} imported in A
 // CHECK-OBJC-NEXT: `-ObjCPropertyDecl {{.*}} imported in A number 'int' readonly
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index c67168ba5f567..bd7eb14d45330 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -688,3 +688,37 @@ TEST_F(TestTypeSystemClang, TestNotDeletingUserCopyCstrDueToMoveCStr) {
   auto *record = llvm::cast<CXXRecordDecl>(ClangUtil::GetAsTagDecl(t));
   EXPECT_TRUE(record->hasUserDeclaredCopyConstructor());
 }
+
+TEST_F(TestTypeSystemClang, AddMethodToObjCObjectType) {
+  // Create an interface decl and mark it as having external storage.
+  CompilerType c = m_ast->CreateObjCClass("A", m_ast->GetTranslationUnitDecl(),
+                                          OptionalClangModuleID(),
+                                          /*IsForwardDecl*/ false,
+                                          /*IsInternal*/ false);
+  ObjCInterfaceDecl *interface = m_ast->GetAsObjCInterfaceDecl(c);
+  m_ast->SetHasExternalStorage(c.GetOpaqueQualType(), true);
+  EXPECT_TRUE(interface->hasExternalLexicalStorage());
+
+  // Add a method to the interface.
+  std::vector<CompilerType> args;
+  CompilerType func_type =
+      m_ast->CreateFunctionType(m_ast->GetBasicType(lldb::eBasicTypeInt),
+                                args.data(), args.size(), /*variadic*/ false,
+                                /*quals*/ 0, clang::CallingConv::CC_C);
+  bool variadic = false;
+  bool artificial = false;
+  bool objc_direct = false;
+  clang::ObjCMethodDecl *method = TypeSystemClang::AddMethodToObjCObjectType(
+      c, "-[A foo]", func_type, lldb::eAccessPublic, artificial, variadic,
+      objc_direct);
+  ASSERT_NE(method, nullptr);
+
+  // The interface decl should still have external lexical storage.
+  EXPECT_TRUE(interface->hasExternalLexicalStorage());
+
+  // Test some properties of the created ObjCMethodDecl.
+  EXPECT_FALSE(method->isVariadic());
+  EXPECT_TRUE(method->isImplicit());
+  EXPECT_FALSE(method->isDirectMethod());
+  EXPECT_EQ(method->getDeclName().getObjCSelector().getAsString(), "foo");
+}

From fc98447af65f5a51d3b62a7e76a056d2556be59d Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Tue, 19 May 2020 15:31:25 +0300
Subject: [PATCH 203/770] [llvm-readobj] - Do not skip building of the GNU hash
 table histogram.

When the `--elf-hash-histogram` is used, the code first tries to build
a histogram for the .hash table and then for the .gnu.hash table.

The problem is that dumper might return early when unable or do not need to
build a histogram for the .hash.

This patch reorders the code slightly to fix the issue and adds a test case.

Differential revision: https://reviews.llvm.org/D80204
---
 .../llvm-readobj/ELF/hash-histogram.test      | 59 +++++++++++++++++++
 llvm/tools/llvm-readobj/ELFDumper.cpp         | 44 +++++++-------
 2 files changed, 83 insertions(+), 20 deletions(-)

diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
index f7551b481a863..5447bffceec30 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
@@ -207,3 +207,62 @@ ProgramHeaders:
     Sections:
       - Section: .hash
       - Section: .dynamic
+
+## Check we dump a histogram for the .gnu.hash table even when the .hash table is skipped.
+
+## Case A: the .hash table has no data to build histogram and it is skipped.
+##         (NBUCKET == 0x1 is a no-op: it does not change the number of buckets described with the "Bucket" key).
+# RUN: yaml2obj --docnum=5 -DNBUCKET=0x1 %s -o %t5.o
+# RUN: llvm-readelf --elf-hash-histogram %t5.o 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=GNU-HASH --implicit-check-not="Histogram"
+
+## Case B: the .hash table has a broken nbucket field. We report a warning
+##         and skip dumping of the .hash table.
+# RUN: yaml2obj --docnum=5 -DNBUCKET=0xffffffff %s -o %t6.o
+# RUN: llvm-readelf --elf-hash-histogram %t6.o 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t6.o --check-prefixes=WARN,GNU-HASH
+
+# WARN:     warning: '[[FILE]]': the hash table at offset 0x78 goes past the end of the file (0x350), nbucket = 4294967295, nchain = 1
+# GNU-HASH: Histogram for `.gnu.hash' bucket list length (total of 1 buckets)
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .hash
+    Type:    SHT_HASH
+    Flags:   [ SHF_ALLOC ]
+    Bucket:  [ 0 ]
+    NBucket: [[NBUCKET]]
+    Chain:   [ 0 ]
+  - Name:  .gnu.hash
+    Type:  SHT_GNU_HASH
+    Flags: [ SHF_ALLOC ]
+    Header:
+      SymNdx: 0x1
+      Shift2: 0x0
+    BloomFilter: [ 0x0 ]
+    HashBuckets: [ 0x00000001, 0x00000004, 0x00000000 ]
+    HashValues:  [ 0x0B887388 ]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Entries:
+      - Tag:   DT_HASH
+        Value: 0x0
+      - Tag:   DT_GNU_HASH
+## sizeof(.hash) == 0x28.
+        Value: 0x28
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols:
+  - Name: foo
+ProgramHeaders:
+  - Type:  PT_LOAD
+    Sections:
+      - Section: .hash
+      - Section: .gnu.hash
+      - Section: .dynamic
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 1bcf488c15f8f..c9d45061320dd 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -4562,15 +4562,11 @@ void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
 // Additionally cumulative coverage of symbols for each set of buckets.
 template <class ELFT>
 void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
-  // Print histogram for .hash section
-  if (const Elf_Hash *HashTable = this->dumper()->getHashTable()) {
-    if (!checkHashTable(Obj, HashTable, this->FileName))
-      return;
-
-    size_t NBucket = HashTable->nbucket;
-    size_t NChain = HashTable->nchain;
-    ArrayRef<Elf_Word> Buckets = HashTable->buckets();
-    ArrayRef<Elf_Word> Chains = HashTable->chains();
+  auto PrintHashHist = [&](const Elf_Hash &HashTable) {
+    size_t NBucket = HashTable.nbucket;
+    size_t NChain = HashTable.nchain;
+    ArrayRef<Elf_Word> Buckets = HashTable.buckets();
+    ArrayRef<Elf_Word> Chains = HashTable.chains();
     size_t TotalSyms = 0;
     // If hash table is correct, we have at least chains with 0 length
     size_t MaxChain = 1;
@@ -4604,7 +4600,7 @@ void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
     if (!TotalSyms)
       return;
 
-    std::vector<size_t> Count(MaxChain, 0) ;
+    std::vector<size_t> Count(MaxChain, 0);
     // Count how long is the chain for each bucket
     for (size_t B = 0; B < NBucket; B++)
       ++Count[ChainLen[B]];
@@ -4619,17 +4615,16 @@ void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
                    (Count[I] * 100.0) / NBucket,
                    (CumulativeNonZero * 100.0) / TotalSyms);
     }
-  }
+  };
 
-  // Print histogram for .gnu.hash section
-  if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable()) {
-    size_t NBucket = GnuHashTable->nbuckets;
-    ArrayRef<Elf_Word> Buckets = GnuHashTable->buckets();
+  auto PrintGnuHashHist = [&](const Elf_GnuHash &GnuHashTable) {
+    size_t NBucket = GnuHashTable.nbuckets;
+    ArrayRef<Elf_Word> Buckets = GnuHashTable.buckets();
     unsigned NumSyms = this->dumper()->dynamic_symbols().size();
     if (!NumSyms)
       return;
-    ArrayRef<Elf_Word> Chains = GnuHashTable->values(NumSyms);
-    size_t Symndx = GnuHashTable->symndx;
+    ArrayRef<Elf_Word> Chains = GnuHashTable.values(NumSyms);
+    size_t Symndx = GnuHashTable.symndx;
     size_t TotalSyms = 0;
     size_t MaxChain = 1;
     size_t CumulativeNonZero = 0;
@@ -4655,7 +4650,7 @@ void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
     if (!TotalSyms)
       return;
 
-    std::vector<size_t> Count(MaxChain, 0) ;
+    std::vector<size_t> Count(MaxChain, 0);
     for (size_t B = 0; B < NBucket; B++)
       ++Count[ChainLen[B]];
     // Print Number of buckets with each chain lengths and their cumulative
@@ -4663,13 +4658,22 @@ void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
     OS << "Histogram for `.gnu.hash' bucket list length (total of " << NBucket
        << " buckets)\n"
        << " Length  Number     % of total  Coverage\n";
-    for (size_t I = 0; I <MaxChain; I++) {
+    for (size_t I = 0; I < MaxChain; I++) {
       CumulativeNonZero += Count[I] * I;
       OS << format("%7lu  %-10lu (%5.1f%%)     %5.1f%%\n", I, Count[I],
                    (Count[I] * 100.0) / NBucket,
                    (CumulativeNonZero * 100.0) / TotalSyms);
     }
-  }
+  };
+
+  // Print histogram for the .hash section.
+  if (const Elf_Hash *HashTable = this->dumper()->getHashTable())
+    if (checkHashTable(Obj, HashTable, this->FileName))
+      PrintHashHist(*HashTable);
+
+  // Print histogram for the .gnu.hash section.
+  if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable())
+    PrintGnuHashHist(*GnuHashTable);
 }
 
 template <class ELFT>

From 8062602810fed6fe377deabe8abd563a0c5d1809 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 11:22:13 +0100
Subject: [PATCH 204/770] DOTGraphTraitsPass.h - remove unnecessary includes.
 NFC.

---
 llvm/include/llvm/Analysis/DOTGraphTraitsPass.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
index c9e8df5db1c20..ecf54cd8a680d 100644
--- a/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/llvm/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -14,8 +14,6 @@
 #define LLVM_ANALYSIS_DOTGRAPHTRAITSPASS_H
 
 #include "llvm/Analysis/CFGPrinter.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/FileSystem.h"
 
 namespace llvm {
 

From 1e9462a201c3a09612e7fe8d56a0be0829e99dcf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 11:51:08 +0100
Subject: [PATCH 205/770] ArchiveWriter.h - remove unnecessary includes. NFC.

---
 llvm/include/llvm/Object/ArchiveWriter.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/include/llvm/Object/ArchiveWriter.h b/llvm/include/llvm/Object/ArchiveWriter.h
index 9e6daf2da36e9..274ffd90c05aa 100644
--- a/llvm/include/llvm/Object/ArchiveWriter.h
+++ b/llvm/include/llvm/Object/ArchiveWriter.h
@@ -13,10 +13,7 @@
 #ifndef LLVM_OBJECT_ARCHIVEWRITER_H
 #define LLVM_OBJECT_ARCHIVEWRITER_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Archive.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 
 namespace llvm {
 

From d804b334ed0f1c88b90ab028541582e35ba3c172 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Tue, 26 May 2020 13:58:20 +0300
Subject: [PATCH 206/770] [llvm-readelf] - Split
 GNUStyle<ELFT>::printHashHistogram. NFC.

As was mentioned in review comments for D80204,
`printHashHistogram` has 2 lambdas that are probably too large
and deserves splitting into member functions.

This patch does it.

Differential revision: https://reviews.llvm.org/D80546
---
 llvm/tools/llvm-readobj/ELFDumper.cpp    | 220 ++++++++++++-----------
 llvm/tools/llvm-readobj/ObjDumper.h      |   2 +-
 llvm/tools/llvm-readobj/llvm-readobj.cpp |   2 +-
 3 files changed, 114 insertions(+), 110 deletions(-)

diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index c9d45061320dd..83132869cc2c2 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -230,7 +230,7 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
 
   void printStackMap() const override;
 
-  void printHashHistogram() override;
+  void printHashHistograms() override;
 
   void printCGProfile() override;
   void printAddrsig() override;
@@ -742,7 +742,7 @@ template <typename ELFT> class DumpStyle {
                                              const Elf_Shdr *Sec) = 0;
   virtual void printVersionDependencySection(const ELFFile<ELFT> *Obj,
                                              const Elf_Shdr *Sec) = 0;
-  virtual void printHashHistogram(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printHashHistograms(const ELFFile<ELFT> *Obj) = 0;
   virtual void printCGProfile(const ELFFile<ELFT> *Obj) = 0;
   virtual void printAddrsig(const ELFFile<ELFT> *Obj) = 0;
   virtual void printNotes(const ELFFile<ELFT> *Obj) = 0;
@@ -811,7 +811,7 @@ template <typename ELFT> class GNUStyle : public DumpStyle<ELFT> {
                                      const Elf_Shdr *Sec) override;
   void printVersionDependencySection(const ELFFile<ELFT> *Obj,
                                      const Elf_Shdr *Sec) override;
-  void printHashHistogram(const ELFFile<ELFT> *Obj) override;
+  void printHashHistograms(const ELFFile<ELFT> *Obj) override;
   void printCGProfile(const ELFFile<ELFT> *Obj) override;
   void printAddrsig(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
@@ -823,6 +823,9 @@ template <typename ELFT> class GNUStyle : public DumpStyle<ELFT> {
   void printMipsABIFlags(const ELFObjectFile<ELFT> *Obj) override;
 
 private:
+  void printHashHistogram(const Elf_Hash &HashTable);
+  void printGnuHashHistogram(const Elf_GnuHash &GnuHashTable);
+
   struct Field {
     std::string Str;
     unsigned Column;
@@ -932,7 +935,7 @@ template <typename ELFT> class LLVMStyle : public DumpStyle<ELFT> {
                                      const Elf_Shdr *Sec) override;
   void printVersionDependencySection(const ELFFile<ELFT> *Obj,
                                      const Elf_Shdr *Sec) override;
-  void printHashHistogram(const ELFFile<ELFT> *Obj) override;
+  void printHashHistograms(const ELFFile<ELFT> *Obj) override;
   void printCGProfile(const ELFFile<ELFT> *Obj) override;
   void printAddrsig(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
@@ -2287,8 +2290,8 @@ template <class ELFT> void ELFDumper<ELFT>::printHashSymbols() {
   ELFDumperStyle->printHashSymbols(ObjF->getELFFile());
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printHashHistogram() {
-  ELFDumperStyle->printHashHistogram(ObjF->getELFFile());
+template <class ELFT> void ELFDumper<ELFT>::printHashHistograms() {
+  ELFDumperStyle->printHashHistograms(ObjF->getELFFile());
 }
 
 template <class ELFT> void ELFDumper<ELFT>::printCGProfile() {
@@ -4556,124 +4559,125 @@ void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
   OS << '\n';
 }
 
-// Hash histogram shows  statistics of how efficient the hash was for the
-// dynamic symbol table. The table shows number of hash buckets for different
-// lengths of chains as absolute number and percentage of the total buckets.
-// Additionally cumulative coverage of symbols for each set of buckets.
 template <class ELFT>
-void GNUStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
-  auto PrintHashHist = [&](const Elf_Hash &HashTable) {
-    size_t NBucket = HashTable.nbucket;
-    size_t NChain = HashTable.nchain;
-    ArrayRef<Elf_Word> Buckets = HashTable.buckets();
-    ArrayRef<Elf_Word> Chains = HashTable.chains();
-    size_t TotalSyms = 0;
-    // If hash table is correct, we have at least chains with 0 length
-    size_t MaxChain = 1;
-    size_t CumulativeNonZero = 0;
-
-    if (NChain == 0 || NBucket == 0)
-      return;
+void GNUStyle<ELFT>::printHashHistogram(const Elf_Hash &HashTable) {
+  size_t NBucket = HashTable.nbucket;
+  size_t NChain = HashTable.nchain;
+  ArrayRef<Elf_Word> Buckets = HashTable.buckets();
+  ArrayRef<Elf_Word> Chains = HashTable.chains();
+  size_t TotalSyms = 0;
+  // If hash table is correct, we have at least chains with 0 length
+  size_t MaxChain = 1;
+  size_t CumulativeNonZero = 0;
+
+  if (NChain == 0 || NBucket == 0)
+    return;
 
-    std::vector<size_t> ChainLen(NBucket, 0);
-    // Go over all buckets and and note chain lengths of each bucket (total
-    // unique chain lengths).
-    for (size_t B = 0; B < NBucket; B++) {
-      std::vector<bool> Visited(NChain);
-      for (size_t C = Buckets[B]; C < NChain; C = Chains[C]) {
-        if (C == ELF::STN_UNDEF)
-          break;
-        if (Visited[C]) {
-          reportWarning(
-              createError(".hash section is invalid: bucket " + Twine(C) +
-                          ": a cycle was detected in the linked chain"),
-              this->FileName);
-          break;
-        }
-        Visited[C] = true;
-        if (MaxChain <= ++ChainLen[B])
-          MaxChain++;
+  std::vector<size_t> ChainLen(NBucket, 0);
+  // Go over all buckets and and note chain lengths of each bucket (total
+  // unique chain lengths).
+  for (size_t B = 0; B < NBucket; B++) {
+    std::vector<bool> Visited(NChain);
+    for (size_t C = Buckets[B]; C < NChain; C = Chains[C]) {
+      if (C == ELF::STN_UNDEF)
+        break;
+      if (Visited[C]) {
+        reportWarning(createError(".hash section is invalid: bucket " +
+                                  Twine(C) +
+                                  ": a cycle was detected in the linked chain"),
+                      this->FileName);
+        break;
       }
-      TotalSyms += ChainLen[B];
-    }
-
-    if (!TotalSyms)
-      return;
-
-    std::vector<size_t> Count(MaxChain, 0);
-    // Count how long is the chain for each bucket
-    for (size_t B = 0; B < NBucket; B++)
-      ++Count[ChainLen[B]];
-    // Print Number of buckets with each chain lengths and their cumulative
-    // coverage of the symbols
-    OS << "Histogram for bucket list length (total of " << NBucket
-       << " buckets)\n"
-       << " Length  Number     % of total  Coverage\n";
-    for (size_t I = 0; I < MaxChain; I++) {
-      CumulativeNonZero += Count[I] * I;
-      OS << format("%7lu  %-10lu (%5.1f%%)     %5.1f%%\n", I, Count[I],
-                   (Count[I] * 100.0) / NBucket,
-                   (CumulativeNonZero * 100.0) / TotalSyms);
+      Visited[C] = true;
+      if (MaxChain <= ++ChainLen[B])
+        MaxChain++;
     }
-  };
+    TotalSyms += ChainLen[B];
+  }
 
-  auto PrintGnuHashHist = [&](const Elf_GnuHash &GnuHashTable) {
-    size_t NBucket = GnuHashTable.nbuckets;
-    ArrayRef<Elf_Word> Buckets = GnuHashTable.buckets();
-    unsigned NumSyms = this->dumper()->dynamic_symbols().size();
-    if (!NumSyms)
-      return;
-    ArrayRef<Elf_Word> Chains = GnuHashTable.values(NumSyms);
-    size_t Symndx = GnuHashTable.symndx;
-    size_t TotalSyms = 0;
-    size_t MaxChain = 1;
-    size_t CumulativeNonZero = 0;
+  if (!TotalSyms)
+    return;
 
-    if (Chains.empty() || NBucket == 0)
-      return;
+  std::vector<size_t> Count(MaxChain, 0);
+  // Count how long is the chain for each bucket
+  for (size_t B = 0; B < NBucket; B++)
+    ++Count[ChainLen[B]];
+  // Print Number of buckets with each chain lengths and their cumulative
+  // coverage of the symbols
+  OS << "Histogram for bucket list length (total of " << NBucket
+     << " buckets)\n"
+     << " Length  Number     % of total  Coverage\n";
+  for (size_t I = 0; I < MaxChain; I++) {
+    CumulativeNonZero += Count[I] * I;
+    OS << format("%7lu  %-10lu (%5.1f%%)     %5.1f%%\n", I, Count[I],
+                 (Count[I] * 100.0) / NBucket,
+                 (CumulativeNonZero * 100.0) / TotalSyms);
+  }
+}
 
-    std::vector<size_t> ChainLen(NBucket, 0);
+template <class ELFT>
+void GNUStyle<ELFT>::printGnuHashHistogram(const Elf_GnuHash &GnuHashTable) {
+  size_t NBucket = GnuHashTable.nbuckets;
+  ArrayRef<Elf_Word> Buckets = GnuHashTable.buckets();
+  unsigned NumSyms = this->dumper()->dynamic_symbols().size();
+  if (!NumSyms)
+    return;
+  ArrayRef<Elf_Word> Chains = GnuHashTable.values(NumSyms);
+  size_t Symndx = GnuHashTable.symndx;
+  size_t TotalSyms = 0;
+  size_t MaxChain = 1;
+  size_t CumulativeNonZero = 0;
 
-    for (size_t B = 0; B < NBucket; B++) {
-      if (!Buckets[B])
-        continue;
-      size_t Len = 1;
-      for (size_t C = Buckets[B] - Symndx;
-           C < Chains.size() && (Chains[C] & 1) == 0; C++)
-        if (MaxChain < ++Len)
-          MaxChain++;
-      ChainLen[B] = Len;
-      TotalSyms += Len;
-    }
-    MaxChain++;
+  if (Chains.empty() || NBucket == 0)
+    return;
 
-    if (!TotalSyms)
-      return;
+  std::vector<size_t> ChainLen(NBucket, 0);
+  for (size_t B = 0; B < NBucket; B++) {
+    if (!Buckets[B])
+      continue;
+    size_t Len = 1;
+    for (size_t C = Buckets[B] - Symndx;
+         C < Chains.size() && (Chains[C] & 1) == 0; C++)
+      if (MaxChain < ++Len)
+        MaxChain++;
+    ChainLen[B] = Len;
+    TotalSyms += Len;
+  }
+  MaxChain++;
 
-    std::vector<size_t> Count(MaxChain, 0);
-    for (size_t B = 0; B < NBucket; B++)
-      ++Count[ChainLen[B]];
-    // Print Number of buckets with each chain lengths and their cumulative
-    // coverage of the symbols
-    OS << "Histogram for `.gnu.hash' bucket list length (total of " << NBucket
-       << " buckets)\n"
-       << " Length  Number     % of total  Coverage\n";
-    for (size_t I = 0; I < MaxChain; I++) {
-      CumulativeNonZero += Count[I] * I;
-      OS << format("%7lu  %-10lu (%5.1f%%)     %5.1f%%\n", I, Count[I],
-                   (Count[I] * 100.0) / NBucket,
-                   (CumulativeNonZero * 100.0) / TotalSyms);
-    }
-  };
+  if (!TotalSyms)
+    return;
 
+  std::vector<size_t> Count(MaxChain, 0);
+  for (size_t B = 0; B < NBucket; B++)
+    ++Count[ChainLen[B]];
+  // Print Number of buckets with each chain lengths and their cumulative
+  // coverage of the symbols
+  OS << "Histogram for `.gnu.hash' bucket list length (total of " << NBucket
+     << " buckets)\n"
+     << " Length  Number     % of total  Coverage\n";
+  for (size_t I = 0; I < MaxChain; I++) {
+    CumulativeNonZero += Count[I] * I;
+    OS << format("%7lu  %-10lu (%5.1f%%)     %5.1f%%\n", I, Count[I],
+                 (Count[I] * 100.0) / NBucket,
+                 (CumulativeNonZero * 100.0) / TotalSyms);
+  }
+}
+
+// Hash histogram shows statistics of how efficient the hash was for the
+// dynamic symbol table. The table shows the number of hash buckets for
+// different lengths of chains as an absolute number and percentage of the total
+// buckets, and the cumulative coverage of symbols for each set of buckets.
+template <class ELFT>
+void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
   // Print histogram for the .hash section.
   if (const Elf_Hash *HashTable = this->dumper()->getHashTable())
     if (checkHashTable(Obj, HashTable, this->FileName))
-      PrintHashHist(*HashTable);
+      printHashHistogram(*HashTable);
 
   // Print histogram for the .gnu.hash section.
   if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable())
-    PrintGnuHashHist(*GnuHashTable);
+    printGnuHashHistogram(*GnuHashTable);
 }
 
 template <class ELFT>
@@ -6417,7 +6421,7 @@ void LLVMStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
 }
 
 template <class ELFT>
-void LLVMStyle<ELFT>::printHashHistogram(const ELFFile<ELFT> *Obj) {
+void LLVMStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
   W.startLine() << "Hash Histogram not implemented!\n";
 }
 
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index a0f69edc90719..57477606d6e8e 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -64,7 +64,7 @@ class ObjDumper {
   virtual void printLoadName() {}
   virtual void printVersionInfo() {}
   virtual void printGroupSections() {}
-  virtual void printHashHistogram() {}
+  virtual void printHashHistograms() {}
   virtual void printCGProfile() {}
   virtual void printAddrsig() {}
   virtual void printNotes() {}
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 85409389eccd9..ab2b320c1f82c 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -502,7 +502,7 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer,
     if (opts::SectionGroups)
       Dumper->printGroupSections();
     if (opts::HashHistogram)
-      Dumper->printHashHistogram();
+      Dumper->printHashHistograms();
     if (opts::CGProfile)
       Dumper->printCGProfile();
     if (opts::Addrsig)

From 706b22e3e446621b20befe1094c26e4eda133bc9 Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <suc-daniil@yandex.ru>
Date: Mon, 25 May 2020 15:06:12 +0700
Subject: [PATCH 207/770] [SimpleLoopUnswitch] Drop uses of instructions before
 block deletion

Currently if instructions defined in a block are used in unreachable
blocks and SimpleLoopUnswitch attempts deleting the block, it triggers
assertion "Uses remain when a value is destroyed!".
This patch fixes it by replacing all uses of instructions from BB with
undefs before BB deletion.

Reviewed By: asbirlea

Differential Revision: https://reviews.llvm.org/D80551
---
 llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp            | 5 +++++
 .../dead-blocks-uses-in-unreachablel-blocks.ll               | 1 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 914fadc40d74f..6c6d6ca9cf656 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1574,6 +1574,11 @@ static void deleteDeadBlocksFromLoop(Loop &L,
     // Check that the dominator tree has already been updated.
     assert(!DT.getNode(BB) && "Should already have cleared domtree!");
     LI.changeLoopFor(BB, nullptr);
+    // Drop all uses of the instructions to make sure we won't have dangling
+    // uses in other blocks.
+    for (auto &I : *BB)
+      if (!I.use_empty())
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
     BB->dropAllReferences();
   }
 
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll b/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll
index 4dec9a3919aa9..2756e102a67ac 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/dead-blocks-uses-in-unreachablel-blocks.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; REQUIRES: asserts
 ; RUN: opt < %s -passes='unswitch<nontrivial>' -disable-output
 ; RUN: opt < %s -simple-loop-unswitch -enable-nontrivial-unswitch -disable-output

From 259abfc7cbc11cd98c05b1eb8e4b3fb6a9664bc0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 27 May 2020 12:19:22 +0100
Subject: [PATCH 208/770] [LAA] We only need pointer checks if there are
 non-zero checks (NFC).

If it turns out that we can do runtime checks, but there are no
runtime-checks to generate, set RtCheck.Need to false.

This can happen if we can prove statically that the pointers passed in
to canCheckPtrAtRT do not alias. This should not change any results, but
allows us to skip some work and assert that runtime checks are
generated, if LAA indicates that runtime checks are required.

Reviewers: anemet, Ayal

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D79969
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 26 ++++++++++++-------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 014cabd88fed6..6e1217f0476dc 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -701,12 +701,14 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                      ScalarEvolution *SE, Loop *TheLoop,
                                      const ValueToValueMap &StridesMap,
                                      bool ShouldCheckWrap) {
+  if (!IsRTCheckAnalysisNeeded)
+    return true;
+
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
 
-  bool NeedRTCheck = false;
-  if (!IsRTCheckAnalysisNeeded) return true;
+  RtCheck.Need = false;
 
   bool IsDepCheckNeeded = isDependencyCheckNeeded();
 
@@ -747,10 +749,10 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // check them.  But there is no need to checks if there is only one
     // dependence set for this alias set.
     //
-    // Note that this function computes CanDoRT and NeedRTCheck independently.
-    // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
-    // for which we couldn't find the bounds but we don't actually need to emit
-    // any checks so it does not matter.
+    // Note that this function computes CanDoRT and RtCheck.Need independently.
+    // For example CanDoRT=false, RtCheck.Need=false means that we have a
+    // pointer for which we couldn't find the bounds but we don't actually need
+    // to emit any checks so it does not matter.
     bool NeedsAliasSetRTCheck = false;
     if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2))
       NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
@@ -773,7 +775,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
 
     CanDoRT &= CanDoAliasSetRT;
-    NeedRTCheck |= NeedsAliasSetRTCheck;
+    RtCheck.Need |= NeedsAliasSetRTCheck;
     ++ASId;
   }
 
@@ -807,15 +809,19 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
   }
 
-  if (NeedRTCheck && CanDoRT)
+  if (RtCheck.Need && CanDoRT)
     RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
 
   LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
                     << " pointer comparisons.\n");
 
-  RtCheck.Need = NeedRTCheck;
+  // If we can do run-time checks, but there are no checks, no runtime checks
+  // are needed. This can happen when all pointers point to the same underlying
+  // object for example.
+  if (CanDoRT)
+    RtCheck.Need = RtCheck.getNumberOfChecks() != 0;
 
-  bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
+  bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
   if (!CanDoRTIfNeeded)
     RtCheck.reset();
   return CanDoRTIfNeeded;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index df1529a2f7b9e..8d52ddc5b3b5d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2795,8 +2795,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
-  if (!MemRuntimeCheck)
-    return;
+  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+                            "claimed checks are required");
 
   if (MemCheckBlock->getParent()->hasOptSize()) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&

From 2d0389821e0c6371823198d3a5b1f032138a40bb Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 27 May 2020 12:39:45 +0100
Subject: [PATCH 209/770] Revert "[LAA] We only need pointer checks if there
 are non-zero checks (NFC)."

This reverts commit 259abfc7cbc11cd98c05b1eb8e4b3fb6a9664bc0.

Reverting this, as I missed a case where we return without setting
RtCheck.Need.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 26 +++++++------------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +--
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 6e1217f0476dc..014cabd88fed6 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -701,14 +701,12 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                      ScalarEvolution *SE, Loop *TheLoop,
                                      const ValueToValueMap &StridesMap,
                                      bool ShouldCheckWrap) {
-  if (!IsRTCheckAnalysisNeeded)
-    return true;
-
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
 
-  RtCheck.Need = false;
+  bool NeedRTCheck = false;
+  if (!IsRTCheckAnalysisNeeded) return true;
 
   bool IsDepCheckNeeded = isDependencyCheckNeeded();
 
@@ -749,10 +747,10 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // check them.  But there is no need to checks if there is only one
     // dependence set for this alias set.
     //
-    // Note that this function computes CanDoRT and RtCheck.Need independently.
-    // For example CanDoRT=false, RtCheck.Need=false means that we have a
-    // pointer for which we couldn't find the bounds but we don't actually need
-    // to emit any checks so it does not matter.
+    // Note that this function computes CanDoRT and NeedRTCheck independently.
+    // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
+    // for which we couldn't find the bounds but we don't actually need to emit
+    // any checks so it does not matter.
     bool NeedsAliasSetRTCheck = false;
     if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2))
       NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
@@ -775,7 +773,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
 
     CanDoRT &= CanDoAliasSetRT;
-    RtCheck.Need |= NeedsAliasSetRTCheck;
+    NeedRTCheck |= NeedsAliasSetRTCheck;
     ++ASId;
   }
 
@@ -809,19 +807,15 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
   }
 
-  if (RtCheck.Need && CanDoRT)
+  if (NeedRTCheck && CanDoRT)
     RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
 
   LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
                     << " pointer comparisons.\n");
 
-  // If we can do run-time checks, but there are no checks, no runtime checks
-  // are needed. This can happen when all pointers point to the same underlying
-  // object for example.
-  if (CanDoRT)
-    RtCheck.Need = RtCheck.getNumberOfChecks() != 0;
+  RtCheck.Need = NeedRTCheck;
 
-  bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
+  bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
   if (!CanDoRTIfNeeded)
     RtCheck.reset();
   return CanDoRTIfNeeded;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8d52ddc5b3b5d..df1529a2f7b9e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2795,8 +2795,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
-  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
-                            "claimed checks are required");
+  if (!MemRuntimeCheck)
+    return;
 
   if (MemCheckBlock->getParent()->hasOptSize()) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&

From 9b507b2127f116f29437e04a187cdca70ae9aa33 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 27 May 2020 12:41:32 +0100
Subject: [PATCH 210/770] [LAA] We only need pointer checks if there are
 non-zero checks (NFC).

If it turns out that we can do runtime checks, but there are no
runtime-checks to generate, set RtCheck.Need to false.

This can happen if we can prove statically that the pointers passed in
to canCheckPtrAtRT do not alias. This should not change any results, but
allows us to skip some work and assert that runtime checks are
generated, if LAA indicates that runtime checks are required.

Reviewers: anemet, Ayal

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D79969

Note: This is a recommit of 259abfc7cbc11cd98c05b1eb8e4b3fb6a9664bc0,
with some suggested renaming.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 21 +++++++++++--------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 014cabd88fed6..4cb2db58100bb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -705,7 +705,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   // to place a runtime bound check.
   bool CanDoRT = true;
 
-  bool NeedRTCheck = false;
+  bool MayNeedRTCheck = false;
   if (!IsRTCheckAnalysisNeeded) return true;
 
   bool IsDepCheckNeeded = isDependencyCheckNeeded();
@@ -747,10 +747,10 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // check them.  But there is no need to checks if there is only one
     // dependence set for this alias set.
     //
-    // Note that this function computes CanDoRT and NeedRTCheck independently.
-    // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
-    // for which we couldn't find the bounds but we don't actually need to emit
-    // any checks so it does not matter.
+    // Note that this function computes CanDoRT and MayNeedRTCheck
+    // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
+    // we have a pointer for which we couldn't find the bounds but we don't
+    // actually need to emit any checks so it does not matter.
     bool NeedsAliasSetRTCheck = false;
     if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2))
       NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
@@ -773,7 +773,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
 
     CanDoRT &= CanDoAliasSetRT;
-    NeedRTCheck |= NeedsAliasSetRTCheck;
+    MayNeedRTCheck |= NeedsAliasSetRTCheck;
     ++ASId;
   }
 
@@ -807,15 +807,18 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
   }
 
-  if (NeedRTCheck && CanDoRT)
+  if (MayNeedRTCheck && CanDoRT)
     RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
 
   LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
                     << " pointer comparisons.\n");
 
-  RtCheck.Need = NeedRTCheck;
+  // If we can do run-time checks, but there are no checks, no runtime checks
+  // are needed. This can happen when all pointers point to the same underlying
+  // object for example.
+  RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
 
-  bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
+  bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
   if (!CanDoRTIfNeeded)
     RtCheck.reset();
   return CanDoRTIfNeeded;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index df1529a2f7b9e..8d52ddc5b3b5d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2795,8 +2795,8 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
-  if (!MemRuntimeCheck)
-    return;
+  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+                            "claimed checks are required");
 
   if (MemCheckBlock->getParent()->hasOptSize()) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&

From 6e1eff785892edb75948f3c0a18e01ef8fbe2619 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 26 May 2020 11:58:23 +0000
Subject: [PATCH 211/770] [NFC] Updating tests

Summary:
Updating IR now that alignment is explicitly set.
This is a prerequisite to D80276.

Reviewers: efriedma

Subscribers: llvm-commits, craig.topper

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80549
---
 .../X86/interleaved-accesses-64bits-avx.ll    | 13 +++++++++---
 .../InterleavedAccess/X86/interleavedLoad.ll  | 20 +++++++++----------
 .../InterleavedAccess/X86/interleavedStore.ll |  8 ++++----
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll
index 1a48be2e7cd82..c475f176ae6a6 100644
--- a/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll
@@ -5,7 +5,7 @@
 
 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 ; CHECK-LABEL: @load_factorf64_4(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* %ptr to <4 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[PTR:%.*]] to <4 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 1
@@ -40,7 +40,7 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 
 define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 ; CHECK-LABEL: @load_factori64_4(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i64>* %ptr to <4 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i64>* [[PTR:%.*]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 1
@@ -75,7 +75,7 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 
 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; CHECK-LABEL: @load_factorf64_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* %ptr to <4 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[PTR:%.*]] to <4 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 1
@@ -228,6 +228,13 @@ define void @store_factorf64_4_arbitraryMask(<16 x double>* %ptr, <16 x double>
 @a = local_unnamed_addr global <4 x double> zeroinitializer, align 32
 ; Function Attrs: norecurse nounwind readonly uwtable
 define <4 x double> @test_unhandled(<4 x double> %b) {
+; CHECK-LABEL: @test_unhandled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* @a, align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 0, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[SHUFFLE]]
+;
 entry:
   %0 = load <4 x double>, <4 x double>* @a, align 32
   %1 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll
index 39f4f757d02af..9705b438fa390 100644
--- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll
@@ -6,17 +6,17 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
 ; AVX2-LABEL: @interleaved_load_vf32_i8_stride3(
 ; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <96 x i8>* [[PTR:%.*]] to <16 x i8>*
 ; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
-; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 128
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 128
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 128
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3
-; AVX2-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 128
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]]
+; AVX2-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 128
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5
-; AVX2-NEXT:    [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]]
+; AVX2-NEXT:    [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 128
 ; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX2-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -48,11 +48,11 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
 ; AVX2-LABEL: @interleaved_load_vf16_i8_stride3(
 ; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <48 x i8>* [[PTR:%.*]] to <16 x i8>*
 ; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
-; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 64
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 64
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 64
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
 ; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
 ; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
@@ -79,7 +79,7 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
 
 define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
 ; AVX2-LABEL: @interleaved_load_vf8_i8_stride3(
-; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]]
+; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]], align 32
 ; AVX2-NEXT:    [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; AVX2-NEXT:    [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll
index 11a5283a7b553..ec8f4d6e93555 100644
--- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll
@@ -25,7 +25,7 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> [[TMP16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> [[TMP18]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT:    store <128 x i8> [[TMP19]], <128 x i8>* [[P:%.*]]
+; CHECK-NEXT:    store <128 x i8> [[TMP19]], <128 x i8>* [[P:%.*]], align 128
 ; CHECK-NEXT:    ret void
 ;
   %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -54,7 +54,7 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    store <64 x i8> [[TMP15]], <64 x i8>* [[P:%.*]]
+; CHECK-NEXT:    store <64 x i8> [[TMP15]], <64 x i8>* [[P:%.*]], align 64
 ; CHECK-NEXT:    ret void
 ;
 %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -77,7 +77,7 @@ define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]]
+; CHECK-NEXT:    store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -232,7 +232,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <64 x i8> [[TMP23]], <64 x i8> [[TMP24]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> [[TMP26]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <128 x i8> [[TMP27]], <128 x i8> [[TMP28]], <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
-; CHECK-NEXT:    store <256 x i8> [[TMP29]], <256 x i8>* [[P:%.*]]
+; CHECK-NEXT:    store <256 x i8> [[TMP29]], <256 x i8>* [[P:%.*]], align 256
 ; CHECK-NEXT:    ret void
 ;
 %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>

From 18bb1f1067028fbeaf92774e640bd865c53e1ce1 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Wed, 27 May 2020 14:04:39 +0200
Subject: [PATCH 212/770] [lldb] Fix a potential bug that may cause assert
 failure in CommandObject::CheckRequirements

Summary: `CommandObject::CheckRequirements` requires cleaning up `m_exe_ctx`
between commands. Function `HandleOptionCompletion` returns without cleaning up
`m_exe_ctx` could cause assert failure in later `CheckRequirements`.

Reviewers: teemperor, JDevlieghere

Reviewed By: teemperor

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D80447
---
 lldb/source/Interpreter/CommandObject.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index f1f17dbd66ef4..ddf1f5511ecd7 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -18,6 +18,7 @@
 #include "lldb/Core/Address.h"
 #include "lldb/Interpreter/Options.h"
 #include "lldb/Utility/ArchSpec.h"
+#include "llvm/ADT/ScopeExit.h"
 
 // These are for the Sourcename completers.
 // FIXME: Make a separate file for the completers.
@@ -269,6 +270,7 @@ void CommandObject::Cleanup() {
 void CommandObject::HandleCompletion(CompletionRequest &request) {
 
   m_exe_ctx = m_interpreter.GetExecutionContext();
+  auto reset_ctx = llvm::make_scope_exit([this]() { Cleanup(); });
 
   // Default implementation of WantsCompletion() is !WantsRawCommandString().
   // Subclasses who want raw command string but desire, for example, argument
@@ -296,8 +298,6 @@ void CommandObject::HandleCompletion(CompletionRequest &request) {
     // If we got here, the last word is not an option or an option argument.
     HandleArgumentCompletion(request, opt_element_vector);
   }
-
-  m_exe_ctx.Clear();
 }
 
 bool CommandObject::HelpTextContainsWord(llvm::StringRef search_word,

From c7593b0f0d28f6b7f9fa4557ce73197a49b37799 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Tue, 26 May 2020 13:28:33 +0100
Subject: [PATCH 213/770] [ARM] Fix rewrite of frame index in Thumb2's address
 mode i8s4

Summary:
In Thumb2's frame index rewriting process, the address mode i8s4, which
is used by LDRD and STRD instructions, is handled by taking the
immediate offset operand and multiplying it by 4.

This behaviour is wrong, however. In this specific address mode, the
MachineInstr's immediate operand is already in the expected form. By
consequence of that, multiplying it once more by 4 yields a flawed
offset value, four times greater than it should be.

Differential Revision: https://reviews.llvm.org/D80557
---
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp       |  2 +-
 .../Thumb2/frame-index-addrmode-t2i8s4.mir    | 44 +++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/frame-index-addrmode-t2i8s4.mir

diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 17b2c35847510..48c6b47f21545 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -634,7 +634,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       assert((Offset & OffsetMask) == 0 && "Can't encode this offset!");
       (void)OffsetMask; // squash unused-variable warning at -NDEBUG
     } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
-      Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+      Offset += MI.getOperand(FrameRegIdx + 1).getImm();
       NumBits = 8 + 2;
       // MCInst operand expects already scaled value.
       Scale = 1;
diff --git a/llvm/test/CodeGen/Thumb2/frame-index-addrmode-t2i8s4.mir b/llvm/test/CodeGen/Thumb2/frame-index-addrmode-t2i8s4.mir
new file mode 100644
index 0000000000000..0d246ffba7228
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/frame-index-addrmode-t2i8s4.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=thumbv8.1m.main-none-none-eabi -run-pass=prologepilog | FileCheck %s
+--- |
+  ; Function Attrs: noinline nounwind optnone
+  define dso_local i64 @f() #0 {
+  entry:
+    %a = alloca [10 x i64], align 8
+    %arrayidx = getelementptr inbounds [10 x i64], [10 x i64]* %a, i32 0, i32 1
+    store volatile i64 1, i64* %arrayidx, align 8
+    %arrayidx1 = getelementptr inbounds [10 x i64], [10 x i64]* %a, i32 0, i32 1
+    %0 = load volatile i64, i64* %arrayidx1, align 8
+    ret i64 %0
+  }
+
+...
+---
+name:            f
+alignment:       2
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    8
+  maxCallFrameSize: 0
+  localFrameSize:  80
+stack:
+  - { id: 0, name: a, size: 80, alignment: 8, local-offset: -80 }
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: f
+    ; CHECK: $sp = frame-setup tSUBspi $sp, 20, 14 /* CC::al */, $noreg
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 80
+    ; CHECK: renamable $r0 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: renamable $r1 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: t2STRDi8 killed $r1, killed $r0, $sp, 8, 14 /* CC::al */, $noreg :: (volatile store 8 into %ir.arrayidx)
+    ; CHECK: $r0, $r1 = t2LDRDi8 $sp, 8, 14 /* CC::al */, $noreg :: (volatile dereferenceable load 8 from %ir.arrayidx1)
+    ; CHECK: $sp = frame-destroy tADDspi $sp, 20, 14 /* CC::al */, $noreg
+    ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0, implicit $r1
+    renamable $r0 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    renamable $r1 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    t2STRDi8 killed $r1, killed $r0, %stack.0.a, 8, 14 /* CC::al */, $noreg :: (volatile store 8 into %ir.arrayidx)
+    $r0, $r1 = t2LDRDi8 %stack.0.a, 8, 14 /* CC::al */, $noreg :: (volatile dereferenceable load 8 from %ir.arrayidx1)
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0, implicit $r1
+
+...

From 763bc2305797c980a4f4fa2f6314ed78a010678d Mon Sep 17 00:00:00 2001
From: Gongyu Deng <gy_deng@icloud.com>
Date: Wed, 27 May 2020 14:06:28 +0200
Subject: [PATCH 214/770] [lldb] Tab completion for process plugin name

Summary:

1. Added tab completion to `process launch -p`, `process attach -P`, `process
connect -p`;

2. Bound the plugin name common completion as the default completion for
`eArgTypePlugin` arguments.

Reviewers: teemperor, JDevlieghere

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D79929
---
 lldb/include/lldb/Core/PluginManager.h        |  3 +
 .../lldb/Interpreter/CommandCompletions.h     |  7 ++-
 lldb/source/Commands/CommandCompletions.cpp   |  8 +++
 lldb/source/Commands/CommandObjectProcess.cpp | 60 ++++++++++---------
 lldb/source/Core/PluginManager.cpp            |  8 +++
 lldb/source/Interpreter/CommandObject.cpp     |  2 +-
 .../completion/TestCompletion.py              |  7 +++
 7 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index 4cae597d37320..5e0c9395dae03 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -243,6 +243,9 @@ class PluginManager {
 
   static const char *GetProcessPluginDescriptionAtIndex(uint32_t idx);
 
+  static void AutoCompleteProcessName(llvm::StringRef partial_name,
+                                      CompletionRequest &request);
+
   // ScriptInterpreter
   static bool RegisterPlugin(ConstString name, const char *description,
                              lldb::ScriptLanguage script_lang,
diff --git a/lldb/include/lldb/Interpreter/CommandCompletions.h b/lldb/include/lldb/Interpreter/CommandCompletions.h
index dc2bf841620d1..39d1c98eaa39e 100644
--- a/lldb/include/lldb/Interpreter/CommandCompletions.h
+++ b/lldb/include/lldb/Interpreter/CommandCompletions.h
@@ -36,10 +36,11 @@ class CommandCompletions {
     eVariablePathCompletion = (1u << 8),
     eRegisterCompletion = (1u << 9),
     eBreakpointCompletion = (1u << 10),
+    eProcessPluginCompletion = (1u << 11),
     // This item serves two purposes.  It is the last element in the enum, so
     // you can add custom enums starting from here in your Option class. Also
     // if you & in this bit the base code will not process the option.
-    eCustomCompletion = (1u << 11)
+    eCustomCompletion = (1u << 12)
   };
 
   static bool InvokeCommonCompletionCallbacks(
@@ -89,6 +90,10 @@ class CommandCompletions {
 
   static void Breakpoints(CommandInterpreter &interpreter,
                           CompletionRequest &request, SearchFilter *searcher);
+
+  static void ProcessPluginNames(CommandInterpreter &interpreter,
+                                 CompletionRequest &request,
+                                 SearchFilter *searcher);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp
index d4e4f6a5ebb5a..11198f68490de 100644
--- a/lldb/source/Commands/CommandCompletions.cpp
+++ b/lldb/source/Commands/CommandCompletions.cpp
@@ -58,6 +58,7 @@ bool CommandCompletions::InvokeCommonCompletionCallbacks(
       {eVariablePathCompletion, CommandCompletions::VariablePath},
       {eRegisterCompletion, CommandCompletions::Registers},
       {eBreakpointCompletion, CommandCompletions::Breakpoints},
+      {eProcessPluginCompletion, CommandCompletions::ProcessPluginNames},
       {eNoCompletion, nullptr} // This one has to be last in the list.
   };
 
@@ -582,3 +583,10 @@ void CommandCompletions::Breakpoints(CommandInterpreter &interpreter,
     request.TryCompleteCurrentArg(std::to_string(bp->GetID()), bp_info);
   }
 }
+
+void CommandCompletions::ProcessPluginNames(CommandInterpreter &interpreter,
+                                            CompletionRequest &request,
+                                            SearchFilter *searcher) {
+  PluginManager::AutoCompleteProcessName(request.GetCursorArgumentPrefix(),
+                                         request);
+}
\ No newline at end of file
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 043765a0c09cf..4f591b53aaa6e 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -325,34 +325,38 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
       int opt_arg_pos = opt_element_vector[opt_element_index].opt_arg_pos;
       int opt_defs_index = opt_element_vector[opt_element_index].opt_defs_index;
 
-      // We are only completing the name option for now...
-
-      // Are we in the name?
-      if (GetDefinitions()[opt_defs_index].short_option != 'n')
-        return;
-
-      // Look to see if there is a -P argument provided, and if so use that
-      // plugin, otherwise use the default plugin.
-
-      const char *partial_name = nullptr;
-      partial_name = request.GetParsedLine().GetArgumentAtIndex(opt_arg_pos);
-
-      PlatformSP platform_sp(interpreter.GetPlatform(true));
-      if (!platform_sp)
-        return;
-      ProcessInstanceInfoList process_infos;
-      ProcessInstanceInfoMatch match_info;
-      if (partial_name) {
-        match_info.GetProcessInfo().GetExecutableFile().SetFile(
-            partial_name, FileSpec::Style::native);
-        match_info.SetNameMatchType(NameMatch::StartsWith);
-      }
-      platform_sp->FindProcesses(match_info, process_infos);
-      const size_t num_matches = process_infos.size();
-      if (num_matches == 0)
-        return;
-      for (size_t i = 0; i < num_matches; ++i) {
-        request.AddCompletion(process_infos[i].GetNameAsStringRef());
+      switch (GetDefinitions()[opt_defs_index].short_option) {
+      case 'n': {
+        // Look to see if there is a -P argument provided, and if so use that
+        // plugin, otherwise use the default plugin.
+
+        const char *partial_name = nullptr;
+        partial_name = request.GetParsedLine().GetArgumentAtIndex(opt_arg_pos);
+
+        PlatformSP platform_sp(interpreter.GetPlatform(true));
+        if (!platform_sp)
+          return;
+        ProcessInstanceInfoList process_infos;
+        ProcessInstanceInfoMatch match_info;
+        if (partial_name) {
+          match_info.GetProcessInfo().GetExecutableFile().SetFile(
+              partial_name, FileSpec::Style::native);
+          match_info.SetNameMatchType(NameMatch::StartsWith);
+        }
+        platform_sp->FindProcesses(match_info, process_infos);
+        const size_t num_matches = process_infos.size();
+        if (num_matches == 0)
+          return;
+        for (size_t i = 0; i < num_matches; ++i) {
+          request.AddCompletion(process_infos[i].GetNameAsStringRef());
+        }
+      } break;
+
+      case 'P':
+        CommandCompletions::InvokeCommonCompletionCallbacks(
+            interpreter, CommandCompletions::eProcessPluginCompletion, request,
+            nullptr);
+        break;
       }
     }
 
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 2a1f094534d92..3545ef66cc383 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -830,6 +830,14 @@ PluginManager::GetProcessCreateCallbackForPluginName(ConstString name) {
   return GetProcessInstances().GetCallbackForName(name);
 }
 
+void PluginManager::AutoCompleteProcessName(llvm::StringRef name,
+                                            CompletionRequest &request) {
+  for (const auto &instance : GetProcessInstances().GetInstances()) {
+    if (instance.name.GetStringRef().startswith(name))
+      request.AddCompletion(instance.name.GetCString(), instance.description);
+  }
+}
+
 #pragma mark ScriptInterpreter
 
 struct ScriptInterpreterInstance
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index ddf1f5511ecd7..4cadaa373d14e 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -1080,7 +1080,7 @@ CommandObject::ArgumentTableEntry CommandObject::g_arguments_data[] = {
     { eArgTypePermissionsNumber, "perms-numeric", CommandCompletions::eNoCompletion, { nullptr, false }, "Permissions given as an octal number (e.g. 755)." },
     { eArgTypePermissionsString, "perms=string", CommandCompletions::eNoCompletion, { nullptr, false }, "Permissions given as a string value (e.g. rw-r-xr--)." },
     { eArgTypePid, "pid", CommandCompletions::eNoCompletion, { nullptr, false }, "The process ID number." },
-    { eArgTypePlugin, "plugin", CommandCompletions::eNoCompletion, { nullptr, false }, "Help text goes here." },
+    { eArgTypePlugin, "plugin", CommandCompletions::eProcessPluginCompletion, { nullptr, false }, "Help text goes here." },
     { eArgTypeProcessName, "process-name", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of the process." },
     { eArgTypePythonClass, "python-class", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of a Python class." },
     { eArgTypePythonFunction, "python-function", CommandCompletions::eNoCompletion, { nullptr, false }, "The name of a Python function." },
diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py
index a53ade542f461..7c674dc872ed2 100644
--- a/lldb/test/API/functionalities/completion/TestCompletion.py
+++ b/lldb/test/API/functionalities/completion/TestCompletion.py
@@ -85,6 +85,13 @@ def test_process_launch_arch(self):
                               ['mips',
                                'arm64'])
 
+    def test_process_plugin_completion(self):
+        subcommands = ['attach -P', 'connect -p', 'launch -p']
+
+        for subcommand in subcommands:
+            self.complete_from_to('process ' + subcommand + ' mac',
+                                  'process ' + subcommand + ' mach-o-core')
+
     def test_process_signal(self):
         # The tab completion for "process signal"  won't work without a running process.
         self.complete_from_to('process signal ',

From 5b84ee4f61419b9a911ce75b4bc1c5cc7de1d0d6 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Wed, 20 May 2020 07:37:15 +0000
Subject: [PATCH 215/770] [Alignment] Fix misaligned interleaved loads

Summary: Tentatively fixing https://bugs.llvm.org/show_bug.cgi?id=45957

Reviewers: craig.topper, nlopes

Subscribers: hiraditya, llvm-commits, RKSimon, jdoerfert, efriedma

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80276
---
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  | 26 ++++++++++++-------
 .../InterleavedAccess/X86/interleavedLoad.ll  | 14 +++++-----
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index de84874f715dd..72a37a9ddeb9e 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -150,7 +150,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
   // We support shuffle represents stride 4 for byte type with size of
   // WideInstSize.
   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
-     return true;
+    return true;
 
   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
@@ -211,13 +211,20 @@ void X86InterleavedAccessGroup::decompose(
     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
   }
   // Generate N loads of T type.
+  assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
+         "VecBaseTy's size must be a multiple of 8");
+  const Align FirstAlignment = LI->getAlign();
+  const Align SubsequentAlignment = commonAlignment(
+      FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8);
+  Align Alignment = FirstAlignment;
   for (unsigned i = 0; i < NumLoads; i++) {
     // TODO: Support inbounds GEP.
     Value *NewBasePtr =
         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
     Instruction *NewLoad =
-        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlign());
+        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
     DecomposedVectors.push_back(NewLoad);
+    Alignment = SubsequentAlignment;
   }
 }
 
@@ -255,7 +262,7 @@ static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
                             SmallVectorImpl<int> &Out, int LowOffset,
                             int HighOffset) {
   assert(VT.getSizeInBits() >= 256 &&
-    "This function doesn't accept width smaller then 256");
+         "This function doesn't accept width smaller then 256");
   unsigned NumOfElm = VT.getVectorNumElements();
   for (unsigned i = 0; i < Mask.size(); i++)
     Out.push_back(Mask[i] + LowOffset);
@@ -289,7 +296,7 @@ static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
   if (VecElems == 16) {
     for (unsigned i = 0; i < Stride; i++)
       TransposedMatrix[i] = Builder.CreateShuffleVector(
-        Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
+          Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
     return;
   }
 
@@ -298,20 +305,19 @@ static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
 
   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
-      (i + 1) / Stride * 16);
+                    (i + 1) / Stride * 16);
     Temp[i / 2] = Builder.CreateShuffleVector(
-      Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
+        Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
     OptimizeShuf.clear();
   }
 
   if (VecElems == 32) {
     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
     return;
-  }
-  else
+  } else
     for (unsigned i = 0; i < Stride; i++)
       TransposedMatrix[i] =
-      Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
+          Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
 }
 
 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
@@ -682,7 +688,7 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
 
   unsigned NumOfElm = VT.getVectorNumElements();
   group2Shuffle(VT, GroupSize, VPShuf);
-  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
+  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
 }
 
 void X86InterleavedAccessGroup::transpose_4x4(
diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll
index 9705b438fa390..aa682ddcb3814 100644
--- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll
+++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll
@@ -8,15 +8,15 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
 ; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 128
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 128
+; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 128
+; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 16
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3
-; AVX2-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 128
+; AVX2-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 16
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 128
+; AVX2-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 16
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5
-; AVX2-NEXT:    [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 128
+; AVX2-NEXT:    [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 16
 ; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX2-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -50,9 +50,9 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
 ; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 64
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 64
+; AVX2-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 64
+; AVX2-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 16
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
 ; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>
 ; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 2, i32 5, i32 8, i32 11, i32 14, i32 1, i32 4, i32 7, i32 10, i32 13>

From 63f927b17a1ce18cb922c441ffc0691a71d550b8 Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Wed, 27 May 2020 13:20:05 +0100
Subject: [PATCH 216/770] Update release notes with porting guide for AST
 Matchers

---
 clang/docs/ReleaseNotes.rst | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 571b54904754f..8f9dc81ec0b30 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -259,6 +259,36 @@ AST Matchers
   uses ``IgnoreUnlessSpelledInSource`` by default.  The mode can be changed
   using ``set traversal AsIs`` in the ``clang-query`` environment.
 
+  As this change requires downstream tools which use AST Matchers to adapt
+  to the new default, a porting guide may be useful for downstream tools
+  needing to adapt.
+
+  Note that although there are many steps below, only the first is
+  non-optional. The steps are intentionally extemely granular to facilitate
+  understanding of the guide itself. It is reasonable to do some of the
+  steps at the same time if you understand the guide:
+
+  1. Use ``(your ASTContext instance).getParentMapContext().setTraversalKind(TK_AsIs)``
+     to restore the previous behavior for your tool.  All further steps in
+     this porting guide are optional.
+  2. Wrap your existing matcher expressions with ``traverse(TK_AsIs, ...)``
+     before passing them to ``ASTMatchFinder::addMatcher``.
+  3. Remove ``(your ASTContext instance).getParentMapContext().setTraversalKind(TK_AsIs)``
+     from your tool so that the default behavior of your tool matches the
+     default behavior of upstream clang. This is made possible by wrapping
+     your matchers in ``traverse(TK_AsIs, ...)`` from step (2).
+  4. Audit your matcher expressions and remove ``traverse(TK_AsIs, ...)``
+     where not needed.
+  5. Audit your matcher expressions and remove calls to ``ignoring*()``
+     matchers where not needed.
+  6. Audit your matcher expressions and consider whether the matcher is
+     better using the ``TK_AsIs`` mode or if it can be better expressed in
+     the default mode. For example, some matchers explicitly match
+     ``has(implicitCastExpr(has(...)))``. Such matchers are sometimes
+     written by author who were unaware of the existence of the
+     ``ignoring*()`` matchers.
+
+
 clang-format
 ------------
 

From 0508fb45dfbc3ffde6bacc1e52177f3972a3eb99 Mon Sep 17 00:00:00 2001
From: Ties Stuij <ties.stuij@arm.com>
Date: Wed, 27 May 2020 12:44:00 +0100
Subject: [PATCH 217/770] [CodeGen][BFloat] Add bfloat MVT type

Summary:
This patch adds BFloat MVT support. It also adds fixed and scalable vector MVT
types for BFloat.

This patch is part of a series that adds support for the Bfloat16 extension of the Armv8.6-a architecture, as
detailed here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

The bfloat type, and its properties are specified in the Arm Architecture
Reference Manual:

https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

Reviewers: aemerson, huntergr, craig.topper, fpetrogalli, sdesmalen, LukeGeeson, ostannard

Reviewed By: ostannard

Subscribers: LukeGeeson, pbarrio, dschuff, kristof.beyls, hiraditya, aheejin, jdoerfert, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79706
---
 llvm/include/llvm/CodeGen/SelectionDAG.h     |   1 +
 llvm/include/llvm/CodeGen/ValueTypes.td      | 295 ++++++++-------
 llvm/include/llvm/IR/Intrinsics.td           |   1 +
 llvm/include/llvm/Support/MachineValueType.h | 371 +++++++++++--------
 llvm/lib/CodeGen/ValueTypes.cpp              |  15 +
 llvm/utils/TableGen/CodeGenTarget.cpp        |  10 +
 6 files changed, 393 insertions(+), 300 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 8b794d98d81f8..462d9f91c4f15 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1547,6 +1547,7 @@ class SelectionDAG {
     switch (VT.getScalarType().getSimpleVT().SimpleTy) {
     default: llvm_unreachable("Unknown FP format");
     case MVT::f16:     return APFloat::IEEEhalf();
+    case MVT::bf16:    return APFloat::BFloat();
     case MVT::f32:     return APFloat::IEEEsingle();
     case MVT::f64:     return APFloat::IEEEdouble();
     case MVT::f80:     return APFloat::x87DoubleExtended();
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 2ec0ed7ce3bdb..e08a33a50df68 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -25,148 +25,159 @@ def i16    : ValueType<16 ,  4>;   // 16-bit integer value
 def i32    : ValueType<32 ,  5>;   // 32-bit integer value
 def i64    : ValueType<64 ,  6>;   // 64-bit integer value
 def i128   : ValueType<128,  7>;   // 128-bit integer value
-def f16    : ValueType<16 ,  8>;   // 16-bit floating point value
-def f32    : ValueType<32 ,  9>;   // 32-bit floating point value
-def f64    : ValueType<64 , 10>;   // 64-bit floating point value
-def f80    : ValueType<80 , 11>;   // 80-bit floating point value
-def f128   : ValueType<128, 12>;   // 128-bit floating point value
-def ppcf128: ValueType<128, 13>;   // PPC 128-bit floating point value
-
-def v1i1   : ValueType<1 ,  14>;   //   1 x i1 vector value
-def v2i1   : ValueType<2 ,  15>;   //   2 x i1 vector value
-def v4i1   : ValueType<4 ,  16>;   //   4 x i1 vector value
-def v8i1   : ValueType<8 ,  17>;   //   8 x i1 vector value
-def v16i1  : ValueType<16,  18>;   //  16 x i1 vector value
-def v32i1  : ValueType<32 , 19>;   //  32 x i1 vector value
-def v64i1  : ValueType<64 , 20>;   //  64 x i1 vector value
-def v128i1 : ValueType<128, 21>;   // 128 x i1 vector value
-def v256i1 : ValueType<256, 22>;   // 256 x i1 vector value
-def v512i1 : ValueType<512, 23>;   // 512 x i1 vector value
-def v1024i1: ValueType<1024,24>;   //1024 x i1 vector value
-
-def v1i8   : ValueType<8,   25>;   //  1 x i8  vector value
-def v2i8   : ValueType<16 , 26>;   //  2 x i8  vector value
-def v4i8   : ValueType<32 , 27>;   //  4 x i8  vector value
-def v8i8   : ValueType<64 , 28>;   //  8 x i8  vector value
-def v16i8  : ValueType<128, 29>;   // 16 x i8  vector value
-def v32i8  : ValueType<256, 30>;   // 32 x i8  vector value
-def v64i8  : ValueType<512, 31>;   // 64 x i8  vector value
-def v128i8 : ValueType<1024,32>;   //128 x i8  vector value
-def v256i8 : ValueType<2048,33>;   //256 x i8  vector value
-
-def v1i16  : ValueType<16 , 34>;   //  1 x i16 vector value
-def v2i16  : ValueType<32 , 35>;   //  2 x i16 vector value
-def v3i16  : ValueType<48 , 36>;   //  3 x i16 vector value
-def v4i16  : ValueType<64 , 37>;   //  4 x i16 vector value
-def v8i16  : ValueType<128, 38>;   //  8 x i16 vector value
-def v16i16 : ValueType<256, 39>;   // 16 x i16 vector value
-def v32i16 : ValueType<512, 40>;   // 32 x i16 vector value
-def v64i16 : ValueType<1024,41>;   // 64 x i16 vector value
-def v128i16: ValueType<2048,42>;   //128 x i16 vector value
-
-def v1i32    : ValueType<32 , 43>;   //  1 x i32 vector value
-def v2i32    : ValueType<64 , 44>;   //  2 x i32 vector value
-def v3i32    : ValueType<96 , 45>;   //  3 x i32 vector value
-def v4i32    : ValueType<128, 46>;   //  4 x i32 vector value
-def v5i32    : ValueType<160, 47>;   //  5 x i32 vector value
-def v8i32    : ValueType<256, 48>;   //  8 x i32 vector value
-def v16i32   : ValueType<512, 49>;   // 16 x i32 vector value
-def v32i32   : ValueType<1024,50>;   // 32 x i32 vector value
-def v64i32   : ValueType<2048,51>;   // 64 x i32 vector value
-def v128i32  : ValueType<4096,52>;   // 128 x i32 vector value
-def v256i32  : ValueType<8182,53>;   // 256 x i32 vector value
-def v512i32  : ValueType<16384,54>;  // 512 x i32 vector value
-def v1024i32 : ValueType<32768,55>;  // 1024 x i32 vector value
-def v2048i32 : ValueType<65536,56>;  // 2048 x i32 vector value
-
-def v1i64  : ValueType<64 , 57>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 58>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 59>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 60>;   //  8 x i64 vector value
-def v16i64 : ValueType<1024,61>;   // 16 x i64 vector value
-def v32i64 : ValueType<2048,62>;   // 32 x i64 vector value
-
-def v1i128 : ValueType<128, 63>;   //  1 x i128 vector value
-
-def v2f16    : ValueType<32 , 64>;    //    2 x f16 vector value
-def v3f16    : ValueType<48 , 65>;    //    3 x f16 vector value
-def v4f16    : ValueType<64 , 66>;    //    4 x f16 vector value
-def v8f16    : ValueType<128, 67>;    //    8 x f16 vector value
-def v16f16   : ValueType<256, 68>;    //    8 x f16 vector value
-def v32f16   : ValueType<512, 69>;    //    8 x f16 vector value
-def v1f32    : ValueType<32 , 70>;    //    1 x f32 vector value
-def v2f32    : ValueType<64 , 71>;    //    2 x f32 vector value
-def v3f32    : ValueType<96 , 72>;    //    3 x f32 vector value
-def v4f32    : ValueType<128, 73>;    //    4 x f32 vector value
-def v5f32    : ValueType<160, 74>;    //    5 x f32 vector value
-def v8f32    : ValueType<256, 75>;    //    8 x f32 vector value
-def v16f32   : ValueType<512,  76>;   //   16 x f32 vector value
-def v32f32   : ValueType<1024, 77>;   //   32 x f32 vector value
-def v64f32   : ValueType<2048, 78>;   //   64 x f32 vector value
-def v128f32  : ValueType<4096, 79>;   //  128 x f32 vector value
-def v256f32  : ValueType<8182, 80>;   //  256 x f32 vector value
-def v512f32  : ValueType<16384, 81>;  //  512 x f32 vector value
-def v1024f32 : ValueType<32768, 82>;  // 1024 x f32 vector value
-def v2048f32 : ValueType<65536, 83>;  // 2048 x f32 vector value
-def v1f64    : ValueType<64, 84>;     //    1 x f64 vector value
-def v2f64    : ValueType<128, 85>;    //    2 x f64 vector value
-def v4f64    : ValueType<256, 86>;    //    4 x f64 vector value
-def v8f64    : ValueType<512, 87>;    //    8 x f64 vector value
-def v16f64   : ValueType<1024, 88>;   //   16 x f64 vector value
-
-def nxv1i1  : ValueType<1,   89>;  // n x  1 x i1  vector value
-def nxv2i1  : ValueType<2,   90>;  // n x  2 x i1  vector value
-def nxv4i1  : ValueType<4,   91>;  // n x  4 x i1  vector value
-def nxv8i1  : ValueType<8,   92>;  // n x  8 x i1  vector value
-def nxv16i1 : ValueType<16,  93>;  // n x 16 x i1  vector value
-def nxv32i1 : ValueType<32,  94>;  // n x 32 x i1  vector value
-
-def nxv1i8  : ValueType<8,    95>;  // n x  1 x i8  vector value
-def nxv2i8  : ValueType<16,   96>;  // n x  2 x i8  vector value
-def nxv4i8  : ValueType<32,   97>;  // n x  4 x i8  vector value
-def nxv8i8  : ValueType<64,   98>;  // n x  8 x i8  vector value
-def nxv16i8 : ValueType<128,  99>;  // n x 16 x i8  vector value
-def nxv32i8 : ValueType<256, 100>;  // n x 32 x i8  vector value
-
-def nxv1i16 : ValueType<16,  101>; // n x  1 x i16 vector value
-def nxv2i16 : ValueType<32,  102>; // n x  2 x i16 vector value
-def nxv4i16 : ValueType<64,  103>; // n x  4 x i16 vector value
-def nxv8i16 : ValueType<128, 104>; // n x  8 x i16 vector value
-def nxv16i16: ValueType<256, 105>; // n x 16 x i16 vector value
-def nxv32i16: ValueType<512, 106>; // n x 32 x i16 vector value
-
-def nxv1i32 : ValueType<32,  107>; // n x  1 x i32 vector value
-def nxv2i32 : ValueType<64,  108>; // n x  2 x i32 vector value
-def nxv4i32 : ValueType<128, 109>; // n x  4 x i32 vector value
-def nxv8i32 : ValueType<256, 110>; // n x  8 x i32 vector value
-def nxv16i32: ValueType<512, 111>; // n x 16 x i32 vector value
-def nxv32i32: ValueType<1024,112>; // n x 32 x i32 vector value
-
-def nxv1i64 : ValueType<64,  113>; // n x  1 x i64 vector value
-def nxv2i64 : ValueType<128, 114>; // n x  2 x i64 vector value
-def nxv4i64 : ValueType<256, 115>; // n x  4 x i64 vector value
-def nxv8i64 : ValueType<512, 116>; // n x  8 x i64 vector value
-def nxv16i64: ValueType<1024,117>; // n x 16 x i64 vector value
-def nxv32i64: ValueType<2048,118>; // n x 32 x i64 vector value
-
-def nxv2f16  : ValueType<32 , 119>; // n x  2 x f16 vector value
-def nxv4f16  : ValueType<64 , 120>; // n x  4 x f16 vector value
-def nxv8f16  : ValueType<128, 121>; // n x  8 x f16 vector value
-def nxv1f32  : ValueType<32 , 122>; // n x  1 x f32 vector value
-def nxv2f32  : ValueType<64 , 123>; // n x  2 x f32 vector value
-def nxv4f32  : ValueType<128, 124>; // n x  4 x f32 vector value
-def nxv8f32  : ValueType<256, 125>; // n x  8 x f32 vector value
-def nxv16f32 : ValueType<512, 126>; // n x 16 x f32 vector value
-def nxv1f64  : ValueType<64,  127>; // n x  1 x f64 vector value
-def nxv2f64  : ValueType<128, 128>; // n x  2 x f64 vector value
-def nxv4f64  : ValueType<256, 129>; // n x  4 x f64 vector value
-def nxv8f64  : ValueType<512, 130>; // n x  8 x f64 vector value
-
-def x86mmx : ValueType<64 , 131>;   // X86 MMX value
-def FlagVT : ValueType<0  , 132>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 133>;   // Produces no value
-def untyped: ValueType<8  , 134>;   // Produces an untyped value
-def exnref : ValueType<0  , 135>;   // WebAssembly's exnref type
+
+def bf16   : ValueType<16 ,  8>;   // 16-bit brain floating point value
+def f16    : ValueType<16 ,  9>;   // 16-bit floating point value
+def f32    : ValueType<32 , 10>;   // 32-bit floating point value
+def f64    : ValueType<64 , 11>;   // 64-bit floating point value
+def f80    : ValueType<80 , 12>;   // 80-bit floating point value
+def f128   : ValueType<128, 13>;   // 128-bit floating point value
+def ppcf128: ValueType<128, 14>;   // PPC 128-bit floating point value
+
+def v1i1   : ValueType<1 ,  15>;   //   1 x i1 vector value
+def v2i1   : ValueType<2 ,  16>;   //   2 x i1 vector value
+def v4i1   : ValueType<4 ,  17>;   //   4 x i1 vector value
+def v8i1   : ValueType<8 ,  18>;   //   8 x i1 vector value
+def v16i1  : ValueType<16,  19>;   //  16 x i1 vector value
+def v32i1  : ValueType<32 , 20>;   //  32 x i1 vector value
+def v64i1  : ValueType<64 , 21>;   //  64 x i1 vector value
+def v128i1 : ValueType<128, 22>;   // 128 x i1 vector value
+def v256i1 : ValueType<256, 23>;   // 256 x i1 vector value
+def v512i1 : ValueType<512, 24>;   // 512 x i1 vector value
+def v1024i1: ValueType<1024,25>;   //1024 x i1 vector value
+
+def v1i8   : ValueType<8,   26>;   //  1 x i8  vector value
+def v2i8   : ValueType<16 , 27>;   //  2 x i8  vector value
+def v4i8   : ValueType<32 , 28>;   //  4 x i8  vector value
+def v8i8   : ValueType<64 , 29>;   //  8 x i8  vector value
+def v16i8  : ValueType<128, 30>;   // 16 x i8  vector value
+def v32i8  : ValueType<256, 31>;   // 32 x i8  vector value
+def v64i8  : ValueType<512, 32>;   // 64 x i8  vector value
+def v128i8 : ValueType<1024,33>;   //128 x i8  vector value
+def v256i8 : ValueType<2048,34>;   //256 x i8  vector value
+
+def v1i16  : ValueType<16 , 35>;   //  1 x i16 vector value
+def v2i16  : ValueType<32 , 36>;   //  2 x i16 vector value
+def v3i16  : ValueType<48 , 37>;   //  3 x i16 vector value
+def v4i16  : ValueType<64 , 38>;   //  4 x i16 vector value
+def v8i16  : ValueType<128, 39>;   //  8 x i16 vector value
+def v16i16 : ValueType<256, 40>;   // 16 x i16 vector value
+def v32i16 : ValueType<512, 41>;   // 32 x i16 vector value
+def v64i16 : ValueType<1024,42>;   // 64 x i16 vector value
+def v128i16: ValueType<2048,43>;   //128 x i16 vector value
+
+def v1i32    : ValueType<32 , 44>;   //  1 x i32 vector value
+def v2i32    : ValueType<64 , 45>;   //  2 x i32 vector value
+def v3i32    : ValueType<96 , 46>;   //  3 x i32 vector value
+def v4i32    : ValueType<128, 47>;   //  4 x i32 vector value
+def v5i32    : ValueType<160, 48>;   //  5 x i32 vector value
+def v8i32    : ValueType<256, 49>;   //  8 x i32 vector value
+def v16i32   : ValueType<512, 50>;   // 16 x i32 vector value
+def v32i32   : ValueType<1024,51>;   // 32 x i32 vector value
+def v64i32   : ValueType<2048,52>;   // 64 x i32 vector value
+def v128i32  : ValueType<4096,53>;   // 128 x i32 vector value
+def v256i32  : ValueType<8182,54>;   // 256 x i32 vector value
+def v512i32  : ValueType<16384,55>;  // 512 x i32 vector value
+def v1024i32 : ValueType<32768,56>;  // 1024 x i32 vector value
+def v2048i32 : ValueType<65536,57>;  // 2048 x i32 vector value
+
+def v1i64  : ValueType<64 , 58>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 59>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 60>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 61>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,62>;   // 16 x i64 vector value
+def v32i64 : ValueType<2048,63>;   // 32 x i64 vector value
+
+def v1i128 : ValueType<128, 64>;   //  1 x i128 vector value
+
+def v2f16    : ValueType<32 , 65>;    //    2 x f16 vector value
+def v3f16    : ValueType<48 , 66>;    //    3 x f16 vector value
+def v4f16    : ValueType<64 , 67>;    //    4 x f16 vector value
+def v8f16    : ValueType<128, 68>;    //    8 x f16 vector value
+def v16f16   : ValueType<256, 69>;    //   16 x f16 vector value
+def v32f16   : ValueType<512, 70>;    //   32 x f16 vector value
+def v2bf16   : ValueType<32 , 71>;    //    2 x bf16 vector value
+def v3bf16   : ValueType<48 , 72>;    //    3 x bf16 vector value
+def v4bf16   : ValueType<64 , 73>;    //    4 x bf16 vector value
+def v8bf16   : ValueType<128, 74>;    //    8 x bf16 vector value
+def v16bf16  : ValueType<256, 75>;    //   16 x bf16 vector value
+def v32bf16  : ValueType<512, 76>;    //   32 x bf16 vector value
+def v1f32    : ValueType<32 , 77>;    //    1 x f32 vector value
+def v2f32    : ValueType<64 , 78>;    //    2 x f32 vector value
+def v3f32    : ValueType<96 , 79>;    //    3 x f32 vector value
+def v4f32    : ValueType<128, 80>;    //    4 x f32 vector value
+def v5f32    : ValueType<160, 81>;    //    5 x f32 vector value
+def v8f32    : ValueType<256, 82>;    //    8 x f32 vector value
+def v16f32   : ValueType<512,  83>;   //   16 x f32 vector value
+def v32f32   : ValueType<1024, 84>;   //   32 x f32 vector value
+def v64f32   : ValueType<2048, 85>;   //   64 x f32 vector value
+def v128f32  : ValueType<4096, 86>;   //  128 x f32 vector value
+def v256f32  : ValueType<8182, 87>;   //  256 x f32 vector value
+def v512f32  : ValueType<16384, 88>;  //  512 x f32 vector value
+def v1024f32 : ValueType<32768, 89>;  // 1024 x f32 vector value
+def v2048f32 : ValueType<65536, 90>;  // 2048 x f32 vector value
+def v1f64    : ValueType<64, 91>;     //    1 x f64 vector value
+def v2f64    : ValueType<128, 92>;    //    2 x f64 vector value
+def v4f64    : ValueType<256, 93>;    //    4 x f64 vector value
+def v8f64    : ValueType<512, 94>;    //    8 x f64 vector value
+def v16f64   : ValueType<1024, 95>;   //   16 x f64 vector value
+
+def nxv1i1  : ValueType<1,   96>;  // n x  1 x i1  vector value
+def nxv2i1  : ValueType<2,   97>;  // n x  2 x i1  vector value
+def nxv4i1  : ValueType<4,   98>;  // n x  4 x i1  vector value
+def nxv8i1  : ValueType<8,   99>;  // n x  8 x i1  vector value
+def nxv16i1 : ValueType<16, 100>;  // n x 16 x i1  vector value
+def nxv32i1 : ValueType<32, 101>;  // n x 32 x i1  vector value
+
+def nxv1i8  : ValueType<8,   102>;  // n x  1 x i8  vector value
+def nxv2i8  : ValueType<16,  103>;  // n x  2 x i8  vector value
+def nxv4i8  : ValueType<32,  104>;  // n x  4 x i8  vector value
+def nxv8i8  : ValueType<64,  105>;  // n x  8 x i8  vector value
+def nxv16i8 : ValueType<128, 106>;  // n x 16 x i8  vector value
+def nxv32i8 : ValueType<256, 107>;  // n x 32 x i8  vector value
+
+def nxv1i16 : ValueType<16,  108>; // n x  1 x i16 vector value
+def nxv2i16 : ValueType<32,  109>; // n x  2 x i16 vector value
+def nxv4i16 : ValueType<64,  110>; // n x  4 x i16 vector value
+def nxv8i16 : ValueType<128, 111>; // n x  8 x i16 vector value
+def nxv16i16: ValueType<256, 112>; // n x 16 x i16 vector value
+def nxv32i16: ValueType<512, 113>; // n x 32 x i16 vector value
+
+def nxv1i32 : ValueType<32,  114>; // n x  1 x i32 vector value
+def nxv2i32 : ValueType<64,  115>; // n x  2 x i32 vector value
+def nxv4i32 : ValueType<128, 116>; // n x  4 x i32 vector value
+def nxv8i32 : ValueType<256, 117>; // n x  8 x i32 vector value
+def nxv16i32: ValueType<512, 118>; // n x 16 x i32 vector value
+def nxv32i32: ValueType<1024,119>; // n x 32 x i32 vector value
+
+def nxv1i64 : ValueType<64,  120>; // n x  1 x i64 vector value
+def nxv2i64 : ValueType<128, 121>; // n x  2 x i64 vector value
+def nxv4i64 : ValueType<256, 122>; // n x  4 x i64 vector value
+def nxv8i64 : ValueType<512, 123>; // n x  8 x i64 vector value
+def nxv16i64: ValueType<1024,124>; // n x 16 x i64 vector value
+def nxv32i64: ValueType<2048,125>; // n x 32 x i64 vector value
+
+def nxv2f16  : ValueType<32 , 126>; // n x  2 x f16 vector value
+def nxv4f16  : ValueType<64 , 127>; // n x  4 x f16 vector value
+def nxv8f16  : ValueType<128, 128>; // n x  8 x f16 vector value
+def nxv2bf16 : ValueType<32 , 129>; // n x  2 x bf16 vector value
+def nxv4bf16 : ValueType<64 , 130>; // n x  4 x bf16 vector value
+def nxv8bf16 : ValueType<128, 131>; // n x  8 x bf16 vector value
+def nxv1bf32 : ValueType<32 , 132>; // n x  1 x f32 vector value
+def nxv2f32  : ValueType<64 , 133>; // n x  2 x f32 vector value
+def nxv4f32  : ValueType<128, 134>; // n x  4 x f32 vector value
+def nxv8f32  : ValueType<256, 135>; // n x  8 x f32 vector value
+def nxv16f32 : ValueType<512, 136>; // n x 16 x f32 vector value
+def nxv1f64  : ValueType<64,  137>; // n x  1 x f64 vector value
+def nxv2f64  : ValueType<128, 138>; // n x  2 x f64 vector value
+def nxv4f64  : ValueType<256, 139>; // n x  4 x f64 vector value
+def nxv8f64  : ValueType<512, 140>; // n x  8 x f64 vector value
+
+def x86mmx : ValueType<64 , 141>;   // X86 MMX value
+def FlagVT : ValueType<0  , 142>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 143>;   // Produces no value
+def untyped: ValueType<8  , 144>;   // Produces an untyped value
+def exnref : ValueType<0  , 145>;      // WebAssembly's exnref type
 def token  : ValueType<0  , 248>;   // TokenTy
 def MetadataVT: ValueType<0, 249>;  // Metadata
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 7bfb25b0ed7dd..33961767e1c06 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -214,6 +214,7 @@ def llvm_i16_ty        : LLVMType<i16>;
 def llvm_i32_ty        : LLVMType<i32>;
 def llvm_i64_ty        : LLVMType<i64>;
 def llvm_half_ty       : LLVMType<f16>;
+def llvm_bfloat_ty     : LLVMType<bf16>;
 def llvm_float_ty      : LLVMType<f32>;
 def llvm_double_ty     : LLVMType<f64>;
 def llvm_f80_ty        : LLVMType<f80>;
diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h
index 224353c5047fd..93683eb7dcf74 100644
--- a/llvm/include/llvm/Support/MachineValueType.h
+++ b/llvm/include/llvm/Support/MachineValueType.h
@@ -47,100 +47,107 @@ namespace llvm {
       FIRST_INTEGER_VALUETYPE = i1,
       LAST_INTEGER_VALUETYPE  = i128,
 
-      f16            =   8,   // This is a 16 bit floating point value
-      f32            =   9,   // This is a 32 bit floating point value
-      f64            =  10,   // This is a 64 bit floating point value
-      f80            =  11,   // This is a 80 bit floating point value
-      f128           =  12,   // This is a 128 bit floating point value
-      ppcf128        =  13,   // This is a PPC 128-bit floating point value
-
-      FIRST_FP_VALUETYPE = f16,
+      bf16           =   8,   // This is a 16 bit brain floating point value
+      f16            =   9,   // This is a 16 bit floating point value
+      f32            =  10,   // This is a 32 bit floating point value
+      f64            =  11,   // This is a 64 bit floating point value
+      f80            =  12,   // This is a 80 bit floating point value
+      f128           =  13,   // This is a 128 bit floating point value
+      ppcf128        =  14,   // This is a PPC 128-bit floating point value
+
+      FIRST_FP_VALUETYPE = bf16,
       LAST_FP_VALUETYPE  = ppcf128,
 
-      v1i1           =  14,   //    1 x i1
-      v2i1           =  15,   //    2 x i1
-      v4i1           =  16,   //    4 x i1
-      v8i1           =  17,   //    8 x i1
-      v16i1          =  18,   //   16 x i1
-      v32i1          =  19,   //   32 x i1
-      v64i1          =  20,   //   64 x i1
-      v128i1         =  21,   //  128 x i1
-      v256i1         =  22,   //  256 x i1
-      v512i1         =  23,   //  512 x i1
-      v1024i1        =  24,   // 1024 x i1
-
-      v1i8           =  25,   //  1 x i8
-      v2i8           =  26,   //  2 x i8
-      v4i8           =  27,   //  4 x i8
-      v8i8           =  28,   //  8 x i8
-      v16i8          =  29,   // 16 x i8
-      v32i8          =  30,   // 32 x i8
-      v64i8          =  31,   // 64 x i8
-      v128i8         =  32,   //128 x i8
-      v256i8         =  33,   //256 x i8
-
-      v1i16          =  34,   //  1 x i16
-      v2i16          =  35,   //  2 x i16
-      v3i16          =  36,   //  3 x i16
-      v4i16          =  37,   //  4 x i16
-      v8i16          =  38,   //  8 x i16
-      v16i16         =  39,   // 16 x i16
-      v32i16         =  40,   // 32 x i16
-      v64i16         =  41,   // 64 x i16
-      v128i16        =  42,   //128 x i16
-
-      v1i32          =  43,   //    1 x i32
-      v2i32          =  44,   //    2 x i32
-      v3i32          =  45,   //    3 x i32
-      v4i32          =  46,   //    4 x i32
-      v5i32          =  47,   //    5 x i32
-      v8i32          =  48,   //    8 x i32
-      v16i32         =  49,   //   16 x i32
-      v32i32         =  50,   //   32 x i32
-      v64i32         =  51,   //   64 x i32
-      v128i32        =  52,   //  128 x i32
-      v256i32        =  53,   //  256 x i32
-      v512i32        =  54,   //  512 x i32
-      v1024i32       =  55,   // 1024 x i32
-      v2048i32       =  56,   // 2048 x i32
-
-      v1i64          =  57,   //  1 x i64
-      v2i64          =  58,   //  2 x i64
-      v4i64          =  59,   //  4 x i64
-      v8i64          =  60,   //  8 x i64
-      v16i64         =  61,   // 16 x i64
-      v32i64         =  62,   // 32 x i64
-
-      v1i128         =  63,   //  1 x i128
+      v1i1           =  15,   //    1 x i1
+      v2i1           =  16,   //    2 x i1
+      v4i1           =  17,   //    4 x i1
+      v8i1           =  18,   //    8 x i1
+      v16i1          =  19,   //   16 x i1
+      v32i1          =  20,   //   32 x i1
+      v64i1          =  21,   //   64 x i1
+      v128i1         =  22,   //  128 x i1
+      v256i1         =  23,   //  256 x i1
+      v512i1         =  24,   //  512 x i1
+      v1024i1        =  25,   // 1024 x i1
+
+      v1i8           =  26,   //  1 x i8
+      v2i8           =  27,   //  2 x i8
+      v4i8           =  28,   //  4 x i8
+      v8i8           =  29,   //  8 x i8
+      v16i8          =  30,   // 16 x i8
+      v32i8          =  31,   // 32 x i8
+      v64i8          =  32,   // 64 x i8
+      v128i8         =  33,   //128 x i8
+      v256i8         =  34,   //256 x i8
+
+      v1i16          =  35,   //  1 x i16
+      v2i16          =  36,   //  2 x i16
+      v3i16          =  37,   //  3 x i16
+      v4i16          =  38,   //  4 x i16
+      v8i16          =  39,   //  8 x i16
+      v16i16         =  40,   // 16 x i16
+      v32i16         =  41,   // 32 x i16
+      v64i16         =  42,   // 64 x i16
+      v128i16        =  43,   //128 x i16
+
+      v1i32          =  44,   //    1 x i32
+      v2i32          =  45,   //    2 x i32
+      v3i32          =  46,   //    3 x i32
+      v4i32          =  47,   //    4 x i32
+      v5i32          =  48,   //    5 x i32
+      v8i32          =  49,   //    8 x i32
+      v16i32         =  50,   //   16 x i32
+      v32i32         =  51,   //   32 x i32
+      v64i32         =  52,   //   64 x i32
+      v128i32        =  53,   //  128 x i32
+      v256i32        =  54,   //  256 x i32
+      v512i32        =  55,   //  512 x i32
+      v1024i32       =  56,   // 1024 x i32
+      v2048i32       =  57,   // 2048 x i32
+
+      v1i64          =  58,   //  1 x i64
+      v2i64          =  59,   //  2 x i64
+      v4i64          =  60,   //  4 x i64
+      v8i64          =  61,   //  8 x i64
+      v16i64         =  62,   // 16 x i64
+      v32i64         =  63,   // 32 x i64
+
+      v1i128         =  64,   //  1 x i128
 
       FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
       LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
 
-      v2f16          =  64,   //    2 x f16
-      v3f16          =  65,   //    3 x f16
-      v4f16          =  66,   //    4 x f16
-      v8f16          =  67,   //    8 x f16
-      v16f16         =  68,   //   16 x f16
-      v32f16         =  69,   //   32 x f16
-      v1f32          =  70,   //    1 x f32
-      v2f32          =  71,   //    2 x f32
-      v3f32          =  72,   //    3 x f32
-      v4f32          =  73,   //    4 x f32
-      v5f32          =  74,   //    5 x f32
-      v8f32          =  75,   //    8 x f32
-      v16f32         =  76,   //   16 x f32
-      v32f32         =  77,   //   32 x f32
-      v64f32         =  78,   //   64 x f32
-      v128f32        =  79,   //  128 x f32
-      v256f32        =  80,   //  256 x f32
-      v512f32        =  81,   //  512 x f32
-      v1024f32       =  82,   // 1024 x f32
-      v2048f32       =  83,   // 2048 x f32
-      v1f64          =  84,   //    1 x f64
-      v2f64          =  85,   //    2 x f64
-      v4f64          =  86,   //    4 x f64
-      v8f64          =  87,   //    8 x f64
-      v16f64         =  88,   //   16 x f64
+      v2f16          =  65,   //    2 x f16
+      v3f16          =  66,   //    3 x f16
+      v4f16          =  67,   //    4 x f16
+      v8f16          =  68,   //    8 x f16
+      v16f16         =  69,   //   16 x f16
+      v32f16         =  70,   //   32 x f16
+      v2bf16         =  71,   //    2 x bf16
+      v3bf16         =  72,   //    3 x bf16
+      v4bf16         =  73,   //    4 x bf16
+      v8bf16         =  74,   //    8 x bf16
+      v16bf16        =  75,   //   16 x bf16
+      v32bf16        =  76,   //   32 x bf16
+      v1f32          =  77,   //    1 x f32
+      v2f32          =  78,   //    2 x f32
+      v3f32          =  79,   //    3 x f32
+      v4f32          =  80,   //    4 x f32
+      v5f32          =  81,   //    5 x f32
+      v8f32          =  82,   //    8 x f32
+      v16f32         =  83,   //   16 x f32
+      v32f32         =  84,   //   32 x f32
+      v64f32         =  85,   //   64 x f32
+      v128f32        =  86,   //  128 x f32
+      v256f32        =  87,   //  256 x f32
+      v512f32        =  88,   //  512 x f32
+      v1024f32       =  89,   // 1024 x f32
+      v2048f32       =  90,   // 2048 x f32
+      v1f64          =  91,   //    1 x f64
+      v2f64          =  92,   //    2 x f64
+      v4f64          =  93,   //    4 x f64
+      v8f64          =  94,   //    8 x f64
+      v16f64         =  95,   //   16 x f64
 
       FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
       LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v16f64,
@@ -148,56 +155,59 @@ namespace llvm {
       FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
       LAST_FIXEDLEN_VECTOR_VALUETYPE = v16f64,
 
-      nxv1i1         =  89,   // n x  1 x i1
-      nxv2i1         =  90,   // n x  2 x i1
-      nxv4i1         =  91,   // n x  4 x i1
-      nxv8i1         =  92,   // n x  8 x i1
-      nxv16i1        =  93,   // n x 16 x i1
-      nxv32i1        =  94,   // n x 32 x i1
-
-      nxv1i8         =  95,   // n x  1 x i8
-      nxv2i8         =  96,   // n x  2 x i8
-      nxv4i8         =  97,   // n x  4 x i8
-      nxv8i8         =  98,   // n x  8 x i8
-      nxv16i8        =  99,   // n x 16 x i8
-      nxv32i8        =  100,  // n x 32 x i8
-
-      nxv1i16        =  101,  // n x  1 x i16
-      nxv2i16        =  102,  // n x  2 x i16
-      nxv4i16        =  103,  // n x  4 x i16
-      nxv8i16        =  104,  // n x  8 x i16
-      nxv16i16       =  105,  // n x 16 x i16
-      nxv32i16       =  106,  // n x 32 x i16
-
-      nxv1i32        =  107,  // n x  1 x i32
-      nxv2i32        =  108,  // n x  2 x i32
-      nxv4i32        =  109,  // n x  4 x i32
-      nxv8i32        =  110,  // n x  8 x i32
-      nxv16i32       =  111,  // n x 16 x i32
-      nxv32i32       =  112,  // n x 32 x i32
-
-      nxv1i64        =  113,  // n x  1 x i64
-      nxv2i64        =  114,  // n x  2 x i64
-      nxv4i64        =  115,  // n x  4 x i64
-      nxv8i64        =  116,  // n x  8 x i64
-      nxv16i64       =  117,  // n x 16 x i64
-      nxv32i64       =  118,  // n x 32 x i64
+      nxv1i1         =  96,   // n x  1 x i1
+      nxv2i1         =  97,   // n x  2 x i1
+      nxv4i1         =  98,   // n x  4 x i1
+      nxv8i1         =  99,   // n x  8 x i1
+      nxv16i1        = 100,   // n x 16 x i1
+      nxv32i1        = 101,   // n x 32 x i1
+
+      nxv1i8         = 102,   // n x  1 x i8
+      nxv2i8         = 103,   // n x  2 x i8
+      nxv4i8         = 104,   // n x  4 x i8
+      nxv8i8         = 105,   // n x  8 x i8
+      nxv16i8        = 106,   // n x 16 x i8
+      nxv32i8        = 107,   // n x 32 x i8
+
+      nxv1i16        = 108,  // n x  1 x i16
+      nxv2i16        = 109,  // n x  2 x i16
+      nxv4i16        = 110,  // n x  4 x i16
+      nxv8i16        = 111,  // n x  8 x i16
+      nxv16i16       = 112,  // n x 16 x i16
+      nxv32i16       = 113,  // n x 32 x i16
+
+      nxv1i32        = 114,  // n x  1 x i32
+      nxv2i32        = 115,  // n x  2 x i32
+      nxv4i32        = 116,  // n x  4 x i32
+      nxv8i32        = 117,  // n x  8 x i32
+      nxv16i32       = 118,  // n x 16 x i32
+      nxv32i32       = 119,  // n x 32 x i32
+
+      nxv1i64        = 120,  // n x  1 x i64
+      nxv2i64        = 121,  // n x  2 x i64
+      nxv4i64        = 122,  // n x  4 x i64
+      nxv8i64        = 123,  // n x  8 x i64
+      nxv16i64       = 124,  // n x 16 x i64
+      nxv32i64       = 125,  // n x 32 x i64
 
       FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
       LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
 
-      nxv2f16        =  119,  // n x  2 x f16
-      nxv4f16        =  120,  // n x  4 x f16
-      nxv8f16        =  121,  // n x  8 x f16
-      nxv1f32        =  122,  // n x  1 x f32
-      nxv2f32        =  123,  // n x  2 x f32
-      nxv4f32        =  124,  // n x  4 x f32
-      nxv8f32        =  125,  // n x  8 x f32
-      nxv16f32       =  126,  // n x 16 x f32
-      nxv1f64        =  127,  // n x  1 x f64
-      nxv2f64        =  128,  // n x  2 x f64
-      nxv4f64        =  129,  // n x  4 x f64
-      nxv8f64        =  130,  // n x  8 x f64
+      nxv2f16        = 126,  // n x  2 x f16
+      nxv4f16        = 127,  // n x  4 x f16
+      nxv8f16        = 128,  // n x  8 x f16
+      nxv2bf16       = 129,  // n x  2 x bf16
+      nxv4bf16       = 130,  // n x  4 x bf16
+      nxv8bf16       = 131,  // n x  8 x bf16
+      nxv1f32        = 132,  // n x  1 x f32
+      nxv2f32        = 133,  // n x  2 x f32
+      nxv4f32        = 134,  // n x  4 x f32
+      nxv8f32        = 135,  // n x  8 x f32
+      nxv16f32       = 136,  // n x 16 x f32
+      nxv1f64        = 137,  // n x  1 x f64
+      nxv2f64        = 138,  // n x  2 x f64
+      nxv4f64        = 139,  // n x  4 x f64
+      nxv8f64        = 140,  // n x  8 x f64
 
       FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv2f16,
       LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
@@ -208,20 +218,20 @@ namespace llvm {
       FIRST_VECTOR_VALUETYPE = v1i1,
       LAST_VECTOR_VALUETYPE  = nxv8f64,
 
-      x86mmx         =  131,   // This is an X86 MMX value
+      x86mmx         = 141,   // This is an X86 MMX value
 
-      Glue           =  132,   // This glues nodes together during pre-RA sched
+      Glue           = 142,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  133,   // This has no value
+      isVoid         = 143,   // This has no value
 
-      Untyped        =  134,   // This value takes a register, but has
+      Untyped        = 144,   // This value takes a register, but has
                                // unspecified type.  The register class
                                // will be determined by the opcode.
 
-      exnref         =  135,   // WebAssembly's exnref type
+      exnref         = 145,   // WebAssembly's exnref type
 
       FIRST_VALUETYPE = 1,     // This is always the beginning of the list.
-      LAST_VALUETYPE =  136,   // This always remains at the end of the list.
+      LAST_VALUETYPE = 146,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -333,17 +343,19 @@ namespace llvm {
 
     /// Return true if this is a 32-bit vector type.
     bool is32BitVector() const {
-      return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8  ||
-              SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
-              SimpleTy == MVT::v2f16 || SimpleTy == MVT::v1f32);
+      return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8   ||
+              SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32  ||
+              SimpleTy == MVT::v2f16 || SimpleTy == MVT::v2bf16 ||
+              SimpleTy == MVT::v1f32);
     }
 
     /// Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
-      return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8  ||
-              SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
-              SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
-              SimpleTy == MVT::v2f32 || SimpleTy == MVT::v1f64);
+      return (SimpleTy == MVT::v64i1  || SimpleTy == MVT::v8i8  ||
+              SimpleTy == MVT::v4i16  || SimpleTy == MVT::v2i32 ||
+              SimpleTy == MVT::v1i64  || SimpleTy == MVT::v4f16 ||
+              SimpleTy == MVT::v4bf16 ||SimpleTy == MVT::v2f32  ||
+              SimpleTy == MVT::v1f64);
     }
 
     /// Return true if this is a 128-bit vector type.
@@ -351,24 +363,26 @@ namespace llvm {
       return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8  ||
               SimpleTy == MVT::v8i16  || SimpleTy == MVT::v4i32  ||
               SimpleTy == MVT::v2i64  || SimpleTy == MVT::v1i128 ||
-              SimpleTy == MVT::v8f16  || SimpleTy == MVT::v4f32  ||
-              SimpleTy == MVT::v2f64);
+              SimpleTy == MVT::v8f16  || SimpleTy == MVT::v8bf16 ||
+              SimpleTy == MVT::v4f32  || SimpleTy == MVT::v2f64);
     }
 
     /// Return true if this is a 256-bit vector type.
     bool is256BitVector() const {
-      return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v8f32 ||
-              SimpleTy == MVT::v4f64  || SimpleTy == MVT::v32i8 ||
-              SimpleTy == MVT::v16i16 || SimpleTy == MVT::v8i32 ||
-              SimpleTy == MVT::v4i64  || SimpleTy == MVT::v256i1);
+      return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v16bf16 ||
+              SimpleTy == MVT::v8f32  || SimpleTy == MVT::v4f64   ||
+              SimpleTy == MVT::v32i8  || SimpleTy == MVT::v16i16  ||
+              SimpleTy == MVT::v8i32  || SimpleTy == MVT::v4i64   ||
+              SimpleTy == MVT::v256i1);
     }
 
     /// Return true if this is a 512-bit vector type.
     bool is512BitVector() const {
-      return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v16f32 ||
-              SimpleTy == MVT::v8f64  || SimpleTy == MVT::v512i1 ||
-              SimpleTy == MVT::v64i8  || SimpleTy == MVT::v32i16 ||
-              SimpleTy == MVT::v16i32 || SimpleTy == MVT::v8i64);
+      return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v32bf16 ||
+              SimpleTy == MVT::v16f32 || SimpleTy == MVT::v8f64   ||
+              SimpleTy == MVT::v512i1 || SimpleTy == MVT::v64i8   ||
+              SimpleTy == MVT::v32i16 || SimpleTy == MVT::v16i32  ||
+              SimpleTy == MVT::v8i64);
     }
 
     /// Return true if this is a 1024-bit vector type.
@@ -515,6 +529,15 @@ namespace llvm {
       case nxv2f16:
       case nxv4f16:
       case nxv8f16: return f16;
+      case v2bf16:
+      case v3bf16:
+      case v4bf16:
+      case v8bf16:
+      case v16bf16:
+      case v32bf16:
+      case nxv2bf16:
+      case nxv4bf16:
+      case nxv8bf16: return bf16;
       case v1f32:
       case v2f32:
       case v3f32:
@@ -578,6 +601,7 @@ namespace llvm {
       case v32i32:
       case v32i64:
       case v32f16:
+      case v32bf16:
       case v32f32:
       case nxv32i1:
       case nxv32i8:
@@ -590,6 +614,7 @@ namespace llvm {
       case v16i32:
       case v16i64:
       case v16f16:
+      case v16bf16:
       case v16f32:
       case v16f64:
       case nxv16i1:
@@ -604,6 +629,7 @@ namespace llvm {
       case v8i32:
       case v8i64:
       case v8f16:
+      case v8bf16:
       case v8f32:
       case v8f64:
       case nxv8i1:
@@ -612,6 +638,7 @@ namespace llvm {
       case nxv8i32:
       case nxv8i64:
       case nxv8f16:
+      case nxv8bf16:
       case nxv8f32:
       case nxv8f64: return 8;
       case v5i32:
@@ -622,6 +649,7 @@ namespace llvm {
       case v4i32:
       case v4i64:
       case v4f16:
+      case v4bf16:
       case v4f32:
       case v4f64:
       case nxv4i1:
@@ -630,11 +658,13 @@ namespace llvm {
       case nxv4i32:
       case nxv4i64:
       case nxv4f16:
+      case nxv4bf16:
       case nxv4f32:
       case nxv4f64: return 4;
       case v3i16:
       case v3i32:
       case v3f16:
+      case v3bf16:
       case v3f32: return 3;
       case v2i1:
       case v2i8:
@@ -642,6 +672,7 @@ namespace llvm {
       case v2i32:
       case v2i64:
       case v2f16:
+      case v2bf16:
       case v2f32:
       case v2f64:
       case nxv2i1:
@@ -650,6 +681,7 @@ namespace llvm {
       case nxv2i32:
       case nxv2i64:
       case nxv2f16:
+      case nxv2bf16:
       case nxv2f32:
       case nxv2f64: return 2;
       case v1i1:
@@ -712,6 +744,7 @@ namespace llvm {
       case nxv8i1: return TypeSize::Scalable(8);
       case i16 :
       case f16:
+      case bf16:
       case v16i1:
       case v2i8:
       case v1i16: return TypeSize::Fixed(16);
@@ -724,6 +757,7 @@ namespace llvm {
       case v4i8:
       case v2i16:
       case v2f16:
+      case v2bf16:
       case v1f32:
       case v1i32: return TypeSize::Fixed(32);
       case nxv32i1:
@@ -731,9 +765,11 @@ namespace llvm {
       case nxv2i16:
       case nxv1i32:
       case nxv2f16:
+      case nxv2bf16:
       case nxv1f32: return TypeSize::Scalable(32);
       case v3i16:
-      case v3f16: return TypeSize::Fixed(48);
+      case v3f16:
+      case v3bf16: return TypeSize::Fixed(48);
       case x86mmx:
       case f64 :
       case i64 :
@@ -743,6 +779,7 @@ namespace llvm {
       case v2i32:
       case v1i64:
       case v4f16:
+      case v4bf16:
       case v2f32:
       case v1f64: return TypeSize::Fixed(64);
       case nxv8i8:
@@ -750,6 +787,7 @@ namespace llvm {
       case nxv2i32:
       case nxv1i64:
       case nxv4f16:
+      case nxv4bf16:
       case nxv2f32:
       case nxv1f64: return TypeSize::Scalable(64);
       case f80 :  return TypeSize::Fixed(80);
@@ -765,6 +803,7 @@ namespace llvm {
       case v2i64:
       case v1i128:
       case v8f16:
+      case v8bf16:
       case v4f32:
       case v2f64: return TypeSize::Fixed(128);
       case nxv16i8:
@@ -772,6 +811,7 @@ namespace llvm {
       case nxv4i32:
       case nxv2i64:
       case nxv8f16:
+      case nxv8bf16:
       case nxv4f32:
       case nxv2f64: return TypeSize::Scalable(128);
       case v5i32:
@@ -782,6 +822,7 @@ namespace llvm {
       case v8i32:
       case v4i64:
       case v16f16:
+      case v16bf16:
       case v8f32:
       case v4f64: return TypeSize::Fixed(256);
       case nxv32i8:
@@ -796,6 +837,7 @@ namespace llvm {
       case v16i32:
       case v8i64:
       case v32f16:
+      case v32bf16:
       case v16f32:
       case v8f64: return TypeSize::Fixed(512);
       case nxv32i16:
@@ -993,6 +1035,14 @@ namespace llvm {
         if (NumElements == 16) return MVT::v16f16;
         if (NumElements == 32) return MVT::v32f16;
         break;
+      case MVT::bf16:
+        if (NumElements == 2)  return MVT::v2bf16;
+        if (NumElements == 3)  return MVT::v3bf16;
+        if (NumElements == 4)  return MVT::v4bf16;
+        if (NumElements == 8)  return MVT::v8bf16;
+        if (NumElements == 16) return MVT::v16bf16;
+        if (NumElements == 32) return MVT::v32bf16;
+        break;
       case MVT::f32:
         if (NumElements == 1)    return MVT::v1f32;
         if (NumElements == 2)    return MVT::v2f32;
@@ -1069,6 +1119,11 @@ namespace llvm {
           if (NumElements == 4)  return MVT::nxv4f16;
           if (NumElements == 8)  return MVT::nxv8f16;
           break;
+        case MVT::bf16:
+          if (NumElements == 2)  return MVT::nxv2bf16;
+          if (NumElements == 4)  return MVT::nxv4bf16;
+          if (NumElements == 8)  return MVT::nxv8bf16;
+          break;
         case MVT::f32:
           if (NumElements == 1)  return MVT::nxv1f32;
           if (NumElements == 2)  return MVT::nxv2f32;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index e24ad844a62c1..2b97e9d83dd0a 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -147,6 +147,7 @@ std::string EVT::getEVTString() const {
     if (isFloatingPoint())
       return "f" + utostr(getSizeInBits());
     llvm_unreachable("Invalid EVT!");
+  case MVT::bf16:    return "bf16";
   case MVT::ppcf128: return "ppcf128";
   case MVT::isVoid:  return "isVoid";
   case MVT::Other:   return "ch";
@@ -174,6 +175,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::i64:     return Type::getInt64Ty(Context);
   case MVT::i128:    return IntegerType::get(Context, 128);
   case MVT::f16:     return Type::getHalfTy(Context);
+  case MVT::bf16:     return Type::getBFloatTy(Context);
   case MVT::f32:     return Type::getFloatTy(Context);
   case MVT::f64:     return Type::getDoubleTy(Context);
   case MVT::f80:     return Type::getX86_FP80Ty(Context);
@@ -236,6 +238,12 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
   case MVT::v16f16:  return VectorType::get(Type::getHalfTy(Context), 16);
   case MVT::v32f16:  return VectorType::get(Type::getHalfTy(Context), 32);
+  case MVT::v2bf16:  return VectorType::get(Type::getBFloatTy(Context), 2);
+  case MVT::v3bf16:  return VectorType::get(Type::getBFloatTy(Context), 3);
+  case MVT::v4bf16:  return VectorType::get(Type::getBFloatTy(Context), 4);
+  case MVT::v8bf16:  return VectorType::get(Type::getBFloatTy(Context), 8);
+  case MVT::v16bf16: return VectorType::get(Type::getBFloatTy(Context), 16);
+  case MVT::v32bf16: return VectorType::get(Type::getBFloatTy(Context), 32);
   case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v3f32:   return VectorType::get(Type::getFloatTy(Context), 3);
@@ -321,6 +329,12 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return VectorType::get(Type::getHalfTy(Context), 4, /*Scalable=*/ true);
   case MVT::nxv8f16:
     return VectorType::get(Type::getHalfTy(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv2bf16:
+    return VectorType::get(Type::getBFloatTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4bf16:
+    return VectorType::get(Type::getBFloatTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8bf16:
+    return VectorType::get(Type::getBFloatTy(Context), 8, /*Scalable=*/ true);
   case MVT::nxv1f32:
     return VectorType::get(Type::getFloatTy(Context), 1, /*Scalable=*/ true);
   case MVT::nxv2f32:
@@ -356,6 +370,7 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::IntegerTyID:
     return getIntegerVT(cast<IntegerType>(Ty)->getBitWidth());
   case Type::HalfTyID:      return MVT(MVT::f16);
+  case Type::BFloatTyID:    return MVT(MVT::bf16);
   case Type::FloatTyID:     return MVT(MVT::f32);
   case Type::DoubleTyID:    return MVT(MVT::f64);
   case Type::X86_FP80TyID:  return MVT(MVT::f80);
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index e0470e4266f86..282e62cf838e0 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -69,6 +69,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::fAny:     return "MVT::fAny";
   case MVT::vAny:     return "MVT::vAny";
   case MVT::f16:      return "MVT::f16";
+  case MVT::bf16:     return "MVT::bf16";
   case MVT::f32:      return "MVT::f32";
   case MVT::f64:      return "MVT::f64";
   case MVT::f80:      return "MVT::f80";
@@ -132,6 +133,12 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v8f16:    return "MVT::v8f16";
   case MVT::v16f16:   return "MVT::v16f16";
   case MVT::v32f16:   return "MVT::v32f16";
+  case MVT::v2bf16:   return "MVT::v2bf16";
+  case MVT::v3bf16:   return "MVT::v3bf16";
+  case MVT::v4bf16:   return "MVT::v4bf16";
+  case MVT::v8bf16:   return "MVT::v8bf16";
+  case MVT::v16bf16:  return "MVT::v16bf16";
+  case MVT::v32bf16:  return "MVT::v32bf16";
   case MVT::v1f32:    return "MVT::v1f32";
   case MVT::v2f32:    return "MVT::v2f32";
   case MVT::v3f32:    return "MVT::v3f32";
@@ -182,6 +189,9 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::nxv2f16:  return "MVT::nxv2f16";
   case MVT::nxv4f16:  return "MVT::nxv4f16";
   case MVT::nxv8f16:  return "MVT::nxv8f16";
+  case MVT::nxv2bf16:  return "MVT::nxv2bf16";
+  case MVT::nxv4bf16:  return "MVT::nxv4bf16";
+  case MVT::nxv8bf16:  return "MVT::nxv8bf16";
   case MVT::nxv1f32:  return "MVT::nxv1f32";
   case MVT::nxv2f32:  return "MVT::nxv2f32";
   case MVT::nxv4f32:  return "MVT::nxv4f32";

From ae07fabf6a705b7eb91e801d7735bda4a319567c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 12:39:36 +0100
Subject: [PATCH 218/770] ObjCARCInstKind.h - remove unused includes. NFC.

---
 llvm/include/llvm/Analysis/ObjCARCInstKind.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ObjCARCInstKind.h b/llvm/include/llvm/Analysis/ObjCARCInstKind.h
index dc6093a7b86c5..84565b9315c78 100644
--- a/llvm/include/llvm/Analysis/ObjCARCInstKind.h
+++ b/llvm/include/llvm/Analysis/ObjCARCInstKind.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_ANALYSIS_OBJCARCINSTKIND_H
 #define LLVM_ANALYSIS_OBJCARCINSTKIND_H
 
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Instructions.h"
 
 namespace llvm {

From 0865d41492a7f2e8ca8ab70cb3baa121b747e9a7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 13:10:37 +0100
Subject: [PATCH 219/770] ObjectFile.h - reduce unnecessary includes to forward
 declarations. NFC.

Fix SubtargetFeature.h include dependency in XCOFFObjectFile.cpp
---
 llvm/include/llvm/Object/ObjectFile.h | 3 +--
 llvm/lib/Object/XCOFFObjectFile.cpp   | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index 4d51430ffaf73..8e8937201716b 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -18,13 +18,11 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Magic.h"
-#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <cassert>
 #include <cstdint>
@@ -34,6 +32,7 @@
 namespace llvm {
 
 class ARMAttributeParser;
+class SubtargetFeatures;
 
 namespace object {
 
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index 4d2b7f3372bcc..d41afc8bdc24d 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/XCOFFObjectFile.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include <cstddef>
 #include <cstring>
 

From 2ee4ec6b6f6d0571288db69b824f6773717d2cf7 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 27 May 2020 08:29:09 -0400
Subject: [PATCH 220/770] [IR] add set function for FMF 'contract'

This was missed when the flag was added with D31164.
---
 llvm/include/llvm/IR/Instruction.h | 5 +++++
 llvm/lib/IR/Instruction.cpp        | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index b15ececc677cc..4722d509f3700 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -396,6 +396,11 @@ class Instruction : public User,
   /// this flag.
   void setHasAllowReciprocal(bool B);
 
+  /// Set or clear the allow-contract flag on this instruction, which must be
+  /// an operator which supports this flag. See LangRef.html for the meaning of
+  /// this flag.
+  void setHasAllowContract(bool B);
+
   /// Set or clear the approximate-math-functions flag on this instruction,
   /// which must be an operator which supports this flag. See LangRef.html for
   /// the meaning of this flag.
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 645f7c0944d53..23e8332f7d094 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -198,6 +198,11 @@ void Instruction::setHasAllowReciprocal(bool B) {
   cast<FPMathOperator>(this)->setHasAllowReciprocal(B);
 }
 
+void Instruction::setHasAllowContract(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasAllowContract(B);
+}
+
 void Instruction::setHasApproxFunc(bool B) {
   assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
   cast<FPMathOperator>(this)->setHasApproxFunc(B);

From 833996cef1381115b0077ab5694e189463f5d02e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 19 May 2020 10:16:45 -0400
Subject: [PATCH 221/770] AMDGPU: Fix backwards s_cselect_* operands

The vector equivalent has backwards operands, but the scalar version
does not. The passes that use these hooks aren't enabled by default,
so this doesn't really change anything.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp       | 40 +++++++++++++-------
 llvm/test/CodeGen/AMDGPU/early-if-convert.ll |  6 +--
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1d0397afd771f..fd1da238a8761 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2348,14 +2348,17 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
 
   if (DstSize == 32) {
-    unsigned SelOp = Pred == SCC_TRUE ?
-      AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
-
-    // Instruction's operands are backwards from what is expected.
-    MachineInstr *Select =
-      BuildMI(MBB, I, DL, get(SelOp), DstReg)
-      .addReg(FalseReg)
-      .addReg(TrueReg);
+    MachineInstr *Select;
+    if (Pred == SCC_TRUE) {
+      Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
+        .addReg(TrueReg)
+        .addReg(FalseReg);
+    } else {
+      // Instruction's operands are backwards from what is expected.
+      Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg);
+    }
 
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
     return;
@@ -2364,8 +2367,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   if (DstSize == 64 && Pred == SCC_TRUE) {
     MachineInstr *Select =
       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
-      .addReg(FalseReg)
-      .addReg(TrueReg);
+      .addReg(TrueReg)
+      .addReg(FalseReg);
 
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
     return;
@@ -2416,10 +2419,19 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
     unsigned SubIdx = SubIndices[Idx];
 
-    MachineInstr *Select =
-      BuildMI(MBB, I, DL, get(SelOp), DstElt)
-      .addReg(FalseReg, 0, SubIdx)
-      .addReg(TrueReg, 0, SubIdx);
+    MachineInstr *Select;
+    if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
+      Select =
+        BuildMI(MBB, I, DL, get(SelOp), DstElt)
+        .addReg(FalseReg, 0, SubIdx)
+        .addReg(TrueReg, 0, SubIdx);
+    } else {
+      Select =
+        BuildMI(MBB, I, DL, get(SelOp), DstElt)
+        .addReg(TrueReg, 0, SubIdx)
+        .addReg(FalseReg, 0, SubIdx);
+    }
+
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
     fixImplicitOperands(*Select);
 
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
index 66d5411cd978a..41ed70bff1601 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -246,7 +246,7 @@ endif:
 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
-; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
+; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
 entry:
   %v = load i32, i32 addrspace(4)* %in
@@ -362,7 +362,7 @@ endif:
 
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
+; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
@@ -383,7 +383,7 @@ done:
 ; GCN-LABEL: {{^}}ifcvt_undef_scc:
 ; GCN: {{^}}; %bb.0:
 ; GCN-NEXT: s_load_dwordx2
-; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
+; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   br i1 undef, label %else, label %if

From 70d4a202995315c77d7daec9f332a6ceda84efc9 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 27 May 2020 12:54:29 +0100
Subject: [PATCH 222/770] [UnJ] Update LI for inner nested loops

This makes sure to correctly register the loop info of the children
of unroll and jammed loops. It re-uses some code from the unroller for
registering subloops.

Differential Revision: https://reviews.llvm.org/D80619
---
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp | 12 +--
 .../Transforms/LoopUnrollAndJam/innerloop.ll  | 97 +++++++++++++++++++
 2 files changed, 103 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnrollAndJam/innerloop.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index e8aac1233292a..8ac6b0894d1c8 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -366,29 +366,29 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     SmallVector<BasicBlock *, 8> NewBlocks;
     // Maps Blocks[It] -> Blocks[It-1]
     DenseMap<Value *, Value *> PrevItValueMap;
+    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+    NewLoops[L] = L;
+    NewLoops[SubLoop] = SubLoop;
 
     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
       ValueToValueMapTy VMap;
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
-      if (ForeBlocks.count(*BB)) {
-        L->addBasicBlockToLoop(New, *LI);
+      // Tell LI about New.
+      addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
 
+      if (ForeBlocks.count(*BB)) {
         if (*BB == ForeBlocksFirst[0])
           ForeBlocksFirst.push_back(New);
         if (*BB == ForeBlocksLast[0])
           ForeBlocksLast.push_back(New);
       } else if (SubLoopBlocks.count(*BB)) {
-        SubLoop->addBasicBlockToLoop(New, *LI);
-
         if (*BB == SubLoopBlocksFirst[0])
           SubLoopBlocksFirst.push_back(New);
         if (*BB == SubLoopBlocksLast[0])
           SubLoopBlocksLast.push_back(New);
       } else if (AftBlocks.count(*BB)) {
-        L->addBasicBlockToLoop(New, *LI);
-
         if (*BB == AftBlocksFirst[0])
           AftBlocksFirst.push_back(New);
         if (*BB == AftBlocksLast[0])
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/innerloop.ll b/llvm/test/Transforms/LoopUnrollAndJam/innerloop.ll
new file mode 100644
index 0000000000000..d84260650c67d
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnrollAndJam/innerloop.ll
@@ -0,0 +1,97 @@
+; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -verify-loop-info < %s -S | FileCheck %s
+; RUN: opt -passes='unroll-and-jam,verify<loops>' -allow-unroll-and-jam < %s -S | FileCheck %s
+
+; Check that the newly created loops to not fail to be added to LI
+; This test deliberately disables UnJ on the middle loop, performing it instead on the
+; outer of 3 nested loops. The (new) inner loops need to be added to LI.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @test() {
+; CHECK-LABEL: test
+; CHECK:       for.cond17.preheader:
+; CHECK:    br label %for.cond20.preheader
+; CHECK:       for.cond20.preheader:
+; CHECK:    br label %for.cond23.preheader
+; CHECK:       for.cond23.preheader:
+; CHECK:    br label %for.body25
+; CHECK:       for.body25:
+; CHECK:    br i1 [[CMP24:%.*]], label %for.body25, label %for.inc45
+; CHECK:       for.inc45:
+; CHECK:    br label %for.body25.1
+; CHECK:       for.inc48:
+; CHECK:    br i1 [[CMP18_3:%.*]], label %for.cond20.preheader, label %for.end50
+; CHECK:       for.end50:
+; CHECK:    ret i32 0
+; CHECK:       for.body25.1:
+; CHECK:    br i1 [[CMP24_1:%.*]], label %for.body25.1, label %for.inc45.1
+; CHECK:       for.inc45.1:
+; CHECK:    br label %for.body25.2
+; CHECK:       for.body25.2:
+; CHECK:    br i1 [[CMP24_2:%.*]], label %for.body25.2, label %for.inc45.2
+; CHECK:       for.inc45.2:
+; CHECK:    br label %for.body25.3
+; CHECK:       for.body25.3:
+; CHECK:    br i1 [[CMP24_3:%.*]], label %for.body25.3, label %for.inc45.3
+; CHECK:       for.inc45.3:
+; CHECK:    br i1 [[CMP21_3:%.*]], label %for.cond23.preheader, label %for.inc48
+;
+entry:
+  %A = alloca [8 x [8 x i32]], align 16
+  %B = alloca [8 x [8 x i32]], align 16
+  %C = alloca [8 x [8 x i32]], align 16
+  br label %for.cond17.preheader
+
+for.cond17.preheader:                             ; preds = %for.inc14
+  br label %for.cond20.preheader
+
+for.cond20.preheader:                             ; preds = %for.cond17.preheader, %for.inc48
+  %i.13 = phi i32 [ 0, %for.cond17.preheader ], [ %inc49, %for.inc48 ]
+  br label %for.cond23.preheader
+
+for.cond23.preheader:                             ; preds = %for.cond20.preheader, %for.inc45
+  %j.12 = phi i32 [ 0, %for.cond20.preheader ], [ %inc46, %for.inc45 ]
+  br label %for.body25
+
+for.body25:                                       ; preds = %for.cond23.preheader, %for.body25
+  %k.01 = phi i32 [ 0, %for.cond23.preheader ], [ %inc43, %for.body25 ]
+  %idxprom26 = zext i32 %i.13 to i64
+  %idxprom28 = zext i32 %j.12 to i64
+  %arrayidx29 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %C, i64 0, i64 %idxprom26, i64 %idxprom28
+  %0 = load i32, i32* %arrayidx29, align 4
+  %idxprom30 = zext i32 %i.13 to i64
+  %idxprom32 = zext i32 %k.01 to i64
+  %arrayidx33 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %A, i64 0, i64 %idxprom30, i64 %idxprom32
+  %1 = load i32, i32* %arrayidx33, align 4
+  %idxprom34 = zext i32 %k.01 to i64
+  %idxprom36 = zext i32 %j.12 to i64
+  %arrayidx37 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %B, i64 0, i64 %idxprom34, i64 %idxprom36
+  %2 = load i32, i32* %arrayidx37, align 4
+  %mul = mul nsw i32 %1, %2
+  %add = add nsw i32 %0, %mul
+  %idxprom38 = zext i32 %i.13 to i64
+  %idxprom40 = zext i32 %j.12 to i64
+  %arrayidx41 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %C, i64 0, i64 %idxprom38, i64 %idxprom40
+  store i32 %add, i32* %arrayidx41, align 4
+  %inc43 = add nuw nsw i32 %k.01, 1
+  %cmp24 = icmp ult i32 %k.01, 7
+  br i1 %cmp24, label %for.body25, label %for.inc45
+
+for.inc45:                                        ; preds = %for.body25
+  %inc46 = add nuw nsw i32 %j.12, 1
+  %cmp21 = icmp ult i32 %j.12, 7
+  br i1 %cmp21, label %for.cond23.preheader, label %for.inc48, !llvm.loop !7
+
+for.inc48:                                        ; preds = %for.inc45
+  %inc49 = add nuw nsw i32 %i.13, 1
+  %cmp18 = icmp ult i32 %i.13, 7
+  br i1 %cmp18, label %for.cond20.preheader, label %for.end50, !llvm.loop !5
+
+for.end50:                                        ; preds = %for.inc48
+  ret i32 0
+}
+
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.unroll_and_jam.disable"}

From ad5d319ee85d31ee2b1ca5c29b3a10b340513fec Mon Sep 17 00:00:00 2001
From: Ties Stuij <ties.stuij@arm.com>
Date: Wed, 27 May 2020 14:00:33 +0100
Subject: [PATCH 223/770] [IR][BFloat] add BFloat IR intrinsics support

Summary:
This patch is part of a series that adds support for the Bfloat16 extension of
the Armv8.6-a architecture, as detailed here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

The bfloat type, and its properties are specified in the Arm Architecture
Reference Manual:

https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

Reviewers: scanon, fpetrogalli, sdesmalen, craig.topper, LukeGeeson

Reviewed By: fpetrogalli

Subscribers: LukeGeeson, pbarrio, kristof.beyls, hiraditya, jdoerfert, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79707
---
 llvm/include/llvm/IR/Intrinsics.h         | 1 +
 llvm/include/llvm/IR/Intrinsics.td        | 3 +++
 llvm/include/llvm/IR/IntrinsicsAArch64.td | 1 +
 llvm/lib/IR/Function.cpp                  | 8 +++++++-
 llvm/utils/TableGen/IntrinsicEmitter.cpp  | 4 +++-
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index ca2bc26e50b38..a44a9a4a70b0f 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -106,6 +106,7 @@ namespace Intrinsic {
       Token,
       Metadata,
       Half,
+      BFloat,
       Float,
       Double,
       Quad,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 33961767e1c06..a2553cdeec6c2 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -280,6 +280,9 @@ def llvm_v1i128_ty     : LLVMType<v1i128>;   //  1 x i128
 def llvm_v2f16_ty      : LLVMType<v2f16>;    //  2 x half (__fp16)
 def llvm_v4f16_ty      : LLVMType<v4f16>;    //  4 x half (__fp16)
 def llvm_v8f16_ty      : LLVMType<v8f16>;    //  8 x half (__fp16)
+def llvm_v2bf16_ty     : LLVMType<v2bf16>;   //  2 x bfloat (__bf16)
+def llvm_v4bf16_ty     : LLVMType<v4bf16>;   //  4 x bfloat (__bf16)
+def llvm_v8bf16_ty     : LLVMType<v8bf16>;   //  8 x bfloat (__bf16)
 def llvm_v1f32_ty      : LLVMType<v1f32>;    //  1 x float
 def llvm_v2f32_ty      : LLVMType<v2f32>;    //  2 x float
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 28a5a16c5a664..384e3209f5f5f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -783,6 +783,7 @@ def llvm_nxv16i8_ty : LLVMType<nxv16i8>;
 def llvm_nxv4i32_ty : LLVMType<nxv4i32>;
 def llvm_nxv2i64_ty : LLVMType<nxv2i64>;
 def llvm_nxv8f16_ty : LLVMType<nxv8f16>;
+def llvm_nxv8bf16_ty : LLVMType<nxv8bf16>;
 def llvm_nxv4f32_ty : LLVMType<nxv4f32>;
 def llvm_nxv2f64_ty : LLVMType<nxv2f64>;
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index c7eec1f77fd99..1259468adc548 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -747,7 +747,8 @@ enum IIT_Info {
   IIT_SUBDIVIDE2_ARG = 44,
   IIT_SUBDIVIDE4_ARG = 45,
   IIT_VEC_OF_BITCASTS_TO_INT = 46,
-  IIT_V128  = 47
+  IIT_V128 = 47,
+  IIT_BF16 = 48
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -782,6 +783,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_F16:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Half, 0));
     return;
+  case IIT_BF16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::BFloat, 0));
+    return;
   case IIT_F32:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
     return;
@@ -1005,6 +1009,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Token: return Type::getTokenTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
+  case IITDescriptor::BFloat: return Type::getBFloatTy(Context);
   case IITDescriptor::Float: return Type::getFloatTy(Context);
   case IITDescriptor::Double: return Type::getDoubleTy(Context);
   case IITDescriptor::Quad: return Type::getFP128Ty(Context);
@@ -1183,6 +1188,7 @@ static bool matchIntrinsicType(
     case IITDescriptor::Token: return !Ty->isTokenTy();
     case IITDescriptor::Metadata: return !Ty->isMetadataTy();
     case IITDescriptor::Half: return !Ty->isHalfTy();
+    case IITDescriptor::BFloat: return !Ty->isBFloatTy();
     case IITDescriptor::Float: return !Ty->isFloatTy();
     case IITDescriptor::Double: return !Ty->isDoubleTy();
     case IITDescriptor::Quad: return !Ty->isFP128Ty();
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index b0ac385c19390..f05fd9fd39fe2 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -245,7 +245,8 @@ enum IIT_Info {
   IIT_SUBDIVIDE2_ARG = 44,
   IIT_SUBDIVIDE4_ARG = 45,
   IIT_VEC_OF_BITCASTS_TO_INT = 46,
-  IIT_V128  = 47
+  IIT_V128 = 47,
+  IIT_BF16 = 48
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -266,6 +267,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   switch (VT) {
   default: PrintFatalError("unhandled MVT in intrinsic!");
   case MVT::f16: return Sig.push_back(IIT_F16);
+  case MVT::bf16: return Sig.push_back(IIT_BF16);
   case MVT::f32: return Sig.push_back(IIT_F32);
   case MVT::f64: return Sig.push_back(IIT_F64);
   case MVT::f128: return Sig.push_back(IIT_F128);

From 4ab03e62fd040efdbde4b6c310e5abbda5363abd Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Thu, 21 May 2020 17:29:18 +0300
Subject: [PATCH 224/770] [llvm-readobj] - Do not crash when an invalid
 .eh_frame_hdr is dumped using --unwind.

When the p_offset/p_filesz of the PT_GNU_EH_FRAME is invalid
(e.g larger than the file size) then llvm-readobj might crash.

This patch fixes the issue. I've introduced `ELFFile<ELFT>::getSegmentContent`
method, which is very similar to `ELFFile<ELFT>::getSectionContentsAsArray` one.

Differential revision: https://reviews.llvm.org/D80380
---
 llvm/include/llvm/Object/ELF.h               | 32 ++++++++++
 llvm/test/tools/llvm-readobj/ELF/unwind.test | 61 ++++++++++++++++++++
 llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h  | 30 +++++-----
 3 files changed, 107 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 18cc7abe1cd61..15d473805ecce 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -64,6 +64,17 @@ std::string getSecIndexForError(const ELFFile<ELFT> *Obj,
   return "[unknown index]";
 }
 
+template <class ELFT>
+std::string getPhdrIndexForError(const ELFFile<ELFT> *Obj,
+                                 const typename ELFT::Phdr *Phdr) {
+  auto Headers = Obj->program_headers();
+  if (Headers)
+    return ("[index " + Twine(Phdr - &Headers->front()) + "]").str();
+  // See comment in the getSecIndexForError() above.
+  llvm::consumeError(Headers.takeError());
+  return "[unknown index]";
+}
+
 static inline Error defaultWarningHandler(const Twine &Msg) {
   return createError(Msg);
 }
@@ -299,6 +310,7 @@ class ELFFile {
   template <typename T>
   Expected<ArrayRef<T>> getSectionContentsAsArray(const Elf_Shdr *Sec) const;
   Expected<ArrayRef<uint8_t>> getSectionContents(const Elf_Shdr *Sec) const;
+  Expected<ArrayRef<uint8_t>> getSegmentContents(const Elf_Phdr *Phdr) const;
 };
 
 using ELF32LEFile = ELFFile<ELF32LE>;
@@ -422,6 +434,26 @@ ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr *Sec) const {
   return makeArrayRef(Start, Size / sizeof(T));
 }
 
+template <class ELFT>
+Expected<ArrayRef<uint8_t>>
+ELFFile<ELFT>::getSegmentContents(const Elf_Phdr *Phdr) const {
+  uintX_t Offset = Phdr->p_offset;
+  uintX_t Size = Phdr->p_filesz;
+
+  if (std::numeric_limits<uintX_t>::max() - Offset < Size)
+    return createError("program header " + getPhdrIndexForError(this, Phdr) +
+                       " has a p_offset (0x" + Twine::utohexstr(Offset) +
+                       ") + p_filesz (0x" + Twine::utohexstr(Size) +
+                       ") that cannot be represented");
+  if (Offset + Size > Buf.size())
+    return createError("program header  " + getPhdrIndexForError(this, Phdr) +
+                       " has a p_offset (0x" + Twine::utohexstr(Offset) +
+                       ") + p_filesz (0x" + Twine::utohexstr(Size) +
+                       ") that is greater than the file size (0x" +
+                       Twine::utohexstr(Buf.size()) + ")");
+  return makeArrayRef(base() + Offset, Size);
+}
+
 template <class ELFT>
 Expected<ArrayRef<uint8_t>>
 ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
diff --git a/llvm/test/tools/llvm-readobj/ELF/unwind.test b/llvm/test/tools/llvm-readobj/ELF/unwind.test
index 466c6a6a75178..2fe673d806e2e 100644
--- a/llvm/test/tools/llvm-readobj/ELF/unwind.test
+++ b/llvm/test/tools/llvm-readobj/ELF/unwind.test
@@ -262,3 +262,64 @@ Sections:
     Type:    SHT_PROGBITS
 ## Length is set to 0xFF, though the actual section length is 4.
     Content: "FF000000"
+
+## Check we report an error when we can't read the content of the .eh_frame section.
+
+## Case A: test we report an error when the p_offset of the PT_GNU_EH_FRAME
+##         is invalid (goes past the end of the file).
+
+# RUN: yaml2obj --docnum=4 %s -o %t4 -DOFFSET=0xffff0000 -DSIZE=0x1 -DBITS=32
+# RUN: not llvm-readobj --unwind %t4 2>&1 \
+# RUN:   | FileCheck %s -DFILE=%t4 --check-prefix=BROKEN-CONTENT -DOFFSET=0xffff0000 -DSIZE=0x1
+
+# RUN: yaml2obj --docnum=4 %s -o %t5 -DOFFSET=0x1 -DSIZE=0xffff0000 -DBITS=32
+# RUN: not llvm-readobj --unwind %t5 2>&1 \
+# RUN:   | FileCheck %s -DFILE=%t5 --check-prefix=BROKEN-CONTENT -DOFFSET=0x1 -DSIZE=0xffff0000
+
+# BROKEN-CONTENT:      EHFrameHeader {
+# BROKEN-CONTENT-NEXT:   Address: 0x0
+# BROKEN-CONTENT-NEXT:   Offset:  [[OFFSET]]
+# BROKEN-CONTENT-NEXT:   Size:    [[SIZE]]
+# BROKEN-CONTENT-NEXT:   Corresponding Section:
+# BROKEN-CONTENT-NEXT: error: '[[FILE]]': program header  [index 0] has a p_offset ([[OFFSET]]) + p_filesz ([[SIZE]]) that is greater than the file size (0xe0)
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS[[BITS]]
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_NONE
+ProgramHeaders:
+  - Type: PT_GNU_EH_FRAME
+    MemSize:  [[SIZE]]
+    FileSize: [[SIZE]]
+    Offset:   [[OFFSET]]
+    Sections: []
+
+## Case B: test we report an error when the file size of the PT_GNU_EH_FRAME
+##         is invalid (goes past the end of the file).
+# RUN: yaml2obj --docnum=4 %s -o %t6 -DOFFSET=0x100 -DSIZE=0xffff0000 -DBITS=32
+# RUN: not llvm-readobj --unwind %t6 2>&1 \
+# RUN:   | FileCheck %s -DFILE=%t6 --check-prefix=BROKEN-CONTENT -DOFFSET=0x100 -DSIZE=0xffff0000
+
+## Case C: test we report an error when the offset + the file size of the PT_GNU_EH_FRAME is so large a
+##         value that it overflows the platform address size type.
+
+# RUN: yaml2obj --docnum=4 %s -o %t7 -DOFFSET=0x1 -DSIZE=0xffffffff -DBITS=32
+# RUN: not llvm-readobj --unwind %t7 2>&1 | FileCheck %s -DFILE=%t7 --check-prefix=BROKEN-CONTENT2 -DOFFSET=0x1 -DSIZE=0xffffffff
+
+# RUN: yaml2obj --docnum=4 %s -o %t8 -DOFFSET=0xffffffff -DSIZE=0x1 -DBITS=32
+# RUN: not llvm-readobj --unwind %t8 2>&1 | FileCheck %s -DFILE=%t8 --check-prefix=BROKEN-CONTENT2 -DOFFSET=0xffffffff -DSIZE=0x1
+
+# RUN: yaml2obj --docnum=4 %s -o %t9 -DOFFSET=0x1 -DSIZE=0xffffffffffffffff -DBITS=64
+# RUN: not llvm-readelf --unwind %t9 2>&1 | FileCheck %s -DFILE=%t9 --check-prefix=BROKEN-CONTENT2 -DOFFSET=0x1 -DSIZE=0xffffffffffffffff
+
+# RUN: yaml2obj --docnum=4 %s -o %t10 -DOFFSET=0xffffffffffffffff -DSIZE=0x1 -DBITS=64
+# RUN: not llvm-readelf --unwind %t10 2>&1 | FileCheck %s -DFILE=%t10 --check-prefix=BROKEN-CONTENT2 -DOFFSET=0xffffffffffffffff -DSIZE=0x1
+
+# BROKEN-CONTENT2:      EHFrameHeader {
+# BROKEN-CONTENT2-NEXT:   Address: 0x0
+# BROKEN-CONTENT2-NEXT:   Offset: [[OFFSET]]
+# BROKEN-CONTENT2-NEXT:   Size:   [[SIZE]]
+# BROKEN-CONTENT2-NEXT:   Corresponding Section:
+# BROKEN-CONTENT2-NEXT: error: '[[FILE]]': program header [index 0] has a p_offset ([[OFFSET]]) + p_filesz ([[SIZE]]) that cannot be represented
diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 00a0b691b76b7..01800aba7cbe1 100644
--- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -33,8 +33,7 @@ class PrinterContext {
   ScopedPrinter &W;
   const object::ELFObjectFile<ELFT> *ObjF;
 
-  void printEHFrameHdr(uint64_t Offset, uint64_t Address, uint64_t Size) const;
-
+  void printEHFrameHdr(const typename ELFT::Phdr *EHFramePHdr) const;
   void printEHFrame(const typename ELFT::Shdr *EHFrameShdr) const;
 
 public:
@@ -60,7 +59,6 @@ findSectionByAddress(const object::ELFObjectFile<ELFT> *ObjF, uint64_t Addr) {
 template <typename ELFT>
 void PrinterContext<ELFT>::printUnwindInformation() const {
   const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const typename ELFT::Phdr *EHFramePhdr = nullptr;
 
   auto PHs = Obj->program_headers();
   if (Error E = PHs.takeError())
@@ -68,19 +66,15 @@ void PrinterContext<ELFT>::printUnwindInformation() const {
 
   for (const auto &Phdr : *PHs) {
     if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
-      EHFramePhdr = &Phdr;
       if (Phdr.p_memsz != Phdr.p_filesz)
         reportError(object::createError(
                         "p_memsz does not match p_filesz for GNU_EH_FRAME"),
                     ObjF->getFileName());
+      printEHFrameHdr(&Phdr);
       break;
     }
   }
 
-  if (EHFramePhdr)
-    printEHFrameHdr(EHFramePhdr->p_offset, EHFramePhdr->p_vaddr,
-                    EHFramePhdr->p_memsz);
-
   auto Sections = Obj->sections();
   if (Error E = Sections.takeError())
     reportError(std::move(E), ObjF->getFileName());
@@ -96,16 +90,16 @@ void PrinterContext<ELFT>::printUnwindInformation() const {
 }
 
 template <typename ELFT>
-void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
-                                           uint64_t EHFrameHdrAddress,
-                                           uint64_t EHFrameHdrSize) const {
+void PrinterContext<ELFT>::printEHFrameHdr(const typename ELFT::Phdr *EHFramePHdr) const {
   DictScope L(W, "EHFrameHeader");
+  uint64_t EHFrameHdrAddress = EHFramePHdr->p_vaddr;
   W.startLine() << format("Address: 0x%" PRIx64 "\n", EHFrameHdrAddress);
-  W.startLine() << format("Offset: 0x%" PRIx64 "\n", EHFrameHdrOffset);
-  W.startLine() << format("Size: 0x%" PRIx64 "\n", EHFrameHdrSize);
+  W.startLine() << format("Offset: 0x%" PRIx64 "\n", (uint64_t)EHFramePHdr->p_offset);
+  W.startLine() << format("Size: 0x%" PRIx64 "\n", (uint64_t)EHFramePHdr->p_memsz);
 
   const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const auto *EHFrameHdrShdr = findSectionByAddress(ObjF, EHFrameHdrAddress);
+  const typename ELFT::Shdr *EHFrameHdrShdr =
+      findSectionByAddress(ObjF, EHFramePHdr->p_vaddr);
   if (EHFrameHdrShdr) {
     auto SectionName = Obj->getSectionName(EHFrameHdrShdr);
     if (Error E = SectionName.takeError())
@@ -114,7 +108,11 @@ void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
     W.printString("Corresponding Section", *SectionName);
   }
 
-  DataExtractor DE(makeArrayRef(Obj->base() + EHFrameHdrOffset, EHFrameHdrSize),
+  Expected<ArrayRef<uint8_t>> Content = Obj->getSegmentContents(EHFramePHdr);
+  if (!Content)
+    reportError(Content.takeError(), ObjF->getFileName());
+
+  DataExtractor DE(*Content,
                    ELFT::TargetEndianness == support::endianness::little,
                    ELFT::Is64Bits ? 8 : 4);
 
@@ -154,7 +152,7 @@ void PrinterContext<ELFT>::printEHFrameHdr(uint64_t EHFrameHdrOffset,
 
   unsigned NumEntries = 0;
   uint64_t PrevPC = 0;
-  while (Offset + 8 <= EHFrameHdrSize && NumEntries < FDECount) {
+  while (Offset + 8 <= EHFramePHdr->p_memsz && NumEntries < FDECount) {
     DictScope D(W, std::string("entry ")  + std::to_string(NumEntries));
 
     auto InitialPC = DE.getSigned(&Offset, 4) + EHFrameHdrAddress;

From 5ee902bb5f3a843230f45dcd7b8101de71da7c83 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Wed, 27 May 2020 03:39:59 +0000
Subject: [PATCH 225/770] [compiler-rt][asan] Add noinline to use-after-scope
 testcases

Some testcases are unexpectedly passing with NPM.
This is because the target functions are inlined in NPM.

I think we should add noinline attribute to keep these test points.

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D79648
---
 compiler-rt/test/asan/TestCases/use-after-scope-dtor-order.cpp | 2 +-
 compiler-rt/test/asan/TestCases/use-after-scope-temp.cpp       | 2 +-
 compiler-rt/test/asan/TestCases/use-after-scope-temp2.cpp      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/use-after-scope-dtor-order.cpp b/compiler-rt/test/asan/TestCases/use-after-scope-dtor-order.cpp
index 8d4f772c0eabc..43c17106010b7 100644
--- a/compiler-rt/test/asan/TestCases/use-after-scope-dtor-order.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-scope-dtor-order.cpp
@@ -5,7 +5,7 @@
 
 struct IntHolder {
   explicit IntHolder(int *val = 0) : val_(val) { }
-  ~IntHolder() {
+  __attribute__((noinline)) ~IntHolder() {
     printf("Value: %d\n", *val_);  // BOOM
     // CHECK: ERROR: AddressSanitizer: stack-use-after-scope
     // CHECK:  #0 0x{{.*}} in IntHolder::~IntHolder{{.*}}.cpp:[[@LINE-2]]
diff --git a/compiler-rt/test/asan/TestCases/use-after-scope-temp.cpp b/compiler-rt/test/asan/TestCases/use-after-scope-temp.cpp
index b97f312b4abab..2cfc7cee215a9 100644
--- a/compiler-rt/test/asan/TestCases/use-after-scope-temp.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-scope-temp.cpp
@@ -8,7 +8,7 @@ struct IntHolder {
 
 const IntHolder *saved;
 
-void save(const IntHolder &holder) {
+__attribute__((noinline)) void save(const IntHolder &holder) {
   saved = &holder;
 }
 
diff --git a/compiler-rt/test/asan/TestCases/use-after-scope-temp2.cpp b/compiler-rt/test/asan/TestCases/use-after-scope-temp2.cpp
index 99e4f2505ff3b..3e6f52a3942ee 100644
--- a/compiler-rt/test/asan/TestCases/use-after-scope-temp2.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-scope-temp2.cpp
@@ -3,7 +3,7 @@
 
 
 struct IntHolder {
-  const IntHolder& Self() const {
+  __attribute__((noinline)) const IntHolder &Self() const {
     return *this;
   }
   int val = 3;

From cadb7ccf2cebcaa2d546db77223bde3d69a162af Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Mon, 25 May 2020 18:55:41 +0200
Subject: [PATCH 226/770] [mlir] SCF: provide function_ref builders for IfOp

Now that OpBuilder is available in `build` functions, it becomes possible to
populate the "then" and "else" regions directly when building the "if"
operation. This is desirable in more structured forms of builders, especially
in when conditionals are mixed with loops. Provide new `build` APIs taking
callbacks for body constructors, similarly to scf::ForOp, and replace more
clunky edsc::BlockBuilder uses with these. The original APIs remain available
and go through the new implementation.

Differential Revision: https://reviews.llvm.org/D80527
---
 mlir/include/mlir/Dialect/SCF/EDSC/Builders.h | 10 +++
 mlir/include/mlir/Dialect/SCF/SCF.h           |  2 +
 mlir/include/mlir/Dialect/SCF/SCFOps.td       | 13 +++-
 .../Conversion/VectorToSCF/VectorToSCF.cpp    | 70 +++++++++----------
 mlir/lib/Dialect/SCF/EDSC/Builders.cpp        | 48 +++++++++++++
 mlir/lib/Dialect/SCF/SCF.cpp                  | 44 +++++++++---
 6 files changed, 141 insertions(+), 46 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
index fa72bd623b259..607ea439d63a5 100644
--- a/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
+++ b/mlir/include/mlir/Dialect/SCF/EDSC/Builders.h
@@ -82,6 +82,16 @@ scf::ValueVector loopNestBuilder(
     Value lb, Value ub, Value step, ValueRange iterArgInitValues,
     function_ref<scf::ValueVector(Value, ValueRange)> fun = nullptr);
 
+/// Adapters for building if conditions using the builder and the location
+/// stored in ScopedContext. 'thenBody' is mandatory, 'elseBody' can be omitted
+/// if the condition should not have an 'else' part.
+ValueRange
+conditionBuilder(TypeRange results, Value condition,
+                 function_ref<scf::ValueVector()> thenBody,
+                 function_ref<scf::ValueVector()> elseBody = nullptr);
+ValueRange conditionBuilder(Value condition, function_ref<void()> thenBody,
+                            function_ref<void()> elseBody = nullptr);
+
 } // namespace edsc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SCF/SCF.h b/mlir/include/mlir/Dialect/SCF/SCF.h
index 570e8a31bce46..3974b58cbfbba 100644
--- a/mlir/include/mlir/Dialect/SCF/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/SCF.h
@@ -24,6 +24,8 @@
 namespace mlir {
 namespace scf {
 
+void buildTerminatedBody(OpBuilder &builder, Location loc);
+
 #include "mlir/Dialect/SCF/SCFOpsDialect.h.inc"
 
 #define GET_OP_CLASSES
diff --git a/mlir/include/mlir/Dialect/SCF/SCFOps.td b/mlir/include/mlir/Dialect/SCF/SCFOps.td
index 84f1645840033..a57d862d44ff7 100644
--- a/mlir/include/mlir/Dialect/SCF/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/SCFOps.td
@@ -238,7 +238,18 @@ def IfOp : SCF_Op<"if",
     OpBuilder<"OpBuilder &builder, OperationState &result, "
               "Value cond, bool withElseRegion">,
     OpBuilder<"OpBuilder &builder, OperationState &result, "
-              "TypeRange resultTypes, Value cond, bool withElseRegion">
+              "TypeRange resultTypes, Value cond, bool withElseRegion">,
+    OpBuilder<
+        "OpBuilder &builder, OperationState &result, TypeRange resultTypes, "
+        "Value cond, "
+        "function_ref<void(OpBuilder &, Location)> thenBuilder "
+        "    = buildTerminatedBody, "
+        "function_ref<void(OpBuilder &, Location)> elseBuilder = nullptr">,
+    OpBuilder<
+        "OpBuilder &builder, OperationState &result, Value cond, "
+        "function_ref<void(OpBuilder &, Location)> thenBuilder "
+        "    = buildTerminatedBody, "
+        "function_ref<void(OpBuilder &, Location)> elseBuilder = nullptr">
   ];
 
   let extraClassDeclaration = [{
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index 9868a14c21651..8c72800819a5b 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -235,39 +235,38 @@ LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
       SmallVector<Type, 1> resultType;
       if (options.unroll)
         resultType.push_back(vectorType);
-      auto ifOp = ScopedContext::getBuilderRef().create<scf::IfOp>(
-          ScopedContext::getLocation(), resultType, inBoundsCondition,
-          /*withElseRegion=*/true);
-
-      // 3.a. If in-bounds, progressively lower to a 1-D transfer read.
-      BlockBuilder(&ifOp.thenRegion().front(), Append())([&] {
-        Value vector = load1DVector(majorIvsPlusOffsets);
-        // 3.a.i. If `options.unroll` is true, insert the 1-D vector in the
-        // aggregate. We must yield and merge with the `else` branch.
-        if (options.unroll) {
-          vector = vector_insert(vector, result, majorIvs);
-          (loop_yield(vector));
-          return;
-        }
-        // 3.a.ii. Otherwise, just go through the temporary `alloc`.
-        std_store(vector, alloc, majorIvs);
-      });
-
-      // 3.b. If not in-bounds, splat a 1-D vector.
-      BlockBuilder(&ifOp.elseRegion().front(), Append())([&] {
-        Value vector = std_splat(minorVectorType, xferOp.padding());
-        // 3.a.i. If `options.unroll` is true, insert the 1-D vector in the
-        // aggregate. We must yield and merge with the `then` branch.
-        if (options.unroll) {
-          vector = vector_insert(vector, result, majorIvs);
-          (loop_yield(vector));
-          return;
-        }
-        // 3.b.ii. Otherwise, just go through the temporary `alloc`.
-        std_store(vector, alloc, majorIvs);
-      });
+
+      // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise
+      // splat a 1-D vector.
+      ValueRange ifResults = conditionBuilder(
+          resultType, inBoundsCondition,
+          [&]() -> scf::ValueVector {
+            Value vector = load1DVector(majorIvsPlusOffsets);
+            // 3.a. If `options.unroll` is true, insert the 1-D vector in the
+            // aggregate. We must yield and merge with the `else` branch.
+            if (options.unroll) {
+              vector = vector_insert(vector, result, majorIvs);
+              return {vector};
+            }
+            // 3.b. Otherwise, just go through the temporary `alloc`.
+            std_store(vector, alloc, majorIvs);
+            return {};
+          },
+          [&]() -> scf::ValueVector {
+            Value vector = std_splat(minorVectorType, xferOp.padding());
+            // 3.c. If `options.unroll` is true, insert the 1-D vector in the
+            // aggregate. We must yield and merge with the `then` branch.
+            if (options.unroll) {
+              vector = vector_insert(vector, result, majorIvs);
+              return {vector};
+            }
+            // 3.d. Otherwise, just go through the temporary `alloc`.
+            std_store(vector, alloc, majorIvs);
+            return {};
+          });
+
       if (!resultType.empty())
-        result = *ifOp.results().begin();
+        result = *ifResults.begin();
     } else {
       // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read.
       Value loaded1D = load1DVector(majorIvsPlusOffsets);
@@ -336,11 +335,8 @@ LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
     if (inBoundsCondition) {
       // 2.a. If the condition is not null, we need an IfOp, to write
       // conditionally. Progressively lower to a 1-D transfer write.
-      auto ifOp = ScopedContext::getBuilderRef().create<scf::IfOp>(
-          ScopedContext::getLocation(), TypeRange{}, inBoundsCondition,
-          /*withElseRegion=*/false);
-      BlockBuilder(&ifOp.thenRegion().front(),
-                   Append())([&] { emitTransferWrite(majorIvsPlusOffsets); });
+      conditionBuilder(inBoundsCondition,
+                       [&] { emitTransferWrite(majorIvsPlusOffsets); });
     } else {
       // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write.
       emitTransferWrite(majorIvsPlusOffsets);
diff --git a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
index 4ce701c1d7f9a..090c72fcd91f0 100644
--- a/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
+++ b/mlir/lib/Dialect/SCF/EDSC/Builders.cpp
@@ -159,3 +159,51 @@ mlir::scf::ValueVector mlir::edsc::loopNestBuilder(
                                 iterArgInitValues.end());
       });
 }
+
+static std::function<void(OpBuilder &, Location)>
+wrapIfBody(function_ref<scf::ValueVector()> body, TypeRange expectedTypes) {
+  (void)expectedTypes;
+  return [=](OpBuilder &builder, Location loc) {
+    ScopedContext context(builder, loc);
+    scf::ValueVector returned = body();
+    assert(ValueRange(returned).getTypes() == expectedTypes &&
+           "'if' body builder returned values of unexpected type");
+    builder.create<scf::YieldOp>(loc, returned);
+  };
+}
+
+ValueRange
+mlir::edsc::conditionBuilder(TypeRange results, Value condition,
+                             function_ref<scf::ValueVector()> thenBody,
+                             function_ref<scf::ValueVector()> elseBody) {
+  assert(ScopedContext::getContext() && "EDSC ScopedContext not set up");
+  assert(thenBody && "thenBody is mandatory");
+
+  auto ifOp = ScopedContext::getBuilderRef().create<scf::IfOp>(
+      ScopedContext::getLocation(), results, condition,
+      wrapIfBody(thenBody, results), wrapIfBody(elseBody, results));
+  return ifOp.getResults();
+}
+
+static std::function<void(OpBuilder &, Location)>
+wrapZeroResultIfBody(function_ref<void()> body) {
+  return [=](OpBuilder &builder, Location loc) {
+    ScopedContext context(builder, loc);
+    body();
+    builder.create<scf::YieldOp>(loc);
+  };
+}
+
+ValueRange mlir::edsc::conditionBuilder(Value condition,
+                                        function_ref<void()> thenBody,
+                                        function_ref<void()> elseBody) {
+  assert(ScopedContext::getContext() && "EDSC ScopedContext not set up");
+  assert(thenBody && "thenBody is mandatory");
+
+  ScopedContext::getBuilderRef().create<scf::IfOp>(
+      ScopedContext::getLocation(), condition, wrapZeroResultIfBody(thenBody),
+      elseBody ? llvm::function_ref<void(OpBuilder &, Location)>(
+                     wrapZeroResultIfBody(elseBody))
+               : llvm::function_ref<void(OpBuilder &, Location)>(nullptr));
+  return {};
+}
diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp
index edcb5aafbe4e5..e7c890c17841f 100644
--- a/mlir/lib/Dialect/SCF/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/SCF.cpp
@@ -35,6 +35,11 @@ SCFDialect::SCFDialect(MLIRContext *context)
       >();
 }
 
+/// Default callback for IfOp builders. Inserts a yield without arguments.
+void mlir::scf::buildTerminatedBody(OpBuilder &builder, Location loc) {
+  builder.create<scf::YieldOp>(loc);
+}
+
 //===----------------------------------------------------------------------===//
 // ForOp
 //===----------------------------------------------------------------------===//
@@ -338,20 +343,43 @@ void IfOp::build(OpBuilder &builder, OperationState &result, Value cond,
 
 void IfOp::build(OpBuilder &builder, OperationState &result,
                  TypeRange resultTypes, Value cond, bool withElseRegion) {
+  auto addTerminator = [&](OpBuilder &nested, Location loc) {
+    if (resultTypes.empty())
+      IfOp::ensureTerminator(*nested.getInsertionBlock()->getParent(), nested,
+                             loc);
+  };
+
+  build(builder, result, resultTypes, cond, addTerminator,
+        withElseRegion ? addTerminator
+                       : function_ref<void(OpBuilder &, Location)>());
+}
+
+void IfOp::build(OpBuilder &builder, OperationState &result,
+                 TypeRange resultTypes, Value cond,
+                 function_ref<void(OpBuilder &, Location)> thenBuilder,
+                 function_ref<void(OpBuilder &, Location)> elseBuilder) {
+  assert(thenBuilder && "the builder callback for 'then' must be present");
+
   result.addOperands(cond);
   result.addTypes(resultTypes);
 
+  OpBuilder::InsertionGuard guard(builder);
   Region *thenRegion = result.addRegion();
-  thenRegion->push_back(new Block());
-  if (resultTypes.empty())
-    IfOp::ensureTerminator(*thenRegion, builder, result.location);
+  builder.createBlock(thenRegion);
+  thenBuilder(builder, result.location);
 
   Region *elseRegion = result.addRegion();
-  if (withElseRegion) {
-    elseRegion->push_back(new Block());
-    if (resultTypes.empty())
-      IfOp::ensureTerminator(*elseRegion, builder, result.location);
-  }
+  if (!elseBuilder)
+    return;
+
+  builder.createBlock(elseRegion);
+  elseBuilder(builder, result.location);
+}
+
+void IfOp::build(OpBuilder &builder, OperationState &result, Value cond,
+                 function_ref<void(OpBuilder &, Location)> thenBuilder,
+                 function_ref<void(OpBuilder &, Location)> elseBuilder) {
+  build(builder, result, TypeRange(), cond, thenBuilder, elseBuilder);
 }
 
 static LogicalResult verify(IfOp op) {

From 42eba9b40b25cceeb3e6d432047c5ef99d4a7b50 Mon Sep 17 00:00:00 2001
From: Ties Stuij <ties.stuij@arm.com>
Date: Wed, 27 May 2020 14:59:54 +0100
Subject: [PATCH 227/770] [AArch64][BFloat] basic AArch64 bfloat support

Summary:
This patch adds the bfloat type to the AArch64 backend:
- adds it as part of the FPR16 register class
- adds bfloat calling conventions
- as f16 is now not the only FPR16 type anymore, we need to constrain a number
  of instruction patterns using FPR16Op to help out the TableGen type inferrer

This patch is part of a series implementing the Bfloat16 extension of the
Armv8.6-a architecture, as detailed here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

The bfloat type, and its properties are specified in the Arm Architecture
Reference Manual:

https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

Reviewers: t.p.northover, c-rhodes, fpetrogalli, sdesmalen, ostannard, LukeGeeson, ab

Reviewed By: fpetrogalli

Subscribers: pbarrio, LukeGeeson, kristof.beyls, hiraditya, danielkiss, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79709
---
 .../AArch64/AArch64CallingConvention.td       | 59 +++++++++++--------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  8 +++
 .../lib/Target/AArch64/AArch64InstrFormats.td | 38 ++++++------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  2 +-
 .../lib/Target/AArch64/AArch64RegisterInfo.td | 16 +++--
 5 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 6eb9ba4864621..eed87946dab9e 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -33,9 +33,9 @@ def CC_AArch64_AAPCS : CallingConv<[
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
-  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8],
                          CCBitConvertToType<f64>>>,
-  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8],
                          CCBitConvertToType<f128>>>,
 
   // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter.
@@ -75,10 +75,10 @@ def CC_AArch64_AAPCS : CallingConv<[
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
-            nxv2f32, nxv4f32, nxv2f64],
+            nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
   CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
-            nxv2f32, nxv4f32, nxv2f64],
+            nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCPassIndirect<i64>>,
 
   CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -102,22 +102,24 @@ def CC_AArch64_AAPCS : CallingConv<[
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
   CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
+  CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>,
   CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -132,9 +134,9 @@ def RetCC_AArch64_AAPCS : CallingConv<[
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
-  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8],
                          CCBitConvertToType<f64>>>,
-  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8],
                          CCBitConvertToType<f128>>>,
 
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -144,18 +146,20 @@ def RetCC_AArch64_AAPCS : CallingConv<[
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
   CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
-            nxv2f32, nxv4f32, nxv2f64],
+            nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
 
   CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -165,7 +169,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
 // Vararg functions on windows pass floats in integer registers
 let Entry = 1 in
 def CC_AArch64_Win64_VarArg : CallingConv<[
-  CCIfType<[f16, f32], CCPromoteToType<f64>>,
+  CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
   CCIfType<[f64], CCBitConvertToType<i64>>,
   CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
@@ -219,19 +223,22 @@ def CC_AArch64_DarwinPCS : CallingConv<[
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
   CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
-  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
+  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16",
+  CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
 
   // Re-demote pointers to 32-bits so we don't end up storing 64-bit
@@ -239,9 +246,9 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
   CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
 
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -255,14 +262,14 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 
   // Handle all scalar types as either i64 or f64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-  CCIfType<[f16, f32],     CCPromoteToType<f64>>,
+  CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
 
   // Everything is on the stack.
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -275,16 +282,16 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
 
   // Handle all scalar types as either i32 or f32.
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[f16],     CCPromoteToType<f32>>,
+  CCIfType<[f16, bf16], CCPromoteToType<f32>>,
 
   // Everything is on the stack.
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5eb9b7463411f..187f133669e66 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -132,6 +132,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
@@ -148,6 +149,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
     addDRTypeForNEON(MVT::v4f16);
+    addDRTypeForNEON(MVT::v4bf16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
@@ -156,6 +158,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
     addQRTypeForNEON(MVT::v8f16);
+    addQRTypeForNEON(MVT::v8bf16);
   }
 
   if (Subtarget->hasSVE()) {
@@ -174,6 +177,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
+    addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
@@ -3578,6 +3584,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         RC = &AArch64::GPR64RegClass;
       else if (RegVT == MVT::f16)
         RC = &AArch64::FPR16RegClass;
+      else if (RegVT == MVT::bf16)
+        RC = &AArch64::FPR16RegClass;
       else if (RegVT == MVT::f32)
         RC = &AArch64::FPR32RegClass;
       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index a06394a2898d9..713bf0bf3cade 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4447,14 +4447,14 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
            SDPatternOperator OpN> {
   // Unscaled half-precision to 32-bit
   def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+                                     [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> {
     let Inst{31} = 0; // 32-bit GPR flag
     let Predicates = [HasFullFP16];
   }
 
   // Unscaled half-precision to 64-bit
   def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+                                     [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> {
     let Inst{31} = 1; // 64-bit GPR flag
     let Predicates = [HasFullFP16];
   }
@@ -4489,7 +4489,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
   // Scaled half-precision to 32-bit
   def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
                               fixedpoint_f16_i32, asm,
-              [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+              [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn),
                                           fixedpoint_f16_i32:$scale)))]> {
     let Inst{31} = 0; // 32-bit GPR flag
     let scale{5} = 1;
@@ -4499,7 +4499,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
   // Scaled half-precision to 64-bit
   def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
                               fixedpoint_f16_i64, asm,
-              [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+              [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn),
                                           fixedpoint_f16_i64:$scale)))]> {
     let Inst{31} = 1; // 64-bit GPR flag
     let Predicates = [HasFullFP16];
@@ -4615,7 +4615,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
 
   // Scaled
   def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
-                             [(set FPR16:$Rd,
+                             [(set (f16 FPR16:$Rd),
                                    (fdiv (node GPR32:$Rn),
                                          fixedpoint_f16_i32:$scale))]> {
     let Inst{31} = 0; // 32-bit GPR flag
@@ -4643,7 +4643,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
   }
 
   def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
-                             [(set FPR16:$Rd,
+                             [(set (f16 FPR16:$Rd),
                                    (fdiv (node GPR64:$Rn),
                                          fixedpoint_f16_i64:$scale))]> {
     let Inst{31} = 1; // 64-bit GPR flag
@@ -4816,7 +4816,7 @@ class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
 multiclass FPConversion<string asm> {
   // Double-precision to Half-precision
   def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
-                             [(set FPR16:$Rd, (any_fpround FPR64:$Rn))]>;
+                             [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>;
 
   // Double-precision to Single-precision
   def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
@@ -4824,11 +4824,11 @@ multiclass FPConversion<string asm> {
 
   // Half-precision to Double-precision
   def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
-                             [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>;
+                             [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
 
   // Half-precision to Single-precision
   def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
-                             [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>;
+                             [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
 
   // Single-precision to Double-precision
   def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
@@ -4836,7 +4836,7 @@ multiclass FPConversion<string asm> {
 
   // Single-precision to Half-precision
   def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
-                             [(set FPR16:$Rd, (any_fpround FPR32:$Rn))]>;
+                             [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>;
 }
 
 //---
@@ -4938,7 +4938,7 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm,
 
 multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
   def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
-                  [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+                  [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
     let Inst{23-22} = 0b11; // 16-bit size flag
     let Predicates = [HasFullFP16];
   }
@@ -4980,7 +4980,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
 multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
                               SDPatternOperator node> {
   def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
-            [(set FPR16:$Rd,
+            [(set (f16 FPR16:$Rd),
                   (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
     let Inst{23-22} = 0b11; // 16-bit size flag
     let Predicates = [HasFullFP16];
@@ -5042,7 +5042,7 @@ multiclass FPComparison<bit signalAllNans, string asm,
                         SDPatternOperator OpNode = null_frag> {
   let Defs = [NZCV] in {
   def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
-      [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+      [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> {
     let Inst{23-22} = 0b11;
     let Predicates = [HasFullFP16];
   }
@@ -6742,7 +6742,7 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
       [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
     let Predicates = [HasNEON, HasFullFP16] in {
     def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
-      [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+      [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>;
     } // Predicates = [HasNEON, HasFullFP16]
   }
 
@@ -6949,7 +6949,7 @@ multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                                 [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
   let Predicates = [HasNEON, HasFullFP16] in {
   def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
-                                [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+                                [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
   }
 }
 
@@ -7091,10 +7091,10 @@ multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
   let Predicates = [HasNEON, HasFullFP16] in {
   def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
                                    asm, ".4h",
-        [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+        [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>;
   def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
                                    asm, ".8h",
-        [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+        [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>;
   } // Predicates = [HasNEON, HasFullFP16]
   def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
                                    asm, ".4s",
@@ -8095,7 +8095,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
   def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
                            (AArch64dup (f16 FPR16Op_lo:$Rm)))),
             (!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR16Op_lo:$Rm, hsub), (i64 0))>;
+                (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
 
   def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
                            (AArch64duplane16 (v8f16 V128_lo:$Rm),
@@ -8105,7 +8105,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
   def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
                            (AArch64dup (f16 FPR16Op_lo:$Rm)))),
             (!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR16Op_lo:$Rm, hsub), (i64 0))>;
+                (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
 
   def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn),
                          (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 03ac81e2462b4..07bca441529ee 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2529,7 +2529,7 @@ defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
                     [(set FPR8Op:$Rt,
                           (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
-                    [(set FPR16Op:$Rt,
+                    [(set (f16 FPR16Op:$Rt),
                           (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
                     [(set (f32 FPR32Op:$Rt),
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 93b6aa0cdb7f2..bd05c56009a1d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -422,18 +422,20 @@ def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
 def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
   let Size = 8;
 }
-def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> {
   let Size = 16;
 }
+
 def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> {
   let Size = 16;
 }
 def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
 def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64, v4f16],
-                                    64, (sequence "D%u", 0, 31)>;
+                                      v1i64, v4f16, v4bf16],
+                                     64, (sequence "D%u", 0, 31)>;
 def FPR64_lo : RegisterClass<"AArch64",
-                             [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
+                             [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32,
+                              v1f64],
                              64, (trunc FPR64, 16)>;
 
 // We don't (yet) have an f128 legal type, so don't use that here. We
@@ -441,13 +443,14 @@ def FPR64_lo : RegisterClass<"AArch64",
 // that here.
 def FPR128 : RegisterClass<"AArch64",
                            [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
-                            v8f16],
+                            v8f16, v8bf16],
                            128, (sequence "Q%u", 0, 31)>;
 
 // The lower 16 vector registers.  Some instructions can only take registers
 // in this range.
 def FPR128_lo : RegisterClass<"AArch64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16,
+                               v8bf16],
                               128, (trunc FPR128, 16)>;
 
 // Pairs, triples, and quads of 64-bit vector registers.
@@ -876,6 +879,7 @@ def PPR3b64  : PPRRegOp<"d", PPRAsmOp3b64,  ElementSizeD, PPR_3b>;
 class ZPRClass<int lastreg> : RegisterClass<"AArch64",
                                             [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
                                              nxv2f16, nxv4f16, nxv8f16,
+                                             nxv2bf16, nxv4bf16, nxv8bf16,
                                              nxv2f32, nxv4f32,
                                              nxv2f64],
                                             128, (sequence "Z%u", 0, lastreg)> {

From 4408eeed0ff191304121c11168aa1db861cccb97 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Sat, 23 May 2020 17:14:08 +0200
Subject: [PATCH 228/770] tsan: fix false positives in AcquireGlobal

Add ThreadClock:: global_acquire_ which is the last time another thread
has done a global acquire of this thread's clock.

It helps to avoid problem described in:
https://github.com/golang/go/issues/39186
See test/tsan/java_finalizer2.cpp for a regression test.
Note the failuire is _extremely_ hard to hit, so if you are trying
to reproduce it, you may want to run something like:
$ go get golang.org/x/tools/cmd/stress
$ stress -p=64 ./a.out

The crux of the problem is roughly as follows.
A number of O(1) optimizations in the clocks algorithm assume proper
transitive cumulative propagation of clock values. The AcquireGlobal
operation may produce an inconsistent non-linearazable view of
thread clocks. Namely, it may acquire a later value from a thread
with a higher ID, but fail to acquire an earlier value from a thread
with a lower ID. If a thread that executed AcquireGlobal then releases
to a sync clock, it will spoil the sync clock with the inconsistent
values. If another thread later releases to the sync clock, the optimized
algorithm may break.

The exact sequence of events that leads to the failure.
- thread 1 executes AcquireGlobal
- thread 1 acquires value 1 for thread 2
- thread 2 increments clock to 2
- thread 2 releases to sync object 1
- thread 3 at time 1
- thread 3 acquires from sync object 1
- thread 1 acquires value 1 for thread 3
- thread 1 releases to sync object 2
- sync object 2 clock has 1 for thread 2 and 1 for thread 3
- thread 3 releases to sync object 2
- thread 3 sees value 1 in the clock for itself
  and decides that it has already released to the clock
  and did not acquire anything from other threads after that
  (the last_acquire_ check in release operation)
- thread 3 does not update the value for thread 2 in the clock from 1 to 2
- thread 4 acquires from sync object 2
- thread 4 detects a false race with thread 2
  as it should have been synchronized with thread 2 up to time 2,
  but because of the broken clock it is now synchronized only up to time 1

The global_acquire_ value helps to prevent this scenario.
Namely, thread 3 will not trust any own clock values up to global_acquire_
for the purposes of the last_acquire_ optimization.

Reviewed-in: https://reviews.llvm.org/D80474
Reported-by: nvanbenschoten (Nathan VanBenschoten)
---
 compiler-rt/lib/tsan/rtl/tsan_clock.cpp     | 15 +++-
 compiler-rt/lib/tsan/rtl/tsan_clock.h       | 57 ++++++++++++++
 compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp |  4 +-
 compiler-rt/test/tsan/java_finalizer2.cpp   | 82 +++++++++++++++++++++
 4 files changed, 154 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/test/tsan/java_finalizer2.cpp

diff --git a/compiler-rt/lib/tsan/rtl/tsan_clock.cpp b/compiler-rt/lib/tsan/rtl/tsan_clock.cpp
index acbcf804194aa..c91b29cb22b47 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_clock.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_clock.cpp
@@ -115,13 +115,14 @@ static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
     : tid_(tid)
     , reused_(reused + 1)  // 0 has special meaning
+    , last_acquire_()
+    , global_acquire_()
     , cached_idx_()
     , cached_size_()
     , cached_blocks_() {
   CHECK_LT(tid, kMaxTidInClock);
   CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
   nclk_ = tid_ + 1;
-  last_acquire_ = 0;
   internal_memset(clk_, 0, sizeof(clk_));
 }
 
@@ -247,7 +248,7 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) {
   // Check if we had not acquired anything from other threads
   // since the last release on dst. If so, we need to update
   // only dst->elem(tid_).
-  if (dst->elem(tid_).epoch > last_acquire_) {
+  if (!HasAcquiredAfterRelease(dst)) {
     UpdateCurrentThread(c, dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
@@ -318,7 +319,7 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
 
   if (dst->release_store_tid_ == tid_ &&
       dst->release_store_reused_ == reused_ &&
-      dst->elem(tid_).epoch > last_acquire_) {
+      !HasAcquiredAfterRelease(dst)) {
     CPP_STAT_INC(StatClockStoreFast);
     UpdateCurrentThread(c, dst);
     return;
@@ -400,6 +401,14 @@ bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
   return true;
 }
 
+// Checks whether the current thread has acquired anything
+// from other clocks after releasing to dst (directly or indirectly).
+bool ThreadClock::HasAcquiredAfterRelease(const SyncClock *dst) const {
+  const u64 my_epoch = dst->elem(tid_).epoch;
+  return my_epoch <= last_acquire_ ||
+      my_epoch <= atomic_load_relaxed(&global_acquire_);
+}
+
 // Sets a single element in the vector clock.
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_clock.h b/compiler-rt/lib/tsan/rtl/tsan_clock.h
index c66431f54ee4c..736cdae06ba21 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_clock.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_clock.h
@@ -139,6 +139,7 @@ class ThreadClock {
   void acq_rel(ClockCache *c, SyncClock *dst);
   void ReleaseStore(ClockCache *c, SyncClock *dst);
   void ResetCached(ClockCache *c);
+  void NoteGlobalAcquire(u64 v);
 
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
@@ -151,6 +152,53 @@ class ThreadClock {
   // Current thread time when it acquired something from other threads.
   u64 last_acquire_;
 
+  // Last time another thread has done a global acquire of this thread's clock.
+  // It helps to avoid problem described in:
+  // https://github.com/golang/go/issues/39186
+  // See test/tsan/java_finalizer2.cpp for a regression test.
+  // Note the failuire is _extremely_ hard to hit, so if you are trying
+  // to reproduce it, you may want to run something like:
+  // $ go get golang.org/x/tools/cmd/stress
+  // $ stress -p=64 ./a.out
+  //
+  // The crux of the problem is roughly as follows.
+  // A number of O(1) optimizations in the clocks algorithm assume proper
+  // transitive cumulative propagation of clock values. The AcquireGlobal
+  // operation may produce an inconsistent non-linearazable view of
+  // thread clocks. Namely, it may acquire a later value from a thread
+  // with a higher ID, but fail to acquire an earlier value from a thread
+  // with a lower ID. If a thread that executed AcquireGlobal then releases
+  // to a sync clock, it will spoil the sync clock with the inconsistent
+  // values. If another thread later releases to the sync clock, the optimized
+  // algorithm may break.
+  //
+  // The exact sequence of events that leads to the failure.
+  // - thread 1 executes AcquireGlobal
+  // - thread 1 acquires value 1 for thread 2
+  // - thread 2 increments clock to 2
+  // - thread 2 releases to sync object 1
+  // - thread 3 at time 1
+  // - thread 3 acquires from sync object 1
+  // - thread 3 increments clock to 2
+  // - thread 1 acquires value 2 for thread 3
+  // - thread 1 releases to sync object 2
+  // - sync object 2 clock has 1 for thread 2 and 2 for thread 3
+  // - thread 3 releases to sync object 2
+  // - thread 3 sees value 2 in the clock for itself
+  //   and decides that it has already released to the clock
+  //   and did not acquire anything from other threads after that
+  //   (the last_acquire_ check in release operation)
+  // - thread 3 does not update the value for thread 2 in the clock from 1 to 2
+  // - thread 4 acquires from sync object 2
+  // - thread 4 detects a false race with thread 2
+  //   as it should have been synchronized with thread 2 up to time 2,
+  //   but because of the broken clock it is now synchronized only up to time 1
+  //
+  // The global_acquire_ value helps to prevent this scenario.
+  // Namely, thread 3 will not trust any own clock values up to global_acquire_
+  // for the purposes of the last_acquire_ optimization.
+  atomic_uint64_t global_acquire_;
+
   // Cached SyncClock (without dirty entries and release_store_tid_).
   // We reuse it for subsequent store-release operations without intervening
   // acquire operations. Since it is shared (and thus constant), clock value
@@ -165,6 +213,7 @@ class ThreadClock {
   u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
+  bool HasAcquiredAfterRelease(const SyncClock *dst) const;
   void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
 };
 
@@ -186,6 +235,14 @@ ALWAYS_INLINE uptr ThreadClock::size() const {
   return nclk_;
 }
 
+ALWAYS_INLINE void ThreadClock::NoteGlobalAcquire(u64 v) {
+  // Here we rely on the fact that AcquireGlobal is protected by
+  // ThreadRegistryLock, thus only one thread at a time executes it
+  // and values passed to this function should not go backwards.
+  CHECK_LE(atomic_load_relaxed(&global_acquire_), v);
+  atomic_store_relaxed(&global_acquire_, v);
+}
+
 ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
   return Iter(this);
 }
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
index bca194f064b40..ebd0d72218188 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
@@ -415,8 +415,10 @@ static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
   ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
   ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
   u64 epoch = tctx->epoch1;
-  if (tctx->status == ThreadStatusRunning)
+  if (tctx->status == ThreadStatusRunning) {
     epoch = tctx->thr->fast_state.epoch();
+    tctx->thr->clock.NoteGlobalAcquire(epoch);
+  }
   thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
 }
 
diff --git a/compiler-rt/test/tsan/java_finalizer2.cpp b/compiler-rt/test/tsan/java_finalizer2.cpp
new file mode 100644
index 0000000000000..f2590f7c40b9d
--- /dev/null
+++ b/compiler-rt/test/tsan/java_finalizer2.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+// Regression test for https://github.com/golang/go/issues/39186
+#include "java.h"
+#include <string.h>
+
+struct Heap {
+  uint64_t data;
+  uint64_t ready;
+  uint64_t finalized;
+  uint64_t wg;
+  pthread_barrier_t barrier_finalizer;
+  pthread_barrier_t barrier_ballast;
+};
+
+void *Thread1(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  __tsan_java_finalize();
+  __atomic_fetch_add(&heap->wg, 1, __ATOMIC_RELEASE);
+  __atomic_store_n(&heap->finalized, 1, __ATOMIC_RELAXED);
+  return 0;
+}
+
+void *Thread2(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  heap->data = 1;
+  __atomic_store_n(&heap->ready, 1, __ATOMIC_RELEASE);
+  return 0;
+}
+
+void *Thread3(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  while (__atomic_load_n(&heap->ready, __ATOMIC_ACQUIRE) != 1)
+    pthread_yield();
+  while (__atomic_load_n(&heap->finalized, __ATOMIC_RELAXED) != 1)
+    pthread_yield();
+  __atomic_fetch_add(&heap->wg, 1, __ATOMIC_RELEASE);
+  return 0;
+}
+
+void *Ballast(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_ballast);
+  return 0;
+}
+
+int main() {
+  Heap* heap = (Heap*)calloc(sizeof(Heap), 1);
+  __tsan_java_init((jptr)heap, sizeof(*heap));
+  __tsan_java_alloc((jptr)heap, sizeof(*heap));
+  // Ballast threads merely make the bug a bit easier to trigger.
+  const int kBallastThreads = 100;
+  pthread_barrier_init(&heap->barrier_finalizer, 0, 4);
+  pthread_barrier_init(&heap->barrier_ballast, 0, kBallastThreads + 1);
+  pthread_t th[3];
+  pthread_create(&th[0], 0, Thread1, heap);
+  pthread_create(&th[1], 0, Thread2, heap);
+  pthread_t ballast[kBallastThreads];
+  for (int i = 0; i < kBallastThreads; i++)
+    pthread_create(&ballast[i], 0, Ballast, heap);
+  pthread_create(&th[2], 0, Thread3, heap);
+  pthread_barrier_wait(&heap->barrier_ballast);
+  for (int i = 0; i < kBallastThreads; i++)
+    pthread_join(ballast[i], 0);
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  while (__atomic_load_n(&heap->wg, __ATOMIC_ACQUIRE) != 2)
+    pthread_yield();
+  if (heap->data != 1)
+    exit(printf("no data\n"));
+  for (int i = 0; i < 3; i++)
+    pthread_join(th[i], 0);
+  pthread_barrier_destroy(&heap->barrier_ballast);
+  pthread_barrier_destroy(&heap->barrier_finalizer);
+  __tsan_java_free((jptr)heap, sizeof(*heap));
+  fprintf(stderr, "DONE\n");
+  return __tsan_java_fini();
+}
+
+// CHECK-NOT: WARNING: ThreadSanitizer: data race
+// CHECK: DONE

From 78bd0c0e5e8fbbfbb9f827bdd1f83f91ed3437fa Mon Sep 17 00:00:00 2001
From: Ties Stuij <ties.stuij@arm.com>
Date: Wed, 27 May 2020 15:27:47 +0100
Subject: [PATCH 229/770] [AArch64][BFloat] add BFloat instruction support for
 AArch64

Summary:
Add support for lowering various BFloat related SelDAG nodes:
- load/store (ldrh/strh)
- concat
- dup/duplane
- bitconvert/bitcast
- insert_subvector/insert_subreg

This patch is part of a series implementing the Bfloat16 extension of the
Armv8.6-a architecture, as detailed here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

The bfloat type, and its properties are specified in the Arm Architecture
Reference Manual:

https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

Reviewers: ab, t.p.northover, john.brawn, fpetrogalli, sdesmalen, LukeGeeson

Reviewed By: fpetrogalli

Subscribers: LukeGeeson, pbarrio, kristof.beyls, hiraditya, danielkiss, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79712
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |   2 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  41 ++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 161 +++++++++++++
 .../CodeGen/AArch64/bf16-vector-bitcast.ll    | 218 ++++++++++++++++++
 .../CodeGen/AArch64/bf16-vector-shuffle.ll    | 165 +++++++++++++
 llvm/test/CodeGen/AArch64/bf16.ll             |  38 +++
 6 files changed, 608 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
 create mode 100644 llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll
 create mode 100644 llvm/test/CodeGen/AArch64/bf16.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index c4f260145afff..f2c1727fd0676 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1321,6 +1321,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
     }
   } else if (VT == MVT::f16) {
     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
+  } else if (VT == MVT::bf16) {
+    Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
   } else if (VT == MVT::f32) {
     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
   } else if (VT == MVT::f64 || VT.is64BitVector()) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 187f133669e66..579905d748eac 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -622,6 +622,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+  setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
 
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -633,6 +634,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setIndexedLoadAction(im, MVT::f64, Legal);
     setIndexedLoadAction(im, MVT::f32, Legal);
     setIndexedLoadAction(im, MVT::f16, Legal);
+    setIndexedLoadAction(im, MVT::bf16, Legal);
     setIndexedStoreAction(im, MVT::i8, Legal);
     setIndexedStoreAction(im, MVT::i16, Legal);
     setIndexedStoreAction(im, MVT::i32, Legal);
@@ -640,6 +642,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(im, MVT::f64, Legal);
     setIndexedStoreAction(im, MVT::f32, Legal);
     setIndexedStoreAction(im, MVT::f16, Legal);
+    setIndexedStoreAction(im, MVT::bf16, Legal);
   }
 
   // Trap.
@@ -2818,7 +2821,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 }
 
 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
-  if (Op.getValueType() != MVT::f16)
+  EVT OpVT = Op.getValueType();
+  if (OpVT != MVT::f16 && OpVT != MVT::bf16)
     return SDValue();
 
   assert(Op.getOperand(0).getValueType() == MVT::i16);
@@ -2827,7 +2831,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
   return SDValue(
-      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
       0);
 }
@@ -3582,9 +3586,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         RC = &AArch64::GPR32RegClass;
       else if (RegVT == MVT::i64)
         RC = &AArch64::GPR64RegClass;
-      else if (RegVT == MVT::f16)
-        RC = &AArch64::FPR16RegClass;
-      else if (RegVT == MVT::bf16)
+      else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
         RC = &AArch64::FPR16RegClass;
       else if (RegVT == MVT::f32)
         RC = &AArch64::FPR32RegClass;
@@ -5279,8 +5281,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
                        Cmp);
   }
 
-  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
-         LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
+         LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
@@ -7305,7 +7307,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> REV32
     if (VT.getVectorElementType() == MVT::i16 ||
-        VT.getVectorElementType() == MVT::f16)
+        VT.getVectorElementType() == MVT::f16 ||
+        VT.getVectorElementType() == MVT::bf16)
       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> REV16
     assert(VT.getVectorElementType() == MVT::i8);
@@ -7318,7 +7321,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
     unsigned Opcode;
     if (EltTy == MVT::i8)
       Opcode = AArch64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16 || EltTy == MVT::f16)
+    else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
       Opcode = AArch64ISD::DUPLANE16;
     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
       Opcode = AArch64ISD::DUPLANE32;
@@ -7425,7 +7428,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
 static unsigned getDUPLANEOp(EVT EltType) {
   if (EltType == MVT::i8)
     return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16 || EltType == MVT::f16)
+  if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
     return AArch64ISD::DUPLANE16;
   if (EltType == MVT::i32 || EltType == MVT::f32)
     return AArch64ISD::DUPLANE32;
@@ -7661,6 +7664,7 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
     break;
   case MVT::f16:
+  case MVT::bf16:
   case MVT::f32:
   case MVT::f64:
     // Fine as is
@@ -8367,8 +8371,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       EVT EltTy = VT.getVectorElementType();
-      assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
-              "Unsupported floating-point vector type");
+      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
+               EltTy == MVT::f64) && "Unsupported floating-point vector type");
       LLVM_DEBUG(
           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
                     "BITCASTS, and try again\n");
@@ -8487,11 +8491,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
-      VT == MVT::v8f16)
+      VT == MVT::v8f16 || VT == MVT::v8bf16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+      VT != MVT::v4bf16)
     return SDValue();
 
   // For V64 types, we perform insertion by expanding the value
@@ -8521,11 +8526,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
-      VT == MVT::v8f16)
+      VT == MVT::v8f16 || VT == MVT::v8bf16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+      VT != MVT::v4bf16)
     return SDValue();
 
   // For V64 types, we perform extraction by expanding the value
@@ -13690,7 +13696,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
 
-  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+  if (N->getValueType(0) != MVT::i16 ||
+      (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
     return;
 
   Op = SDValue(
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 07bca441529ee..713229261562f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2329,6 +2329,10 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
                  [(set (f128 FPR128Op:$Rt),
                        (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
 
+// bf16 load pattern
+def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+
 // For regular load, we do not have any alignment requirement.
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
@@ -2974,6 +2978,11 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1,  "strb",
                                     (am_indexed8 GPR64sp:$Rn,
                                                  uimm12s1:$offset))]>;
 
+// bf16 store pattern
+def : Pat<(store (bf16 FPR16Op:$Rt),
+                 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+          (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>;
+
 let AddedComplexity = 10 in {
 
 // Match all store 64 bits width whose type is compatible with FPR64
@@ -4776,6 +4785,7 @@ multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
 defm : ExtPat<v8i8, v16i8, 8>;
 defm : ExtPat<v4i16, v8i16, 4>;
 defm : ExtPat<v4f16, v8f16, 4>;
+defm : ExtPat<v4bf16, v8bf16, 4>;
 defm : ExtPat<v2i32, v4i32, 2>;
 defm : ExtPat<v2f32, v4f32, 2>;
 defm : ExtPat<v1i64, v2i64, 1>;
@@ -4897,16 +4907,29 @@ def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
           (v4f16 (DUPv4i16lane
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
             (i64 0)))>;
+def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))),
+          (v4bf16 (DUPv4i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
 def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
           (v8f16 (DUPv8i16lane
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
             (i64 0)))>;
+def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))),
+          (v8bf16 (DUPv8i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
 
 def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
           (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
 def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
           (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
 
+def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
+
 def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
           (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
 def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
@@ -5022,6 +5045,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
 def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
+def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
             (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
@@ -5038,6 +5066,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
 def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
+def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -5063,6 +5096,23 @@ def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
             (i64 0))>;
 
+def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn),
+            (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi16lane
+              (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+              (i64 0)),
+            dsub)>;
+
+def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn),
+            (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+          (INSvi16lane
+            V128:$Rn, VectorIndexH:$imm,
+            (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+            (i64 0))>;
+
 def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
             (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
           (EXTRACT_SUBREG
@@ -5144,6 +5194,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
 }
 
 defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;
 defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
 
@@ -5157,6 +5208,9 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
           (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
+          (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+
 
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
           (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
@@ -5164,6 +5218,8 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
           (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
           (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
+          (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
@@ -5179,6 +5235,7 @@ def : ConcatPat<v4i32, v2i32>;
 def : ConcatPat<v4f32, v2f32>;
 def : ConcatPat<v8i16, v4i16>;
 def : ConcatPat<v8f16, v4f16>;
+def : ConcatPat<v8bf16, v4bf16>;
 def : ConcatPat<v16i8, v8i8>;
 
 // If the high lanes are undef, though, we can just ignore them:
@@ -6620,6 +6677,7 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
 def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6627,12 +6685,14 @@ def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6640,6 +6700,7 @@ def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6656,6 +6717,7 @@ def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
@@ -6664,6 +6726,7 @@ def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6672,6 +6735,7 @@ def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6680,6 +6744,7 @@ def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6691,6 +6756,7 @@ def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -6699,6 +6765,7 @@ def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
@@ -6706,6 +6773,7 @@ def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 
 def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
@@ -6716,6 +6784,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
@@ -6730,6 +6800,8 @@ def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4bf16 (bitconvert GPR64:$Xn)),
+                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 
@@ -6741,6 +6813,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
           (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 }
@@ -6770,6 +6844,7 @@ def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -6781,6 +6856,8 @@ def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
                              (v1i64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
                              (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
                              (v1i64 (REV64v2i32 FPR64:$src))>;
 }
@@ -6794,6 +6871,7 @@ def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
@@ -6808,6 +6886,8 @@ def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
                              (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
 }
 def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 
@@ -6834,6 +6914,7 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
@@ -6842,6 +6923,13 @@ def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+
+def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v8i8  FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (f64   FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
@@ -6856,8 +6944,22 @@ def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
                              (v4f16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
+
+def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4bf16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4bf16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4bf16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (f64   FPR64:$src))),
+                             (v4bf16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4bf16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4bf16 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
@@ -6867,6 +6969,7 @@ def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4bf16 FPR64:$src))), (v8i8  FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
@@ -6883,6 +6986,8 @@ def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))),
                              (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4bf16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
 }
 
 let Predicates = [IsLE] in {
@@ -6891,6 +6996,7 @@ def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4bf16 FPR64:$src))), (f64   FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
@@ -6903,6 +7009,8 @@ def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
                              (f64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))),
                              (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4bf16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
@@ -6913,6 +7021,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
@@ -6925,6 +7034,8 @@ def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
                              (v1f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
                              (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
@@ -6936,6 +7047,7 @@ def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
@@ -6950,6 +7062,8 @@ def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
                              (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 
@@ -6960,6 +7074,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -6974,6 +7089,9 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
 def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                             (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
                             (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
@@ -6989,6 +7107,7 @@ def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 }
@@ -7002,6 +7121,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
                              (v2f64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
@@ -7013,6 +7134,7 @@ let Predicates = [IsLE] in {
 def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -7025,6 +7147,8 @@ def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
                              (v4f32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
@@ -7041,6 +7165,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
@@ -7056,6 +7181,8 @@ def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
                              (v2i64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
                              (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
 }
 def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 
@@ -7066,6 +7193,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
@@ -7082,6 +7210,8 @@ def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
                              (v4i32 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
                              (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 
@@ -7110,6 +7240,7 @@ def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
@@ -7118,6 +7249,13 @@ def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+
+def : Pat<(v8bf16 (bitconvert (f128  FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))),
@@ -7134,8 +7272,24 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
                              (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
                              (v8f16 (REV32v8i16 FPR128:$src))>;
+
+def : Pat<(v8bf16 (bitconvert (f128  FPR128:$src))),
+                             (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8bf16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8bf16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8bf16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8bf16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8bf16 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -7145,6 +7299,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
@@ -7163,6 +7318,8 @@ def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
                              (v16i8 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
                              (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
 }
 
 def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
@@ -7173,6 +7330,8 @@ def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
            (EXTRACT_SUBREG V128:$Rn, dsub)>;
 def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
            (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
 def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
            (EXTRACT_SUBREG V128:$Rn, dsub)>;
 def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
@@ -7204,6 +7363,8 @@ multiclass InsertSubvectorUndef<ValueType Ty> {
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
   def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
             (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
   def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 }
diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
new file mode 100644
index 0000000000000..d59f1351b3698
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
@@ -0,0 +1,218 @@
+; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x i16> @v4bf16_to_v4i16(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v4i16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @v4bf16_to_v2i32(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v2i32:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <1 x i64> @v4bf16_to_v1i64(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v1i64:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define i64 @v4bf16_to_i64(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_i64:
+; CHECK-NEXT: fmov x0, d1
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to i64
+  ret i64 %1
+}
+
+define <2 x float> @v4bf16_to_v2float(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v2float:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <2 x float>
+  ret <2 x float> %1
+}
+
+define <1 x double> @v4bf16_to_v1double(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v1double:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+define double @v4bf16_to_double(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_double:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to double
+  ret double %1
+}
+
+
+define <4 x bfloat> @v4i16_to_v4bf16(float, <4 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16_to_v4bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @v2i32_to_v4bf16(float, <2 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32_to_v4bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <2 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @v1i64_to_v4bf16(float, <1 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64_to_v4bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <1 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @i64_to_v4bf16(float, i64 %a) nounwind {
+; CHECK-LABEL: i64_to_v4bf16:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast i64 %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @v2float_to_v4bf16(float, <2 x float> %a) nounwind {
+; CHECK-LABEL: v2float_to_v4bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <2 x float> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @v1double_to_v4bf16(float, <1 x double> %a) nounwind {
+; CHECK-LABEL: v1double_to_v4bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <1 x double> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @double_to_v4bf16(float, double %a) nounwind {
+; CHECK-LABEL: double_to_v4bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast double %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <8 x i16> @v8bf16_to_v8i16(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v8i16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @v8bf16_to_v4i32(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v4i32:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @v8bf16_to_v2i64(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v2i64:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <4 x float> @v8bf16_to_v4float(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v4float:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x double> @v8bf16_to_v2double(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v2double:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <8 x bfloat> @v8i16_to_v8bf16(float, <8 x i16> %a) nounwind {
+; CHECK-LABEL: v8i16_to_v8bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @v4i32_to_v8bf16(float, <4 x i32> %a) nounwind {
+; CHECK-LABEL: v4i32_to_v8bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @v2i64_to_v8bf16(float, <2 x i64> %a) nounwind {
+; CHECK-LABEL: v2i64_to_v8bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <2 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @v4float_to_v8bf16(float, <4 x float> %a) nounwind {
+; CHECK-LABEL: v4float_to_v8bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <4 x float> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @v2double_to_v8bf16(float, <2 x double> %a) nounwind {
+; CHECK-LABEL: v2double_to_v8bf16:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+entry:
+  %1 = bitcast <2 x double> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll
new file mode 100644
index 0000000000000..df078030e08b1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-none-eabi | FileCheck %s
+
+; bfloat16x4_t test_vcreate_bf16(uint64_t a) { return vcreate_bf16(a); }
+define <4 x bfloat> @test_vcreate_bf16(i64 %a) nounwind {
+; CHECK-LABEL: test_vcreate_bf16:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast i64 %a to <4 x bfloat>
+  ret <4 x bfloat> %0
+}
+
+; bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { return vdup_n_bf16(v); }
+define <4 x bfloat> @test_vdup_n_bf16(bfloat %v) nounwind {
+; CHECK-LABEL: test_vdup_n_bf16:
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %vecinit.i = insertelement <4 x bfloat> undef, bfloat %v, i32 0
+  %vecinit3.i = shufflevector <4 x bfloat> %vecinit.i, <4 x bfloat> undef, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %vecinit3.i
+}
+
+; bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { return vdupq_n_bf16(v); }
+define <8 x bfloat> @test_vdupq_n_bf16(bfloat %v) nounwind {
+; CHECK-LABEL: test_vdupq_n_bf16:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %vecinit.i = insertelement <8 x bfloat> undef, bfloat %v, i32 0
+  %vecinit7.i = shufflevector <8 x bfloat> %vecinit.i, <8 x bfloat> undef, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %vecinit7.i
+}
+
+; bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { return vdup_lane_bf16(v, 1); }
+define <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vdup_lane_bf16:
+; CHECK-NEXT:    dup v0.4h, v0.h[1]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x bfloat> %lane
+}
+
+; bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { return vdupq_lane_bf16(v, 1); }
+define <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vdupq_lane_bf16:
+; CHECK-NEXT:    dup v0.8h, v0.h[1]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x bfloat> %lane
+}
+
+; bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { return vdup_laneq_bf16(v, 7); }
+define <4 x bfloat> @test_vdup_laneq_bf16(<8 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vdup_laneq_bf16:
+; CHECK-NEXT:    dup v0.4h, v0.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x bfloat> %lane
+}
+
+; bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { return vdupq_laneq_bf16(v, 7); }
+define <8 x bfloat> @test_vdupq_laneq_bf16(<8 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vdupq_laneq_bf16:
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x bfloat> %lane
+}
+
+; bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) { return vcombine_bf16(low, high); }
+define <8 x bfloat> @test_vcombine_bf16(<4 x bfloat> %low, <4 x bfloat> %high) nounwind {
+; CHECK-LABEL: test_vcombine_bf16:
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x bfloat> %low, <4 x bfloat> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x bfloat> %shuffle.i
+}
+
+; bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { return vget_high_bf16(a); }
+define <4 x bfloat> @test_vget_high_bf16(<8 x bfloat> %a) nounwind {
+; CHECK-LABEL: test_vget_high_bf16:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle.i
+}
+
+; bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { return vget_low_bf16(a); }
+define <4 x bfloat> @test_vget_low_bf16(<8 x bfloat> %a) nounwind {
+; CHECK-LABEL: test_vget_low_bf16:
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle.i
+}
+
+; bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { return vget_lane_bf16(v, 1); }
+define bfloat @test_vget_lane_bf16(<4 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vget_lane_bf16:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    ret
+entry:
+  %vget_lane = extractelement <4 x bfloat> %v, i32 1
+  ret bfloat %vget_lane
+}
+
+; bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { return vgetq_lane_bf16(v, 7); }
+define bfloat @test_vgetq_lane_bf16(<8 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vgetq_lane_bf16:
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vgetq_lane = extractelement <8 x bfloat> %v, i32 7
+  ret bfloat %vgetq_lane
+}
+
+; bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { return vset_lane_bf16(a, v, 1); }
+define <4 x bfloat> @test_vset_lane_bf16(bfloat %a, <4 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vset_lane_bf16:
+; CHECK-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %vset_lane = insertelement <4 x bfloat> %v, bfloat %a, i32 1
+  ret <4 x bfloat> %vset_lane
+}
+
+; bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { return vsetq_lane_bf16(a, v, 7); }
+define <8 x bfloat> @test_vsetq_lane_bf16(bfloat %a, <8 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vsetq_lane_bf16:
+; CHECK-NEXT:    mov v1.h[7], v0.h[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %vset_lane = insertelement <8 x bfloat> %v, bfloat %a, i32 7
+  ret <8 x bfloat> %vset_lane
+}
+
+; bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { return vduph_lane_bf16(v, 1); }
+define bfloat @test_vduph_lane_bf16(<4 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vduph_lane_bf16:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    ret
+entry:
+  %vget_lane = extractelement <4 x bfloat> %v, i32 1
+  ret bfloat %vget_lane
+}
+
+; bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) { return vduph_laneq_bf16(v, 7); }
+define bfloat @test_vduph_laneq_bf16(<8 x bfloat> %v) nounwind {
+; CHECK-LABEL: test_vduph_laneq_bf16:
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vgetq_lane = extractelement <8 x bfloat> %v, i32 7
+  ret bfloat %vgetq_lane
+}
diff --git a/llvm/test/CodeGen/AArch64/bf16.ll b/llvm/test/CodeGen/AArch64/bf16.ll
new file mode 100644
index 0000000000000..f1464eae5c9f5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -asm-verbose=0 -mtriple=arm64-eabi | FileCheck %s
+; RUN: llc < %s -asm-verbose=0 -mtriple=aarch64-eabi | FileCheck %s
+
+; test argument passing and simple load/store
+
+define bfloat @test_load(bfloat* %p) nounwind {
+; CHECK-LABEL: test_load:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: ret
+  %tmp1 = load bfloat, bfloat* %p, align 16
+  ret bfloat %tmp1
+}
+
+define <4 x bfloat> @test_vec_load(<4 x bfloat>* %p) nounwind {
+; CHECK-LABEL: test_vec_load:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ret
+  %tmp1 = load <4 x bfloat>, <4 x bfloat>* %p, align 16
+  ret <4 x bfloat> %tmp1
+}
+
+define void @test_store(bfloat* %a, bfloat %b) nounwind {
+; CHECK-LABEL: test_store:
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+  store bfloat %b, bfloat* %a, align 16
+  ret void
+}
+
+; Simple store of v4bf16
+define void @test_vec_store(<4 x bfloat>* %a, <4 x bfloat> %b) nounwind {
+; CHECK-LABEL: test_vec_store:
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+  store <4 x bfloat> %b, <4 x bfloat>* %a, align 16
+  ret void
+}

From 559845f8fe53fabb22f9a392e8d34761df250c72 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 27 May 2020 09:37:57 -0500
Subject: [PATCH 230/770] Revert "[PowerPC] Add support for -mcpu=pwr10 in both
 clang and llvm"

This reverts commit 7eb666b1556b86503f2f386bf921186cdbb2d22a.
---
 clang/lib/Basic/Targets/PPC.cpp               | 41 +++++---------
 clang/lib/Basic/Targets/PPC.h                 | 43 +++++++--------
 clang/lib/Driver/ToolChains/Arch/PPC.cpp      | 20 +++----
 clang/test/Misc/target-invalid-cpu-note.c     |  2 +-
 clang/test/Preprocessor/init-ppc64.c          | 18 ------
 llvm/lib/Support/Host.cpp                     |  1 -
 llvm/lib/Target/PowerPC/PPC.td                | 27 ++-------
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  3 -
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp      |  1 -
 llvm/lib/Target/PowerPC/PPCSubtarget.h        | 55 +++++++++----------
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  9 +--
 llvm/test/CodeGen/PowerPC/check-cpu.ll        |  6 +-
 12 files changed, 78 insertions(+), 148 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 231f94b66f5fd..81c13a8104e8a 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -151,8 +151,6 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("_ARCH_PWR8");
   if (ArchDefs & ArchDefinePwr9)
     Builder.defineMacro("_ARCH_PWR9");
-  if (ArchDefs & ArchDefinePwr10)
-    Builder.defineMacro("_ARCH_PWR10");
   if (ArchDefs & ArchDefineA2)
     Builder.defineMacro("_ARCH_A2");
   if (ArchDefs & ArchDefineA2q) {
@@ -265,51 +263,41 @@ bool PPCTargetInfo::initFeatureMap(
                             .Case("pwr7", true)
                             .Case("pwr8", true)
                             .Case("pwr9", true)
-                            .Case("pwr10", true)
                             .Case("ppc64", true)
                             .Case("ppc64le", true)
                             .Default(false);
 
   Features["qpx"] = (CPU == "a2q");
-  Features["power9-vector"] = llvm::StringSwitch<bool>(CPU)
-                                  .Case("pwr10", true)
-                                  .Case("pwr9", true)
-                                  .Default(false);
+  Features["power9-vector"] = (CPU == "pwr9");
   Features["crypto"] = llvm::StringSwitch<bool>(CPU)
                            .Case("ppc64le", true)
-                           .Case("pwr10", true)
                            .Case("pwr9", true)
                            .Case("pwr8", true)
                            .Default(false);
   Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
                                   .Case("ppc64le", true)
-                                  .Case("pwr10", true)
                                   .Case("pwr9", true)
                                   .Case("pwr8", true)
                                   .Default(false);
   Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
                            .Case("ppc64le", true)
-                           .Case("pwr10", true)
                            .Case("pwr9", true)
                            .Case("pwr8", true)
                            .Case("pwr7", true)
                            .Default(false);
   Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
                            .Case("ppc64le", true)
-                           .Case("pwr10", true)
                            .Case("pwr9", true)
                            .Case("pwr8", true)
                            .Case("pwr7", true)
                            .Default(false);
   Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
                                 .Case("ppc64le", true)
-                                .Case("pwr10", true)
                                 .Case("pwr9", true)
                                 .Case("pwr8", true)
                                 .Default(false);
   Features["vsx"] = llvm::StringSwitch<bool>(CPU)
                         .Case("ppc64le", true)
-                        .Case("pwr10", true)
                         .Case("pwr9", true)
                         .Case("pwr8", true)
                         .Case("pwr7", true)
@@ -325,10 +313,10 @@ bool PPCTargetInfo::initFeatureMap(
                         .Case("e500", true)
                         .Default(false);
 
-  // Future CPU should include all of the features of Power 10 as well as any
+  // Future CPU should include all of the features of Power 9 as well as any
   // additional features (yet to be determined) specific to it.
   if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
+    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
     addFutureSpecificFeatures(Features);
   }
 
@@ -475,17 +463,18 @@ ArrayRef<TargetInfo::AddlRegName> PPCTargetInfo::getGCCAddlRegNames() const {
 }
 
 static constexpr llvm::StringLiteral ValidCPUNames[] = {
-    {"generic"},     {"440"},     {"450"},     {"601"},       {"602"},
-    {"603"},         {"603e"},    {"603ev"},   {"604"},       {"604e"},
-    {"620"},         {"630"},     {"g3"},      {"7400"},      {"g4"},
-    {"7450"},        {"g4+"},     {"750"},     {"8548"},      {"970"},
-    {"g5"},          {"a2"},      {"a2q"},     {"e500"},      {"e500mc"},
-    {"e5500"},       {"power3"},  {"pwr3"},    {"power4"},    {"pwr4"},
-    {"power5"},      {"pwr5"},    {"power5x"}, {"pwr5x"},     {"power6"},
-    {"pwr6"},        {"power6x"}, {"pwr6x"},   {"power7"},    {"pwr7"},
-    {"power8"},      {"pwr8"},    {"power9"},  {"pwr9"},      {"power10"},
-    {"pwr10"},       {"powerpc"}, {"ppc"},     {"powerpc64"}, {"ppc64"},
-    {"powerpc64le"}, {"ppc64le"}, {"future"}};
+    {"generic"},   {"440"},       {"450"},         {"601"},         {"602"},
+    {"603"},       {"603e"},      {"603ev"},       {"604"},         {"604e"},
+    {"620"},       {"630"},       {"g3"},          {"7400"},        {"g4"},
+    {"7450"},      {"g4+"},       {"750"},         {"8548"},        {"970"},
+    {"g5"},        {"a2"},        {"a2q"},         {"e500"},        {"e500mc"},
+    {"e5500"},     {"power3"},    {"pwr3"},        {"power4"},      {"pwr4"},
+    {"power5"},    {"pwr5"},      {"power5x"},     {"pwr5x"},       {"power6"},
+    {"pwr6"},      {"power6x"},   {"pwr6x"},       {"power7"},      {"pwr7"},
+    {"power8"},    {"pwr8"},      {"power9"},      {"pwr9"},        {"powerpc"},
+    {"ppc"},       {"powerpc64"}, {"ppc64"},       {"powerpc64le"}, {"ppc64le"},
+    {"future"}
+};
 
 bool PPCTargetInfo::isValidCPUName(StringRef Name) const {
   return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames);
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 3feda1853547f..7c19a96a99c74 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -43,13 +43,13 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
     ArchDefinePwr7 = 1 << 11,
     ArchDefinePwr8 = 1 << 12,
     ArchDefinePwr9 = 1 << 13,
-    ArchDefinePwr10 = 1 << 14,
-    ArchDefineFuture = 1 << 15,
-    ArchDefineA2 = 1 << 16,
-    ArchDefineA2q = 1 << 17,
-    ArchDefineE500 = 1 << 18
+    ArchDefineFuture = 1 << 14,
+    ArchDefineA2 = 1 << 15,
+    ArchDefineA2q = 1 << 16,
+    ArchDefineE500 = 1 << 17
   } ArchDefineTypes;
 
+
   ArchDefineTypes ArchDefs = ArchDefineNone;
   static const Builtin::Info BuiltinInfo[];
   static const char *const GCCRegNames[];
@@ -119,20 +119,20 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q)
               .Cases("power3", "pwr3", ArchDefinePpcgr)
               .Cases("power4", "pwr4",
-                     ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+                    ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power5", "pwr5",
-                     ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
+                    ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                        ArchDefinePpcsq)
               .Cases("power5x", "pwr5x",
-                     ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
-                         ArchDefinePpcgr | ArchDefinePpcsq)
+                    ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
+                        ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power6", "pwr6",
-                     ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
-                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+                    ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
+                        ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power6x", "pwr6x",
-                     ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
-                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
+                    ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
+                        ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                        ArchDefinePpcsq)
               .Cases("power7", "pwr7",
                      ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
@@ -146,16 +146,11 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                      ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                          ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                          ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power10", "pwr10",
-                     ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 |
-                         ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
-                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
               .Case("future",
-                    ArchDefineFuture | ArchDefinePwr10 | ArchDefinePwr9 |
-                        ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
-                        ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
-                        ArchDefinePpcgr | ArchDefinePpcsq)
+                    ArchDefineFuture | ArchDefinePwr9 | ArchDefinePwr8 |
+                        ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
+                        ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                        ArchDefinePpcsq)
               .Cases("8548", "e500", ArchDefineE500)
               .Default(ArchDefineNone);
     }
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index 144e276a6bd87..e5130a9485de7 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -70,7 +70,6 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
         .Case("power7", "pwr7")
         .Case("power8", "pwr8")
         .Case("power9", "pwr9")
-        .Case("power10", "pwr10")
         .Case("future", "future")
         .Case("pwr3", "pwr3")
         .Case("pwr4", "pwr4")
@@ -81,7 +80,6 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
         .Case("pwr7", "pwr7")
         .Case("pwr8", "pwr8")
         .Case("pwr9", "pwr9")
-        .Case("pwr10", "pwr10")
         .Case("powerpc", "ppc")
         .Case("powerpc64", "ppc64")
         .Case("powerpc64le", "ppc64le")
@@ -93,16 +91,14 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
 
 const char *ppc::getPPCAsmModeForCPU(StringRef Name) {
   return llvm::StringSwitch<const char *>(Name)
-      .Case("pwr7", "-mpower7")
-      .Case("power7", "-mpower7")
-      .Case("pwr8", "-mpower8")
-      .Case("power8", "-mpower8")
-      .Case("ppc64le", "-mpower8")
-      .Case("pwr9", "-mpower9")
-      .Case("power9", "-mpower9")
-      .Case("pwr10", "-mpower10")
-      .Case("power10", "-mpower10")
-      .Default("-many");
+        .Case("pwr7", "-mpower7")
+        .Case("power7", "-mpower7")
+        .Case("pwr8", "-mpower8")
+        .Case("power8", "-mpower8")
+        .Case("ppc64le", "-mpower8")
+        .Case("pwr9", "-mpower9")
+        .Case("power9", "-mpower9")
+        .Default("-many");
 }
 
 void ppc::getPPCTargetFeatures(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 5c571fb458ec5..9f036c94c3f8e 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -81,7 +81,7 @@
 // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750,
 // PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4,
 // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x,
-// PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64,
+// PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, powerpc, ppc, powerpc64,
 // PPC-SAME: ppc64, powerpc64le, ppc64le, future
 
 // RUN: not %clang_cc1 -triple mips--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix MIPS
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index ed8601636554e..b24f8eb7050be 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -627,30 +627,12 @@
 // PPCPOWER9:#define _ARCH_PWR7 1
 // PPCPOWER9:#define _ARCH_PWR9 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr10 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER10 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power10 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER10 %s
-//
-// PPCPOWER10:#define _ARCH_PPC 1
-// PPCPOWER10:#define _ARCH_PPC64 1
-// PPCPOWER10:#define _ARCH_PPCGR 1
-// PPCPOWER10:#define _ARCH_PPCSQ 1
-// PPCPOWER10:#define _ARCH_PWR10 1
-// PPCPOWER10:#define _ARCH_PWR4 1
-// PPCPOWER10:#define _ARCH_PWR5 1
-// PPCPOWER10:#define _ARCH_PWR5X 1
-// PPCPOWER10:#define _ARCH_PWR6 1
-// PPCPOWER10-NOT:#define _ARCH_PWR6X 1
-// PPCPOWER10:#define _ARCH_PWR7 1
-// PPCPOWER10:#define _ARCH_PWR8 1
-// PPCPOWER10:#define _ARCH_PWR9 1
-//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu future -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCFUTURE %s
 //
 // PPCFUTURE:#define _ARCH_PPC 1
 // PPCFUTURE:#define _ARCH_PPC64 1
 // PPCFUTURE:#define _ARCH_PPCGR 1
 // PPCFUTURE:#define _ARCH_PPCSQ 1
-// PPCFUTURE:#define _ARCH_PWR10 1
 // PPCFUTURE:#define _ARCH_PWR4 1
 // PPCFUTURE:#define _ARCH_PWR5 1
 // PPCFUTURE:#define _ARCH_PWR5X 1
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index da68464c4a3d9..d9b3cac5e8dc0 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -142,7 +142,6 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Case("POWER8E", "pwr8")
       .Case("POWER8NVL", "pwr8")
       .Case("POWER9", "pwr9")
-      .Case("POWER10", "pwr10")
       // FIXME: If we get a simulator or machine with the capabilities of
       // mcpu=future, we should revisit this and add the name reported by the
       // simulator/machine.
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index a6c7868f6ac25..1d1f11e498c20 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -51,7 +51,6 @@ def DirectivePwr6x
 def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
 def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
-def DirectivePwr10: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR10", "">;
 def DirectivePwrFuture
     : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;
 
@@ -206,9 +205,6 @@ def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
 def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
                                      "true",
                                      "Enable instructions added in ISA 3.0.">;
-def FeatureISA3_1 : SubtargetFeature<"isa-v31-instructions", "IsISA3_1",
-                                     "true",
-                                     "Enable instructions added in ISA 3.1.">;
 def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
                                         "Enable POWER9 Altivec instructions",
                                         [FeatureISA3_0, FeatureP8Altivec]>;
@@ -332,25 +328,14 @@ def ProcessorFeatures {
   list<SubtargetFeature> P9Features =
     !listconcat(P9InheritableFeatures, P9SpecificFeatures);
 
-  // Power10
-  // For P10 CPU we assume that all of the existing features from Power9
-  // still exist with the exception of those we know are Power9 specific.
-  list<SubtargetFeature> P10AdditionalFeatures =
-    [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
-     FeaturePCRelativeMemops];
-  list<SubtargetFeature> P10SpecificFeatures = [];
-  list<SubtargetFeature> P10InheritableFeatures =
-    !listconcat(P9InheritableFeatures, P10AdditionalFeatures);
-  list<SubtargetFeature> P10Features =
-    !listconcat(P10InheritableFeatures, P10SpecificFeatures);
-
   // Future
-  // For future CPU we assume that all of the existing features from Power10
-  // still exist with the exception of those we know are Power10 specific.
+  // For future CPU we assume that all of the existing features from Power 9
+  // still exist with the exception of those we know are Power 9 specific.
   list<SubtargetFeature> FutureAdditionalFeatures = [];
-  list<SubtargetFeature> FutureSpecificFeatures = [];
+  list<SubtargetFeature> FutureSpecificFeatures =
+    [FeaturePrefixInstrs, FeaturePCRelativeMemops];
   list<SubtargetFeature> FutureInheritableFeatures =
-    !listconcat(P10InheritableFeatures, FutureAdditionalFeatures);
+    !listconcat(P9InheritableFeatures, FutureAdditionalFeatures);
   list<SubtargetFeature> FutureFeatures =
     !listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
 }
@@ -555,8 +540,6 @@ def : ProcessorModel<"pwr6x", G5Model,
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
-// No scheduler model yet.
-def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 53f9ac678c7b7..42df83831113a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1306,7 +1306,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
-  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE:
     setPrefLoopAlignment(Align(16));
     setPrefFunctionAlignment(Align(16));
@@ -14914,7 +14913,6 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
-  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE: {
     if (!ML)
       break;
@@ -16105,7 +16103,6 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
       // vector        7       2      2
       return true;
     case PPC::DIR_PWR9:
-    case PPC::DIR_PWR10:
     case PPC::DIR_PWR_FUTURE:
       //  type        mul     add    shl
       // scalar        5       2      2
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 2f332715d8cac..cfc54df13f792 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -115,7 +115,6 @@ void PPCSubtarget::initializeEnvironment() {
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
   IsISA3_0 = false;
-  IsISA3_1 = false;
   UseLongCalls = false;
   SecurePlt = false;
   VectorsUseTwoUnits = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index bfe39814e4cc8..be1143f903e8b 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -34,33 +34,32 @@ class StringRef;
 
 namespace PPC {
   // -m directive values.
-enum {
-  DIR_NONE,
-  DIR_32,
-  DIR_440,
-  DIR_601,
-  DIR_602,
-  DIR_603,
-  DIR_7400,
-  DIR_750,
-  DIR_970,
-  DIR_A2,
-  DIR_E500,
-  DIR_E500mc,
-  DIR_E5500,
-  DIR_PWR3,
-  DIR_PWR4,
-  DIR_PWR5,
-  DIR_PWR5X,
-  DIR_PWR6,
-  DIR_PWR6X,
-  DIR_PWR7,
-  DIR_PWR8,
-  DIR_PWR9,
-  DIR_PWR10,
-  DIR_PWR_FUTURE,
-  DIR_64
-};
+  enum {
+    DIR_NONE,
+    DIR_32,
+    DIR_440,
+    DIR_601,
+    DIR_602,
+    DIR_603,
+    DIR_7400,
+    DIR_750,
+    DIR_970,
+    DIR_A2,
+    DIR_E500,
+    DIR_E500mc,
+    DIR_E5500,
+    DIR_PWR3,
+    DIR_PWR4,
+    DIR_PWR5,
+    DIR_PWR5X,
+    DIR_PWR6,
+    DIR_PWR6X,
+    DIR_PWR7,
+    DIR_PWR8,
+    DIR_PWR9,
+    DIR_PWR_FUTURE,
+    DIR_64
+  };
 }
 
 class GlobalValue;
@@ -139,7 +138,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
   bool IsISA3_0;
-  bool IsISA3_1;
   bool UseLongCalls;
   bool SecurePlt;
   bool VectorsUseTwoUnits;
@@ -310,7 +308,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool hasHTM() const { return HasHTM; }
   bool hasFloat128() const { return HasFloat128; }
   bool isISA3_0() const { return IsISA3_0; }
-  bool isISA3_1() const { return IsISA3_1; }
   bool useLongCalls() const { return UseLongCalls; }
   bool hasFusion() const { return HasFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 46c5335a558f4..a41c6b41a991b 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -651,12 +651,11 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
   if (CacheLineSize.getNumOccurrences() > 0)
     return CacheLineSize;
 
-  // Starting with P7 we have a cache line size of 128.
+  // On P7, P8 or P9 we have a cache line size of 128.
   unsigned Directive = ST->getCPUDirective();
   // Assume that Future CPU has the same cache line size as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
-      Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
     return 128;
 
   // On other processors return a default of 64 bytes.
@@ -688,11 +687,9 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // For P7 and P8, floating-point instructions have a 6-cycle latency and
   // there are two execution units, so unroll by 12x for latency hiding.
   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
-  // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
   // Assume that future is the same as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
-      Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
     return 12;
 
   // For most things, modern systems have two execution units (and
diff --git a/llvm/test/CodeGen/PowerPC/check-cpu.ll b/llvm/test/CodeGen/PowerPC/check-cpu.ll
index 132be3058216b..baa39024ebe8d 100644
--- a/llvm/test/CodeGen/PowerPC/check-cpu.ll
+++ b/llvm/test/CodeGen/PowerPC/check-cpu.ll
@@ -2,13 +2,9 @@
 ; RUN:     -mcpu=future < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:     -mcpu=future < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:     -mcpu=power10 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:     -mcpu=pwr10 < %s | FileCheck %s
 
 
-; Test -mcpu=[pwr10|future] is recognized on PowerPC.
+; Test mcpu=future that should be recognized on PowerPC.
 
 ; CHECK-NOT: is not a recognized processor for this target
 ; CHECK:     .text

From b5b00877221ec7817b9de9cd65571e1c05e80145 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 15:50:31 +0100
Subject: [PATCH 231/770] SpecialCaseList.h - reduce unnecessary includes to
 forward declarations. NFC.

Remove Regex forward declaration as we already require the Regex.h include.

Add missing VirtualFileSystem.h include to dependent source files.
---
 clang/lib/Driver/SanitizerArgs.cpp          | 2 +-
 clang/lib/Driver/XRayArgs.cpp               | 2 +-
 llvm/include/llvm/Support/SpecialCaseList.h | 8 +++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 35e982a502ef6..9beca156e93e5 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -14,10 +14,10 @@
 #include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include <memory>
 
 using namespace clang;
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index 54c15685d3898..f233267b49846 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -13,10 +13,10 @@
 #include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
 
 using namespace clang;
 using namespace clang::driver;
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 5b5b7f6124d68..330e96a7b9acb 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -52,18 +52,20 @@
 #define LLVM_SUPPORT_SPECIALCASELIST_H
 
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/TrigramIndex.h"
-#include "llvm/Support/VirtualFileSystem.h"
+#include <memory>
 #include <string>
 #include <vector>
 
 namespace llvm {
 class MemoryBuffer;
-class Regex;
 class StringRef;
 
+namespace vfs {
+class FileSystem;
+};
+
 class SpecialCaseList {
 public:
   /// Parses the special case list entries from files. On failure, returns

From 461af57de78155ee5d1dc1969b81dd019d228538 Mon Sep 17 00:00:00 2001
From: Eduardo Caldas <ecaldas@google.com>
Date: Wed, 27 May 2020 15:27:59 +0200
Subject: [PATCH 232/770] Add support for UnaryOperator in SyntaxTree

Reviewers: gribozavr2

Reviewed By: gribozavr2

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80624
---
 clang/include/clang/Tooling/Syntax/Nodes.h  |  48 ++++++
 clang/lib/Tooling/Syntax/BuildTree.cpp      |  19 +++
 clang/lib/Tooling/Syntax/Nodes.cpp          |  18 +++
 clang/unittests/Tooling/Syntax/TreeTest.cpp | 157 +++++++++++++++++++-
 4 files changed, 241 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h
index 5db99d4b9e350..e240becbf883a 100644
--- a/clang/include/clang/Tooling/Syntax/Nodes.h
+++ b/clang/include/clang/Tooling/Syntax/Nodes.h
@@ -40,6 +40,8 @@ enum class NodeKind : uint16_t {
 
   // Expressions.
   UnknownExpression,
+  PrefixUnaryOperatorExpression,
+  PostfixUnaryOperatorExpression,
   BinaryOperatorExpression,
 
   // Statements.
@@ -105,6 +107,8 @@ enum class NodeRole : uint8_t {
   BodyStatement,
 
   // Roles specific to particular node kinds.
+  UnaryOperatorExpression_operatorToken,
+  UnaryOperatorExpression_operand,
   BinaryOperatorExpression_leftHandSide,
   BinaryOperatorExpression_operatorToken,
   BinaryOperatorExpression_rightHandSide,
@@ -162,6 +166,50 @@ class UnknownExpression final : public Expression {
   }
 };
 
+/// An abstract class for prefix and postfix unary operators.
+class UnaryOperatorExpression : public Expression {
+public:
+  UnaryOperatorExpression(NodeKind K) : Expression(K) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::PrefixUnaryOperatorExpression ||
+           N->kind() == NodeKind::PostfixUnaryOperatorExpression;
+  }
+  syntax::Leaf *operatorToken();
+  syntax::Expression *operand();
+};
+
+/// <operator> <operand>
+///
+/// For example:
+///   +a          -b
+///   !c          not c
+///   ~d          compl d
+///   *e          &f
+///   ++h         --h
+///   __real i    __imag i
+class PrefixUnaryOperatorExpression final : public UnaryOperatorExpression {
+public:
+  PrefixUnaryOperatorExpression()
+      : UnaryOperatorExpression(NodeKind::PrefixUnaryOperatorExpression) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::PrefixUnaryOperatorExpression;
+  }
+};
+
+/// <operand> <operator>
+///
+/// For example:
+///   a++
+///   b--
+class PostfixUnaryOperatorExpression final : public UnaryOperatorExpression {
+public:
+  PostfixUnaryOperatorExpression()
+      : UnaryOperatorExpression(NodeKind::PostfixUnaryOperatorExpression) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::PostfixUnaryOperatorExpression;
+  }
+};
+
 /// <lhs> <operator> <rhs>
 ///
 /// For example:
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index 8fee44cdbf10d..60c6b3f88509d 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -608,6 +608,25 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
     return true;
   }
 
+  bool WalkUpFromUnaryOperator(UnaryOperator *S) {
+    Builder.markChildToken(
+        S->getOperatorLoc(),
+        syntax::NodeRole::UnaryOperatorExpression_operatorToken);
+    Builder.markExprChild(S->getSubExpr(),
+                          syntax::NodeRole::UnaryOperatorExpression_operand);
+
+    if (S->isPostfix())
+      Builder.foldNode(Builder.getExprRange(S),
+                       new (allocator()) syntax::PostfixUnaryOperatorExpression,
+                       S);
+    else
+      Builder.foldNode(Builder.getExprRange(S),
+                       new (allocator()) syntax::PrefixUnaryOperatorExpression,
+                       S);
+
+    return true;
+  }
+
   bool WalkUpFromBinaryOperator(BinaryOperator *S) {
     Builder.markExprChild(
         S->getLHS(), syntax::NodeRole::BinaryOperatorExpression_leftHandSide);
diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp
index 84c0143db81d3..f0f1a8ed13c24 100644
--- a/clang/lib/Tooling/Syntax/Nodes.cpp
+++ b/clang/lib/Tooling/Syntax/Nodes.cpp
@@ -18,6 +18,10 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
     return OS << "TranslationUnit";
   case NodeKind::UnknownExpression:
     return OS << "UnknownExpression";
+  case NodeKind::PrefixUnaryOperatorExpression:
+    return OS << "PrefixUnaryOperatorExpression";
+  case NodeKind::PostfixUnaryOperatorExpression:
+    return OS << "PostfixUnaryOperatorExpression";
   case NodeKind::BinaryOperatorExpression:
     return OS << "BinaryOperatorExpression";
   case NodeKind::UnknownStatement:
@@ -112,6 +116,10 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeRole R) {
     return OS << "IfStatement_elseKeyword";
   case syntax::NodeRole::IfStatement_elseStatement:
     return OS << "IfStatement_elseStatement";
+  case syntax::NodeRole::UnaryOperatorExpression_operatorToken:
+    return OS << "UnaryOperatorExpression_operatorToken";
+  case syntax::NodeRole::UnaryOperatorExpression_operand:
+    return OS << "UnaryOperatorExpression_operand";
   case syntax::NodeRole::BinaryOperatorExpression_leftHandSide:
     return OS << "BinaryOperatorExpression_leftHandSide";
   case syntax::NodeRole::BinaryOperatorExpression_operatorToken:
@@ -155,6 +163,16 @@ syntax::Expression *syntax::BinaryOperatorExpression::lhs() {
       findChild(syntax::NodeRole::BinaryOperatorExpression_leftHandSide));
 }
 
+syntax::Leaf *syntax::UnaryOperatorExpression::operatorToken() {
+  return llvm::cast_or_null<syntax::Leaf>(
+      findChild(syntax::NodeRole::UnaryOperatorExpression_operatorToken));
+}
+
+syntax::Expression *syntax::UnaryOperatorExpression::operand() {
+  return llvm::cast_or_null<syntax::Expression>(
+      findChild(syntax::NodeRole::UnaryOperatorExpression_operand));
+}
+
 syntax::Leaf *syntax::BinaryOperatorExpression::operatorToken() {
   return llvm::cast_or_null<syntax::Leaf>(
       findChild(syntax::NodeRole::BinaryOperatorExpression_operatorToken));
diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
index 634f99f7c395c..e81e3c2b83542 100644
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -594,6 +594,161 @@ void test() {
 )txt");
 }
 
+TEST_F(SyntaxTreeTest, PostfixUnaryOperator) {
+  expectTreeDumpEqual(
+      R"cpp(
+void test(int a) {
+  a++;
+  a--;
+}
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-void
+  |-SimpleDeclarator
+  | |-test
+  | `-ParametersAndQualifiers
+  |   |-(
+  |   |-SimpleDeclaration
+  |   | |-int
+  |   | `-SimpleDeclarator
+  |   |   `-a
+  |   `-)
+  `-CompoundStatement
+    |-{
+    |-ExpressionStatement
+    | |-PostfixUnaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | `-++
+    | `-;
+    |-ExpressionStatement
+    | |-PostfixUnaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-a
+    | | `---
+    | `-;
+    `-}
+)txt");
+}
+
+TEST_F(SyntaxTreeTest, PrefixUnaryOperator) {
+  expectTreeDumpEqual(
+      R"cpp(
+void test(int a, int *ap, bool b) {
+  --a; ++a;
+  ~a; compl a;
+  -a;
+  +a;
+  &a;
+  *ap;
+  !b; not b;
+  __real a; __imag a;
+}
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-void
+  |-SimpleDeclarator
+  | |-test
+  | `-ParametersAndQualifiers
+  |   |-(
+  |   |-SimpleDeclaration
+  |   | |-int
+  |   | `-SimpleDeclarator
+  |   |   `-a
+  |   |-,
+  |   |-SimpleDeclaration
+  |   | |-int
+  |   | `-SimpleDeclarator
+  |   |   |-*
+  |   |   `-ap
+  |   |-,
+  |   |-SimpleDeclaration
+  |   | |-bool
+  |   | `-SimpleDeclarator
+  |   |   `-b
+  |   `-)
+  `-CompoundStatement
+    |-{
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |---
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-++
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-~
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-compl
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |--
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-+
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-&
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-*
+    | | `-UnknownExpression
+    | |   `-ap
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-!
+    | | `-UnknownExpression
+    | |   `-b
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-not
+    | | `-UnknownExpression
+    | |   `-b
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-__real
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    |-ExpressionStatement
+    | |-PrefixUnaryOperatorExpression
+    | | |-__imag
+    | | `-UnknownExpression
+    | |   `-a
+    | `-;
+    `-}
+)txt");
+}
+
 TEST_F(SyntaxTreeTest, BinaryOperator) {
   expectTreeDumpEqual(
       R"cpp(
@@ -1866,7 +2021,7 @@ const int const *const *volatile b;
 | |-SimpleDeclarator
 | | |-west
 | | |-=
-| | `-UnknownExpression
+| | `-PrefixUnaryOperatorExpression
 | |   |--
 | |   `-UnknownExpression
 | |     `-1

From 3be5e53f208d63135bb4e8499abdc1ac8a2b3266 Mon Sep 17 00:00:00 2001
From: Alex Richardson <Alexander.Richardson@cl.cam.ac.uk>
Date: Wed, 27 May 2020 15:13:42 +0100
Subject: [PATCH 233/770] [FileCheck] Allow parenthesized expressions

With this change it is be possible to write FileCheck expressions such
as [[#(VAR+1)-2]]. Currently, the only supported arithmetic operators are
plus and minus, so this is not particularly useful yet. However, it our
CHERI fork we have tests that benefit from having multiplication in
FileCheck expressions. Allowing parenthesized expressions is the simplest
way for us to work around the current lack of operator precedence in
FileCheck expressions.

Reviewed By: thopre, jhenderson
Differential Revision: https://reviews.llvm.org/D77383
---
 llvm/docs/CommandGuide/FileCheck.rst       |  2 +
 llvm/lib/Support/FileCheck.cpp             | 39 ++++++++++
 llvm/lib/Support/FileCheckImpl.h           | 13 +++-
 llvm/test/FileCheck/numeric-expression.txt | 14 ++++
 llvm/unittests/Support/FileCheckTest.cpp   | 86 ++++++++++++++++++++++
 5 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/CommandGuide/FileCheck.rst b/llvm/docs/CommandGuide/FileCheck.rst
index 7d9a69e127b9e..d8a2e343026bf 100644
--- a/llvm/docs/CommandGuide/FileCheck.rst
+++ b/llvm/docs/CommandGuide/FileCheck.rst
@@ -694,6 +694,8 @@ The syntax of a numeric substitution is ``[[#%<fmtspec>,<expr>]]`` where:
   A numeric operand is a previously defined numeric variable, or an integer
   literal. The supported operators are ``+`` and ``-``. Spaces are accepted
   before, after and between any of these elements.
+  There is currently no support for operator precendence, but parentheses can
+  be used to change the evaluation order.
 
 For example:
 
diff --git a/llvm/lib/Support/FileCheck.cpp b/llvm/lib/Support/FileCheck.cpp
index 2797b8279cd42..300eea865f91b 100644
--- a/llvm/lib/Support/FileCheck.cpp
+++ b/llvm/lib/Support/FileCheck.cpp
@@ -273,6 +273,13 @@ Expected<std::unique_ptr<NumericVariableUse>> Pattern::parseNumericVariableUse(
 Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
     StringRef &Expr, AllowedOperand AO, Optional<size_t> LineNumber,
     FileCheckPatternContext *Context, const SourceMgr &SM) {
+  if (Expr.startswith("(")) {
+    if (AO != AllowedOperand::Any)
+      return ErrorDiagnostic::get(
+          SM, Expr, "parenthesized expression not permitted here");
+    return parseParenExpr(Expr, LineNumber, Context, SM);
+  }
+
   if (AO == AllowedOperand::LineVar || AO == AllowedOperand::Any) {
     // Try to parse as a numeric variable use.
     Expected<Pattern::VariableProperties> ParseVarResult =
@@ -300,6 +307,38 @@ Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
                               "invalid operand format '" + Expr + "'");
 }
 
+Expected<std::unique_ptr<ExpressionAST>>
+Pattern::parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
+                        FileCheckPatternContext *Context, const SourceMgr &SM) {
+  Expr = Expr.ltrim(SpaceChars);
+  assert(Expr.startswith("("));
+
+  // Parse right operand.
+  Expr.consume_front("(");
+  Expr = Expr.ltrim(SpaceChars);
+  if (Expr.empty())
+    return ErrorDiagnostic::get(SM, Expr, "missing operand in expression");
+
+  // Note: parseNumericOperand handles nested opening parentheses.
+  Expected<std::unique_ptr<ExpressionAST>> SubExprResult =
+      parseNumericOperand(Expr, AllowedOperand::Any, LineNumber, Context, SM);
+  Expr = Expr.ltrim(SpaceChars);
+  while (SubExprResult && !Expr.empty() && !Expr.startswith(")")) {
+    StringRef OrigExpr = Expr;
+    SubExprResult = parseBinop(OrigExpr, Expr, std::move(*SubExprResult), false,
+                               LineNumber, Context, SM);
+    Expr = Expr.ltrim(SpaceChars);
+  }
+  if (!SubExprResult)
+    return SubExprResult;
+
+  if (!Expr.consume_front(")")) {
+    return ErrorDiagnostic::get(SM, Expr,
+                                "missing ')' at end of nested expression");
+  }
+  return SubExprResult;
+}
+
 static uint64_t add(uint64_t LeftOp, uint64_t RightOp) {
   return LeftOp + RightOp;
 }
diff --git a/llvm/lib/Support/FileCheckImpl.h b/llvm/lib/Support/FileCheckImpl.h
index 8a7d58399aeab..f4f2fc21a2084 100644
--- a/llvm/lib/Support/FileCheckImpl.h
+++ b/llvm/lib/Support/FileCheckImpl.h
@@ -665,7 +665,8 @@ class Pattern {
   /// \p Context points to the class instance holding the live string and
   /// numeric variables. \returns the class representing that operand in the
   /// AST of the expression or an error holding a diagnostic against \p SM
-  /// otherwise.
+  /// otherwise. If \p Expr starts with a "(" this function will attempt to
+  /// parse a parenthesized expression.
   static Expected<std::unique_ptr<ExpressionAST>>
   parseNumericOperand(StringRef &Expr, AllowedOperand AO,
                       Optional<size_t> LineNumber,
@@ -684,6 +685,16 @@ class Pattern {
              std::unique_ptr<ExpressionAST> LeftOp, bool IsLegacyLineExpr,
              Optional<size_t> LineNumber, FileCheckPatternContext *Context,
              const SourceMgr &SM);
+
+  /// Parses a parenthesized expression inside \p Expr at line \p LineNumber, or
+  /// before input is parsed if \p LineNumber is None. \p Expr must start with
+  /// a '('. Accepts both literal values and numeric variables. Parameter \p
+  /// Context points to the class instance holding the live string and numeric
+  /// variables. \returns the class representing that operand in the AST of the
+  /// expression or an error holding a diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
+                 FileCheckPatternContext *Context, const SourceMgr &SM);
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/FileCheck/numeric-expression.txt b/llvm/test/FileCheck/numeric-expression.txt
index 81a82b399e7ec..3d33e64a0a9ed 100644
--- a/llvm/test/FileCheck/numeric-expression.txt
+++ b/llvm/test/FileCheck/numeric-expression.txt
@@ -164,6 +164,20 @@ DEF EXPR GOOD MATCH  // CHECK-LABEL: DEF EXPR GOOD MATCH
 EMPTY NUM EXPR  // CHECK-LABEL: EMPTY NUM EXPR
 foo 104 bar  // CHECK-NEXT: {{^}}foo [[#]] bar
 
+; Numeric expressions using parentheses.
+RUN: %ProtectFileCheckOutput \
+RUN: not FileCheck -D#NUMVAR=10 --check-prefix PAREN-OP \
+RUN:               --input-file %s %s 2>&1 \
+RUN:   | FileCheck --strict-whitespace --check-prefix PAREN-OP-MSG %s
+
+PAREN EXPRESSIONS // PAREN-OP-LABEL: PAREN EXPRESSIONS
+11  // PAREN-OP-NEXT: [[#(NUMVAR+2)-1]]
+11  // PAREN-OP-NEXT: [[#NUMVAR+(2-1)]]
+11  // PAREN-OP-NEXT: [[#NUMVAR+(2-1]]
+PAREN-OP-MSG: numeric-expression.txt:[[#@LINE-1]]:36: error: missing ')' at end of nested expression
+PAREN-OP-MSG-NEXT: {{P}}AREN-OP-NEXT: {{\[\[#NUMVAR\+\(2\-1]\]}}
+PAREN-OP-MSG-NEXT: {{^}}                                   ^{{$}}
+
 ; Numeric expression using undefined variables.
 RUN: %ProtectFileCheckOutput \
 RUN: not FileCheck --check-prefix UNDEF-USE --input-file %s %s 2>&1 \
diff --git a/llvm/unittests/Support/FileCheckTest.cpp b/llvm/unittests/Support/FileCheckTest.cpp
index 6b0eee4b36c7f..75b7fba8759d8 100644
--- a/llvm/unittests/Support/FileCheckTest.cpp
+++ b/llvm/unittests/Support/FileCheckTest.cpp
@@ -724,6 +724,46 @@ TEST_F(FileCheckTest, ParseNumericSubstitutionBlock) {
       "implicit format conflict between 'FOO' (%u) and "
       "'VAR_LOWER_HEX' (%x), need an explicit format specifier",
       Tester.parseSubst("FOO+VAR_LOWER_HEX").takeError());
+
+  // Simple parenthesized expressions:
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("(1)"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("(1+1)"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("(1)+1"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("((1)+1)"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("((1)+X)"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("((X)+Y)"), Succeeded());
+
+  expectDiagnosticError("missing operand in expression",
+                        Tester.parseSubst("(").takeError());
+  expectDiagnosticError("missing ')' at end of nested expression",
+                        Tester.parseSubst("(1").takeError());
+  expectDiagnosticError("missing operand in expression",
+                        Tester.parseSubst("(1+").takeError());
+  expectDiagnosticError("missing ')' at end of nested expression",
+                        Tester.parseSubst("(1+1").takeError());
+  expectDiagnosticError("missing ')' at end of nested expression",
+                        Tester.parseSubst("((1+2+3").takeError());
+  expectDiagnosticError("missing ')' at end of nested expression",
+                        Tester.parseSubst("((1+2)+3").takeError());
+
+  // Test missing operation between operands:
+  expectDiagnosticError("unsupported operation '('",
+                        Tester.parseSubst("(1)(2)").takeError());
+  expectDiagnosticError("unsupported operation '('",
+                        Tester.parseSubst("2(X)").takeError());
+
+  // Test more closing than opening parentheses. The diagnostic messages are
+  // not ideal, but for now simply check that we reject invalid input.
+  expectDiagnosticError("invalid operand format ')'",
+                        Tester.parseSubst(")").takeError());
+  expectDiagnosticError("unsupported operation ')'",
+                        Tester.parseSubst("1)").takeError());
+  expectDiagnosticError("unsupported operation ')'",
+                        Tester.parseSubst("(1+2))").takeError());
+  expectDiagnosticError("unsupported operation ')'",
+                        Tester.parseSubst("(2))").takeError());
+  expectDiagnosticError("unsupported operation ')'",
+                        Tester.parseSubst("(1))(").takeError());
 }
 
 TEST_F(FileCheckTest, ParsePattern) {
@@ -844,6 +884,52 @@ TEST_F(FileCheckTest, Match) {
                        Succeeded());
 }
 
+TEST_F(FileCheckTest, MatchParen) {
+  PatternTester Tester;
+  // Check simple parenthesized expressions
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR:]]"));
+  expectNotFoundError(Tester.match("FAIL").takeError());
+  expectNotFoundError(Tester.match("").takeError());
+  EXPECT_THAT_EXPECTED(Tester.match("18"), Succeeded());
+
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR + (2 + 2)]]"));
+  expectNotFoundError(Tester.match("21").takeError());
+  EXPECT_THAT_EXPECTED(Tester.match("22"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR + (2)]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("20"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR+(2)]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("20"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR+(NUMVAR)]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("36"), Succeeded());
+
+  // Check nested parenthesized expressions:
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR+(2+(2))]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("22"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR+(2+(NUMVAR))]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("38"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR+((((NUMVAR))))]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("36"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#NUMVAR+((((NUMVAR)))-1)-1]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("34"), Succeeded());
+
+  // Parentheses can also be the first character after the '#':
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#(NUMVAR)]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("18"), Succeeded());
+  Tester.initNextPattern();
+  ASSERT_FALSE(Tester.parsePattern("[[#(NUMVAR+2)]]"));
+  EXPECT_THAT_EXPECTED(Tester.match("20"), Succeeded());
+}
+
 TEST_F(FileCheckTest, Substitution) {
   SourceMgr SM;
   FileCheckPatternContext Context;

From a888fc6b3412574f5869a8680acf4ed2bed1d2a2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Thu, 21 May 2020 08:30:23 -0400
Subject: [PATCH 234/770] [OPENMP50]Initial support for use_device_addr clause.

Summary:
Added parsing/sema analysis/serialization support for use_device_addr
clauses.

Reviewers: jdoerfert

Subscribers: yaxunl, guansong, arphaman, sstefan1, llvm-commits, cfe-commits, caomhin

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D80404
---
 clang/include/clang/AST/OpenMPClause.h        | 104 ++++++
 clang/include/clang/AST/RecursiveASTVisitor.h |   7 +
 clang/include/clang/Sema/Sema.h               |   3 +
 clang/lib/AST/OpenMPClause.cpp                |  58 ++++
 clang/lib/AST/StmtProfile.cpp                 |   4 +
 clang/lib/Basic/OpenMPKinds.cpp               |   2 +
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |   1 +
 clang/lib/Parse/ParseOpenMP.cpp               |   5 +-
 clang/lib/Sema/SemaOpenMP.cpp                 |  73 ++++-
 clang/lib/Sema/TreeTransform.h                |  24 ++
 clang/lib/Serialization/ASTReader.cpp         |  51 +++
 clang/lib/Serialization/ASTWriter.cpp         |  20 ++
 clang/test/OpenMP/target_data_messages.c      |   8 +-
 ...et_data_use_device_ptr_addr_ast_print.cpp} |  28 +-
 ...rget_data_use_device_ptr_addr_messages.cpp | 300 ++++++++++++++++++
 .../target_data_use_device_ptr_messages.cpp   | 208 ------------
 clang/test/OpenMP/target_map_messages.cpp     |   2 +-
 .../test/OpenMP/target_teams_map_messages.cpp |   2 +-
 clang/tools/libclang/CIndex.cpp               |   4 +
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   2 +
 20 files changed, 674 insertions(+), 232 deletions(-)
 rename clang/test/OpenMP/{target_data_use_device_ptr_ast_print.cpp => target_data_use_device_ptr_addr_ast_print.cpp} (70%)
 create mode 100644 clang/test/OpenMP/target_data_use_device_ptr_addr_messages.cpp
 delete mode 100644 clang/test/OpenMP/target_data_use_device_ptr_messages.cpp

diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 54e83f4619800..91e4d011a3e96 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -6597,6 +6597,110 @@ class OMPUseDevicePtrClause final
   }
 };
 
+/// This represents clause 'use_device_addr' in the '#pragma omp ...'
+/// directives.
+///
+/// \code
+/// #pragma omp target data use_device_addr(a,b)
+/// \endcode
+/// In this example directive '#pragma omp target data' has clause
+/// 'use_device_addr' with the variables 'a' and 'b'.
+class OMPUseDeviceAddrClause final
+    : public OMPMappableExprListClause<OMPUseDeviceAddrClause>,
+      private llvm::TrailingObjects<
+          OMPUseDeviceAddrClause, Expr *, ValueDecl *, unsigned,
+          OMPClauseMappableExprCommon::MappableComponent> {
+  friend class OMPClauseReader;
+  friend OMPMappableExprListClause;
+  friend OMPVarListClause;
+  friend TrailingObjects;
+
+  /// Build clause with number of variables \a NumVars.
+  ///
+  /// \param Locs Locations needed to build a mappable clause. It includes 1)
+  /// StartLoc: starting location of the clause (the clause keyword); 2)
+  /// LParenLoc: location of '('; 3) EndLoc: ending location of the clause.
+  /// \param Sizes All required sizes to build a mappable clause. It includes 1)
+  /// NumVars: number of expressions listed in this clause; 2)
+  /// NumUniqueDeclarations: number of unique base declarations in this clause;
+  /// 3) NumComponentLists: number of component lists in this clause; and 4)
+  /// NumComponents: total number of expression components in the clause.
+  explicit OMPUseDeviceAddrClause(const OMPVarListLocTy &Locs,
+                                  const OMPMappableExprListSizeTy &Sizes)
+      : OMPMappableExprListClause(llvm::omp::OMPC_use_device_addr, Locs,
+                                  Sizes) {}
+
+  /// Build an empty clause.
+  ///
+  /// \param Sizes All required sizes to build a mappable clause. It includes 1)
+  /// NumVars: number of expressions listed in this clause; 2)
+  /// NumUniqueDeclarations: number of unique base declarations in this clause;
+  /// 3) NumComponentLists: number of component lists in this clause; and 4)
+  /// NumComponents: total number of expression components in the clause.
+  explicit OMPUseDeviceAddrClause(const OMPMappableExprListSizeTy &Sizes)
+      : OMPMappableExprListClause(llvm::omp::OMPC_use_device_addr,
+                                  OMPVarListLocTy(), Sizes) {}
+
+  /// Define the sizes of each trailing object array except the last one. This
+  /// is required for TrailingObjects to work properly.
+  size_t numTrailingObjects(OverloadToken<Expr *>) const {
+    return varlist_size();
+  }
+  size_t numTrailingObjects(OverloadToken<ValueDecl *>) const {
+    return getUniqueDeclarationsNum();
+  }
+  size_t numTrailingObjects(OverloadToken<unsigned>) const {
+    return getUniqueDeclarationsNum() + getTotalComponentListNum();
+  }
+
+public:
+  /// Creates clause with a list of variables \a Vars.
+  ///
+  /// \param C AST context.
+  /// \param Locs Locations needed to build a mappable clause. It includes 1)
+  /// StartLoc: starting location of the clause (the clause keyword); 2)
+  /// LParenLoc: location of '('; 3) EndLoc: ending location of the clause.
+  /// \param Vars The original expression used in the clause.
+  /// \param Declarations Declarations used in the clause.
+  /// \param ComponentLists Component lists used in the clause.
+  static OMPUseDeviceAddrClause *
+  Create(const ASTContext &C, const OMPVarListLocTy &Locs,
+         ArrayRef<Expr *> Vars, ArrayRef<ValueDecl *> Declarations,
+         MappableExprComponentListsRef ComponentLists);
+
+  /// Creates an empty clause with the place for \a NumVars variables.
+  ///
+  /// \param C AST context.
+  /// \param Sizes All required sizes to build a mappable clause. It includes 1)
+  /// NumVars: number of expressions listed in this clause; 2)
+  /// NumUniqueDeclarations: number of unique base declarations in this clause;
+  /// 3) NumComponentLists: number of component lists in this clause; and 4)
+  /// NumComponents: total number of expression components in the clause.
+  static OMPUseDeviceAddrClause *
+  CreateEmpty(const ASTContext &C, const OMPMappableExprListSizeTy &Sizes);
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
+                       reinterpret_cast<Stmt **>(varlist_end()));
+  }
+
+  const_child_range children() const {
+    auto Children = const_cast<OMPUseDeviceAddrClause *>(this)->children();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_use_device_addr;
+  }
+};
+
 /// This represents clause 'is_device_ptr' in the '#pragma omp ...'
 /// directives.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index b30d456bd24a8..83ff49e405020 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3521,6 +3521,13 @@ bool RecursiveASTVisitor<Derived>::VisitOMPUseDevicePtrClause(
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPUseDeviceAddrClause(
+    OMPUseDeviceAddrClause *C) {
+  TRY_TO(VisitOMPClauseList(C));
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPIsDevicePtrClause(
     OMPIsDevicePtrClause *C) {
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 9276852543068..e63f65e2580cc 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10776,6 +10776,9 @@ class Sema final {
   /// Called on well-formed 'use_device_ptr' clause.
   OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
                                            const OMPVarListLocTy &Locs);
+  /// Called on well-formed 'use_device_addr' clause.
+  OMPClause *ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                            const OMPVarListLocTy &Locs);
   /// Called on well-formed 'is_device_ptr' clause.
   OMPClause *ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
                                           const OMPVarListLocTy &Locs);
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 14c4c78e5f39f..fa1c80fc6bbf9 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -136,6 +136,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -227,6 +228,7 @@ const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C)
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -1198,6 +1200,53 @@ OMPUseDevicePtrClause::CreateEmpty(const ASTContext &C,
   return new (Mem) OMPUseDevicePtrClause(Sizes);
 }
 
+OMPUseDeviceAddrClause *
+OMPUseDeviceAddrClause::Create(const ASTContext &C, const OMPVarListLocTy &Locs,
+                               ArrayRef<Expr *> Vars,
+                               ArrayRef<ValueDecl *> Declarations,
+                               MappableExprComponentListsRef ComponentLists) {
+  OMPMappableExprListSizeTy Sizes;
+  Sizes.NumVars = Vars.size();
+  Sizes.NumUniqueDeclarations = getUniqueDeclarationsTotalNumber(Declarations);
+  Sizes.NumComponentLists = ComponentLists.size();
+  Sizes.NumComponents = getComponentsTotalNumber(ComponentLists);
+
+  // We need to allocate:
+  // 3 x NumVars x Expr* - we have an original list expression for each clause
+  // list entry and an equal number of private copies and inits.
+  // NumUniqueDeclarations x ValueDecl* - unique base declarations associated
+  // with each component list.
+  // (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the
+  // number of lists for each unique declaration and the size of each component
+  // list.
+  // NumComponents x MappableComponent - the total of all the components in all
+  // the lists.
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
+          Sizes.NumComponents));
+
+  auto *Clause = new (Mem) OMPUseDeviceAddrClause(Locs, Sizes);
+
+  Clause->setVarRefs(Vars);
+  Clause->setClauseInfo(Declarations, ComponentLists);
+  return Clause;
+}
+
+OMPUseDeviceAddrClause *
+OMPUseDeviceAddrClause::CreateEmpty(const ASTContext &C,
+                                    const OMPMappableExprListSizeTy &Sizes) {
+  void *Mem = C.Allocate(
+      totalSizeToAlloc<Expr *, ValueDecl *, unsigned,
+                       OMPClauseMappableExprCommon::MappableComponent>(
+          Sizes.NumVars, Sizes.NumUniqueDeclarations,
+          Sizes.NumUniqueDeclarations + Sizes.NumComponentLists,
+          Sizes.NumComponents));
+  return new (Mem) OMPUseDeviceAddrClause(Sizes);
+}
+
 OMPIsDevicePtrClause *
 OMPIsDevicePtrClause::Create(const ASTContext &C, const OMPVarListLocTy &Locs,
                              ArrayRef<Expr *> Vars,
@@ -1934,6 +1983,15 @@ void OMPClausePrinter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *Node) {
   }
 }
 
+void OMPClausePrinter::VisitOMPUseDeviceAddrClause(
+    OMPUseDeviceAddrClause *Node) {
+  if (!Node->varlist_empty()) {
+    OS << "use_device_addr";
+    VisitOMPClauseList(Node, '(');
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *Node) {
   if (!Node->varlist_empty()) {
     OS << "is_device_ptr";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index bd2eeb699e65e..e573c045cb7ab 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -784,6 +784,10 @@ void OMPClauseProfiler::VisitOMPUseDevicePtrClause(
     const OMPUseDevicePtrClause *C) {
   VisitOMPClauseList(C);
 }
+void OMPClauseProfiler::VisitOMPUseDeviceAddrClause(
+    const OMPUseDeviceAddrClause *C) {
+  VisitOMPClauseList(C);
+}
 void OMPClauseProfiler::VisitOMPIsDevicePtrClause(
     const OMPIsDevicePtrClause *C) {
   VisitOMPClauseList(C);
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 8dddb66fa322a..a000e4dee3b85 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -163,6 +163,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind,
   case OMPC_hint:
   case OMPC_uniform:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -411,6 +412,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_hint:
   case OMPC_uniform:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index d12aa65af0bae..ae4e3400fcbc4 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -4730,6 +4730,7 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index bd40e6b991a5d..5161c7d06cdab 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2497,7 +2497,7 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) {
 ///       in_reduction-clause | allocator-clause | allocate-clause |
 ///       acq_rel-clause | acquire-clause | release-clause | relaxed-clause |
 ///       depobj-clause | destroy-clause | detach-clause | inclusive-clause |
-///       exclusive-clause | uses_allocators-clause
+///       exclusive-clause | uses_allocators-clause | use_device_addr-clause
 ///
 OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
                                      OpenMPClauseKind CKind, bool FirstClause) {
@@ -2663,6 +2663,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_allocate:
   case OMPC_nontemporal:
@@ -3581,6 +3582,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
 ///       'from' '(' [ mapper '(' mapper-identifier ')' ':' ] list ')'
 ///    use_device_ptr-clause:
 ///       'use_device_ptr' '(' list ')'
+///    use_device_addr-clause:
+///       'use_device_addr' '(' list ')'
 ///    is_device_ptr-clause:
 ///       'is_device_ptr' '(' list ')'
 ///    allocate-clause:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index e556969a786ab..a60a047db0e7a 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5408,6 +5408,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
       case OMPC_to:
       case OMPC_from:
       case OMPC_use_device_ptr:
+      case OMPC_use_device_addr:
       case OMPC_is_device_ptr:
       case OMPC_nontemporal:
       case OMPC_order:
@@ -10165,12 +10166,18 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
 
-  // OpenMP [2.10.1, Restrictions, p. 97]
-  // At least one map clause must appear on the directive.
-  if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr)) {
+  // OpenMP [2.12.2, target data Construct, Restrictions]
+  // At least one map, use_device_addr or use_device_ptr clause must appear on
+  // the directive.
+  if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr) &&
+      (LangOpts.OpenMP < 50 || !hasClauses(Clauses, OMPC_use_device_addr))) {
+    StringRef Expected;
+    if (LangOpts.OpenMP < 50)
+      Expected = "'map' or 'use_device_ptr'";
+    else
+      Expected = "'map', 'use_device_ptr', or 'use_device_addr'";
     Diag(StartLoc, diag::err_omp_no_clause_for_directive)
-        << "'map' or 'use_device_ptr'"
-        << getOpenMPDirectiveName(OMPD_target_data);
+        << Expected << getOpenMPDirectiveName(OMPD_target_data);
     return StmtError();
   }
 
@@ -11535,6 +11542,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -12289,6 +12297,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -12731,6 +12740,7 @@ OMPClause *Sema::ActOnOpenMPSimpleClause(
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -12956,6 +12966,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_unified_address:
   case OMPC_unified_shared_memory:
@@ -13195,6 +13206,7 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_to:
   case OMPC_from:
   case OMPC_use_device_ptr:
+  case OMPC_use_device_addr:
   case OMPC_is_device_ptr:
   case OMPC_atomic_default_mem_order:
   case OMPC_device_type:
@@ -13406,6 +13418,9 @@ OMPClause *Sema::ActOnOpenMPVarListClause(
   case OMPC_use_device_ptr:
     Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs);
     break;
+  case OMPC_use_device_addr:
+    Res = ActOnOpenMPUseDeviceAddrClause(VarList, Locs);
+    break;
   case OMPC_is_device_ptr:
     Res = ActOnOpenMPIsDevicePtrClause(VarList, Locs);
     break;
@@ -18389,6 +18404,54 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
       MVLI.VarBaseDeclarations, MVLI.VarComponents);
 }
 
+OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                                const OMPVarListLocTy &Locs) {
+  MappableVarListInfo MVLI(VarList);
+
+  for (Expr *RefExpr : VarList) {
+    assert(RefExpr && "NULL expr in OpenMP use_device_addr clause.");
+    SourceLocation ELoc;
+    SourceRange ERange;
+    Expr *SimpleRefExpr = RefExpr;
+    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+                              /*AllowArraySection=*/true);
+    if (Res.second) {
+      // It will be analyzed later.
+      MVLI.ProcessedVarList.push_back(RefExpr);
+    }
+    ValueDecl *D = Res.first;
+    if (!D)
+      continue;
+    auto *VD = dyn_cast<VarDecl>(D);
+
+    // If required, build a capture to implement the privatization initialized
+    // with the current list item value.
+    DeclRefExpr *Ref = nullptr;
+    if (!VD)
+      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+    MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
+
+    // We need to add a data sharing attribute for this variable to make sure it
+    // is correctly captured. A variable that shows up in a use_device_addr has
+    // similar properties of a first private variable.
+    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
+
+    // Create a mappable component for the list item. List items in this clause
+    // only need a component.
+    MVLI.VarBaseDeclarations.push_back(D);
+    MVLI.VarComponents.emplace_back();
+    MVLI.VarComponents.back().push_back(
+        OMPClauseMappableExprCommon::MappableComponent(SimpleRefExpr, D));
+  }
+
+  if (MVLI.ProcessedVarList.empty())
+    return nullptr;
+
+  return OMPUseDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList,
+                                        MVLI.VarBaseDeclarations,
+                                        MVLI.VarComponents);
+}
+
 OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
                                               const OMPVarListLocTy &Locs) {
   MappableVarListInfo MVLI(VarList);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 923792fde7fcb..e4c71552f718f 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2036,6 +2036,15 @@ class TreeTransform {
     return getSema().ActOnOpenMPUseDevicePtrClause(VarList, Locs);
   }
 
+  /// Build a new OpenMP 'use_device_addr' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                           const OMPVarListLocTy &Locs) {
+    return getSema().ActOnOpenMPUseDeviceAddrClause(VarList, Locs);
+  }
+
   /// Build a new OpenMP 'is_device_ptr' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -9740,6 +9749,21 @@ OMPClause *TreeTransform<Derived>::TransformOMPUseDevicePtrClause(
   return getDerived().RebuildOMPUseDevicePtrClause(Vars, Locs);
 }
 
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPUseDeviceAddrClause(
+    OMPUseDeviceAddrClause *C) {
+  llvm::SmallVector<Expr *, 16> Vars;
+  Vars.reserve(C->varlist_size());
+  for (auto *VE : C->varlists()) {
+    ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
+    if (EVar.isInvalid())
+      return nullptr;
+    Vars.push_back(EVar.get());
+  }
+  OMPVarListLocTy Locs(C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
+  return getDerived().RebuildOMPUseDeviceAddrClause(Vars, Locs);
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 16bcb18f4e68e..a5a1276253c7c 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11918,6 +11918,15 @@ OMPClause *OMPClauseReader::readClause() {
     C = OMPUseDevicePtrClause::CreateEmpty(Context, Sizes);
     break;
   }
+  case llvm::omp::OMPC_use_device_addr: {
+    OMPMappableExprListSizeTy Sizes;
+    Sizes.NumVars = Record.readInt();
+    Sizes.NumUniqueDeclarations = Record.readInt();
+    Sizes.NumComponentLists = Record.readInt();
+    Sizes.NumComponents = Record.readInt();
+    C = OMPUseDeviceAddrClause::CreateEmpty(Context, Sizes);
+    break;
+  }
   case llvm::omp::OMPC_is_device_ptr: {
     OMPMappableExprListSizeTy Sizes;
     Sizes.NumVars = Record.readInt();
@@ -12704,6 +12713,48 @@ void OMPClauseReader::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
   C->setComponents(Components, ListSizes);
 }
 
+void OMPClauseReader::VisitOMPUseDeviceAddrClause(OMPUseDeviceAddrClause *C) {
+  C->setLParenLoc(Record.readSourceLocation());
+  auto NumVars = C->varlist_size();
+  auto UniqueDecls = C->getUniqueDeclarationsNum();
+  auto TotalLists = C->getTotalComponentListNum();
+  auto TotalComponents = C->getTotalComponentsNum();
+
+  SmallVector<Expr *, 16> Vars;
+  Vars.reserve(NumVars);
+  for (unsigned i = 0; i != NumVars; ++i)
+    Vars.push_back(Record.readSubExpr());
+  C->setVarRefs(Vars);
+
+  SmallVector<ValueDecl *, 16> Decls;
+  Decls.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    Decls.push_back(Record.readDeclAs<ValueDecl>());
+  C->setUniqueDecls(Decls);
+
+  SmallVector<unsigned, 16> ListsPerDecl;
+  ListsPerDecl.reserve(UniqueDecls);
+  for (unsigned i = 0; i < UniqueDecls; ++i)
+    ListsPerDecl.push_back(Record.readInt());
+  C->setDeclNumLists(ListsPerDecl);
+
+  SmallVector<unsigned, 32> ListSizes;
+  ListSizes.reserve(TotalLists);
+  for (unsigned i = 0; i < TotalLists; ++i)
+    ListSizes.push_back(Record.readInt());
+  C->setComponentListSizes(ListSizes);
+
+  SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
+  Components.reserve(TotalComponents);
+  for (unsigned i = 0; i < TotalComponents; ++i) {
+    Expr *AssociatedExpr = Record.readSubExpr();
+    auto *AssociatedDecl = Record.readDeclAs<ValueDecl>();
+    Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
+        AssociatedExpr, AssociatedDecl));
+  }
+  C->setComponents(Components, ListSizes);
+}
+
 void OMPClauseReader::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
   auto NumVars = C->varlist_size();
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 1e3adb588da29..9d81e137f0bb8 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6625,6 +6625,26 @@ void OMPClauseWriter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
   }
 }
 
+void OMPClauseWriter::VisitOMPUseDeviceAddrClause(OMPUseDeviceAddrClause *C) {
+  Record.push_back(C->varlist_size());
+  Record.push_back(C->getUniqueDeclarationsNum());
+  Record.push_back(C->getTotalComponentListNum());
+  Record.push_back(C->getTotalComponentsNum());
+  Record.AddSourceLocation(C->getLParenLoc());
+  for (auto *E : C->varlists())
+    Record.AddStmt(E);
+  for (auto *D : C->all_decls())
+    Record.AddDeclRef(D);
+  for (auto N : C->all_num_lists())
+    Record.push_back(N);
+  for (auto N : C->all_lists_sizes())
+    Record.push_back(N);
+  for (auto &M : C->all_components()) {
+    Record.AddStmt(M.getAssociatedExpression());
+    Record.AddDeclRef(M.getAssociatedDeclaration());
+  }
+}
+
 void OMPClauseWriter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
   Record.push_back(C->varlist_size());
   Record.push_back(C->getUniqueDeclarationsNum());
diff --git a/clang/test/OpenMP/target_data_messages.c b/clang/test/OpenMP/target_data_messages.c
index 32d2c130d4e97..7a7fc0012af2b 100644
--- a/clang/test/OpenMP/target_data_messages.c
+++ b/clang/test/OpenMP/target_data_messages.c
@@ -1,6 +1,8 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s -Wuninitialized
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 100 -o - %s -Wuninitialized
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized
 
-// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -ferror-limit 100 -o - %s -Wuninitialized
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 -o - %s -Wuninitialized
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized
 
 void foo() { }
 
@@ -13,7 +15,7 @@ void xxx(int argc) {
 
 int main(int argc, char **argv) {
   int a;
-  #pragma omp target data // expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
+  #pragma omp target data // omp45-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}} omp50-error {{expected at least one 'map', 'use_device_ptr', or 'use_device_addr' clause for '#pragma omp target data'}}
   {}
   L1:
     foo();
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_ast_print.cpp b/clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp
similarity index 70%
rename from clang/test/OpenMP/target_data_use_device_ptr_ast_print.cpp
rename to clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp
index ba429f50febed..93e8a853e45f0 100644
--- a/clang/test/OpenMP/target_data_use_device_ptr_ast_print.cpp
+++ b/clang/test/OpenMP/target_data_use_device_ptr_addr_ast_print.cpp
@@ -1,9 +1,10 @@
-// RxUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -16,18 +17,19 @@ struct SA {
   int i, j;
   int *k = &j;
   int *&z = k;
+  int &y = i;
   void func(int arg) {
-#pragma omp target data map(tofrom: i) use_device_ptr(k)
+#pragma omp target data map(tofrom: i) use_device_ptr(k) use_device_addr(i, j)
     {}
-#pragma omp target data map(tofrom: i) use_device_ptr(z)
+#pragma omp target data map(tofrom: i) use_device_ptr(z) use_device_addr(k, y)
     {}
   return;
  }
 };
 // CHECK: struct SA
 // CHECK: void func(
-// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->k){{$}}
-// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->z)
+// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->k) use_device_addr(this->i,this->j){{$}}
+// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->z) use_device_addr(this->k,this->y)
 struct SB {
   unsigned A;
   unsigned B;
@@ -143,13 +145,13 @@ int main(int argc, char **argv) {
 // CHECK-NEXT: int &j = i;
 // CHECK-NEXT: int *k = &j;
 // CHECK-NEXT: int *&z = k;
-#pragma omp target data map(tofrom: i) use_device_ptr(k)
-// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+#pragma omp target data map(tofrom: i) use_device_ptr(k) use_device_addr(i, j)
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k) use_device_addr(i,j)
   {}
 // CHECK-NEXT: {
 // CHECK-NEXT: }
-#pragma omp target data map(tofrom: i) use_device_ptr(z)
-// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+#pragma omp target data map(tofrom: i) use_device_ptr(z) use_device_addr(i, j, k[:i])
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z) use_device_addr(i,j,k[:i])
   {}
   return tmain<int>(argc) + (*tmain<int*>(&argc));
 }
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_addr_messages.cpp b/clang/test/OpenMP/target_data_use_device_ptr_addr_messages.cpp
new file mode 100644
index 0000000000000..98dc56ea07d24
--- /dev/null
+++ b/clang/test/OpenMP/target_data_use_device_ptr_addr_messages.cpp
@@ -0,0 +1,300 @@
+// RUN: %clang_cc1 -std=c++11 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 200 %s -Wuninitialized
+// RUN: %clang_cc1 -std=c++11 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 200 %s -Wuninitialized
+
+// RUN: %clang_cc1 -std=c++11 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 200 %s -Wuninitialized
+// RUN: %clang_cc1 -std=c++11 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 200 %s -Wuninitialized
+struct ST {
+  int *a;
+};
+struct SA {
+  const int d = 5;
+  const int da[5] = { 0 };
+  ST e;
+  ST g[10];
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  void func(int arg) {
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+    {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+    {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+    {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+    {}
+#pragma omp target data map(i) use_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+    {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+    {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_addr // expected-error {{expected '(' after 'use_device_addr'}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr() // expected-error {{expected expression}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(alloc) // expected-error {{use of undeclared identifier 'alloc'}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(i) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(j) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(k) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(z) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(aa) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(e) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(g) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(k,i,j) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(d) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+#pragma omp target data map(i) use_device_addr(da) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+    {}
+  return;
+ }
+};
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef int from;
+
+template <typename T, int I>
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+  {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_addr // expected-error {{expected '(' after 'use_device_addr'}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr() // expected-error {{expected expression}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(alloc) // expected-error {{use of undeclared identifier 'alloc'}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(i) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(j) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(k) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(z) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(aa) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(e) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(g) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(k,i,j) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(d) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(da) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+  {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_addr // expected-error {{expected '(' after 'use_device_addr'}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr() // expected-error {{expected expression}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(alloc) // expected-error {{use of undeclared identifier 'alloc'}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(i) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(j) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(k) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(z) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(aa) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(e) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(g) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(k,i,j) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(d) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+#pragma omp target data map(i) use_device_addr(da) // omp45-error {{unexpected OpenMP clause 'use_device_addr' in directive '#pragma omp target data'}}
+  {}
+  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
+}
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_messages.cpp b/clang/test/OpenMP/target_data_use_device_ptr_messages.cpp
deleted file mode 100644
index 6ce6f9db7d224..0000000000000
--- a/clang/test/OpenMP/target_data_use_device_ptr_messages.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -verify -fopenmp -ferror-limit 200 %s -Wuninitialized
-
-// RUN: %clang_cc1 -std=c++11 -verify -fopenmp-simd -ferror-limit 200 %s -Wuninitialized
-struct ST {
-  int *a;
-};
-struct SA {
-  const int d = 5;
-  const int da[5] = { 0 };
-  ST e;
-  ST g[10];
-  int i;
-  int &j = i;
-  int *k = &j;
-  int *&z = k;
-  int aa[10];
-  void func(int arg) {
-#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
-    {}
-#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
-    {}
-#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
-    {}
-#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
-    {}
-#pragma omp target data map(i) use_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(k) // OK
-    {}
-#pragma omp target data map(i) use_device_ptr(z) // OK
-    {}
-#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-    {}
-  return;
- }
-};
-struct SB {
-  unsigned A;
-  unsigned B;
-  float Arr[100];
-  float *Ptr;
-  float *foo() {
-    return &Arr[0];
-  }
-};
-
-struct SC {
-  unsigned A : 2;
-  unsigned B : 3;
-  unsigned C;
-  unsigned D;
-  float Arr[100];
-  SB S;
-  SB ArrS[100];
-  SB *PtrS;
-  SB *&RPtrS;
-  float *Ptr;
-
-  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
-};
-
-union SD {
-  unsigned A;
-  float B;
-};
-
-struct S1;
-extern S1 a;
-class S2 {
-  mutable int a;
-public:
-  S2():a(0) { }
-  S2(S2 &s2):a(s2.a) { }
-  static float S2s;
-  static const float S2sc;
-};
-const float S2::S2sc = 0;
-const S2 b;
-const S2 ba[5];
-class S3 {
-  int a;
-public:
-  S3():a(0) { }
-  S3(S3 &s3):a(s3.a) { }
-};
-const S3 c;
-const S3 ca[5];
-extern const int f;
-class S4 {
-  int a;
-  S4();
-  S4(const S4 &s4);
-public:
-  S4(int v):a(v) { }
-};
-class S5 {
-  int a;
-  S5():a(0) {}
-  S5(const S5 &s5):a(s5.a) { }
-public:
-  S5(int v):a(v) { }
-};
-
-S3 h;
-#pragma omp threadprivate(h)
-
-typedef int from;
-
-template <typename T, int I>
-T tmain(T argc) {
-  const T d = 5;
-  const T da[5] = { 0 };
-  S4 e(4);
-  S5 g(5);
-  T i;
-  T &j = i;
-  T *k = &j;
-  T *&z = k;
-  T aa[10];
-#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
-  {}
-#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
-  {}
-#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
-  {}
-#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
-  {}
-#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(k) // OK
-  {}
-#pragma omp target data map(i) use_device_ptr(z) // OK
-  {}
-#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-  return 0;
-}
-
-int main(int argc, char **argv) {
-  const int d = 5;
-  const int da[5] = { 0 };
-  S4 e(4);
-  S5 g(5);
-  int i;
-  int &j = i;
-  int *k = &j;
-  int *&z = k;
-  int aa[10];
-#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
-  {}
-#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
-  {}
-#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
-  {}
-#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
-  {}
-#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(k) // OK
-  {}
-#pragma omp target data map(i) use_device_ptr(z) // OK
-  {}
-#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
-  {}
-  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
-}
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index 556df1cf3e146..92edd12e9449e 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -598,7 +598,7 @@ int main(int argc, char **argv) {
   const int (&l)[5] = da;
   SC1 s;
   SC1 *p;
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} le45-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}} le50-error {{expected at least one 'map', 'use_device_ptr', or 'use_device_addr' clause for '#pragma omp target data'}}
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
diff --git a/clang/test/OpenMP/target_teams_map_messages.cpp b/clang/test/OpenMP/target_teams_map_messages.cpp
index ec89831f691fa..11115d501912a 100644
--- a/clang/test/OpenMP/target_teams_map_messages.cpp
+++ b/clang/test/OpenMP/target_teams_map_messages.cpp
@@ -488,7 +488,7 @@ int main(int argc, char **argv) {
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} le45-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}} le50-error {{expected at least one 'map', 'use_device_ptr', or 'use_device_addr' clause for '#pragma omp target data'}}
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 79a970c229dd4..bff23f52b4597 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2489,6 +2489,10 @@ void OMPClauseEnqueue::VisitOMPUseDevicePtrClause(
     const OMPUseDevicePtrClause *C) {
   VisitOMPClauseList(C);
 }
+void OMPClauseEnqueue::VisitOMPUseDeviceAddrClause(
+    const OMPUseDeviceAddrClause *C) {
+  VisitOMPClauseList(C);
+}
 void OMPClauseEnqueue::VisitOMPIsDevicePtrClause(
     const OMPIsDevicePtrClause *C) {
   VisitOMPClauseList(C);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index e16e7c6ad1f9a..5f36245403224 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -198,6 +198,7 @@ __OMP_CLAUSE(inclusive, OMPInclusiveClause)
 __OMP_CLAUSE(exclusive, OMPExclusiveClause)
 __OMP_CLAUSE(uses_allocators, OMPUsesAllocatorsClause)
 __OMP_CLAUSE(affinity, OMPAffinityClause)
+__OMP_CLAUSE(use_device_addr, OMPUseDeviceAddrClause)
 
 __OMP_CLAUSE_NO_CLASS(uniform)
 __OMP_CLAUSE_NO_CLASS(device_type)
@@ -904,6 +905,7 @@ __OMP_DIRECTIVE_CLAUSE(target_data, 1, ~0, if)
 __OMP_DIRECTIVE_CLAUSE(target_data, 1, ~0, device)
 __OMP_DIRECTIVE_CLAUSE(target_data, 1, ~0, map)
 __OMP_DIRECTIVE_CLAUSE(target_data, 1, ~0, use_device_ptr)
+__OMP_DIRECTIVE_CLAUSE(target_data, 50, ~0, use_device_addr)
 
 __OMP_DIRECTIVE_CLAUSE(target_enter_data, 1, ~0, if)
 __OMP_DIRECTIVE_CLAUSE(target_enter_data, 1, ~0, device)

From 31f40f603d0c00b313397196124c5f39090badf0 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 27 May 2020 08:45:55 -0700
Subject: [PATCH 235/770] [mlir] Add simple generator for return types

Take advantage of equality constrains to generate the type inference interface.
This is used for equality and trivially built types. The type inference method
is only generated when no type inference trait is specified already.

This reorders verification that changes some test error messages.

Differential Revision: https://reviews.llvm.org/D80484
---
 mlir/docs/OpDefinitions.md                    |  13 +-
 .../include/mlir/Dialect/Shape/IR/ShapeOps.td |   5 +-
 mlir/include/mlir/TableGen/Attribute.h        |   3 +
 mlir/include/mlir/TableGen/Operator.h         |  42 ++++++
 mlir/lib/Dialect/Shape/IR/Shape.cpp           |   9 --
 mlir/lib/TableGen/Attribute.cpp               |   2 +
 mlir/lib/TableGen/Operator.cpp                | 141 ++++++++++++++++--
 mlir/test/lib/Dialect/Test/TestOps.td         |  20 ---
 mlir/test/mlir-tblgen/op-decl.td              |  23 ++-
 mlir/test/mlir-tblgen/types.mlir              |   4 +-
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp   | 128 +++++++++++-----
 11 files changed, 299 insertions(+), 91 deletions(-)

diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md
index ddabae2225e7b..42c431d13f8ec 100644
--- a/mlir/docs/OpDefinitions.md
+++ b/mlir/docs/OpDefinitions.md
@@ -1,4 +1,4 @@
-# Table-driven Operation Definition Specification (ODS)
+# Operation Definition Specification (ODS)
 
 In addition to specializing the `mlir::Op` C++ template, MLIR also supports
 defining operations in a table-driven manner. This is achieved via
@@ -526,10 +526,9 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   IntegerAttr i32_attr, FloatAttr f32_attr, ...);
 
 // All operands/attributes have aggregate parameters.
-// Generated if InferTypeOpInterface interface is specified.
+// Generated if return type can be inferred.
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
-                  ValueRange operands,
-                  ArrayRef<NamedAttribute> attributes);
+                  ValueRange operands, ArrayRef<NamedAttribute> attributes);
 
 // (And manually specified builders depending on the specific op.)
 ```
@@ -554,6 +553,12 @@ restrictions.) Otherwise, the builder of the third form will still be generated
 but default values for the attributes not at the end of the `arguments` list
 will not be supplied in the builder's signature.
 
+ODS will generate a builder that doesn't require return type specified if
+
+*   Op implements InferTypeOpInterface interface;
+*   All return types are either buildable types or are the same as a given
+    operand (e.g., `AllTypesMatch` constraint between operand and result);
+
 And there may potentially exist other builders depending on the specific op;
 please refer to the
 [generated C++ file](#run-mlir-tblgen-to-see-the-generated-content) for the
diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 0278d7bbeb065..a9759fc6a7343 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -91,10 +91,7 @@ def Shape_BroadcastOp : Shape_Op<"broadcast",
   let hasFolder = 1;
 }
 
-def Shape_ConstShapeOp : Shape_Op<"const_shape",
-    [ConstantLike,
-     NoSideEffect,
-     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def Shape_ConstShapeOp : Shape_Op<"const_shape", [ConstantLike, NoSideEffect]> {
   let summary = "Creates a constant of !shape.shape type.";
   let description = [{
     Creates a !shape.shape with rank given by the length of `shape` and with
diff --git a/mlir/include/mlir/TableGen/Attribute.h b/mlir/include/mlir/TableGen/Attribute.h
index f99939392e932..4571ca8ee9b38 100644
--- a/mlir/include/mlir/TableGen/Attribute.h
+++ b/mlir/include/mlir/TableGen/Attribute.h
@@ -230,6 +230,9 @@ class StructAttr : public Attribute {
   std::vector<StructFieldAttr> getAllFields() const;
 };
 
+// Name of infer type op interface.
+extern const char *inferTypeOpInterface;
+
 } // end namespace tblgen
 } // end namespace mlir
 
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index e65bc55a84f5f..040f52314cea0 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -23,6 +23,7 @@
 #include "mlir/TableGen/Type.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SMLoc.h"
 
@@ -227,10 +228,45 @@ class Operator {
   // debugging purposes.
   void print(llvm::raw_ostream &os) const;
 
+  // Return whether all the result types are known.
+  bool allResultTypesKnown() const { return allResultsHaveKnownTypes; };
+
+  // Pair representing either a index to an argument or a type constraint. Only
+  // one of these entries should have the non-default value.
+  struct ArgOrType {
+    explicit ArgOrType(int index) : index(index), constraint(None) {}
+    explicit ArgOrType(TypeConstraint constraint)
+        : index(None), constraint(constraint) {}
+    bool isArg() const {
+      assert(constraint.hasValue() ^ index.hasValue());
+      return index.hasValue();
+    }
+    bool isType() const {
+      assert(constraint.hasValue() ^ index.hasValue());
+      return constraint.hasValue();
+    }
+
+    int getArg() const { return *index; }
+    TypeConstraint getType() const { return *constraint; }
+
+  private:
+    Optional<int> index;
+    Optional<TypeConstraint> constraint;
+  };
+
+  // Return all arguments or type constraints with same type as result[index].
+  // Requires: all result types are known.
+  ArrayRef<ArgOrType> getSameTypeAsResult(int index) const;
+
 private:
   // Populates the vectors containing operands, attributes, results and traits.
   void populateOpStructure();
 
+  // Populates type inference info (mostly equality) with input a mapping from
+  // names to indices for arguments and results.
+  void populateTypeInferenceInfo(
+      const llvm::StringMap<int> &argumentsAndResultsIndex);
+
   // The dialect of this op.
   Dialect dialect;
 
@@ -261,12 +297,18 @@ class Operator {
   // The regions of this op.
   SmallVector<NamedRegion, 1> regions;
 
+  // The argument with the same type as the result.
+  SmallVector<SmallVector<ArgOrType, 2>, 4> resultTypeMapping;
+
   // The number of native attributes stored in the leading positions of
   // `attributes`.
   int numNativeAttributes;
 
   // The TableGen definition of this op.
   const llvm::Record &def;
+
+  // Whether the type of all results are known.
+  bool allResultsHaveKnownTypes;
 };
 
 } // end namespace tblgen
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 095c41720fbae..fa9552fc86945 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -223,15 +223,6 @@ static ParseResult parseConstShapeOp(OpAsmParser &parser,
 
 OpFoldResult ConstShapeOp::fold(ArrayRef<Attribute>) { return shape(); }
 
-LogicalResult
-ConstShapeOp::inferReturnTypes(MLIRContext *context,
-                               Optional<Location> location, ValueRange operands,
-                               DictionaryAttr attributes, RegionRange regions,
-                               SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(ShapeType::get(context));
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // ConstSizeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp
index 89dce1958991d..fe1fffbc1a695 100644
--- a/mlir/lib/TableGen/Attribute.cpp
+++ b/mlir/lib/TableGen/Attribute.cpp
@@ -288,3 +288,5 @@ tblgen::StructAttr::getAllFields() const {
 
   return attributes;
 }
+
+const char *mlir::tblgen::inferTypeOpInterface = "InferTypeOpInterface";
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 808ba7aabc766..2f77184980e28 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -14,6 +14,9 @@
 #include "mlir/TableGen/OpTrait.h"
 #include "mlir/TableGen/Predicate.h"
 #include "mlir/TableGen/Type.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
@@ -155,13 +158,13 @@ auto tblgen::Operator::getArgDecorators(int index) const
 
 const tblgen::OpTrait *tblgen::Operator::getTrait(StringRef trait) const {
   for (const auto &t : traits) {
-    if (auto opTrait = dyn_cast<tblgen::NativeOpTrait>(&t)) {
+    if (const auto *opTrait = dyn_cast<tblgen::NativeOpTrait>(&t)) {
       if (opTrait->getTrait() == trait)
         return opTrait;
-    } else if (auto opTrait = dyn_cast<tblgen::InternalOpTrait>(&t)) {
+    } else if (const auto *opTrait = dyn_cast<tblgen::InternalOpTrait>(&t)) {
       if (opTrait->getTrait() == trait)
         return opTrait;
-    } else if (auto opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&t)) {
+    } else if (const auto *opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&t)) {
       if (opTrait->getTrait() == trait)
         return opTrait;
     }
@@ -252,22 +255,126 @@ auto tblgen::Operator::getArg(int index) const -> Argument {
   return arguments[index];
 }
 
+// Mapping from result index to combined argument and result index. Arguments
+// are indexed to match getArg index, while the result indexes are mapped to
+// avoid overlap.
+static int resultIndex(int i) { return -1 - i; }
+
+bool tblgen::Operator::isVariadic() const {
+  return any_of(llvm::concat<const NamedTypeConstraint>(operands, results),
+                [](const NamedTypeConstraint &op) { return op.isVariadic(); });
+}
+
+void tblgen::Operator::populateTypeInferenceInfo(
+    const llvm::StringMap<int> &argumentsAndResultsIndex) {
+  // If the type inference op interface is not registered, then do not attempt
+  // to determine if the result types an be inferred.
+  auto &recordKeeper = def.getRecords();
+  auto *inferTrait = recordKeeper.getDef(inferTypeOpInterface);
+  allResultsHaveKnownTypes = false;
+  if (!inferTrait)
+    return;
+
+  // If there are no results, the skip this else the build method generated
+  // overlaps with another autogenerated builder.
+  if (getNumResults() == 0)
+    return;
+
+  // Skip for ops with variadic operands/results.
+  // TODO: This can be relaxed.
+  if (isVariadic())
+    return;
+
+  // Skip cases currently being custom generated.
+  // TODO: Remove special cases.
+  if (getTrait("OpTrait::SameOperandsAndResultType"))
+    return;
+
+  // We create equivalence classes of argument/result types where arguments
+  // and results are mapped into the same index space and indices corresponding
+  // to the same type are in the same equivalence class.
+  llvm::EquivalenceClasses<int> ecs;
+  resultTypeMapping.resize(getNumResults());
+  // Captures the argument whose type matches a given result type. Preference
+  // towards capturing operands first before attributes.
+  auto captureMapping = [&](int i) {
+    bool found = false;
+    ecs.insert(resultIndex(i));
+    auto mi = ecs.findLeader(resultIndex(i));
+    for (auto me = ecs.member_end(); mi != me; ++mi) {
+      if (*mi < 0) {
+        auto tc = getResultTypeConstraint(i);
+        if (tc.getBuilderCall().hasValue()) {
+          resultTypeMapping[i].emplace_back(tc);
+          found = true;
+        }
+        continue;
+      }
+
+      if (auto *attr = getArg(*mi).dyn_cast<NamedAttribute *>()) {
+        // TODO: Handle attributes.
+        continue;
+      } else {
+        resultTypeMapping[i].emplace_back(*mi);
+        found = true;
+      }
+    }
+    return found;
+  };
+
+  for (const OpTrait &trait : traits) {
+    const llvm::Record &def = trait.getDef();
+    // If the infer type op interface was manually added, then treat it as
+    // intention that the op needs special handling.
+    // TODO: Reconsider whether to always generate, this is more conservative
+    // and keeps existing behavior so starting that way for now.
+    if (def.isSubClassOf(
+            llvm::formatv("{0}::Trait", inferTypeOpInterface).str()))
+      return;
+    if (const auto *opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&trait))
+      if (opTrait->getTrait().startswith(inferTypeOpInterface))
+        return;
+
+    if (!def.isSubClassOf("AllTypesMatch"))
+      continue;
+
+    auto values = def.getValueAsListOfStrings("values");
+    auto root = argumentsAndResultsIndex.lookup(values.front());
+    for (StringRef str : values)
+      ecs.unionSets(argumentsAndResultsIndex.lookup(str), root);
+  }
+
+  // Verifies that all output types have a corresponding known input type
+  // and chooses matching operand or attribute (in that order) that
+  // matches it.
+  allResultsHaveKnownTypes =
+      all_of(llvm::seq<int>(0, getNumResults()), captureMapping);
+
+  // If the types could be computed, then add type inference trait.
+  if (allResultsHaveKnownTypes)
+    traits.push_back(OpTrait::create(inferTrait->getDefInit()));
+}
+
 void tblgen::Operator::populateOpStructure() {
   auto &recordKeeper = def.getRecords();
-  auto typeConstraintClass = recordKeeper.getClass("TypeConstraint");
-  auto attrClass = recordKeeper.getClass("Attr");
-  auto derivedAttrClass = recordKeeper.getClass("DerivedAttr");
-  auto opVarClass = recordKeeper.getClass("OpVariable");
+  auto *typeConstraintClass = recordKeeper.getClass("TypeConstraint");
+  auto *attrClass = recordKeeper.getClass("Attr");
+  auto *derivedAttrClass = recordKeeper.getClass("DerivedAttr");
+  auto *opVarClass = recordKeeper.getClass("OpVariable");
   numNativeAttributes = 0;
 
   DagInit *argumentValues = def.getValueAsDag("arguments");
   unsigned numArgs = argumentValues->getNumArgs();
 
+  // Mapping from name of to argument or result index. Arguments are indexed
+  // to match getArg index, while the results are negatively indexed.
+  llvm::StringMap<int> argumentsAndResultsIndex;
+
   // Handle operands and native attributes.
   for (unsigned i = 0; i != numArgs; ++i) {
-    auto arg = argumentValues->getArg(i);
+    auto *arg = argumentValues->getArg(i);
     auto givenName = argumentValues->getArgNameStr(i);
-    auto argDefInit = dyn_cast<DefInit>(arg);
+    auto *argDefInit = dyn_cast<DefInit>(arg);
     if (!argDefInit)
       PrintFatalError(def.getLoc(),
                       Twine("undefined type for argument #") + Twine(i));
@@ -290,6 +397,8 @@ void tblgen::Operator::populateOpStructure() {
       PrintFatalError(def.getLoc(), "unexpected def type; only defs deriving "
                                     "from TypeConstraint or Attr are allowed");
     }
+    if (!givenName.empty())
+      argumentsAndResultsIndex[givenName] = i;
   }
 
   // Handle derived attributes.
@@ -348,6 +457,8 @@ void tblgen::Operator::populateOpStructure() {
     if (resultDef->isSubClassOf(opVarClass))
       resultDef = resultDef->getValueAsDef("constraint");
     results.push_back({name, TypeConstraint(resultDef)});
+    if (!name.empty())
+      argumentsAndResultsIndex[name] = resultIndex(i);
   }
 
   // Handle successors
@@ -375,17 +486,19 @@ void tblgen::Operator::populateOpStructure() {
 
   // Create list of traits, skipping over duplicates: appending to lists in
   // tablegen is easy, making them unique less so, so dedupe here.
-  if (auto traitList = def.getValueAsListInit("traits")) {
+  if (auto *traitList = def.getValueAsListInit("traits")) {
     // This is uniquing based on pointers of the trait.
     SmallPtrSet<const llvm::Init *, 32> traitSet;
     traits.reserve(traitSet.size());
-    for (auto traitInit : *traitList) {
+    for (auto *traitInit : *traitList) {
       // Keep traits in the same order while skipping over duplicates.
       if (traitSet.insert(traitInit).second)
         traits.push_back(OpTrait::create(traitInit));
     }
   }
 
+  populateTypeInferenceInfo(argumentsAndResultsIndex);
+
   // Handle regions
   auto *regionsDag = def.getValueAsDag("regions");
   auto *regionsOp = dyn_cast<DefInit>(regionsDag->getOperator());
@@ -415,6 +528,12 @@ void tblgen::Operator::populateOpStructure() {
   LLVM_DEBUG(print(llvm::dbgs()));
 }
 
+auto tblgen::Operator::getSameTypeAsResult(int index) const
+    -> ArrayRef<ArgOrType> {
+  assert(allResultTypesKnown());
+  return resultTypeMapping[index];
+}
+
 ArrayRef<llvm::SMLoc> tblgen::Operator::getLoc() const { return def.getLoc(); }
 
 bool tblgen::Operator::hasDescription() const {
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 9e95932b5680a..997d8eb44ae59 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -756,15 +756,6 @@ def OpSymbolBindingA : TEST_Op<"symbol_binding_a", []> {
 def OpSymbolBindingB : TEST_Op<"symbol_binding_b", []> {
   let arguments = (ins I32:$operand);
   let results = (outs I32);
-
-  let builders = [
-    OpBuilder<
-      "OpBuilder &builder, OperationState &state, Value operand",
-      [{
-        state.types.assign({builder.getIntegerType(32)});
-        state.addOperands({operand});
-      }]>
-  ];
 }
 def OpSymbolBindingC : TEST_Op<"symbol_binding_c", []> {
   let arguments = (ins I32:$operand);
@@ -868,17 +859,6 @@ def AnotherThreeResultOp : TEST_Op<"another_three_result"> {
 def TwoResultOp : TEST_Op<"two_result"> {
   let arguments = (ins MultiResultOpEnum:$kind);
   let results = (outs I32:$result1, F32:$result2);
-
-  let builders = [
-    OpBuilder<
-      "OpBuilder &builder, OperationState &state, IntegerAttr kind",
-      [{
-        auto i32 = builder.getIntegerType(32);
-        auto f32 = builder.getF32Type();
-        state.types.assign({i32, f32});
-        state.addAttribute("kind", kind);
-      }]>
-  ];
 }
 
 def AnotherTwoResultOp : TEST_Op<"another_two_result"> {
diff --git a/mlir/test/mlir-tblgen/op-decl.td b/mlir/test/mlir-tblgen/op-decl.td
index c68d03c96b308..565f1921125a3 100644
--- a/mlir/test/mlir-tblgen/op-decl.td
+++ b/mlir/test/mlir-tblgen/op-decl.td
@@ -1,6 +1,7 @@
 // RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck --dump-input-on-failure %s
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def Test_Dialect : Dialect {
@@ -44,8 +45,6 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
   }];
 }
 
-// CHECK: class AOp;
-
 // CHECK-LABEL: NS::AOp declarations
 
 // CHECK: class AOpOperandAdaptor {
@@ -150,6 +149,26 @@ def NS_EOp : NS_Op<"op_with_optionals", []> {
 // CHECK:   Value b();
 // CHECK:   static void build(OpBuilder &odsBuilder, OperationState &odsState, /*optional*/Type b, /*optional*/Value a)
 
+// Check that all types match constraint results in generating builder.
+// ---
+
+def NS_FOp : NS_Op<"op_with_all_types_constraint",
+    [AllTypesMatch<["a", "b"]>]> {
+  let arguments = (ins AnyType:$a);
+  let results = (outs AnyType:$b);
+}
+
+// CHECK-LABEL: class FOp :
+// CHECK: static LogicalResult inferReturnTypes
+
+def NS_GOp : NS_Op<"op_with_fixed_return_type", []> {
+  let arguments = (ins AnyType:$a);
+  let results = (outs I32:$b);
+}
+
+// CHECK-LABEL: class GOp :
+// CHECK: static LogicalResult inferReturnTypes
+
 // Check that default builders can be suppressed.
 // ---
 
diff --git a/mlir/test/mlir-tblgen/types.mlir b/mlir/test/mlir-tblgen/types.mlir
index 6850b77c7672e..6a0a80ca5e5fc 100644
--- a/mlir/test/mlir-tblgen/types.mlir
+++ b/mlir/test/mlir-tblgen/types.mlir
@@ -438,7 +438,7 @@ func @operand_rank_equals_result_size_failure(%arg : tensor<1x2x3x4xi32>) {
 // -----
 
 func @same_types_element_mismatch(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>) {
-  // expected-error@+1 {{all of {x, res} have same type}}
+  // expected-error@+1 {{type incompatible with return type of operation}}
   "test.operand0_and_result_have_same_type"(%arg0, %arg1) : (tensor<* x i32>, tensor<* x f32>) -> tensor<* x f32>
   return
 }
@@ -446,7 +446,7 @@ func @same_types_element_mismatch(%arg0: tensor<* x i32>, %arg1: tensor<* x f32>
 // -----
 
 func @same_types_shape_mismatch(%arg0: tensor<1x2xi32>, %arg1: tensor<2x1xi32>) {
-  // expected-error@+1 {{all of {x, res} have same type}}
+  // expected-error@+1 {{type incompatible with return type of operation}}
   "test.operand0_and_result_have_same_type"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<2x1xi32>
   return
 }
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 2010262f21858..0b55825d1a46c 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -295,9 +295,15 @@ class OpEmitter {
   // Generate the OpInterface methods.
   void genOpInterfaceMethods();
 
+  // Generate op interface method.
+  void genOpInterfaceMethod(const tblgen::InterfaceOpTrait *trait);
+
   // Generate the side effect interface methods.
   void genSideEffectInterfaceMethods();
 
+  // Generate the type inference interface methods.
+  void genTypeInterfaceMethods();
+
 private:
   // The TableGen record for this op.
   // TODO(antiagainst,zinenko): OpEmitter should not have a Record directly,
@@ -321,6 +327,7 @@ OpEmitter::OpEmitter(const Operator &op)
   verifyCtx.withOp("(*this->getOperation())");
 
   genTraits();
+
   // Generate C++ code for various op methods. The order here determines the
   // methods in the generated file.
   genOpAsmInterface();
@@ -341,6 +348,7 @@ OpEmitter::OpEmitter(const Operator &op)
   genOpInterfaceMethods();
   generateOpFormat(op, opClass);
   genSideEffectInterfaceMethods();
+  genTypeInterfaceMethods();
 }
 
 void OpEmitter::emitDecl(const Operator &op, raw_ostream &os) {
@@ -750,6 +758,10 @@ static bool canGenerateUnwrappedBuilder(Operator &op) {
   return canGenerate;
 }
 
+static bool canInferType(Operator &op) {
+  return op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0;
+}
+
 void OpEmitter::genSeparateArgParamBuilder() {
   SmallVector<AttrParamKind, 2> attrBuilderType;
   attrBuilderType.push_back(AttrParamKind::WrappedAttr);
@@ -814,11 +826,9 @@ void OpEmitter::genSeparateArgParamBuilder() {
     llvm_unreachable("unhandled TypeParamKind");
   };
 
-  bool canInferType =
-      op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0;
   for (auto attrType : attrBuilderType) {
     emit(attrType, TypeParamKind::Separate, /*inferType=*/false);
-    if (canInferType)
+    if (canInferType(op))
       emit(attrType, TypeParamKind::None, /*inferType=*/true);
     // Emit separate arg build with collective type, unless there is only one
     // variadic result, in which case the above would have already generated
@@ -1070,11 +1080,8 @@ void OpEmitter::genCollectiveParamBuilder() {
   body << "  " << builderOpState << ".addTypes(resultTypes);\n";
 
   // Generate builder that infers type too.
-  // TODO(jpienaar): Subsume this with general checking if type can be inferred
-  // automatically.
   // TODO(jpienaar): Expand to handle regions and successors.
-  if (op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0 &&
-      op.getNumSuccessors() == 0)
+  if (canInferType(op) && op.getNumSuccessors() == 0)
     genInferredTypeCollectiveParamBuilder();
 }
 
@@ -1318,40 +1325,43 @@ void OpEmitter::genFolderDecls() {
   }
 }
 
+void OpEmitter::genOpInterfaceMethod(const tblgen::InterfaceOpTrait *opTrait) {
+  auto interface = opTrait->getOpInterface();
+
+  // Get the set of methods that should always be declared.
+  auto alwaysDeclaredMethodsVec = opTrait->getAlwaysDeclaredMethods();
+  llvm::StringSet<> alwaysDeclaredMethods;
+  alwaysDeclaredMethods.insert(alwaysDeclaredMethodsVec.begin(),
+                               alwaysDeclaredMethodsVec.end());
+
+  for (const OpInterfaceMethod &method : interface.getMethods()) {
+    // Don't declare if the method has a body.
+    if (method.getBody())
+      continue;
+    // Don't declare if the method has a default implementation and the op
+    // didn't request that it always be declared.
+    if (method.getDefaultImplementation() &&
+        !alwaysDeclaredMethods.count(method.getName()))
+      continue;
+
+    std::string args;
+    llvm::raw_string_ostream os(args);
+    interleaveComma(method.getArguments(), os,
+                    [&](const OpInterfaceMethod::Argument &arg) {
+                      os << arg.type << " " << arg.name;
+                    });
+    opClass.newMethod(method.getReturnType(), method.getName(), os.str(),
+                      method.isStatic() ? OpMethod::MP_Static
+                                        : OpMethod::MP_None,
+                      /*declOnly=*/true);
+  }
+}
+
 void OpEmitter::genOpInterfaceMethods() {
   for (const auto &trait : op.getTraits()) {
-    auto opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&trait);
-    if (!opTrait || !opTrait->shouldDeclareMethods())
-      continue;
-    auto interface = opTrait->getOpInterface();
-
-    // Get the set of methods that should always be declared.
-    auto alwaysDeclaredMethodsVec = opTrait->getAlwaysDeclaredMethods();
-    llvm::StringSet<> alwaysDeclaredMethods;
-    alwaysDeclaredMethods.insert(alwaysDeclaredMethodsVec.begin(),
-                                 alwaysDeclaredMethodsVec.end());
-
-    for (const OpInterfaceMethod &method : interface.getMethods()) {
-      // Don't declare if the method has a body.
-      if (method.getBody())
-        continue;
-      // Don't declare if the method has a default implementation and the op
-      // didn't request that it always be declared.
-      if (method.getDefaultImplementation() &&
-          !alwaysDeclaredMethods.count(method.getName()))
-        continue;
-
-      std::string args;
-      llvm::raw_string_ostream os(args);
-      interleaveComma(method.getArguments(), os,
-                      [&](const OpInterfaceMethod::Argument &arg) {
-                        os << arg.type << " " << arg.name;
-                      });
-      opClass.newMethod(method.getReturnType(), method.getName(), os.str(),
-                        method.isStatic() ? OpMethod::MP_Static
-                                          : OpMethod::MP_None,
-                        /*declOnly=*/true);
-    }
+    if (const auto *opTrait = dyn_cast<tblgen::InterfaceOpTrait>(&trait))
+      if (opTrait->shouldDeclareMethods())
+        genOpInterfaceMethod(opTrait);
   }
 }
 
@@ -1431,6 +1441,46 @@ void OpEmitter::genSideEffectInterfaceMethods() {
   }
 }
 
+void OpEmitter::genTypeInterfaceMethods() {
+  if (!op.allResultTypesKnown())
+    return;
+
+  auto &method = opClass.newMethod(
+      "LogicalResult", "inferReturnTypes",
+      "MLIRContext* context, Optional<Location> location, "
+      "ValueRange operands, DictionaryAttr attributes, RegionRange regions, "
+      "SmallVectorImpl<Type>& inferredReturnTypes",
+      OpMethod::MP_Static,
+      /*declOnly=*/false);
+  auto &os = method.body();
+  os << "  inferredReturnTypes.resize(" << op.getNumResults() << ");\n";
+
+  FmtContext fctx;
+  fctx.withBuilder("odsBuilder");
+  os << "  Builder odsBuilder(context);\n";
+
+  auto emitType =
+      [&](const tblgen::Operator::ArgOrType &type) -> OpMethodBody & {
+    if (type.isArg()) {
+      auto argIndex = type.getArg();
+      assert(!op.getArg(argIndex).is<NamedAttribute *>());
+      return os << "operands[" << argIndex << "].getType()";
+    } else {
+      return os << tgfmt(*type.getType().getBuilderCall(), &fctx);
+    }
+  };
+
+  for (int i = 0, e = op.getNumResults(); i != e; ++i) {
+    os << "  inferredReturnTypes[" << i << "] = ";
+    auto types = op.getSameTypeAsResult(i);
+    emitType(types[0]) << ";\n";
+    if (types.size() == 1)
+      continue;
+    // TODO: We could verify equality here, but skipping that for verification.
+  }
+  os << "  return success();";
+}
+
 void OpEmitter::genParser() {
   if (!hasStringAttribute(def, "parser") ||
       hasStringAttribute(def, "assemblyFormat"))

From 6022efb0e9cbb350f7b690acd0cfa4b87b1dfe87 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 May 2020 16:57:28 +0100
Subject: [PATCH 236/770] CoverageFilters.h - reduce unnecessary includes to
 forward declarations. NFC.

---
 llvm/tools/llvm-cov/CodeCoverage.cpp    |  1 +
 llvm/tools/llvm-cov/CoverageFilters.cpp |  1 +
 llvm/tools/llvm-cov/CoverageFilters.h   | 10 +++++++---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 1d464431cbbe2..b3c895b44a6d6 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
diff --git a/llvm/tools/llvm-cov/CoverageFilters.cpp b/llvm/tools/llvm-cov/CoverageFilters.cpp
index ca241e386e87e..da3b5214eec4c 100644
--- a/llvm/tools/llvm-cov/CoverageFilters.cpp
+++ b/llvm/tools/llvm-cov/CoverageFilters.cpp
@@ -13,6 +13,7 @@
 #include "CoverageFilters.h"
 #include "CoverageSummaryInfo.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/SpecialCaseList.h"
 
 using namespace llvm;
 
diff --git a/llvm/tools/llvm-cov/CoverageFilters.h b/llvm/tools/llvm-cov/CoverageFilters.h
index ce56e16071117..ccaa7a9df5905 100644
--- a/llvm/tools/llvm-cov/CoverageFilters.h
+++ b/llvm/tools/llvm-cov/CoverageFilters.h
@@ -13,13 +13,17 @@
 #ifndef LLVM_COV_COVERAGEFILTERS_H
 #define LLVM_COV_COVERAGEFILTERS_H
 
-#include "CoverageSummaryInfo.h"
-#include "llvm/ProfileData/Coverage/CoverageMapping.h"
-#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/ADT/StringRef.h"
 #include <memory>
 #include <vector>
 
 namespace llvm {
+class SpecialCaseList;
+
+namespace coverage {
+class CoverageMapping;
+struct FunctionRecord;
+}; // namespace coverage
 
 /// Matches specific functions that pass the requirement of this filter.
 class CoverageFilter {

From aca3d067efe194539efd1e0fcf03820a2c377753 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Wed, 27 May 2020 09:55:29 -0400
Subject: [PATCH 237/770] Fix Darwin 'constinit thread_local' variables.

Unlike other platforms using ItaniumCXXABI, Darwin does not allow the
creation of a thread-wrapper function for a variable in the TU of
users. Because of this, it can set the linkage of the thread-local
symbol to internal, with the assumption that no TUs other than the one
defining the variable will need it.

However, constinit thread_local variables do not require the use of
the thread-wrapper call, so users reference the variable
directly. Thus, it must not be converted to internal, or users will
get a link failure.

This was a regression introduced by the optimization in
00223827a952f66e7426c9881a2a4229e59bb019.

Differential Revision: https://reviews.llvm.org/D80417
---
 clang/lib/CodeGen/CodeGenModule.cpp           | 27 +++++----
 .../cxx2a-thread-local-constinit.cpp          | 57 ++++++++++++-------
 2 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index f43bc6434dafd..89a95db086804 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4136,17 +4136,24 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
 
   GV->setAlignment(getContext().getDeclAlign(D).getAsAlign());
 
-  // On Darwin, if the normal linkage of a C++ thread_local variable is
-  // LinkOnce or Weak, we keep the normal linkage to prevent multiple
-  // copies within a linkage unit; otherwise, the backing variable has
-  // internal linkage and all accesses should just be calls to the
-  // Itanium-specified entry point, which has the normal linkage of the
-  // variable. This is to preserve the ability to change the implementation
-  // behind the scenes.
-  if (!D->isStaticLocal() && D->getTLSKind() == VarDecl::TLS_Dynamic &&
+  // On Darwin, unlike other Itanium C++ ABI platforms, the thread-wrapper
+  // function is only defined alongside the variable, not also alongside
+  // callers. Normally, all accesses to a thread_local go through the
+  // thread-wrapper in order to ensure initialization has occurred, underlying
+  // variable will never be used other than the thread-wrapper, so it can be
+  // converted to internal linkage.
+  //
+  // However, if the variable has the 'constinit' attribute, it _can_ be
+  // referenced directly, without calling the thread-wrapper, so the linkage
+  // must not be changed.
+  //
+  // Additionally, if the variable isn't plain external linkage, e.g. if it's
+  // weak or linkonce, the de-duplication semantics are important to preserve,
+  // so we don't change the linkage.
+  if (D->getTLSKind() == VarDecl::TLS_Dynamic &&
+      Linkage == llvm::GlobalValue::ExternalLinkage &&
       Context.getTargetInfo().getTriple().isOSDarwin() &&
-      !llvm::GlobalVariable::isLinkOnceLinkage(Linkage) &&
-      !llvm::GlobalVariable::isWeakLinkage(Linkage))
+      !D->hasAttr<ConstInitAttr>())
     Linkage = llvm::GlobalValue::InternalLinkage;
 
   GV->setLinkage(Linkage);
diff --git a/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp b/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
index f47707555098b..99c5d721dc476 100644
--- a/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-thread-local-constinit.cpp
@@ -1,41 +1,55 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++2a %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++2a %s -emit-llvm -o - | FileCheck --check-prefix=CHECK --check-prefix=LINUX %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin12  -std=c++2a %s -emit-llvm -o - | FileCheck --check-prefix=CHECK --check-prefix=DARWIN %s
+
+// Check variable definitions/declarations. Note that on Darwin, typically the
+// variable's symbol is marked internal, and only the _ZTW function is
+// exported. Except: constinit variables do get exported, even on darwin.
+
+// CHECK-DAG:  @a = external thread_local global i32
+// CHECK-DAG:  @b = external thread_local global i32
+// LINUX-DAG:  @c = thread_local global i32 0, align 4
+// DARWIN-DAG: @c = internal thread_local global i32 0, align 4
+// LINUX-DAG:  @d = thread_local global i32 0, align 4
+// DARWIN-DAG: @d = internal thread_local global i32 0, align 4
+// CHECK-DAG:  @e = external thread_local global %struct.Destructed, align 4
+// CHECK-DAG:  @e2 = thread_local global %struct.Destructed zeroinitializer, align 4
+// CHECK-DAG:  @f = thread_local global i32 4, align 4
 
-// CHECK-DAG: @a = external thread_local global i32
 extern thread_local int a;
-
-// CHECK-DAG: @b = external thread_local global i32
 extern thread_local constinit int b;
 
-// CHECK-LABEL: define i32 @_Z1fv()
-// CHECK: call i32* @_ZTW1a()
+// CHECK-LABEL: define i32 @_Z5get_av()
+// CHECK: call {{(cxx_fast_tlscc )?}}i32* @_ZTW1a()
 // CHECK: }
-int f() { return a; }
+int get_a() { return a; }
 
-// CHECK-LABEL: define linkonce_odr {{.*}} @_ZTW1a()
-// CHECK: br i1
-// CHECK: call void @_ZTH1a()
-// CHECK: }
+// LINUX-LABEL: define linkonce_odr {{.*}} @_ZTW1a()
+// LINUX: br i1
+// LINUX: call void @_ZTH1a()
+// LINUX: }
+// DARWIN-NOT: define {{.*}}@_ZTW1a()
 
-// CHECK-LABEL: define i32 @_Z1gv()
+// CHECK-LABEL: define i32 @_Z5get_bv()
 // CHECK-NOT: call
 // CHECK: load i32, i32* @b
 // CHECK-NOT: call
 // CHECK: }
-int g() { return b; }
+int get_b() { return b; }
 
 // CHECK-NOT: define {{.*}} @_ZTW1b()
 
 extern thread_local int c;
 
-// CHECK-LABEL: define i32 @_Z1hv()
-// CHECK: call i32* @_ZTW1c()
+// CHECK-LABEL: define i32 @_Z5get_cv()
+// LINUX: call {{(cxx_fast_tlscc )?}}i32* @_ZTW1c()
 // CHECK: load i32, i32* %
 // CHECK: }
-int h() { return c; }
+int get_c() { return c; }
 
 // Note: use of 'c' does not trigger initialization of 'd', because 'c' has a
 // constant initializer.
-// CHECK-LABEL: define weak_odr {{.*}} @_ZTW1c()
+// DARWIN-LABEL: define cxx_fast_tlscc {{.*}} @_ZTW1c()
+// LINUX-LABEL: define weak_odr {{.*}} @_ZTW1c()
 // CHECK-NOT: br i1
 // CHECK-NOT: call
 // CHECK: ret i32* @c
@@ -55,15 +69,18 @@ struct Destructed {
 };
 
 extern thread_local constinit Destructed e;
-// CHECK-LABEL: define i32 @_Z1iv()
+// CHECK-LABEL: define i32 @_Z5get_ev()
 // CHECK: call {{.*}}* @_ZTW1e()
 // CHECK: }
-int i() { return e.n; }
+int get_e() { return e.n; }
 
 // CHECK: define {{.*}}[[E2_INIT:@__cxx_global_var_init[^(]*]](
-// CHECK: call {{.*}} @__cxa_thread_atexit({{.*}} @_ZN10DestructedD1Ev {{.*}} @e2
+// LINUX: call {{.*}} @__cxa_thread_atexit({{.*}} @_ZN10DestructedD1Ev {{.*}} @e2
+// DARWIN: call {{.*}} @_tlv_atexit({{.*}} @_ZN10DestructedD1Ev {{.*}} @e2
 thread_local constinit Destructed e2;
 
+thread_local constinit int f = 4;
+
 // CHECK-LABEL: define {{.*}}__tls_init
 // CHECK: call {{.*}} [[D_INIT]]
 // CHECK: call {{.*}} [[E2_INIT]]

From b0404681171d8cfebdb1f439f45aeb1001321eb7 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Wed, 27 May 2020 12:05:55 -0400
Subject: [PATCH 238/770] Fix warning `-Wpedantic`. NFC.

---
 llvm/include/llvm/Support/SpecialCaseList.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 330e96a7b9acb..2d4930fb407e0 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -64,7 +64,7 @@ class StringRef;
 
 namespace vfs {
 class FileSystem;
-};
+}
 
 class SpecialCaseList {
 public:

From 495f18292b2bc90a162b79d187c6d14ecfbe98f9 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 26 May 2020 16:54:02 +0000
Subject: [PATCH 239/770] [VFABI] Fix parsing of uniform parameters that
 shouldn't expect step or positional data.

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80575
---
 llvm/lib/Analysis/VFABIDemangling.cpp         | 32 +++------------
 .../Analysis/VectorFunctionABITest.cpp        | 40 +++++++++----------
 2 files changed, 26 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index a975e0ff493a9..0192a216b2f7d 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -214,28 +214,6 @@ ParseRet tryParseLinearWithCompileTimeStep(StringRef &ParseString,
   return ParseRet::None;
 }
 
-/// The function looks for the following strings at the beginning of
-/// the input string `ParseString`:
-///
-/// "u" <number>
-///
-/// On success, it removes the parsed parameter from `ParseString`,
-/// sets `PKind` to the correspondent enum value, sets `Pos` to
-/// <number>, and return success.  On a syntax error, it return a
-/// parsing error. If nothing is parsed, it returns None.
-ParseRet tryParseUniform(StringRef &ParseString, VFParamKind &PKind, int &Pos) {
-  // "u" <Pos>
-  const char *UniformToken = "u";
-  if (ParseString.consume_front(UniformToken)) {
-    PKind = VFABI::getVFParamKindFromString(UniformToken);
-    if (ParseString.consumeInteger(10, Pos))
-      return ParseRet::Error;
-
-    return ParseRet::OK;
-  }
-  return ParseRet::None;
-}
-
 /// Looks into the <parameters> part of the mangled name in search
 /// for valid paramaters at the beginning of the string
 /// `ParseString`.
@@ -252,6 +230,12 @@ ParseRet tryParseParameter(StringRef &ParseString, VFParamKind &PKind,
     return ParseRet::OK;
   }
 
+  if (ParseString.consume_front("u")) {
+    PKind = VFParamKind::OMP_Uniform;
+    StepOrPos = 0;
+    return ParseRet::OK;
+  }
+
   const ParseRet HasLinearRuntime =
       tryParseLinearWithRuntimeStep(ParseString, PKind, StepOrPos);
   if (HasLinearRuntime != ParseRet::None)
@@ -262,10 +246,6 @@ ParseRet tryParseParameter(StringRef &ParseString, VFParamKind &PKind,
   if (HasLinearCompileTime != ParseRet::None)
     return HasLinearCompileTime;
 
-  const ParseRet HasUniform = tryParseUniform(ParseString, PKind, StepOrPos);
-  if (HasUniform != ParseRet::None)
-    return HasUniform;
-
   return ParseRet::None;
 }
 
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index cc6dbe84c09f7..6668529f49e09 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -326,7 +326,7 @@ TEST_F(VFABIParserTest, Align) {
 }
 
 TEST_F(VFABIParserTest, ParseUniform) {
-  EXPECT_TRUE(invokeParser("_ZGVnN2u0_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVnN2u_sin"));
   EXPECT_EQ(VF, (unsigned)2);
   EXPECT_FALSE(IsMasked());
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
@@ -334,10 +334,10 @@ TEST_F(VFABIParserTest, ParseUniform) {
   EXPECT_EQ(Parameters.size(), (unsigned)1);
   EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::OMP_Uniform, 0}));
   EXPECT_EQ(ScalarName, "sin");
-  EXPECT_EQ(VectorName, "_ZGVnN2u0_sin");
+  EXPECT_EQ(VectorName, "_ZGVnN2u_sin");
 
-  EXPECT_FALSE(invokeParser("_ZGVnN2u_sin"));
-  EXPECT_FALSE(invokeParser("_ZGVnN2ul_sin"));
+  // Uniform doesn't expect extra data.
+  EXPECT_FALSE(invokeParser("_ZGVnN2u0_sin"));
 }
 
 TEST_F(VFABIParserTest, ISAIndependentMangling) {
@@ -353,7 +353,7 @@ TEST_F(VFABIParserTest, ISAIndependentMangling) {
       VFParameter({6, VFParamKind::OMP_LinearVal, 10}),
       VFParameter({7, VFParamKind::OMP_LinearUVal, 100}),
       VFParameter({8, VFParamKind::OMP_LinearRef, 1000}),
-      VFParameter({9, VFParamKind::OMP_Uniform, 2}),
+      VFParameter({9, VFParamKind::OMP_Uniform, 0}),
   };
 
 #define __COMMON_CHECKS                                                        \
@@ -367,54 +367,54 @@ TEST_F(VFABIParserTest, ISAIndependentMangling) {
   } while (0)
 
   // Advanced SIMD: <isa> = "n"
-  EXPECT_TRUE(invokeParser("_ZGVnN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVnN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVnN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVnN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
   // SVE: <isa> = "s"
-  EXPECT_TRUE(invokeParser("_ZGVsN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVsN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::SVE);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVsN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVsN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
   // SSE: <isa> = "b"
-  EXPECT_TRUE(invokeParser("_ZGVbN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVbN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::SSE);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVbN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVbN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
   // AVX: <isa> = "c"
-  EXPECT_TRUE(invokeParser("_ZGVcN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVcN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::AVX);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVcN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVcN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
   // AVX2: <isa> = "d"
-  EXPECT_TRUE(invokeParser("_ZGVdN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVdN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::AVX2);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVdN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVdN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
   // AVX512: <isa> = "e"
-  EXPECT_TRUE(invokeParser("_ZGVeN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVeN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::AVX512);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVeN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVeN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
   // LLVM: <isa> = "_LLVM_" internal vector function.
   EXPECT_TRUE(invokeParser(
-      "_ZGV_LLVM_N2vls2Ls27Us4Rs5l1L10U100R1000u2_sin(vectorf)", "vectorf"));
+      "_ZGV_LLVM_N2vls2Ls27Us4Rs5l1L10U100R1000u_sin(vectorf)", "vectorf"));
   EXPECT_EQ(ISA, VFISAKind::LLVM);
   __COMMON_CHECKS;
   EXPECT_EQ(VectorName, "vectorf");
 
   // Unknown ISA (randomly using "q"). This test will need update if
   // some targets decide to use "q" as their ISA token.
-  EXPECT_TRUE(invokeParser("_ZGVqN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin"));
+  EXPECT_TRUE(invokeParser("_ZGVqN2vls2Ls27Us4Rs5l1L10U100R1000u_sin"));
   EXPECT_EQ(ISA, VFISAKind::Unknown);
   __COMMON_CHECKS;
-  EXPECT_EQ(VectorName, "_ZGVqN2vls2Ls27Us4Rs5l1L10U100R1000u2_sin");
+  EXPECT_EQ(VectorName, "_ZGVqN2vls2Ls27Us4Rs5l1L10U100R1000u_sin");
 
 #undef __COMMON_CHECKS
 }

From 1af3705c7fe23db9d5308bfdf07bfbd04398b895 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 27 May 2020 09:14:54 -0700
Subject: [PATCH 240/770] Start migrating away from statepoint's inline length
 prefixed argument bundles

In the current statepoint design, we have four distinct groups of operands to the call: call args, gc transition args, deopt args, and gc args. This format prexisted the support in IR for operand bundles and was in fact one of the inspirations for the extension. However, we never went back and rearchitected statepoints to fully leverage bundles.

This change is the first in a small sequence to do so. All this does is extend the SelectionDAG lowering code to allow deopt and gc transition operands to be specified in either inline argument bundles or operand bundles.

Differential Revision: https://reviews.llvm.org/D8059
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  1 +
 .../SelectionDAG/StatepointLowering.cpp       | 22 ++++++++++++++++---
 llvm/lib/IR/Verifier.cpp                      | 14 ++++++++++++
 .../statepoint-gctransition-call-lowering.ll  | 15 +++++++++++++
 llvm/test/CodeGen/X86/statepoint-regs.ll      | 17 ++++++++++++++
 5 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index dd03e415910cc..c1b4d7431ca8f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2775,6 +2775,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   assert(!I.hasOperandBundlesOtherThan({LLVMContext::OB_deopt,
+                                        LLVMContext::OB_gc_transition,
                                         LLVMContext::OB_funclet,
                                         LLVMContext::OB_cfguardtarget}) &&
          "Cannot lower invokes with arbitrary operand bundles yet!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index d64515d763d35..9a35bd41d1167 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -866,10 +866,26 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
 
   SI.GCArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
   SI.StatepointInstr = ISP.getInstruction();
-  SI.GCTransitionArgs = ArrayRef<const Use>(ISP.gc_transition_args_begin(),
-                                            ISP.gc_transition_args_end());
   SI.ID = ISP.getID();
-  SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
+
+  if (auto Opt = ISP.getCall()->getOperandBundle(LLVMContext::OB_deopt)) {
+    assert(ISP.deopt_begin() == ISP.deopt_end() &&
+           "can't list both deopt operands and deopt bundle");
+    auto &Inputs = Opt->Inputs;
+    SI.DeoptState = ArrayRef<const Use>(Inputs.begin(), Inputs.end());
+  } else {
+    SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
+  }
+  if (auto Opt = ISP.getCall()->getOperandBundle(LLVMContext::OB_gc_transition)) {
+    assert(ISP.gc_transition_args_begin() == ISP.gc_transition_args_end() &&
+           "can't list both gc_transition operands and bundle");
+    auto &Inputs = Opt->Inputs;
+    SI.GCTransitionArgs = ArrayRef<const Use>(Inputs.begin(), Inputs.end());
+  } else {
+    SI.GCTransitionArgs = ArrayRef<const Use>(ISP.gc_transition_args_begin(),
+                                              ISP.gc_transition_args_end());
+  }
+
   SI.StatepointFlags = ISP.getFlags();
   SI.NumPatchBytes = ISP.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 5ca6762d1c7fd..f4680fffa8582 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2085,6 +2085,13 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
          "gc.statepoint number of transition arguments must be positive", Call);
   const int EndTransitionArgsInx = EndCallArgsInx + 1 + NumTransitionArgs;
 
+  // We're migrating away from inline operands to operand bundles, enforce
+  // the either/or property during transition.
+  if (Call.getOperandBundle(LLVMContext::OB_gc_transition)) {
+    Assert(NumTransitionArgs == 0,
+           "can't use both deopt operands and deopt bundle on a statepoint");
+  }
+
   const Value *NumDeoptArgsV = Call.getArgOperand(EndTransitionArgsInx + 1);
   Assert(isa<ConstantInt>(NumDeoptArgsV),
          "gc.statepoint number of deoptimization arguments "
@@ -2096,6 +2103,13 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
          "must be positive",
          Call);
 
+  // We're migrating away from inline operands to operand bundles, enforce
+  // the either/or property during transition.
+  if (Call.getOperandBundle(LLVMContext::OB_deopt)) {
+    Assert(NumDeoptArgs == 0,
+           "can't use both deopt operands and deopt bundle on a statepoint");
+  }
+
   const int ExpectedNumArgs =
       7 + NumCallArgs + NumTransitionArgs + NumDeoptArgs;
   Assert(ExpectedNumArgs <= (int)Call.arg_size(),
diff --git a/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index 90f2002e2d452..c98badf682686 100644
--- a/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -116,6 +116,21 @@ entry:
   ret i32 %call1
 }
 
+; Same as test_transition_args_2 except using bundle format
+define i32 @test_bundle() gc "statepoint-example" {
+; CHECK-LABEL: test_bundle
+; CHECK: pushq %rax
+; CHECK: callq return_i32
+; CHECK: popq %rcx
+; CHECK: retq
+entry:
+  %val = alloca i32
+  %arg = alloca i8
+  %safepoint_token = call token (i64, i32, i32 (i32, i8*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32i32p0i8f(i64 0, i32 0, i32 (i32, i8*)* @return_i32_with_args, i32 2, i32 1, i32 0, i8* %arg, i32 0, i32 0) ["gc-transition" (i32* %val, i64 42)]
+  %call1 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+  ret i32 %call1
+}
+
 declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
 declare i1 @llvm.experimental.gc.result.i1(token)
 
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index 0f5bad8ee7ddf..b137b18e88e32 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -691,6 +691,23 @@ define i32 addrspace(1)*  @test_fpconst_deopt(i32 addrspace(1)* %in) gc "statepo
     %out = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %statepoint_token, i32 27, i32 27)
     ret i32 addrspace(1)* %out
 }
+
+; Same as test1, but using deopt bundle
+define void @test1b(i32 %a) gc "statepoint-example" {
+; CHECK-LABEL: test1b:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    callq _bar ## 4-byte Folded Reload
+; CHECK-NEXT:  Ltmp19:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a)]
+  ret void
+}
+
 ; CHECK-LABEL: __LLVM_StackMaps:
 ; CHECK: .long   Ltmp18-_test_fpconst_deopt
 ; CHECK-NEXT: .short	0

From 5ba874e4724e72838dfbb3e4b40392e0b24cc6f4 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Tue, 5 May 2020 13:04:32 +0100
Subject: [PATCH 241/770] [MLIR] [OpenMP] Add basic OpenMP parallel operation

Summary:
This includes a basic implementation for the OpenMP parallel
operation without a custom pretty-printer and parser.
The if, num_threads, private, shared, first_private, last_private,
proc_bind and default clauses are included in this implementation.

Currently the reduction clause is omitted as it is more complex and
requires analysis to see if we can share implementation with the loop
dialect. The allocate clause is also omitted.

A discussion about the design of this operation can be found here:
https://llvm.discourse.group/t/openmp-parallel-operation-design-issues/686

The current OpenMP Specification can be found here:
https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5.0.pdf

Co-authored-by: Kiran Chandramohan <kiran.chandramohan@arm.com>

Reviewers: jdoerfert

Subscribers: mgorny, yaxunl, kristof.beyls, guansong, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, Joonsoo, grosul1, frgossen, Kayjukh, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79410
---
 .../mlir/Dialect/OpenMP/CMakeLists.txt        |   8 +-
 .../mlir/Dialect/OpenMP/OpenMPDialect.h       |   2 +
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 119 ++++++++++++++++--
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   5 +
 mlir/test/Dialect/OpenMP/ops.mlir             |  36 ++++++
 5 files changed, 157 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index 0df7a04f52b9e..1254bbe8b8fc9 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -1,2 +1,8 @@
-add_mlir_dialect(OpenMPOps omp)
+set(LLVM_TARGET_DEFINITIONS OpenMPOps.td)
+mlir_tablegen(OpenMPOpsDialect.h.inc -gen-dialect-decls -dialect=omp)
+mlir_tablegen(OpenMPOps.h.inc -gen-op-decls)
+mlir_tablegen(OpenMPOps.cpp.inc -gen-op-defs)
+mlir_tablegen(OpenMPOpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(OpenMPOpsEnums.cpp.inc -gen-enum-defs)
 add_mlir_doc(OpenMPOps -gen-dialect-doc OpenMPDialect Dialects/)
+add_public_tablegen_target(MLIROpenMPOpsIncGen)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
index 6761b51b55b58..8f0bb93e1043e 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
@@ -16,6 +16,8 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 
+#include "mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc"
+
 namespace mlir {
 namespace omp {
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index af5e0433ef3ca..27b2110bf71ed 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -18,16 +18,103 @@ include "mlir/IR/OpBase.td"
 
 def OpenMP_Dialect : Dialect {
   let name = "omp";
+  let cppNamespace = "omp";
 }
 
 class OpenMP_Op<string mnemonic, list<OpTrait> traits = []> :
       Op<OpenMP_Dialect, mnemonic, traits>;
 
-def BarrierOp : OpenMP_Op<"barrier"> {
-  let summary = "barrier construct";
+
+//===----------------------------------------------------------------------===//
+// 2.6 parallel Construct
+//===----------------------------------------------------------------------===//
+
+// Possible values for the default clause
+def ClauseDefaultPrivate : StrEnumAttrCase<"defprivate">;
+def ClauseDefaultFirstPrivate : StrEnumAttrCase<"deffirstprivate">;
+def ClauseDefaultShared : StrEnumAttrCase<"defshared">;
+def ClauseDefaultNone : StrEnumAttrCase<"defnone">;
+
+def ClauseDefault : StrEnumAttr<
+    "ClauseDefault",
+    "default clause",
+    [ClauseDefaultPrivate, ClauseDefaultFirstPrivate, ClauseDefaultShared,
+     ClauseDefaultNone]> {
+  let cppNamespace = "::mlir::omp";
+}
+
+// Possible values for the proc_bind clause
+def ClauseProcMaster : StrEnumAttrCase<"master">;
+def ClauseProcClose : StrEnumAttrCase<"close">;
+def ClauseProcSpread : StrEnumAttrCase<"spread">;
+
+def ClauseProcBind : StrEnumAttr<
+    "ClauseProcBind",
+    "procbind clause",
+    [ClauseProcMaster, ClauseProcClose, ClauseProcSpread]> {
+  let cppNamespace = "::mlir::omp";
+}
+
+def ParallelOp : OpenMP_Op<"parallel", [AttrSizedOperandSegments]> {
+  let summary = "parallel construct";
   let description = [{
-    The barrier construct specifies an explicit barrier at the point at which
-    the construct appears.
+    The parallel construct includes a region of code which is to be executed
+    by a team of threads.
+
+    The optional $if_expr_var parameter specifies a boolean result of a
+    conditional check. If this value is 1 or is not provided then the parallel
+    region runs as normal, if it is 0 then the parallel region is executed with
+    one thread.
+
+    The optional $num_threads_var parameter specifies the number of threads which
+    should be used to execute the parallel region.
+
+    The optional $default_val attribute specifies the default data sharing attribute
+    of variables used in the parallel region that are not passed explicitly as parameters
+    to the operation.
+
+    The $private_vars, $firstprivate_vars, $shared_vars and $copyin_vars parameters
+    are a variadic list of variables that specify the data sharing attribute of
+    those variables.
+
+    The optional $proc_bind_val attribute controls the thread affinity for the execution
+    of the parallel region.
+  }];
+
+  let arguments = (ins Optional<I1>:$if_expr_var,
+             Optional<AnyInteger>:$num_threads_var,
+             OptionalAttr<ClauseDefault>:$default_val,
+             Variadic<AnyType>:$private_vars,
+             Variadic<AnyType>:$firstprivate_vars,
+             Variadic<AnyType>:$shared_vars,
+             Variadic<AnyType>:$copyin_vars,
+             OptionalAttr<ClauseProcBind>:$proc_bind_val);
+
+  let regions = (region AnyRegion:$region);
+}
+
+def TerminatorOp : OpenMP_Op<"terminator", [Terminator]> {
+  let summary = "terminator for OpenMP regions.";
+  let description = [{
+    A terminator operation for regions that appear in the body of OpenMP
+    operation.  These regions are not expected to return any value so the
+    terminator takes no operands. The terminator op returns control to the
+    enclosing op.
+  }];
+
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
+//===----------------------------------------------------------------------===//
+// 2.10.4 taskyield Construct
+//===----------------------------------------------------------------------===//
+
+def TaskyieldOp : OpenMP_Op<"taskyield"> {
+  let summary = "taskyield construct";
+  let description = [{
+    The taskyield construct specifies that the current task can be suspended
+    in favor of execution of a different task.
   }];
 
   let assemblyFormat = "attr-dict";
@@ -50,21 +137,29 @@ def FlushOp : OpenMP_Op<"flush"> {
   let assemblyFormat = "attr-dict ($varList^ `:` type($varList))?";
 }
 
-def TaskwaitOp : OpenMP_Op<"taskwait"> {
-  let summary = "taskwait construct";
+//===----------------------------------------------------------------------===//
+// 2.17.2 barrier Construct
+//===----------------------------------------------------------------------===//
+
+def BarrierOp : OpenMP_Op<"barrier"> {
+  let summary = "barrier construct";
   let description = [{
-    The taskwait construct specifies a wait on the completion of child tasks
-    of the current task.
+    The barrier construct specifies an explicit barrier at the point at which
+    the construct appears.
   }];
 
   let assemblyFormat = "attr-dict";
 }
 
-def TaskyieldOp : OpenMP_Op<"taskyield"> {
-  let summary = "taskyield construct";
+//===----------------------------------------------------------------------===//
+// 2.17.5 taskwait Construct
+//===----------------------------------------------------------------------===//
+
+def TaskwaitOp : OpenMP_Op<"taskwait"> {
+  let summary = "taskwait construct";
   let description = [{
-    The taskyield construct specifies that the current task can be suspended
-    in favor of execution of a different task.
+    The taskwait construct specifies a wait on the completion of child tasks
+    of the current task.
   }];
 
   let assemblyFormat = "attr-dict";
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index edb1a4eb5eb4f..99c592c25b83f 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -11,8 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/OpImplementation.h"
 
+#include "llvm/ADT/StringSwitch.h"
+
+#include "mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc"
+
 using namespace mlir;
 using namespace mlir::omp;
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 8a556196b0eab..bffc82417761e 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -35,3 +35,39 @@ func @omp_flush(%arg0 : !llvm.i32) -> () {
 
   return
 }
+
+func @omp_terminator() -> () {
+  // CHECK: omp.terminator
+  omp.terminator
+}
+
+func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32) -> () {
+  // CHECK: omp_parallel
+  "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var, %data_var, %data_var) ({
+
+  // test without if condition
+  // CHECK: omp.parallel
+    "omp.parallel"(%num_threads, %data_var, %data_var, %data_var, %data_var) ({
+      omp.terminator
+    }) {operand_segment_sizes = dense<[0,1,1,1,1,1]>: vector<6xi32>, default_val = "defshared"} : (si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+
+  // CHECK: omp.barrier
+    omp.barrier
+
+  // test without num_threads
+  // CHECK: omp.parallel
+    "omp.parallel"(%if_cond, %data_var, %data_var, %data_var, %data_var) ({
+      omp.terminator
+    }) {operand_segment_sizes = dense<[1,0,1,1,1,1]> : vector<6xi32>} : (i1, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+
+    omp.terminator
+  }) {operand_segment_sizes = dense<[1,1,1,1,1,1]> : vector<6xi32>, proc_bind_val = "spread"} : (i1, si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+
+  // test with multiple parameters for single variadic argument
+  // CHECK: omp.parallel
+  "omp.parallel" (%data_var, %data_var, %data_var, %data_var, %data_var) ({
+    omp.terminator
+  }) {operand_segment_sizes = dense<[0,0,1,2,1,1]> : vector<6xi32>} : (memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+
+  return
+}

From 4f0eba28eba873de402d9742d62fcae89f4c2363 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 27 May 2020 12:31:59 -0400
Subject: [PATCH 242/770] [gn build] (manually) port dedaf3a2ac5

---
 llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 2c33adb2a6d4d..7793901770f3b 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -497,6 +497,13 @@ static_library("builtins") {
     ]
   }
 
+  if (current_cpu == "ve") {
+    sources += [
+      "ve/grow_stack.S",
+      "ve/grow_stack_align.S",
+    ]
+  }
+
   if (!compiler_rt_exclude_atomic_builtin) {
     sources += [
       # This comment prevents `gn format` from putting the file on the same line

From 0d20ed664ff2d51dae14f9324a64e4433e6b663e Mon Sep 17 00:00:00 2001
From: Bardia Mahjour <bmahjour@ca.ibm.com>
Date: Wed, 27 May 2020 12:33:46 -0400
Subject: [PATCH 243/770] [DDG] Data Dependence Graph - Add query function for
 memory dependencies between two nodes

Summary:
When working with the DDG it's useful to be able to query details of the
memory dependencies between two nodes connected by a memory edge. The DDG
does not hold a copy of the dependencies, but it contains a reference to a
DependenceInfo object through which dependence information can be queried.
This patch adds a query function to the DDG to obtain all the Dependence
objects that exist between instructions of two nodes.

Authored By: bmahjour

Reviewers: Meinersbur, Whitney, etiotto

Reviewed By: Whitney

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80529
---
 llvm/include/llvm/Analysis/DDG.h       |  32 +++++++
 llvm/unittests/Analysis/CMakeLists.txt |   1 +
 llvm/unittests/Analysis/DDGTest.cpp    | 128 +++++++++++++++++++++++++
 3 files changed, 161 insertions(+)
 create mode 100644 llvm/unittests/Analysis/DDGTest.cpp

diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index ca7d8c6fa6a6e..9e2b7907eaec8 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -284,6 +284,12 @@ template <typename NodeType> class DependenceGraphInfo {
     return *Root;
   }
 
+  /// Collect all the data dependency infos coming from any pair of memory
+  /// accesses from \p Src to \p Dst, and store them into \p Deps. Return true
+  /// if a dependence exists, and false otherwise.
+  bool getDependencies(const NodeType &Src, const NodeType &Dst,
+                       DependenceList &Deps) const;
+
 protected:
   // Name of the graph.
   std::string Name;
@@ -431,6 +437,32 @@ class DDGAnalysisPrinterPass : public PassInfoMixin<DDGAnalysisPrinterPass> {
   raw_ostream &OS;
 };
 
+//===--------------------------------------------------------------------===//
+// DependenceGraphInfo Implementation
+//===--------------------------------------------------------------------===//
+
+template <typename NodeType>
+bool DependenceGraphInfo<NodeType>::getDependencies(
+    const NodeType &Src, const NodeType &Dst, DependenceList &Deps) const {
+  assert(Deps.empty() && "Expected empty output list at the start.");
+
+  // List of memory access instructions from src and dst nodes.
+  SmallVector<Instruction *, 8> SrcIList, DstIList;
+  auto isMemoryAccess = [](const Instruction *I) {
+    return I->mayReadOrWriteMemory();
+  };
+  Src.collectInstructions(isMemoryAccess, SrcIList);
+  Dst.collectInstructions(isMemoryAccess, DstIList);
+
+  for (auto *SrcI : SrcIList)
+    for (auto *DstI : DstIList)
+      if (auto Dep =
+              const_cast<DependenceInfo *>(&DI)->depends(SrcI, DstI, true))
+        Deps.push_back(std::move(Dep));
+
+  return !Deps.empty();
+}
+
 //===--------------------------------------------------------------------===//
 // GraphTraits specializations for the DDG
 //===--------------------------------------------------------------------===//
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index 254b2893cac5b..f344d6c7bc25d 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_unittest(AnalysisTests
   CaptureTrackingTest.cpp
   CFGTest.cpp
   CGSCCPassManagerTest.cpp
+  DDGTest.cpp
   DivergenceAnalysisTest.cpp
   DomTreeUpdaterTest.cpp
   GlobalsModRefTest.cpp
diff --git a/llvm/unittests/Analysis/DDGTest.cpp b/llvm/unittests/Analysis/DDGTest.cpp
new file mode 100644
index 0000000000000..a8dce776f12cf
--- /dev/null
+++ b/llvm/unittests/Analysis/DDGTest.cpp
@@ -0,0 +1,128 @@
+//===- DDGTest.cpp - DDGAnalysis unit tests -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DDG.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+/// Build the DDG analysis for a loop and run the given test \p Test.
+static void runTest(Module &M, StringRef FuncName,
+                    function_ref<void(Function &F, LoopInfo &LI,
+                                      DependenceInfo &DI, ScalarEvolution &SE)>
+                        Test) {
+  auto *F = M.getFunction(FuncName);
+  ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
+
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI(TLII);
+  AssumptionCache AC(*F);
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  ScalarEvolution SE(*F, TLI, AC, DT, LI);
+  AAResults AA(TLI);
+  DependenceInfo DI(F, &AA, &SE, &LI);
+  Test(*F, LI, DI, SE);
+}
+
+static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
+                                              const char *ModuleStr) {
+  SMDiagnostic Err;
+  return parseAssemblyString(ModuleStr, Err, Context);
+}
+
+TEST(DDGTest, getDependencies) {
+  const char *ModuleStr =
+      "target datalayout = \"e-m:e-i64:64-n32:64\"\n"
+      "target triple = \"powerpc64le-unknown-linux-gnu\"\n"
+      "\n"
+      "define dso_local void @foo(i32 signext %n, i32* noalias %A, i32* "
+      "noalias %B) {\n"
+      "entry:\n"
+      "   %cmp1 = icmp sgt i32 %n, 0\n"
+      "   br i1 %cmp1, label %for.body.preheader, label %for.end\n"
+      "\n"
+      "for.body.preheader:\n"
+      "   %wide.trip.count = zext i32 %n to i64\n"
+      "   br label %for.body\n"
+      " \n"
+      " for.body:\n"
+      "   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ "
+      "%indvars.iv.next, %for.body ]\n"
+      "   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv\n"
+      "  %0 = trunc i64 %indvars.iv to i32\n"
+      "  store i32 %0, i32* %arrayidx, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 "
+      "%indvars.iv.next\n"
+      "  %1 = load i32, i32* %arrayidx2, align 4\n"
+      "  %add3 = add nsw i32 %1, 1\n"
+      "  %arrayidx5 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv\n"
+      "  store i32 %add3, i32* %arrayidx5, align 4\n"
+      "  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n"
+      "  br i1 %exitcond, label %for.body, label %for.end.loopexit\n"
+      "\n"
+      "for.end.loopexit:\n"
+      "  br label %for.end\n"
+      "\n"
+      "for.end:\n"
+      "  ret void\n"
+      "}\n";
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
+
+  runTest(
+      *M, "foo",
+      [&](Function &F, LoopInfo &LI, DependenceInfo &DI, ScalarEvolution &SE) {
+        Loop *L = *LI.begin();
+        assert(L && "expected the loop to be identified.");
+
+        DataDependenceGraph DDG(*L, LI, DI);
+
+        // Collect all the nodes that have an outgoing memory edge
+        // while collecting all memory edges as well. There should
+        // only be one node with an outgoing memory edge and there
+        // should only be one memory edge in the entire graph.
+        std::vector<DDGNode *> DependenceSourceNodes;
+        std::vector<DDGEdge *> MemoryEdges;
+        for (DDGNode *N : DDG) {
+          for (DDGEdge *E : *N) {
+            bool SourceAdded = false;
+            if (E->isMemoryDependence()) {
+              MemoryEdges.push_back(E);
+              if (!SourceAdded) {
+                DependenceSourceNodes.push_back(N);
+                SourceAdded = true;
+              }
+            }
+          }
+        }
+
+        EXPECT_EQ(DependenceSourceNodes.size(), 1ull);
+        EXPECT_EQ(MemoryEdges.size(), 1ull);
+
+        DataDependenceGraph::DependenceList DL;
+        DDG.getDependencies(*DependenceSourceNodes.back(),
+                            MemoryEdges.back()->getTargetNode(), DL);
+
+        EXPECT_EQ(DL.size(), 1ull);
+        EXPECT_TRUE(DL.back()->isAnti());
+        EXPECT_EQ(DL.back()->getLevels(), 1u);
+        EXPECT_NE(DL.back()->getDistance(1), nullptr);
+        EXPECT_EQ(DL.back()->getDistance(1),
+                  SE.getOne(DL.back()->getDistance(1)->getType()));
+      });
+}

From bed78845e555790c0bcbe34d04436fae41a3fa5f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 27 May 2020 16:41:00 +0000
Subject: [PATCH 244/770] [gn build] Port 0d20ed664ff

---
 llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
index 03c5dc3f25039..132353820918a 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn
@@ -19,6 +19,7 @@ unittest("AnalysisTests") {
     "CGSCCPassManagerTest.cpp",
     "CallGraphTest.cpp",
     "CaptureTrackingTest.cpp",
+    "DDGTest.cpp",
     "DivergenceAnalysisTest.cpp",
     "DomTreeUpdaterTest.cpp",
     "GlobalsModRefTest.cpp",

From 29f8056b54ea5ea6b333e3b8f11de2cc327d1421 Mon Sep 17 00:00:00 2001
From: Ties Stuij <ties.stuij@arm.com>
Date: Wed, 27 May 2020 17:25:10 +0100
Subject: [PATCH 245/770] [CodeGen] fix typo `def nxv1bf32` -> `def nxv1f32`

The `Add bfloat MVT type` patch introduced a typo in the nxv1f32 definition
in llvm/include/llvm/CodeGen/ValueTypes.td:
https://reviews.llvm.org/D79706/new/#inline-740433

This patch fixes that.
---
 llvm/include/llvm/CodeGen/ValueTypes.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index e08a33a50df68..caa3d4daab318 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -163,7 +163,7 @@ def nxv8f16  : ValueType<128, 128>; // n x  8 x f16 vector value
 def nxv2bf16 : ValueType<32 , 129>; // n x  2 x bf16 vector value
 def nxv4bf16 : ValueType<64 , 130>; // n x  4 x bf16 vector value
 def nxv8bf16 : ValueType<128, 131>; // n x  8 x bf16 vector value
-def nxv1bf32 : ValueType<32 , 132>; // n x  1 x f32 vector value
+def nxv1f32  : ValueType<32 , 132>; // n x  1 x f32 vector value
 def nxv2f32  : ValueType<64 , 133>; // n x  2 x f32 vector value
 def nxv4f32  : ValueType<128, 134>; // n x  4 x f32 vector value
 def nxv8f32  : ValueType<256, 135>; // n x  8 x f32 vector value

From 4d6f44f5f0925f2d05431065d9f197644d07b1b5 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Wed, 27 May 2020 09:36:32 -0700
Subject: [PATCH 246/770] [mlir][spirv] Lower allocation/deallocations of
 workgroup memory.

This allocation of a workgroup memory is lowered to a
spv.globalVariable. Only static size allocation with element type
being int or float is handled. The lowering does account for the
element type that are not supported in the lowered spv.module based on
the extensions/capabilities and adjusts the number of elements to get
the same byte length.

Differential Revision: https://reviews.llvm.org/D80411
---
 .../mlir/Dialect/SPIRV/SPIRVLowering.h        |   6 +
 .../ConvertStandardToSPIRV.cpp                | 128 +++++++++++++---
 mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp      |  50 +++---
 .../Conversion/GPUToSPIRV/load-store.mlir     |   8 +-
 mlir/test/Conversion/GPUToSPIRV/loop.mlir     |  12 +-
 .../Conversion/StandardToSPIRV/alloc.mlir     | 144 ++++++++++++++++++
 .../StandardToSPIRV/std-ops-to-spirv.mlir     |  12 +-
 7 files changed, 313 insertions(+), 47 deletions(-)
 create mode 100644 mlir/test/Conversion/StandardToSPIRV/alloc.mlir

diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
index ba0b7ea0714cf..1fa668d7ddc0f 100644
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
@@ -41,6 +41,12 @@ class SPIRVTypeConverter : public TypeConverter {
 public:
   explicit SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr);
 
+  /// Gets the number of bytes used for a type when converted to SPIR-V
+  /// type. Note that it doesnt account for whether the type is legal for a
+  /// SPIR-V target (described by spirv::TargetEnvAttr). Returns None on
+  /// failure.
+  static Optional<int64_t> getConvertedTypeNumBytes(Type);
+
   /// Gets the SPIR-V correspondence for the standard index type.
   static Type getIndexType(MLIRContext *context);
 
diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
index 560bc4acf4369..facdbf7d096ac 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
@@ -169,22 +169,51 @@ bool isUnsignedOp() {
     return true;                                                               \
   }
 
-CHECK_UNSIGNED_OP(spirv::AtomicUMaxOp);
-CHECK_UNSIGNED_OP(spirv::AtomicUMinOp);
-CHECK_UNSIGNED_OP(spirv::BitFieldUExtractOp);
-CHECK_UNSIGNED_OP(spirv::ConvertUToFOp);
-CHECK_UNSIGNED_OP(spirv::GroupNonUniformUMaxOp);
-CHECK_UNSIGNED_OP(spirv::GroupNonUniformUMinOp);
-CHECK_UNSIGNED_OP(spirv::UConvertOp);
-CHECK_UNSIGNED_OP(spirv::UDivOp);
-CHECK_UNSIGNED_OP(spirv::UGreaterThanEqualOp);
-CHECK_UNSIGNED_OP(spirv::UGreaterThanOp);
-CHECK_UNSIGNED_OP(spirv::ULessThanEqualOp);
-CHECK_UNSIGNED_OP(spirv::ULessThanOp);
-CHECK_UNSIGNED_OP(spirv::UModOp);
+CHECK_UNSIGNED_OP(spirv::AtomicUMaxOp)
+CHECK_UNSIGNED_OP(spirv::AtomicUMinOp)
+CHECK_UNSIGNED_OP(spirv::BitFieldUExtractOp)
+CHECK_UNSIGNED_OP(spirv::ConvertUToFOp)
+CHECK_UNSIGNED_OP(spirv::GroupNonUniformUMaxOp)
+CHECK_UNSIGNED_OP(spirv::GroupNonUniformUMinOp)
+CHECK_UNSIGNED_OP(spirv::UConvertOp)
+CHECK_UNSIGNED_OP(spirv::UDivOp)
+CHECK_UNSIGNED_OP(spirv::UGreaterThanEqualOp)
+CHECK_UNSIGNED_OP(spirv::UGreaterThanOp)
+CHECK_UNSIGNED_OP(spirv::ULessThanEqualOp)
+CHECK_UNSIGNED_OP(spirv::ULessThanOp)
+CHECK_UNSIGNED_OP(spirv::UModOp)
 
 #undef CHECK_UNSIGNED_OP
 
+/// Returns true if the allocations of type `t` can be lowered to SPIR-V.
+static bool isAllocationSupported(MemRefType t) {
+  // Currently only support workgroup local memory allocations with static
+  // shape and int or float element type.
+  return t.hasStaticShape() &&
+         SPIRVTypeConverter::getMemorySpaceForStorageClass(
+             spirv::StorageClass::Workgroup) == t.getMemorySpace() &&
+         t.getElementType().isIntOrFloat();
+}
+
+/// Returns the scope to use for atomic operations use for emulating store
+/// operations of unsupported integer bitwidths, based on the memref
+/// type. Returns None on failure.
+static Optional<spirv::Scope> getAtomicOpScope(MemRefType t) {
+  Optional<spirv::StorageClass> storageClass =
+      SPIRVTypeConverter::getStorageClassForMemorySpace(t.getMemorySpace());
+  if (!storageClass)
+    return {};
+  switch (*storageClass) {
+  case spirv::StorageClass::StorageBuffer:
+    return spirv::Scope::Device;
+  case spirv::StorageClass::Workgroup:
+    return spirv::Scope::Workgroup;
+  default: {
+  }
+  }
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // Operation conversion
 //===----------------------------------------------------------------------===//
@@ -195,6 +224,67 @@ CHECK_UNSIGNED_OP(spirv::UModOp);
 
 namespace {
 
+/// Converts an allocation operation to SPIR-V. Currently only supports lowering
+/// to Workgroup memory when the size is constant.  Note that this pattern needs
+/// to be applied in a pass that runs at least at spv.module scope since it wil
+/// ladd global variables into the spv.module.
+class AllocOpPattern final : public SPIRVOpLowering<AllocOp> {
+public:
+  using SPIRVOpLowering<AllocOp>::SPIRVOpLowering;
+
+  LogicalResult
+  matchAndRewrite(AllocOp operation, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    MemRefType allocType = operation.getType();
+    if (!isAllocationSupported(allocType))
+      return operation.emitError("unhandled allocation type");
+
+    // Get the SPIR-V type for the allocation.
+    Type spirvType = typeConverter.convertType(allocType);
+
+    // Insert spv.globalVariable for this allocation.
+    Operation *parent =
+        SymbolTable::getNearestSymbolTable(operation.getParentOp());
+    if (!parent)
+      return failure();
+    Location loc = operation.getLoc();
+    spirv::GlobalVariableOp varOp;
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      Block &entryBlock = *parent->getRegion(0).begin();
+      rewriter.setInsertionPointToStart(&entryBlock);
+      auto varOps = entryBlock.getOps<spirv::GlobalVariableOp>();
+      std::string varName =
+          std::string("__workgroup_mem__") +
+          std::to_string(std::distance(varOps.begin(), varOps.end()));
+      varOp = rewriter.create<spirv::GlobalVariableOp>(
+          loc, TypeAttr::get(spirvType), varName,
+          /*initializer = */ nullptr);
+    }
+
+    // Get pointer to global variable at the current scope.
+    rewriter.replaceOpWithNewOp<spirv::AddressOfOp>(operation, varOp);
+    return success();
+  }
+};
+
+/// Removed a deallocation if it is a supported allocation. Currently only
+/// removes deallocation if the memory space is workgroup memory.
+class DeallocOpPattern final : public SPIRVOpLowering<DeallocOp> {
+public:
+  using SPIRVOpLowering<DeallocOp>::SPIRVOpLowering;
+
+  LogicalResult
+  matchAndRewrite(DeallocOp operation, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    MemRefType deallocType = operation.memref().getType().cast<MemRefType>();
+    if (!isAllocationSupported(deallocType))
+      return operation.emitError("unhandled deallocation type");
+    rewriter.eraseOp(operation);
+    return success();
+  }
+};
+
 /// Converts unary and binary standard operations to SPIR-V operations.
 template <typename StdOp, typename SPIRVOp>
 class UnaryAndBinaryOpPattern final : public SPIRVOpLowering<StdOp> {
@@ -823,12 +913,15 @@ IntStoreOpPattern::matchAndRewrite(StoreOp storeOp, ArrayRef<Value> operands,
       shiftValue(loc, storeOperands.value(), offset, mask, dstBits, rewriter);
   Value adjustedPtr = adjustAccessChainForBitwidth(typeConverter, accessChainOp,
                                                    srcBits, dstBits, rewriter);
+  Optional<spirv::Scope> scope = getAtomicOpScope(memrefType);
+  if (!scope)
+    return failure();
   Value result = rewriter.create<spirv::AtomicAndOp>(
-      loc, dstType, adjustedPtr, spirv::Scope::Device,
-      spirv::MemorySemantics::AcquireRelease, clearBitsMask);
+      loc, dstType, adjustedPtr, *scope, spirv::MemorySemantics::AcquireRelease,
+      clearBitsMask);
   result = rewriter.create<spirv::AtomicOrOp>(
-      loc, dstType, adjustedPtr, spirv::Scope::Device,
-      spirv::MemorySemantics::AcquireRelease, storeVal);
+      loc, dstType, adjustedPtr, *scope, spirv::MemorySemantics::AcquireRelease,
+      storeVal);
 
   // The AtomicOrOp has no side effect. Since it is already inserted, we can
   // just remove the original StoreOp. Note that rewriter.replaceOp()
@@ -913,6 +1006,7 @@ void populateStandardToSPIRVPatterns(MLIRContext *context,
       UnaryAndBinaryOpPattern<UnsignedDivIOp, spirv::UDivOp>,
       UnaryAndBinaryOpPattern<UnsignedRemIOp, spirv::UModOp>,
       UnaryAndBinaryOpPattern<UnsignedShiftRightOp, spirv::ShiftRightLogicalOp>,
+      AllocOpPattern, DeallocOpPattern,
       BitwiseOpPattern<AndOp, spirv::LogicalAndOp, spirv::BitwiseAndOp>,
       BitwiseOpPattern<OrOp, spirv::LogicalOrOp, spirv::BitwiseOrOp>,
       BoolCmpIOpPattern, ConstantCompositeOpPattern, ConstantScalarOpPattern,
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
index c9f2983e232b0..dfc2728ef7109 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
@@ -218,6 +218,10 @@ static Optional<int64_t> getTypeNumBytes(Type t) {
   return llvm::None;
 }
 
+Optional<int64_t> SPIRVTypeConverter::getConvertedTypeNumBytes(Type t) {
+  return getTypeNumBytes(t);
+}
+
 /// Converts a scalar `type` to a suitable type under the given `targetEnv`.
 static Optional<Type>
 convertScalarType(const spirv::TargetEnv &targetEnv, spirv::ScalarType type,
@@ -383,8 +387,11 @@ static Optional<Type> convertMemrefType(const spirv::TargetEnv &targetEnv,
   auto arrayType =
       spirv::ArrayType::get(*arrayElemType, arrayElemCount, *arrayElemSize);
 
-  // Wrap in a struct to satisfy Vulkan interface requirements.
-  auto structType = spirv::StructType::get(arrayType, 0);
+  // Wrap in a struct to satisfy Vulkan interface requirements. Memrefs with
+  // workgroup storage class do not need the struct to be laid out explicitly.
+  auto structType = *storageClass == spirv::StorageClass::Workgroup
+                        ? spirv::StructType::get(arrayType)
+                        : spirv::StructType::get(arrayType, 0);
   return spirv::PointerType::get(structType, *storageClass);
 }
 
@@ -574,35 +581,40 @@ spirv::AccessChainOp mlir::spirv::getElementPtr(
     SPIRVTypeConverter &typeConverter, MemRefType baseType, Value basePtr,
     ArrayRef<Value> indices, Location loc, OpBuilder &builder) {
   // Get base and offset of the MemRefType and verify they are static.
+
   int64_t offset;
   SmallVector<int64_t, 4> strides;
   if (failed(getStridesAndOffset(baseType, strides, offset)) ||
-      llvm::is_contained(strides, MemRefType::getDynamicStrideOrOffset())) {
+      llvm::is_contained(strides, MemRefType::getDynamicStrideOrOffset()) ||
+      offset == MemRefType::getDynamicStrideOrOffset()) {
     return nullptr;
   }
 
   auto indexType = typeConverter.getIndexType(builder.getContext());
-
-  Value ptrLoc = nullptr;
-  assert(indices.size() == strides.size() &&
-         "must provide indices for all dimensions");
-  for (auto index : enumerate(indices)) {
-    Value strideVal = builder.create<spirv::ConstantOp>(
-        loc, indexType, IntegerAttr::get(indexType, strides[index.index()]));
-    Value update = builder.create<spirv::IMulOp>(loc, strideVal, index.value());
-    ptrLoc =
-        (ptrLoc ? builder.create<spirv::IAddOp>(loc, ptrLoc, update).getResult()
-                : update);
-  }
   SmallVector<Value, 2> linearizedIndices;
   // Add a '0' at the start to index into the struct.
   auto zero = spirv::ConstantOp::getZero(indexType, loc, builder);
   linearizedIndices.push_back(zero);
-  // If it is a zero-rank memref type, extract the element directly.
-  if (!ptrLoc) {
-    ptrLoc = zero;
+
+  if (baseType.getRank() == 0) {
+    linearizedIndices.push_back(zero);
+  } else {
+    // TODO: Instead of this logic, use affine.apply and add patterns for
+    // lowering affine.apply to standard ops. These will get lowered to SPIR-V
+    // ops by the DialectConversion framework.
+    Value ptrLoc = builder.create<spirv::ConstantOp>(
+        loc, indexType, IntegerAttr::get(indexType, offset));
+    assert(indices.size() == strides.size() &&
+           "must provide indices for all dimensions");
+    for (auto index : enumerate(indices)) {
+      Value strideVal = builder.create<spirv::ConstantOp>(
+          loc, indexType, IntegerAttr::get(indexType, strides[index.index()]));
+      Value update =
+          builder.create<spirv::IMulOp>(loc, strideVal, index.value());
+      ptrLoc = builder.create<spirv::IAddOp>(loc, ptrLoc, update);
+    }
+    linearizedIndices.push_back(ptrLoc);
   }
-  linearizedIndices.push_back(ptrLoc);
   return builder.create<spirv::AccessChainOp>(loc, basePtr, linearizedIndices);
 }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
index 543f364c93f08..12a5d9df61a87 100644
--- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -58,13 +58,15 @@ module attributes {
       %12 = addi %arg3, %0 : index
       // CHECK: %[[INDEX2:.*]] = spv.IAdd %[[ARG4]], %[[LOCALINVOCATIONIDX]]
       %13 = addi %arg4, %3 : index
+      // CHECK: %[[ZERO:.*]] = spv.constant 0 : i32
+      // CHECK: %[[OFFSET1_0:.*]] = spv.constant 0 : i32
       // CHECK: %[[STRIDE1_1:.*]] = spv.constant 4 : i32
-      // CHECK: %[[OFFSET1_1:.*]] = spv.IMul %[[STRIDE1_1]], %[[INDEX1]] : i32
+      // CHECK: %[[UPDATE1_1:.*]] = spv.IMul %[[STRIDE1_1]], %[[INDEX1]] : i32
+      // CHECK: %[[OFFSET1_1:.*]] = spv.IAdd %[[OFFSET1_0]], %[[UPDATE1_1]] : i32
       // CHECK: %[[STRIDE1_2:.*]] = spv.constant 1 : i32
       // CHECK: %[[UPDATE1_2:.*]] = spv.IMul %[[STRIDE1_2]], %[[INDEX2]] : i32
       // CHECK: %[[OFFSET1_2:.*]] = spv.IAdd %[[OFFSET1_1]], %[[UPDATE1_2]] : i32
-      // CHECK: %[[ZERO1:.*]] = spv.constant 0 : i32
-      // CHECK: %[[PTR1:.*]] = spv.AccessChain %[[ARG0]]{{\[}}%[[ZERO1]], %[[OFFSET1_2]]{{\]}}
+      // CHECK: %[[PTR1:.*]] = spv.AccessChain %[[ARG0]]{{\[}}%[[ZERO]], %[[OFFSET1_2]]{{\]}}
       // CHECK-NEXT: %[[VAL1:.*]] = spv.Load "StorageBuffer" %[[PTR1]]
       %14 = load %arg0[%12, %13] : memref<12x4xf32>
       // CHECK: %[[PTR2:.*]] = spv.AccessChain %[[ARG1]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
index 5bc44cf0ba056..7c5df798438fc 100644
--- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
@@ -28,13 +28,17 @@ module attributes {
       // CHECK:        %[[CMP:.*]] = spv.SLessThan %[[INDVAR]], %[[UB]] : i32
       // CHECK:        spv.BranchConditional %[[CMP]], ^[[BODY:.*]], ^[[MERGE:.*]]
       // CHECK:      ^[[BODY]]:
-      // CHECK:        %[[STRIDE1:.*]] = spv.constant 1 : i32
-      // CHECK:        %[[INDEX1:.*]] = spv.IMul %[[STRIDE1]], %[[INDVAR]] : i32
       // CHECK:        %[[ZERO1:.*]] = spv.constant 0 : i32
+      // CHECK:        %[[OFFSET1:.*]] = spv.constant 0 : i32
+      // CHECK:        %[[STRIDE1:.*]] = spv.constant 1 : i32
+      // CHECK:        %[[UPDATE1:.*]] = spv.IMul %[[STRIDE1]], %[[INDVAR]] : i32
+      // CHECK:        %[[INDEX1:.*]] = spv.IAdd %[[OFFSET1]], %[[UPDATE1]] : i32
       // CHECK:        spv.AccessChain {{%.*}}{{\[}}%[[ZERO1]], %[[INDEX1]]{{\]}}
-      // CHECK:        %[[STRIDE2:.*]] = spv.constant 1 : i32
-      // CHECK:        %[[INDEX2:.*]] = spv.IMul %[[STRIDE2]], %[[INDVAR]] : i32
       // CHECK:        %[[ZERO2:.*]] = spv.constant 0 : i32
+      // CHECK:        %[[OFFSET2:.*]] = spv.constant 0 : i32
+      // CHECK:        %[[STRIDE2:.*]] = spv.constant 1 : i32
+      // CHECK:        %[[UPDATE2:.*]] = spv.IMul %[[STRIDE2]], %[[INDVAR]] : i32
+      // CHECK:        %[[INDEX2:.*]] = spv.IAdd %[[OFFSET2]], %[[UPDATE2]] : i32
       // CHECK:        spv.AccessChain {{%.*}}[%[[ZERO2]], %[[INDEX2]]]
       // CHECK:        %[[INCREMENT:.*]] = spv.IAdd %[[INDVAR]], %[[STEP]] : i32
       // CHECK:        spv.Branch ^[[HEADER]](%[[INCREMENT]] : i32)
diff --git a/mlir/test/Conversion/StandardToSPIRV/alloc.mlir b/mlir/test/Conversion/StandardToSPIRV/alloc.mlir
new file mode 100644
index 0000000000000..3cbeda1cafb07
--- /dev/null
+++ b/mlir/test/Conversion/StandardToSPIRV/alloc.mlir
@@ -0,0 +1,144 @@
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -convert-std-to-spirv -canonicalize -verify-diagnostics %s -o - | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// std allocation/deallocation ops
+//===----------------------------------------------------------------------===//
+
+module attributes {
+  spv.target_env = #spv.target_env<
+    #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+  }
+{
+  func @alloc_dealloc_workgroup_mem(%arg0 : index, %arg1 : index) {
+    %0 = alloc() : memref<4x5xf32, 3>
+    %1 = load %0[%arg0, %arg1] : memref<4x5xf32, 3>
+    store %1, %0[%arg0, %arg1] : memref<4x5xf32, 3>
+    dealloc %0 : memref<4x5xf32, 3>
+    return
+  }
+}
+//     CHECK: spv.globalVariable @[[VAR:.+]] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4>>, Workgroup>
+//     CHECK: func @alloc_dealloc_workgroup_mem
+// CHECK-NOT:   alloc
+//     CHECK:   %[[PTR:.+]] = spv._address_of @[[VAR]]
+//     CHECK:   %[[LOADPTR:.+]] = spv.AccessChain %[[PTR]]
+//     CHECK:   %[[VAL:.+]] = spv.Load "Workgroup" %[[LOADPTR]] : f32
+//     CHECK:   %[[STOREPTR:.+]] = spv.AccessChain %[[PTR]]
+//     CHECK:   spv.Store "Workgroup" %[[STOREPTR]], %[[VAL]] : f32
+// CHECK-NOT:   dealloc
+//     CHECK:   spv.Return
+
+// -----
+
+// TODO: Uncomment this test when the extension handling correctly
+// converts an i16 type to i32 type and handles the load/stores
+// correctly.
+
+// module attributes {
+//   spv.target_env = #spv.target_env<
+//     #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+//     {max_compute_workgroup_invocations = 128 : i32,
+//      max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+//   }
+// {
+//   func @alloc_dealloc_workgroup_mem(%arg0 : index, %arg1 : index) {
+//     %0 = alloc() : memref<4x5xi16, 3>
+//     %1 = load %0[%arg0, %arg1] : memref<4x5xi16, 3>
+//     store %1, %0[%arg0, %arg1] : memref<4x5xi16, 3>
+//     dealloc %0 : memref<4x5xi16, 3>
+//     return
+//   }
+// }
+
+// -----
+
+module attributes {
+  spv.target_env = #spv.target_env<
+    #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+  }
+{
+  func @two_allocs() {
+    %0 = alloc() : memref<4x5xf32, 3>
+    %1 = alloc() : memref<2x3xi32, 3>
+    return
+  }
+}
+
+//  CHECK-DAG: spv.globalVariable @__workgroup_mem__{{[0-9]+}}
+// CHECK-SAME:   !spv.ptr<!spv.struct<!spv.array<6 x i32, stride=4>>, Workgroup>
+//  CHECK-DAG: spv.globalVariable @__workgroup_mem__{{[0-9]+}}
+// CHECK-SAME:   !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4>>, Workgroup>
+//      CHECK: spv.func @two_allocs()
+//      CHECK: spv.Return
+
+// -----
+
+module attributes {
+  spv.target_env = #spv.target_env<
+    #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+  }
+{
+  func @alloc_dealloc_dynamic_workgroup_mem(%arg0 : index) {
+    // expected-error @+2 {{unhandled allocation type}}
+    // expected-error @+1 {{'std.alloc' op operand #0 must be index}}
+    %0 = alloc(%arg0) : memref<4x?xf32, 3>
+    return
+  }
+}
+
+// -----
+
+module attributes {
+  spv.target_env = #spv.target_env<
+    #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+  }
+{
+  func @alloc_dealloc_mem() {
+    // expected-error @+1 {{unhandled allocation type}}
+    %0 = alloc() : memref<4x5xf32>
+    return
+  }
+}
+
+
+// -----
+
+module attributes {
+  spv.target_env = #spv.target_env<
+    #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+  }
+{
+  func @alloc_dealloc_dynamic_workgroup_mem(%arg0 : memref<4x?xf32, 3>) {
+    // expected-error @+2 {{unhandled deallocation type}}
+    // expected-error @+1 {{'std.dealloc' op operand #0 must be memref of any type values}}
+    dealloc %arg0 : memref<4x?xf32, 3>
+    return
+  }
+}
+
+// -----
+
+module attributes {
+  spv.target_env = #spv.target_env<
+    #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>
+  }
+{
+  func @alloc_dealloc_mem(%arg0 : memref<4x5xf32>) {
+    // expected-error @+2 {{unhandled deallocation type}}
+    // expected-error @+1 {{op operand #0 must be memref of any type values}}
+    dealloc %arg0 : memref<4x5xf32>
+    return
+  }
+}
diff --git a/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir
index bf54dbaadb183..3fe24d05dd2e2 100644
--- a/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir
+++ b/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir
@@ -747,9 +747,11 @@ func @load_i8(%arg0: memref<i8>) {
 // CHECK-LABEL: @load_i16
 //       CHECK: (%[[ARG0:.+]]: {{.*}}, %[[ARG1:.+]]: i32)
 func @load_i16(%arg0: memref<10xi16>, %index : index) {
-  //     CHECK: %[[ONE:.+]] = spv.constant 1 : i32
-  //     CHECK: %[[FLAT_IDX:.+]] = spv.IMul %[[ONE]], %[[ARG1]] : i32
   //     CHECK: %[[ZERO:.+]] = spv.constant 0 : i32
+  //     CHECK: %[[OFFSET:.+]] = spv.constant 0 : i32
+  //     CHECK: %[[ONE:.+]] = spv.constant 1 : i32
+  //     CHECK: %[[UPDATE:.+]] = spv.IMul %[[ONE]], %[[ARG1]] : i32
+  //     CHECK: %[[FLAT_IDX:.+]] = spv.IAdd %[[OFFSET]], %[[UPDATE]] : i32
   //     CHECK: %[[TWO1:.+]] = spv.constant 2 : i32
   //     CHECK: %[[QUOTIENT:.+]] = spv.SDiv %[[FLAT_IDX]], %[[TWO1]] : i32
   //     CHECK: %[[PTR:.+]] = spv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]]
@@ -811,9 +813,11 @@ func @store_i8(%arg0: memref<i8>, %value: i8) {
 // CHECK-LABEL: @store_i16
 //       CHECK: (%[[ARG0:.+]]: {{.*}}, %[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
 func @store_i16(%arg0: memref<10xi16>, %index: index, %value: i16) {
-  //     CHECK: %[[ONE:.+]] = spv.constant 1 : i32
-  //     CHECK: %[[FLAT_IDX:.+]] = spv.IMul %[[ONE]], %[[ARG1]] : i32
   //     CHECK: %[[ZERO:.+]] = spv.constant 0 : i32
+  //     CHECK: %[[OFFSET:.+]] = spv.constant 0 : i32
+  //     CHECK: %[[ONE:.+]] = spv.constant 1 : i32
+  //     CHECK: %[[UPDATE:.+]] = spv.IMul %[[ONE]], %[[ARG1]] : i32
+  //     CHECK: %[[FLAT_IDX:.+]] = spv.IAdd %[[OFFSET]], %[[UPDATE]] : i32
   //     CHECK: %[[TWO:.+]] = spv.constant 2 : i32
   //     CHECK: %[[SIXTEEN:.+]] = spv.constant 16 : i32
   //     CHECK: %[[IDX:.+]] = spv.SMod %[[FLAT_IDX]], %[[TWO]] : i32

From 5b4cd2d4c42360469ccc9f59aa04a1a24b290df9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 09:49:08 -0700
Subject: [PATCH 247/770] [X86] Assemble movzb 1280(%rbx, %r12), %r12 after
 D80608

ffmpeg/libavcodec/x86/h264_cabac.c inline assembly may produce
movzb 1280(%rbx, %r12), %r12

After D80608, llvm-mc errors:

error: unknown use of instruction mnemonic without a size suffix
---
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 2 +-
 llvm/test/MC/X86/x86-64.s                      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index a842a91bbb069..91edc1a81c3f5 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3471,7 +3471,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
     Tmp.back() = Suffixes[I];
-    if (MemOp)
+    if (MemOp && HasVectorReg)
       MemOp->Mem.Size = MemSize[I];
     Match[I] = Match_MnemonicFail;
     if (MemOp || !HasVectorReg) {
diff --git a/llvm/test/MC/X86/x86-64.s b/llvm/test/MC/X86/x86-64.s
index a1c7e431cef7e..1b73aced06c7e 100644
--- a/llvm/test/MC/X86/x86-64.s
+++ b/llvm/test/MC/X86/x86-64.s
@@ -869,6 +869,9 @@ movsx (%rax), %ax
 // CHECK: encoding: [0x66,0x0f,0xb6,0x00]
 movzx (%rax), %ax
 
+// CHECK: movzbq	1280(%rbx,%r11), %r12
+// CHECK: encoding: [0x4e,0x0f,0xb6,0xa4,0x1b,0x00,0x05,0x00,0x00]
+movzb 1280(%rbx, %r11), %r12
 
 // rdar://7873482
 // CHECK: [0x65,0x8b,0x04,0x25,0x7c,0x00,0x00,0x00]

From 74a51753a6c2c587f650174e19f99279e8e4ef35 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Wed, 27 May 2020 19:21:54 +0200
Subject: [PATCH 248/770] [lldb] Make order of completions for expressions
 deterministic and sorted by Clang's priority values.

Summary:

It turns out that the order in which we provide completions for expressions is
nondeterministic. This leads to confusing user experience and also breaks the
reproducer tests (as two LLDB tests can go out of sync due to the
non-determinism in the completion lists)

The reason for the non-determinism is that the CompletionConsumer informs us
about decls in the order in which it finds declarations in the lookup store of
the DeclContexts it visits (mainly this snippet in SemaLookup.cpp):

``` lang=c++
    // Enumerate all of the results in this context.
    for (DeclContextLookupResult R :
         Load ? Ctx->lookups()
              : Ctx->noload_lookups(/*PreserveInternalState=*/false)) {
       [...]
```

This storage of the lookup is sorted by pointer values (see the hash of
`DeclarationName`) and can therefore be non-deterministic. The LLDB code
completion consumer that receives these calls originally expected that the order
of declarations is defined by Clang, but it seems the API expects the client to
provide an order to the completions.

This patch fixes the issue as follows:

* We sort the completions we get from Clang alphabetically and also by the
priority value we get from Clang (with priority value sorting having precedence
over the alphabetical sorting)

* We make all the functions/variables that touch a completion before the sorting
const-qualified. The idea is that this should prevent that we never have
observable side-effect from touching these declarations in a non-deterministic
order (e.g., we don't try to complete the type by accident).

This way we behave like the other parts of Clang which also sort the results by
some deterministic value (usually the name or something computed from a name,
e.g., edit distance to a given string).

We most likely also need to fix the Clang code to make the loop I listed above
deterministic to prevent these issues in the future (tracked in rdar://63442513
). This wouldn't replace the functionality provided in this patch though as we
would still need the priority and overall alphabetical sorting.

Note: I had to increase the lldb-vscode completion limit to 100 as the tests
look for strings that aren't in the first 50 results anymore due to variable
names starting with letters like 'v' (which are now always shown much further
down in the list due to the alphabetical sorting).

Fixes rdar://63200995

Reviewers: JDevlieghere, clayborg

Reviewed By: JDevlieghere

Subscribers: mgrang, abidh

Differential Revision: https://reviews.llvm.org/D80292
---
 .../Python/lldbsuite/test/lldbtest.py         |  37 +++-
 .../Clang/ClangExpressionParser.cpp           | 176 +++++++++++-------
 .../completion/TestExprCompletion.py          |  22 +--
 lldb/tools/lldb-vscode/lldb-vscode.cpp        |   2 +-
 4 files changed, 158 insertions(+), 79 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 639f99463d927..0dee4f217c801 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -2147,13 +2147,27 @@ def match(
 
         return match_object
 
-    def check_completion_with_desc(self, str_input, match_desc_pairs):
+    def check_completion_with_desc(self, str_input, match_desc_pairs, enforce_order=False):
+        """
+        Checks that when the given input is completed at the given list of
+        completions and descriptions is returned.
+        :param str_input: The input that should be completed. The completion happens at the end of the string.
+        :param match_desc_pairs: A list of pairs that indicate what completions have to be in the list of
+                                 completions returned by LLDB. The first element of the pair is the completion
+                                 string that LLDB should generate and the second element the description.
+        :param enforce_order: True iff the order in which the completions are returned by LLDB
+                              should match the order of the match_desc_pairs pairs.
+        """
         interp = self.dbg.GetCommandInterpreter()
         match_strings = lldb.SBStringList()
         description_strings = lldb.SBStringList()
         num_matches = interp.HandleCompletionWithDescriptions(str_input, len(str_input), 0, -1, match_strings, description_strings)
         self.assertEqual(len(description_strings), len(match_strings))
 
+        # The index of the last matched description in description_strings or
+        # -1 if no description has been matched yet.
+        last_found_index = -1
+        out_of_order_errors = ""
         missing_pairs = []
         for pair in match_desc_pairs:
             found_pair = False
@@ -2162,20 +2176,35 @@ def check_completion_with_desc(self, str_input, match_desc_pairs):
                 description_candidate = description_strings.GetStringAtIndex(i)
                 if match_candidate == pair[0] and description_candidate == pair[1]:
                     found_pair = True
+                    if enforce_order and last_found_index > i:
+                        new_err = ("Found completion " + pair[0] + " at index " +
+                                  str(i) + " in returned completion list but " +
+                                  "should have been after completion " +
+                                  match_strings.GetStringAtIndex(last_found_index) +
+                                  " (index:" + str(last_found_index) + ")\n")
+                        out_of_order_errors += new_err
+                    last_found_index = i
                     break
             if not found_pair:
                 missing_pairs.append(pair)
 
+        error_msg = ""
+        got_failure = False
         if len(missing_pairs):
-            error_msg = "Missing pairs:\n"
+            got_failure = True
+            error_msg += "Missing pairs:\n"
             for pair in missing_pairs:
                 error_msg += " [" + pair[0] + ":" + pair[1] + "]\n"
+        if len(out_of_order_errors):
+            got_failure = True
+            error_msg += out_of_order_errors
+        if got_failure:
             error_msg += "Got the following " + str(num_matches) + " completions back:\n"
             for i in range(num_matches + 1):
                 match_candidate = match_strings.GetStringAtIndex(i)
                 description_candidate = description_strings.GetStringAtIndex(i)
-                error_msg += "[" + match_candidate + ":" + description_candidate + "]\n"
-            self.assertEqual(0, len(missing_pairs), error_msg)
+                error_msg += "[" + match_candidate + ":" + description_candidate + "] index " + str(i) + "\n"
+            self.assertFalse(got_failure, error_msg)
 
     def complete_exactly(self, str_input, patterns):
         self.complete_from_to(str_input, patterns, True)
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 8885cbc85b2c3..14dd0656bf82b 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -666,11 +666,33 @@ class CodeComplete : public CodeCompleteConsumer {
 
   std::string m_expr;
   unsigned m_position = 0;
-  CompletionRequest &m_request;
   /// The printing policy we use when printing declarations for our completion
   /// descriptions.
   clang::PrintingPolicy m_desc_policy;
 
+  struct CompletionWithPriority {
+    CompletionResult::Completion completion;
+    /// See CodeCompletionResult::Priority;
+    unsigned Priority;
+
+    /// Establishes a deterministic order in a list of CompletionWithPriority.
+    /// The order returned here is the order in which the completions are
+    /// displayed to the user.
+    bool operator<(const CompletionWithPriority &o) const {
+      // High priority results should come first.
+      if (Priority != o.Priority)
+        return Priority > o.Priority;
+
+      // Identical priority, so just make sure it's a deterministic order.
+      return completion.GetUniqueKey() < o.completion.GetUniqueKey();
+    }
+  };
+
+  /// The stored completions.
+  /// Warning: These are in a non-deterministic order until they are sorted
+  /// and returned back to the caller.
+  std::vector<CompletionWithPriority> m_completions;
+
   /// Returns true if the given character can be used in an identifier.
   /// This also returns true for numbers because for completion we usually
   /// just iterate backwards over iterators.
@@ -687,7 +709,7 @@ class CodeComplete : public CodeCompleteConsumer {
   /// Drops all tokens in front of the expression that are unrelated for
   /// the completion of the cmd line. 'unrelated' means here that the token
   /// is not interested for the lldb completion API result.
-  StringRef dropUnrelatedFrontTokens(StringRef cmd) {
+  StringRef dropUnrelatedFrontTokens(StringRef cmd) const {
     if (cmd.empty())
       return cmd;
 
@@ -708,7 +730,7 @@ class CodeComplete : public CodeCompleteConsumer {
   }
 
   /// Removes the last identifier token from the given cmd line.
-  StringRef removeLastToken(StringRef cmd) {
+  StringRef removeLastToken(StringRef cmd) const {
     while (!cmd.empty() && IsIdChar(cmd.back())) {
       cmd = cmd.drop_back();
     }
@@ -719,7 +741,7 @@ class CodeComplete : public CodeCompleteConsumer {
   /// existing command. Returns the completion string that can be returned to
   /// the lldb completion API.
   std::string mergeCompletion(StringRef existing, unsigned pos,
-                              StringRef completion) {
+                              StringRef completion) const {
     StringRef existing_command = existing.substr(0, pos);
     // We rewrite the last token with the completion, so let's drop that
     // token from the command.
@@ -741,11 +763,10 @@ class CodeComplete : public CodeCompleteConsumer {
   /// \param[out] position
   ///    The character position of the user cursor in the `expr` parameter.
   ///
-  CodeComplete(CompletionRequest &request, clang::LangOptions ops,
-               std::string expr, unsigned position)
+  CodeComplete(clang::LangOptions ops, std::string expr, unsigned position)
       : CodeCompleteConsumer(CodeCompleteOptions()),
         m_info(std::make_shared<GlobalCodeCompletionAllocator>()), m_expr(expr),
-        m_position(position), m_request(request), m_desc_policy(ops) {
+        m_position(position), m_desc_policy(ops) {
 
     // Ensure that the printing policy is producing a description that is as
     // short as possible.
@@ -758,9 +779,6 @@ class CodeComplete : public CodeCompleteConsumer {
     m_desc_policy.Bool = true;
   }
 
-  /// Deregisters and destroys this code-completion consumer.
-  ~CodeComplete() override {}
-
   /// \name Code-completion filtering
   /// Check if the result should be filtered out.
   bool isResultFilteredOut(StringRef Filter,
@@ -788,6 +806,85 @@ class CodeComplete : public CodeCompleteConsumer {
     return true;
   }
 
+private:
+  /// Generate the completion strings for the given CodeCompletionResult.
+  /// Note that this function has to process results that could come in
+  /// non-deterministic order, so this function should have no side effects.
+  /// To make this easier to enforce, this function and all its parameters
+  /// should always be const-qualified.
+  /// \return Returns llvm::None if no completion should be provided for the
+  ///         given CodeCompletionResult.
+  llvm::Optional<CompletionWithPriority>
+  getCompletionForResult(const CodeCompletionResult &R) const {
+    std::string ToInsert;
+    std::string Description;
+    // Handle the different completion kinds that come from the Sema.
+    switch (R.Kind) {
+    case CodeCompletionResult::RK_Declaration: {
+      const NamedDecl *D = R.Declaration;
+      ToInsert = R.Declaration->getNameAsString();
+      // If we have a function decl that has no arguments we want to
+      // complete the empty parantheses for the user. If the function has
+      // arguments, we at least complete the opening bracket.
+      if (const FunctionDecl *F = dyn_cast<FunctionDecl>(D)) {
+        if (F->getNumParams() == 0)
+          ToInsert += "()";
+        else
+          ToInsert += "(";
+        raw_string_ostream OS(Description);
+        F->print(OS, m_desc_policy, false);
+        OS.flush();
+      } else if (const VarDecl *V = dyn_cast<VarDecl>(D)) {
+        Description = V->getType().getAsString(m_desc_policy);
+      } else if (const FieldDecl *F = dyn_cast<FieldDecl>(D)) {
+        Description = F->getType().getAsString(m_desc_policy);
+      } else if (const NamespaceDecl *N = dyn_cast<NamespaceDecl>(D)) {
+        // If we try to complete a namespace, then we can directly append
+        // the '::'.
+        if (!N->isAnonymousNamespace())
+          ToInsert += "::";
+      }
+      break;
+    }
+    case CodeCompletionResult::RK_Keyword:
+      ToInsert = R.Keyword;
+      break;
+    case CodeCompletionResult::RK_Macro:
+      ToInsert = R.Macro->getName().str();
+      break;
+    case CodeCompletionResult::RK_Pattern:
+      ToInsert = R.Pattern->getTypedText();
+      break;
+    }
+    // We also filter some internal lldb identifiers here. The user
+    // shouldn't see these.
+    if (llvm::StringRef(ToInsert).startswith("$__lldb_"))
+      return llvm::None;
+    if (ToInsert.empty())
+      return llvm::None;
+    // Merge the suggested Token into the existing command line to comply
+    // with the kind of result the lldb API expects.
+    std::string CompletionSuggestion =
+        mergeCompletion(m_expr, m_position, ToInsert);
+
+    CompletionResult::Completion completion(CompletionSuggestion, Description,
+                                            CompletionMode::Normal);
+    return {{completion, R.Priority}};
+  }
+
+public:
+  /// Adds the completions to the given CompletionRequest.
+  void GetCompletions(CompletionRequest &request) {
+    // Bring m_completions into a deterministic order and pass it on to the
+    // CompletionRequest.
+    llvm::sort(m_completions);
+
+    for (const CompletionWithPriority &C : m_completions)
+      request.AddCompletion(C.completion.GetCompletion(),
+                            C.completion.GetDescription(),
+                            C.completion.GetMode());
+  }
+
   /// \name Code-completion callbacks
   /// Process the finalized code-completion results.
   void ProcessCodeCompleteResults(Sema &SemaRef, CodeCompletionContext Context,
@@ -806,59 +903,11 @@ class CodeComplete : public CodeCompleteConsumer {
         continue;
 
       CodeCompletionResult &R = Results[I];
-      std::string ToInsert;
-      std::string Description;
-      // Handle the different completion kinds that come from the Sema.
-      switch (R.Kind) {
-      case CodeCompletionResult::RK_Declaration: {
-        const NamedDecl *D = R.Declaration;
-        ToInsert = R.Declaration->getNameAsString();
-        // If we have a function decl that has no arguments we want to
-        // complete the empty parantheses for the user. If the function has
-        // arguments, we at least complete the opening bracket.
-        if (const FunctionDecl *F = dyn_cast<FunctionDecl>(D)) {
-          if (F->getNumParams() == 0)
-            ToInsert += "()";
-          else
-            ToInsert += "(";
-          raw_string_ostream OS(Description);
-          F->print(OS, m_desc_policy, false);
-          OS.flush();
-        } else if (const VarDecl *V = dyn_cast<VarDecl>(D)) {
-          Description = V->getType().getAsString(m_desc_policy);
-        } else if (const FieldDecl *F = dyn_cast<FieldDecl>(D)) {
-          Description = F->getType().getAsString(m_desc_policy);
-        } else if (const NamespaceDecl *N = dyn_cast<NamespaceDecl>(D)) {
-          // If we try to complete a namespace, then we can directly append
-          // the '::'.
-          if (!N->isAnonymousNamespace())
-            ToInsert += "::";
-        }
-        break;
-      }
-      case CodeCompletionResult::RK_Keyword:
-        ToInsert = R.Keyword;
-        break;
-      case CodeCompletionResult::RK_Macro:
-        ToInsert = R.Macro->getName().str();
-        break;
-      case CodeCompletionResult::RK_Pattern:
-        ToInsert = R.Pattern->getTypedText();
-        break;
-      }
-      // At this point all information is in the ToInsert string.
-
-      // We also filter some internal lldb identifiers here. The user
-      // shouldn't see these.
-      if (StringRef(ToInsert).startswith("$__lldb_"))
+      llvm::Optional<CompletionWithPriority> CompletionAndPriority =
+          getCompletionForResult(R);
+      if (!CompletionAndPriority)
         continue;
-      if (!ToInsert.empty()) {
-        // Merge the suggested Token into the existing command line to comply
-        // with the kind of result the lldb API expects.
-        std::string CompletionSuggestion =
-            mergeCompletion(m_expr, m_position, ToInsert);
-        m_request.AddCompletion(CompletionSuggestion, Description);
-      }
+      m_completions.push_back(*CompletionAndPriority);
     }
   }
 
@@ -895,12 +944,13 @@ bool ClangExpressionParser::Complete(CompletionRequest &request, unsigned line,
   // the LLVMUserExpression which exposes the right API. This should never fail
   // as we always have a ClangUserExpression whenever we call this.
   ClangUserExpression *llvm_expr = cast<ClangUserExpression>(&m_expr);
-  CodeComplete CC(request, m_compiler->getLangOpts(), llvm_expr->GetUserText(),
+  CodeComplete CC(m_compiler->getLangOpts(), llvm_expr->GetUserText(),
                   typed_pos);
   // We don't need a code generator for parsing.
   m_code_generator.reset();
   // Start parsing the expression with our custom code completion consumer.
   ParseInternal(mgr, &CC, line, pos);
+  CC.GetCompletions(request);
   return true;
 }
 
diff --git a/lldb/test/API/commands/expression/completion/TestExprCompletion.py b/lldb/test/API/commands/expression/completion/TestExprCompletion.py
index 5266266b6ab21..9ff9052bb3fc2 100644
--- a/lldb/test/API/commands/expression/completion/TestExprCompletion.py
+++ b/lldb/test/API/commands/expression/completion/TestExprCompletion.py
@@ -201,26 +201,26 @@ def test_expr_completion_with_descriptions(self):
                                           '// Break here', self.main_source_spec)
 
         self.check_completion_with_desc("expr ", [
-            # VarDecls have their type as description.
-            ["some_expr", "Expr &"],
             # builtin types have no description.
             ["int", ""],
-            ["float", ""]
-        ])
+            ["float", ""],
+            # VarDecls have their type as description.
+            ["some_expr", "Expr &"],
+        ], enforce_order = True)
         self.check_completion_with_desc("expr some_expr.", [
             # Functions have their signature as description.
-            ["some_expr.Self()", "Expr &Self()"],
+            ["some_expr.~Expr()", "inline ~Expr()"],
             ["some_expr.operator=(", "inline Expr &operator=(const Expr &)"],
-            ["some_expr.FooNumbersBar1()", "int FooNumbersBar1()"],
+            # FieldDecls have their type as description.
+            ["some_expr.MemberVariableBar", "int"],
             ["some_expr.StaticMemberMethodBar()", "static int StaticMemberMethodBar()"],
-            ["some_expr.FooWithArgsBar(", "int FooWithArgsBar(int)"],
+            ["some_expr.Self()", "Expr &Self()"],
             ["some_expr.FooNoArgsBar()", "int FooNoArgsBar()"],
+            ["some_expr.FooWithArgsBar(", "int FooWithArgsBar(int)"],
+            ["some_expr.FooNumbersBar1()", "int FooNumbersBar1()"],
             ["some_expr.FooUnderscoreBar_()", "int FooUnderscoreBar_()"],
             ["some_expr.FooWithMultipleArgsBar(", "int FooWithMultipleArgsBar(int, int)"],
-            ["some_expr.~Expr()", "inline ~Expr()"],
-            # FieldDecls have their type as description.
-            ["some_expr.MemberVariableBar", "int"],
-        ])
+        ], enforce_order = True)
 
     def assume_no_completions(self, str_input, cursor_pos = None):
         interp = self.dbg.GetCommandInterpreter()
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
index 764eded8ce8dc..f1620d945fbc2 100644
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -967,7 +967,7 @@ void request_completions(const llvm::json::Object &request) {
     text.c_str(),
     actual_column,
     0, -1, matches, descriptions);
-  size_t count = std::min((uint32_t)50, matches.GetSize());
+  size_t count = std::min((uint32_t)100, matches.GetSize());
   targets.reserve(count);
   for (size_t i = 0; i < count; i++) {
     std::string match = matches.GetStringAtIndex(i);

From 07cd19efa2a63b01aea9b516a7a003cb7f750a12 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 26 May 2020 22:08:31 -0400
Subject: [PATCH 249/770] AMDGPU: Fix dropping MI flags when rewriting
 instructions

All 3 passes that change instruction encodings were dropping MI
flags. This avoids scheduling regressions caused by setting
mayRaiseFPExceptions on FP instructions for non-strictfp functions.
---
 llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp      |  4 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  3 ++-
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     | 25 ++++++++++++-------
 llvm/test/CodeGen/AMDGPU/dpp_combine.mir      | 23 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/sdwa-ops.mir         |  8 +++---
 .../CodeGen/AMDGPU/sdwa-peephole-instr.mir    | 25 ++++++++++++++++++-
 .../AMDGPU/shrink-instructions-flags.mir      | 24 ++++++++++++++++++
 7 files changed, 96 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 10e2c3a263f17..1fa75504493b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -168,7 +168,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
   }
 
   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
-                         OrigMI.getDebugLoc(), TII->get(DPPOp));
+                         OrigMI.getDebugLoc(), TII->get(DPPOp))
+    .setMIFlags(OrigMI.getFlags());
+
   bool Fail = false;
   do {
     auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fd1da238a8761..06dd11fdbf618 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3272,7 +3272,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
                                            unsigned Op32) const {
   MachineBasicBlock *MBB = MI.getParent();;
   MachineInstrBuilder Inst32 =
-    BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
+    BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
+    .setMIFlags(MI.getFlags());
 
   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
   // For VOPC instructions, this is replaced by an implicit def of vcc.
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index c4f511abc4aee..9a1855c3458be 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -922,18 +922,24 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
     if (I->modifiesRegister(AMDGPU::VCC, TRI))
       return;
   }
+
   // Make the two new e32 instruction variants.
   // Replace MI with V_{SUB|ADD}_I32_e32
-  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
-  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
-  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
-  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
+    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
+    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
+    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
+    .setMIFlags(MI.getFlags());
+
   MI.eraseFromParent();
+
   // Replace MISucc with V_{SUBB|ADDC}_U32_e32
-  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
-  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
-  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
-  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+  BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
+    .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
+    .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
+    .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
+    .setMIFlags(MISucc.getFlags());
+
   MISucc.eraseFromParent();
 }
 
@@ -1010,7 +1016,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
 
   // Create SDWA version of instruction MI and initialize its operands
   MachineInstrBuilder SDWAInst =
-    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
+    .setMIFlags(MI.getFlags());
 
   // Copy dst, if it is present in original then should also be present in SDWA
   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
index 0c4d0e0e2384e..859c21d8842fb 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
@@ -293,6 +293,8 @@ body:             |
     %19:vgpr_32 = V_ADD_I32_e32 5, %18, implicit-def $vcc, implicit $exec
 ...
 
+---
+
 # check for floating point modifiers
 # GCN-LABEL: name: add_f32_e64
 # GCN: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
@@ -810,3 +812,24 @@ body: |
     %4:sreg_64_xexec = IMPLICIT_DEF
     %5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec
 ...
+
+---
+
+# Make sure flags aren't dropped
+# GCN-LABEL: name: flags_add_f32_e64
+# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $exec
+name: flags_add_f32_e64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = IMPLICIT_DEF
+
+    %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $exec
+    S_ENDPGM 0, implicit %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
index a759972d3a105..c181f51e747fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir
@@ -3,8 +3,8 @@
 
 # test for 3 consecutive _sdwa's
 # GFX9-LABEL: name:            test1_add_co_sdwa
-# GFX9: V_ADD_I32_sdwa
-# GFX9-NEXT: V_ADDC_U32_e32
+# GFX9: = nsw V_ADD_I32_sdwa
+# GFX9-NEXT: = nuw V_ADDC_U32_e32
 # GFX9: V_ADD_I32_sdwa
 # GFX9-NEXT: V_ADDC_U32_e32
 # GFX9: V_ADD_I32_sdwa
@@ -26,8 +26,8 @@ body:             |
     %22:sreg_32_xm0 = S_MOV_B32 255
     %23:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec
     %30:vreg_64 = COPY $sgpr0_sgpr1
-    %63:vgpr_32, %65:sreg_64_xexec = V_ADD_I32_e64 %30.sub0, %23, 0, implicit $exec
-    %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec
+    %63:vgpr_32, %65:sreg_64_xexec = nsw V_ADD_I32_e64 %30.sub0, %23, 0, implicit $exec
+    %64:vgpr_32, dead %66:sreg_64_xexec = nuw V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec
     %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1
     GLOBAL_STORE_DWORDX2_SADDR %30, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8)
 
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index 3a3bda4a807ad..8ba20b4a66ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -361,7 +361,6 @@ body:             |
 # GFX9: $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def $exec, implicit $exec
 
 
-
 name:            vopc_instructions
 tracksRegLiveness: true
 registers:
@@ -445,3 +444,27 @@ body:             |
     FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# GCN-LABEL: name: preserve_flags
+# GCN: = nnan nofpexcept V_ADD_F32_sdwa 0, %4, 0, %4, 0, 0, 6, 0, 5, 1, implicit $exec
+
+---
+name: preserve_flags
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:sreg_32 = S_MOV_B32 65535
+    %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec
+    %3:vgpr_32 = V_AND_B32_e32 %1, %2, implicit $exec
+    %4:vgpr_32 = V_LSHLREV_B32_e64 16, %3, implicit $exec
+    %5:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+    %6:vgpr_32 = V_BFE_U32 %4, 8, 8, implicit $exec
+    %7:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 %5, %6, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %7
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
new file mode 100644
index 0000000000000..b8c36bc77148f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
@@ -0,0 +1,24 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -run-pass=si-shrink-instructions %s -o - | FileCheck %s
+
+# Make sure flags are preserved when shrinking instructions
+---
+
+name:            shrink_fadd_f32_flags
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: shrink_fadd_f32_flags
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: %2:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $exec
+    ; CHECK: S_NOP 0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    S_NOP 0
+
+...

From e7f1067ad6f116ff1e4bfc0f7fe1977f172b0ea0 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 10:27:44 -0700
Subject: [PATCH 250/770] [lldb/Reproducers] Skip API logging in the DUMMY
 macro

The purpose of the LLDB_RECORD_DUMMY macro is twofold: it is used in
functions that take arguments that we don't know how to serialize (e.g.
void*) and it's used by function where we want to avoid doing excessive
work because they can be called from a signal handler (e.g.
setTerminalWidth).

To support the latter case, I've disabled API logging form the Recorder
ctor used by the DUMMY macro. This ensures we don't allocate memory when
called from a signal handler.
---
 lldb/include/lldb/Utility/ReproducerInstrumentation.h | 8 ++++----
 lldb/source/Utility/ReproducerInstrumentation.cpp     | 9 +++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lldb/include/lldb/Utility/ReproducerInstrumentation.h b/lldb/include/lldb/Utility/ReproducerInstrumentation.h
index 346eac52501a6..f06b8c0388185 100644
--- a/lldb/include/lldb/Utility/ReproducerInstrumentation.h
+++ b/lldb/include/lldb/Utility/ReproducerInstrumentation.h
@@ -207,11 +207,10 @@ template <typename... Ts> inline std::string stringify_args(const Ts &... ts) {
 /// anything. It's used to track API boundaries when we cannot record for
 /// technical reasons.
 #define LLDB_RECORD_DUMMY(Result, Class, Method, Signature, ...)               \
-  lldb_private::repro::Recorder _recorder(LLVM_PRETTY_FUNCTION,                \
-                                          stringify_args(__VA_ARGS__));
+  lldb_private::repro::Recorder _recorder;
 
 #define LLDB_RECORD_DUMMY_NO_ARGS(Result, Class, Method)                       \
-  lldb_private::repro::Recorder _recorder(LLVM_PRETTY_FUNCTION);
+  lldb_private::repro::Recorder _recorder;
 
 namespace lldb_private {
 namespace repro {
@@ -727,7 +726,8 @@ struct EmptyArg {};
 /// this class is also used for logging.
 class Recorder {
 public:
-  Recorder(llvm::StringRef pretty_func = {}, std::string &&pretty_args = {});
+  Recorder();
+  Recorder(llvm::StringRef pretty_func, std::string &&pretty_args = {});
   ~Recorder();
 
   /// Records a single function call.
diff --git a/lldb/source/Utility/ReproducerInstrumentation.cpp b/lldb/source/Utility/ReproducerInstrumentation.cpp
index 46bf6b76e1d2e..09aea69d83138 100644
--- a/lldb/source/Utility/ReproducerInstrumentation.cpp
+++ b/lldb/source/Utility/ReproducerInstrumentation.cpp
@@ -179,6 +179,15 @@ unsigned ObjectToIndex::GetIndexForObjectImpl(const void *object) {
   return m_mapping[object];
 }
 
+Recorder::Recorder()
+    : m_serializer(nullptr), m_pretty_func(), m_pretty_args(),
+      m_local_boundary(false), m_result_recorded(true) {
+  if (!g_global_boundary) {
+    g_global_boundary = true;
+    m_local_boundary = true;
+  }
+}
+
 Recorder::Recorder(llvm::StringRef pretty_func, std::string &&pretty_args)
     : m_serializer(nullptr), m_pretty_func(pretty_func),
       m_pretty_args(pretty_args), m_local_boundary(false),

From 6407aa9d2e0e225bc81d3b2602d6e6ed79912ec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Mart=C3=ADn?= <mardani29@yahoo.es>
Date: Wed, 27 May 2020 18:17:07 +0200
Subject: [PATCH 251/770] [clangd] Add access specifier information to hover
 contents

Summary:
For https://github.com/clangd/clangd/issues/382

This commit adds access specifier information to the hover
contents. For example, the hover information of a class field or
member function will now indicate if the field or member is private,
public, or protected. This can be particularly useful when a developer
is in the implementation file and wants to know if a particular member
definition is public or private.

Reviewers: kadircet

Reviewed By: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80472
---
 clang-tools-extra/clang-doc/Generators.cpp    | 14 -----
 clang-tools-extra/clang-doc/Generators.h      |  2 -
 clang-tools-extra/clang-doc/HTMLGenerator.cpp |  4 +-
 clang-tools-extra/clang-doc/MDGenerator.cpp   |  4 +-
 clang-tools-extra/clangd/Hover.cpp            |  6 +-
 clang-tools-extra/clangd/Hover.h              |  3 +
 .../clangd/unittests/HoverTests.cpp           | 60 ++++++++++++++++++-
 clang/include/clang/Basic/Specifiers.h        | 14 +++++
 clang/lib/AST/DeclPrinter.cpp                 | 10 ++--
 clang/lib/AST/JSONNodeDumper.cpp              | 12 ++--
 clang/lib/AST/TextNodeDumper.cpp              | 18 ++----
 11 files changed, 99 insertions(+), 48 deletions(-)

diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp
index ec7133466f2e9..3b7dcf93411af 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -27,20 +27,6 @@ findGeneratorByName(llvm::StringRef Format) {
 
 // Enum conversion
 
-std::string getAccess(AccessSpecifier AS) {
-  switch (AS) {
-  case AccessSpecifier::AS_public:
-    return "public";
-  case AccessSpecifier::AS_protected:
-    return "protected";
-  case AccessSpecifier::AS_private:
-    return "private";
-  case AccessSpecifier::AS_none:
-    return {};
-  }
-  llvm_unreachable("Unknown AccessSpecifier");
-}
-
 std::string getTagType(TagTypeKind AS) {
   switch (AS) {
   case TagTypeKind::TTK_Class:
diff --git a/clang-tools-extra/clang-doc/Generators.h b/clang-tools-extra/clang-doc/Generators.h
index 799d503b10231..89c6b34c43844 100644
--- a/clang-tools-extra/clang-doc/Generators.h
+++ b/clang-tools-extra/clang-doc/Generators.h
@@ -42,8 +42,6 @@ typedef llvm::Registry<Generator> GeneratorRegistry;
 llvm::Expected<std::unique_ptr<Generator>>
 findGeneratorByName(llvm::StringRef Format);
 
-std::string getAccess(AccessSpecifier AS);
-
 std::string getTagType(TagTypeKind AS);
 
 } // namespace doc
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index dc569e2a482c7..49ff36a02be7f 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -402,7 +402,7 @@ genRecordMembersBlock(const llvm::SmallVector<MemberTypeInfo, 4> &Members,
   Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
   auto &ULBody = Out.back();
   for (const auto &M : Members) {
-    std::string Access = getAccess(M.Access);
+    std::string Access = getAccessSpelling(M.Access).str();
     if (Access != "")
       Access = Access + " ";
     auto LIBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
@@ -679,7 +679,7 @@ genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
   Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
   auto &FunctionHeader = Out.back();
 
-  std::string Access = getAccess(I.Access);
+  std::string Access = getAccessSpelling(I.Access).str();
   if (Access != "")
     FunctionHeader->Children.emplace_back(
         std::make_unique<TextNode>(Access + " "));
diff --git a/clang-tools-extra/clang-doc/MDGenerator.cpp b/clang-tools-extra/clang-doc/MDGenerator.cpp
index 9ad71e435a70c..58c2de96b298c 100644
--- a/clang-tools-extra/clang-doc/MDGenerator.cpp
+++ b/clang-tools-extra/clang-doc/MDGenerator.cpp
@@ -157,7 +157,7 @@ static void genMarkdown(const ClangDocContext &CDCtx, const FunctionInfo &I,
     First = false;
   }
   writeHeader(I.Name, 3, OS);
-  std::string Access = getAccess(I.Access);
+  std::string Access = getAccessSpelling(I.Access).str();
   if (Access != "")
     writeLine(genItalic(Access + " " + I.ReturnType.Type.Name + " " + I.Name +
                         "(" + Stream.str() + ")"),
@@ -250,7 +250,7 @@ static void genMarkdown(const ClangDocContext &CDCtx, const RecordInfo &I,
   if (!I.Members.empty()) {
     writeHeader("Members", 2, OS);
     for (const auto &Member : I.Members) {
-      std::string Access = getAccess(Member.Access);
+      std::string Access = getAccessSpelling(Member.Access).str();
       if (Access != "")
         writeLine(Access + " " + Member.Type.Name + " " + Member.Name, OS);
       else
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 3d0430b57931d..e2a3a0dd62f52 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -468,6 +468,7 @@ HoverInfo getHoverContents(const NamedDecl *D, const SymbolIndex *Index) {
   HoverInfo HI;
   const ASTContext &Ctx = D->getASTContext();
 
+  HI.AccessSpecifier = getAccessSpelling(D->getAccess()).str();
   HI.NamespaceScope = getNamespaceScope(D);
   if (!HI.NamespaceScope->empty())
     HI.NamespaceScope->append("::");
@@ -835,9 +836,12 @@ markup::Document HoverInfo::present() const {
       ScopeComment = "// In namespace " +
                      llvm::StringRef(*NamespaceScope).rtrim(':').str() + '\n';
     }
+    std::string DefinitionWithAccess = !AccessSpecifier.empty()
+                                           ? AccessSpecifier + ": " + Definition
+                                           : Definition;
     // Note that we don't print anything for global namespace, to not annoy
     // non-c++ projects or projects that are not making use of namespaces.
-    Output.addCodeBlock(ScopeComment + Definition);
+    Output.addCodeBlock(ScopeComment + DefinitionWithAccess);
   }
   return Output;
 }
diff --git a/clang-tools-extra/clangd/Hover.h b/clang-tools-extra/clangd/Hover.h
index 931e1c2363a45..b712d844e33d0 100644
--- a/clang-tools-extra/clangd/Hover.h
+++ b/clang-tools-extra/clangd/Hover.h
@@ -59,6 +59,9 @@ struct HoverInfo {
   /// Source code containing the definition of the symbol.
   std::string Definition;
 
+  /// Access specifier for declarations inside class/struct/unions, empty for
+  /// others.
+  std::string AccessSpecifier;
   /// Pretty-printed variable type.
   /// Set only for variables.
   llvm::Optional<std::string> Type;
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index e5ff0ee364d83..dc818ea661938 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -12,6 +12,7 @@
 #include "TestIndex.h"
 #include "TestTU.h"
 #include "index/MemIndex.h"
+#include "clang/Basic/Specifiers.h"
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/StringRef.h"
@@ -79,6 +80,7 @@ TEST(Hover, Structured) {
          HI.Type = "char";
          HI.Offset = 0;
          HI.Size = 1;
+         HI.AccessSpecifier = "public";
        }},
       // Local to class method.
       {R"cpp(
@@ -115,6 +117,7 @@ TEST(Hover, Structured) {
          HI.Type = "char";
          HI.Offset = 0;
          HI.Size = 1;
+         HI.AccessSpecifier = "public";
        }},
       // Struct definition shows size.
       {R"cpp(
@@ -344,6 +347,7 @@ class Foo {})cpp";
          HI.Kind = index::SymbolKind::Constructor;
          HI.Definition = "X()";
          HI.Parameters.emplace();
+         HI.AccessSpecifier = "public";
        }},
       {"class X { [[^~]]X(); };", // FIXME: Should be [[~X]]()
        [](HoverInfo &HI) {
@@ -353,6 +357,7 @@ class Foo {})cpp";
          HI.Kind = index::SymbolKind::Destructor;
          HI.Definition = "~X()";
          HI.Parameters.emplace();
+         HI.AccessSpecifier = "private";
        }},
       {"class X { [[op^erator]] int(); };",
        [](HoverInfo &HI) {
@@ -362,6 +367,7 @@ class Foo {})cpp";
          HI.Kind = index::SymbolKind::ConversionFunction;
          HI.Definition = "operator int()";
          HI.Parameters.emplace();
+         HI.AccessSpecifier = "private";
        }},
       {"class X { operator [[^X]](); };",
        [](HoverInfo &HI) {
@@ -494,6 +500,7 @@ class Foo {})cpp";
          HI.NamespaceScope = "";
          HI.LocalScope = "Add<1, 2>::";
          HI.Value = "3";
+         HI.AccessSpecifier = "public";
        }},
       {R"cpp(
         constexpr int answer() { return 40 + 2; }
@@ -606,6 +613,7 @@ class Foo {})cpp";
          HI.Definition = "typename T = int";
          HI.LocalScope = "foo::";
          HI.Type = "typename";
+         HI.AccessSpecifier = "public";
        }},
       {// TemplateTemplate Type Parameter
        R"cpp(
@@ -618,6 +626,7 @@ class Foo {})cpp";
          HI.Definition = "template <typename> class T";
          HI.LocalScope = "foo::";
          HI.Type = "template <typename> class";
+         HI.AccessSpecifier = "public";
        }},
       {// NonType Template Parameter
        R"cpp(
@@ -630,6 +639,7 @@ class Foo {})cpp";
          HI.Definition = "int T = 5";
          HI.LocalScope = "foo::";
          HI.Type = "int";
+         HI.AccessSpecifier = "public";
        }},
 
       {// Getter
@@ -646,6 +656,7 @@ class Foo {})cpp";
          HI.Type = "float ()";
          HI.ReturnType = "float";
          HI.Parameters.emplace();
+         HI.AccessSpecifier = "public";
        }},
       {// Setter
        R"cpp(
@@ -664,6 +675,7 @@ class Foo {})cpp";
          HI.Parameters->emplace_back();
          HI.Parameters->back().Type = "float";
          HI.Parameters->back().Name = "v";
+         HI.AccessSpecifier = "public";
        }},
       {// Setter (builder)
        R"cpp(
@@ -682,6 +694,7 @@ class Foo {})cpp";
          HI.Parameters->emplace_back();
          HI.Parameters->back().Type = "float";
          HI.Parameters->back().Name = "v";
+         HI.AccessSpecifier = "public";
        }},
   };
   for (const auto &Case : Cases) {
@@ -715,6 +728,7 @@ class Foo {})cpp";
     EXPECT_EQ(H->Value, Expected.Value);
     EXPECT_EQ(H->Size, Expected.Size);
     EXPECT_EQ(H->Offset, Expected.Offset);
+    EXPECT_EQ(H->AccessSpecifier, Expected.AccessSpecifier);
   }
 }
 
@@ -1964,7 +1978,51 @@ Size: 4 bytes
 // In test::Bar
 def)",
       },
-  };
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Field;
+            HI.AccessSpecifier = "public";
+            HI.Name = "foo";
+            HI.LocalScope = "test::Bar::";
+            HI.Definition = "def";
+          },
+          R"(field foo
+
+// In test::Bar
+public: def)",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Definition = "int method()";
+            HI.AccessSpecifier = "protected";
+            HI.Kind = index::SymbolKind::InstanceMethod;
+            HI.NamespaceScope = "";
+            HI.LocalScope = "cls<int>::";
+            HI.Name = "method";
+            HI.Parameters.emplace();
+            HI.ReturnType = "int";
+            HI.Type = "int ()";
+          },
+          R"(instance-method method
+
+→ int
+
+// In cls<int>
+protected: int method())",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Union;
+            HI.AccessSpecifier = "private";
+            HI.Name = "foo";
+            HI.NamespaceScope = "ns1::";
+            HI.Definition = "union foo {}";
+          },
+          R"(union foo
+
+// In namespace ns1
+private: union foo {})",
+      }};
 
   for (const auto &C : Cases) {
     HoverInfo HI;
diff --git a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
index e6c2cb39566ce..2c80dd4fa8103 100644
--- a/clang/include/clang/Basic/Specifiers.h
+++ b/clang/include/clang/Basic/Specifiers.h
@@ -365,6 +365,20 @@ namespace clang {
   };
 
   llvm::StringRef getParameterABISpelling(ParameterABI kind);
+
+  inline llvm::StringRef getAccessSpelling(AccessSpecifier AS) {
+    switch (AS) {
+    case AccessSpecifier::AS_public:
+      return "public";
+    case AccessSpecifier::AS_protected:
+      return "protected";
+    case AccessSpecifier::AS_private:
+      return "private";
+    case AccessSpecifier::AS_none:
+      return {};
+    }
+    llvm_unreachable("Unknown AccessSpecifier");
+  }
 } // end namespace clang
 
 #endif // LLVM_CLANG_BASIC_SPECIFIERS_H
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index fc2a166e11b4c..4df6512e6c76c 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -289,12 +289,10 @@ void DeclPrinter::ProcessDeclGroup(SmallVectorImpl<Decl*>& Decls) {
 }
 
 void DeclPrinter::Print(AccessSpecifier AS) {
-  switch(AS) {
-  case AS_none:      llvm_unreachable("No access specifier!");
-  case AS_public:    Out << "public"; break;
-  case AS_protected: Out << "protected"; break;
-  case AS_private:   Out << "private"; break;
-  }
+  const auto AccessSpelling = getAccessSpelling(AS);
+  if (AccessSpelling.empty())
+    llvm_unreachable("No access specifier!");
+  Out << AccessSpelling;
 }
 
 void DeclPrinter::PrintConstructorInitializers(CXXConstructorDecl *CDecl,
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 91281fb44bfa9..8edfed673ce21 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1,5 +1,6 @@
 #include "clang/AST/JSONNodeDumper.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/Specifiers.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/StringSwitch.h"
 
@@ -465,13 +466,10 @@ JSONNodeDumper::createCXXRecordDefinitionData(const CXXRecordDecl *RD) {
 #undef FIELD2
 
 std::string JSONNodeDumper::createAccessSpecifier(AccessSpecifier AS) {
-  switch (AS) {
-  case AS_none: return "none";
-  case AS_private: return "private";
-  case AS_protected: return "protected";
-  case AS_public: return "public";
-  }
-  llvm_unreachable("Unknown access specifier");
+  const auto AccessSpelling = getAccessSpelling(AS);
+  if (AccessSpelling.empty())
+    return "none";
+  return AccessSpelling.str();
 }
 
 llvm::json::Object
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 9dbe557075391..1b640a8cbe40e 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/LocInfoType.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/Specifiers.h"
 
 using namespace clang;
 
@@ -436,19 +437,10 @@ void TextNodeDumper::dumpName(const NamedDecl *ND) {
 }
 
 void TextNodeDumper::dumpAccessSpecifier(AccessSpecifier AS) {
-  switch (AS) {
-  case AS_none:
-    break;
-  case AS_public:
-    OS << "public";
-    break;
-  case AS_protected:
-    OS << "protected";
-    break;
-  case AS_private:
-    OS << "private";
-    break;
-  }
+  const auto AccessSpelling = getAccessSpelling(AS);
+  if (AccessSpelling.empty())
+    return;
+  OS << AccessSpelling;
 }
 
 void TextNodeDumper::dumpCleanupObject(

From a2a3e9f0a6e91103a0d1fa73086dbdf109c48f69 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 10:21:59 -0700
Subject: [PATCH 252/770] [Driver] Support -fsanitize=shadow-call-stack on
 aarch64_be

Fixes https://bugs.llvm.org/show_bug.cgi?id=46076

Reviewed By: nickdesaulniers, pcc

Differential Revision: https://reviews.llvm.org/D80647
---
 clang/lib/Driver/SanitizerArgs.cpp | 3 +--
 clang/test/Driver/fsanitize.c      | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 9beca156e93e5..6281991ebf04e 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -489,8 +489,7 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         << lastArgumentForMask(D, Args, Kinds & NeedsLTO) << "-flto";
   }
 
-  if ((Kinds & SanitizerKind::ShadowCallStack) &&
-      TC.getTriple().getArch() == llvm::Triple::aarch64 &&
+  if ((Kinds & SanitizerKind::ShadowCallStack) && TC.getTriple().isAArch64() &&
       !llvm::AArch64::isX18ReservedByDefault(TC.getTriple()) &&
       !Args.hasArg(options::OPT_ffixed_x18)) {
     D.Diag(diag::err_drv_argument_only_allowed_with)
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 55a5e7a2e2eb9..dd32f8e466bd5 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -813,6 +813,10 @@
 // CHECK-SCS-MINIMAL: "-fsanitize=shadow-call-stack"
 // CHECK-SCS-MINIMAL: "-fsanitize-minimal-runtime"
 
+// RUN: %clang -target aarch64 -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=AARCH64-SCS
+// RUN: %clang -target aarch64_be -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=AARCH64-SCS
+// AARCH64-SCS: "-fsanitize=shadow-call-stack"
+
 // RUN: %clang -target aarch64-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
 // RUN: %clang -target arm-linux-androideabi -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO

From d37ce53ad30f3d5e7fb37b5bb9c49793ca33d2df Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 11:22:46 -0400
Subject: [PATCH 253/770] AMDGPU: Set StackPointerRegisterToSaveRestore

This will enable selecting non-entry block allocas. Skip the SP write
check in the base isSchedulingBoundary implementation to preserve the
previous scheduling behavior and avoid test churn. It's apparently for
compile time reasons, but if we were to use this more work would be
needed since in some of the failing tests, we seem to incorrectly get
hazard nops inserted.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  3 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp    | 14 +++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2f5b6cdeec25..689aece39dee5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -842,6 +842,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
   setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
 
+  // FIXME: In other contexts we pretend this is a per-function property.
+  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
+
   setSchedulingPreference(Sched::RegPressure);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 06dd11fdbf618..18d08362512d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2936,13 +2936,21 @@ static bool changesVGPRIndexingMode(const MachineInstr &MI) {
 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                        const MachineBasicBlock *MBB,
                                        const MachineFunction &MF) const {
-  // XXX - Do we want the SP check in the base implementation?
+  // Skipping the check for SP writes in the base implementation. The reason it
+  // was added was apparently due to compile time concerns.
+  //
+  // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
+  // but is probably avoidable.
+
+  // Copied from base implementation.
+  // Terminators and labels can't be scheduled around.
+  if (MI.isTerminator() || MI.isPosition())
+    return true;
 
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
-  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
-         MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+  return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
          MI.getOpcode() == AMDGPU::S_DENORM_MODE ||

From 8d9ff2318530d91be04ccced107c3ef04ba2255f Mon Sep 17 00:00:00 2001
From: jasonliu <jasonliu.development@gmail.com>
Date: Wed, 27 May 2020 17:52:21 +0000
Subject: [PATCH 254/770] [NFC][XCOFF][AIX] Return function entry point symbol
 with dedicate function

Use getFunctionEntryPointSymbol whenever possible to enclose the
implementation detail and reduce duplicate logic.

Differential Revision: https://reviews.llvm.org/D80402
---
 .../llvm/CodeGen/TargetLoweringObjectFileImpl.h        |  3 +++
 llvm/include/llvm/Target/TargetLoweringObjectFile.h    |  7 +++++++
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp             | 10 ++++------
 llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp      |  8 ++++++++
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 21a4635ca0ce1..6e2c0973e3547 100644
--- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -262,6 +262,9 @@ class TargetLoweringObjectFileXCOFF : public TargetLoweringObjectFile {
   /// For functions, this will always return a function descriptor symbol.
   MCSymbol *getTargetSymbol(const GlobalValue *GV,
                             const TargetMachine &TM) const override;
+
+  MCSymbol *getFunctionEntryPointSymbol(const Function *F,
+                                        const TargetMachine &TM) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 774b4470ca460..cc6c93b6ee2b9 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -245,6 +245,13 @@ class TargetLoweringObjectFile : public MCObjectFileInfo {
     return nullptr;
   }
 
+  /// If supported, return the function entry point symbol.
+  /// Otherwise, returns nulltpr.
+  virtual MCSymbol *getFunctionEntryPointSymbol(const Function *F,
+                                                const TargetMachine &TM) const {
+    return nullptr;
+  }
+
 protected:
   virtual MCSection *SelectSectionForGlobal(const GlobalObject *GO,
                                             SectionKind Kind,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 1a2b3761b3a79..3be48935f2ab8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1503,6 +1503,8 @@ bool AsmPrinter::doFinalization(Module &M) {
   // Emit remaining GOT equivalent globals.
   emitGlobalGOTEquivs();
 
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
   // Emit linkage(XCOFF) and visibility info for declarations
   for (const Function &F : M) {
     if (!F.isDeclarationForLinker())
@@ -1513,8 +1515,7 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (TM.getTargetTriple().isOSBinFormatXCOFF() && !F.isIntrinsic()) {
 
       // Get the function entry point symbol.
-      MCSymbol *FnEntryPointSym = OutContext.getOrCreateSymbol(
-          "." + cast<MCSymbolXCOFF>(Name)->getUnqualifiedName());
+      MCSymbol *FnEntryPointSym = TLOF.getFunctionEntryPointSymbol(&F, TM);
       if (cast<MCSymbolXCOFF>(FnEntryPointSym)->hasRepresentedCsectSet())
         // Emit linkage for the function entry point.
         emitLinkage(&F, FnEntryPointSym);
@@ -1536,8 +1537,6 @@ bool AsmPrinter::doFinalization(Module &M) {
   if (remarks::RemarkStreamer *RS = M.getContext().getMainRemarkStreamer())
     emitRemarksSection(*RS);
 
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-
   TLOF.emitModuleMetadata(*OutStreamer, M);
 
   if (TM.getTargetTriple().isOSBinFormatELF()) {
@@ -1786,8 +1785,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
                                " initalized first.");
 
     // Get the function entry point symbol.
-    CurrentFnSym = OutContext.getOrCreateSymbol(
-        "." + cast<MCSymbolXCOFF>(CurrentFnDescSym)->getUnqualifiedName());
+    CurrentFnSym = getObjFileLowering().getFunctionEntryPointSymbol(&F, TM);
   }
 
   CurrentFnSymForSize = CurrentFnSym;
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 38a0223688ba1..586de4fd97f0d 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2150,6 +2150,14 @@ XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
   llvm_unreachable("Unknown linkage type!");
 }
 
+MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol(
+    const Function *F, const TargetMachine &TM) const {
+  SmallString<128> NameStr;
+  NameStr.push_back('.');
+  getNameWithPrefix(NameStr, F, TM);
+  return getContext().getOrCreateSymbol(NameStr);
+}
+
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForFunctionDescriptor(
     const Function *F, const TargetMachine &TM) const {
   SmallString<128> NameStr;

From b9c6871a9570975827dc0bbeb39131c99c8daf8e Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 10:45:07 -0700
Subject: [PATCH 255/770] [Driver] Support -fsanitize=shadow-call-stack and
 cfi-icall on aarch64_be

D80647 did not fix https://bugs.llvm.org/show_bug.cgi?id=46076
This is the fix.
---
 clang/lib/Driver/ToolChain.cpp | 9 +++------
 clang/test/Driver/fsanitize.c  | 5 +++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 6258276a00928..ad66e8e6b5d31 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -975,15 +975,12 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
   if (getTriple().getArch() == llvm::Triple::x86 ||
       getTriple().getArch() == llvm::Triple::x86_64 ||
       getTriple().getArch() == llvm::Triple::arm ||
-      getTriple().getArch() == llvm::Triple::aarch64 ||
       getTriple().getArch() == llvm::Triple::wasm32 ||
-      getTriple().getArch() == llvm::Triple::wasm64)
+      getTriple().getArch() == llvm::Triple::wasm64 || getTriple().isAArch64())
     Res |= SanitizerKind::CFIICall;
-  if (getTriple().getArch() == llvm::Triple::x86_64 ||
-      getTriple().getArch() == llvm::Triple::aarch64)
+  if (getTriple().getArch() == llvm::Triple::x86_64 || getTriple().isAArch64())
     Res |= SanitizerKind::ShadowCallStack;
-  if (getTriple().getArch() == llvm::Triple::aarch64 ||
-      getTriple().getArch() == llvm::Triple::aarch64_be)
+  if (getTriple().isAArch64())
     Res |= SanitizerKind::MemTag;
   return Res;
 }
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index dd32f8e466bd5..9ff2bdf58d6cd 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -577,6 +577,7 @@
 // RUN: %clang -target aarch64-linux-gnu -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
 // RUN: %clang -target arm-linux-android -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
 // RUN: %clang -target aarch64-linux-android -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
+// RUN: %clang -target aarch64_be -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
 // CHECK-CFI: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast,cfi-icall,cfi-mfcall,cfi-unrelated-cast,cfi-nvcall,cfi-vcall
 // CHECK-CFI-NOMFCALL: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast,cfi-icall,cfi-unrelated-cast,cfi-nvcall,cfi-vcall
 // CHECK-CFI-DCAST: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast
@@ -813,8 +814,8 @@
 // CHECK-SCS-MINIMAL: "-fsanitize=shadow-call-stack"
 // CHECK-SCS-MINIMAL: "-fsanitize-minimal-runtime"
 
-// RUN: %clang -target aarch64 -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=AARCH64-SCS
-// RUN: %clang -target aarch64_be -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=AARCH64-SCS
+// RUN: %clang -target aarch64 -fsanitize=shadow-call-stack -ffixed-x18 %s -### 2>&1 | FileCheck %s --check-prefix=AARCH64-SCS
+// RUN: %clang -target aarch64_be -fsanitize=shadow-call-stack -ffixed-x18 %s -### 2>&1 | FileCheck %s --check-prefix=AARCH64-SCS
 // AARCH64-SCS: "-fsanitize=shadow-call-stack"
 
 // RUN: %clang -target aarch64-linux-gnu -fsanitize=scudo %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SCUDO

From eadf2959567c89bebff153feac873cbc1b71eb04 Mon Sep 17 00:00:00 2001
From: Rithik Sharma <rithiksh02@gmail.com>
Date: Wed, 27 May 2020 18:00:06 +0000
Subject: [PATCH 256/770] [CodeMoverUtils] Use dominator tree level to decide
 the direction of code motion

Summary: Currently isSafeToMoveBefore uses DFS numbering for determining
the relative position of instruction and insert point which is not
always correct. This PR proposes the use of Dominator Tree depth for the
same. If a node is at a higher level than the insert point then it is
safe to say that we want to move in the forward direction.
Authored By: RithikSharma
Reviewer: Whitney, nikic, bmahjour, etiotto, fhahn
Reviewed By: Whitney
Subscribers: fhahn, hiraditya, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D80084
---
 .../llvm/Analysis/OrderedInstructions.h       |  6 ++++
 llvm/lib/Analysis/OrderedInstructions.cpp     | 12 +++++++
 llvm/lib/Transforms/Utils/CodeMoverUtils.cpp  | 17 ++++-----
 .../Transforms/Utils/CodeMoverUtilsTest.cpp   | 36 +++++++++++++++++++
 4 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Analysis/OrderedInstructions.h b/llvm/include/llvm/Analysis/OrderedInstructions.h
index 302509380a03a..e380a0045377c 100644
--- a/llvm/include/llvm/Analysis/OrderedInstructions.h
+++ b/llvm/include/llvm/Analysis/OrderedInstructions.h
@@ -45,6 +45,12 @@ class OrderedInstructions {
   /// or if the first instruction comes before the second in the same basic
   /// block.
   bool dfsBefore(const Instruction *, const Instruction *) const;
+
+  // Return true if the first instruction comes before the second in the
+  // dominator tree BFS traversal based on the level number of nodes in
+  // dominator tree if they are in different basic blocks else if the first
+  // instruction comes before the second in the same basic block.
+  bool domTreeLevelBefore(const Instruction *, const Instruction *) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/OrderedInstructions.cpp b/llvm/lib/Analysis/OrderedInstructions.cpp
index 11ab3e0927d2e..98a65f31a36cf 100644
--- a/llvm/lib/Analysis/OrderedInstructions.cpp
+++ b/llvm/lib/Analysis/OrderedInstructions.cpp
@@ -43,3 +43,15 @@ bool OrderedInstructions::dfsBefore(const Instruction *InstA,
   DomTreeNode *DB = DT->getNode(InstB->getParent());
   return DA->getDFSNumIn() < DB->getDFSNumIn();
 }
+
+bool OrderedInstructions::domTreeLevelBefore(const Instruction *InstA,
+                                             const Instruction *InstB) const {
+  // Use ordered basic block in case the 2 instructions are in the same basic
+  // block.
+  if (InstA->getParent() == InstB->getParent())
+    return localDominates(InstA, InstB);
+
+  DomTreeNode *DA = DT->getNode(InstA->getParent());
+  DomTreeNode *DB = DT->getNode(InstB->getParent());
+  return DA->getLevel() < DB->getLevel();
+}
diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
index 383e749f32c46..4583ff74167a5 100644
--- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -317,25 +317,20 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
   if (!isControlFlowEquivalent(I, InsertPoint, DT, PDT))
     return reportInvalidCandidate(I, NotControlFlowEquivalent);
 
-  OrderedInstructions OI(&DT);
-  DT.updateDFSNumbers();
-  const bool MoveForward = OI.dfsBefore(&I, &InsertPoint);
-  if (MoveForward) {
-    // When I is being moved forward, we need to make sure the InsertPoint
-    // dominates every users. Or else, a user may be using an undefined I.
+  if (!DT.dominates(&InsertPoint, &I))
     for (const Use &U : I.uses())
       if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
         if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
           return false;
-  } else {
-    // When I is being moved backward, we need to make sure all its opernads
-    // dominates the InsertPoint. Or else, an operand may be undefined for I.
+  if (!DT.dominates(&I, &InsertPoint))
     for (const Value *Op : I.operands())
       if (auto *OpInst = dyn_cast<Instruction>(Op))
-        if (&InsertPoint == OpInst || !OI.dominates(OpInst, &InsertPoint))
+        if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
           return false;
-  }
 
+  OrderedInstructions OI(&DT);
+  DT.updateDFSNumbers();
+  const bool MoveForward = OI.domTreeLevelBefore(&I, &InsertPoint);
   Instruction &StartInst = (MoveForward ? I : InsertPoint);
   Instruction &EndInst = (MoveForward ? InsertPoint : I);
   SmallPtrSet<Instruction *, 10> InstsToCheck;
diff --git a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
index bfa269e8f6ee8..ceeb4071ab7d4 100644
--- a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
@@ -613,3 +613,39 @@ TEST(CodeMoverUtils, IsSafeToMoveTest3) {
         EXPECT_TRUE(isSafeToMoveBefore(*IncInst, *CmpInst, DT, PDT, DI));
       });
 }
+
+TEST(CodeMoverUtils, IsSafeToMoveTest4) {
+  LLVMContext C;
+
+  std::unique_ptr<Module> M =
+      parseIR(C, R"(define void @foo(i1 %cond, i32 %op0, i32 %op1) {
+                 entry:
+                   br i1 %cond, label %if.end.first, label %if.then.first
+                 if.then.first:
+                   %add = add i32 %op0, %op1
+                   %user = add i32 %add, 1
+                   br label %if.end.first
+                 if.end.first:
+                   br i1 %cond, label %if.end.second, label %if.then.second
+                 if.then.second:
+                   %sub_op0 = add i32 %op0, 1
+                   %sub = sub i32 %sub_op0, %op1
+                   br label %if.end.second
+                 if.end.second:
+                   ret void
+                 })");
+
+  run(*M, "foo",
+      [&](Function &F, DominatorTree &DT, PostDominatorTree &PDT,
+          DependenceInfo &DI) {
+        Instruction *AddInst = getInstructionByName(F, "add");
+        Instruction *SubInst = getInstructionByName(F, "sub");
+
+        // Cannot move as %user uses %add and %sub doesn't dominates %user.
+        EXPECT_FALSE(isSafeToMoveBefore(*AddInst, *SubInst, DT, PDT, DI));
+
+        // Cannot move as %sub_op0 is an operand of %sub and %add doesn't
+        // dominates %sub_op0.
+        EXPECT_FALSE(isSafeToMoveBefore(*SubInst, *AddInst, DT, PDT, DI));
+      });
+}

From c295a65da496f5e982402e8f83e417659c7dd166 Mon Sep 17 00:00:00 2001
From: aartbik <ajcbik@google.com>
Date: Wed, 27 May 2020 10:31:29 -0700
Subject: [PATCH 257/770] [mlir] [VectorOps] Add 'vector.flat_transpose'
 operation

Summary:
Provides a representation of the linearized LLVM instrinsic.
With tests and lowering implementation to LLVM IR dialect.
Prepares better lowering for 2-D vector.transpose.

Reviewers: nicolasvasilache, ftynse, reidtatge, bkramer, dcaballe

Reviewed By: ftynse, dcaballe

Subscribers: mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80419
---
 mlir/include/mlir/Dialect/Vector/VectorOps.td | 55 +++++++++++++++++++
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      | 22 ++++++++
 .../VectorToLLVM/vector-to-llvm.mlir          | 12 ++++
 mlir/test/Dialect/Vector/invalid.mlir         |  7 +++
 mlir/test/Dialect/Vector/ops.mlir             | 44 +++++++++++----
 5 files changed, 128 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index 1b978e44dd6ab..4065d19b6c8a9 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -1482,6 +1482,9 @@ def Vector_PrintOp :
 
 //===----------------------------------------------------------------------===//
 // Ops used for supporting progressive lowering and conversion type changes.
+// The Ops are typically not used directly by higher level dialects, but are
+// used by intra-dialect rewriting rules to bring vector operations closer
+// to the hardware ISA.
 //===----------------------------------------------------------------------===//
 
 /// Vector dialect matrix multiplication op that operates on flattened 1-D
@@ -1510,12 +1513,20 @@ def Vector_MatmulOp : Vector_Op<"matrix_multiply", [NoSideEffect,
   let description = [{
     This is the counterpart of llvm.matrix.multiply in MLIR. It serves the
     purposes of more progressive lowering and localized type conversion.
+    Higher levels typically lower matrix multiplications into 'vector.contract'
+    operations. Subsequent rewriting rule progressively lower these operations
+    into 'vector.matrix_multiply' operations to bring the operations closer
+    to the hardware ISA.
 
     The ‘vector.matrix_multiply’ op treats `lhs` as matrix with <lhs_rows> rows
     and <lhs_columns> columns, `rhs` as matrix with <lhs_columns> rows and
     <rhs_columns> and multiplies them. The result matrix is returned embedded in
     the result vector.
 
+    Also see:
+
+    http://llvm.org/docs/LangRef.html#llvm-matrix-multiply-intrinsic
+
     Example:
 
     ```mlir
@@ -1541,4 +1552,48 @@ def Vector_MatmulOp : Vector_Op<"matrix_multiply", [NoSideEffect,
     "`:` `(` type($lhs) `,` type($rhs) `)` `->` type($res)";
 }
 
+/// Vector dialect matrix tranposition op that operates on flattened 1-D
+/// MLIR vectors. This is the counterpart of llvm.matrix.transpose in MLIR.
+/// This may seem redundant with vector.transpose but it serves the purposes of
+/// more progressive lowering and localized type conversion on the path:
+///   `vector<...x...xf32> -> vector<...xf32> -> !llvm<... x float>`.
+def Vector_FlatTransposeOp : Vector_Op<"flat_transpose", [NoSideEffect,
+  PredOpTrait<"source operand and result have same element type",
+                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+    Arguments<(
+      // TODO(ntv, fhahn, ajcbik): tighten vector element types that make sense.
+      ins VectorOfRankAndType<[1],
+            [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$matrix,
+          I32Attr:$rows, I32Attr:$columns)>,
+    Results<(
+      outs VectorOfRankAndType<[1],
+             [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$res)> {
+  let summary = "Vector matrix transposition on flattened 1-D MLIR vectors";
+  let description = [{
+    This is the counterpart of llvm.matrix.transpose in MLIR. It serves
+    the purposes of more progressive lowering and localized type conversion.
+    Higher levels typically lower matrix tranpositions into 'vector.transpose'
+    operations. Subsequent rewriting rule progressively lower these operations
+    into 'vector.flat_transpose' operations to bring the operations closer
+    to the hardware ISA.
+
+    The ‘vector.flat_transpose’ op treats the 1-D input `matrix` as
+    a 2-D matrix with <rows> rows and <columns> columns, and returns the
+    transposed matrix in flattened form in 'res'.
+
+    Also see:
+
+    http://llvm.org/docs/LangRef.html#llvm-matrix-transpose-intrinsic
+
+    Example:
+
+    ```mlir
+    %1 = vector.flat_transpose %0 { rows = 4: i32, columns = 4: i32 }
+       : (vector<16xf32>) -> vector<16xf32>
+    ```
+  }];
+  let verifier = ?;
+  let assemblyFormat = "$matrix attr-dict `:` type($matrix) `->` type($res)";
+}
+
 #endif // VECTOR_OPS
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 975807ca86712..5b3a01c7512f3 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -148,6 +148,27 @@ class VectorMatmulOpConversion : public ConvertToLLVMPattern {
   }
 };
 
+/// Conversion pattern for a vector.flat_transpose.
+/// This is lowered directly to the proper llvm.intr.matrix.transpose.
+class VectorFlatTransposeOpConversion : public ConvertToLLVMPattern {
+public:
+  explicit VectorFlatTransposeOpConversion(MLIRContext *context,
+                                           LLVMTypeConverter &typeConverter)
+      : ConvertToLLVMPattern(vector::FlatTransposeOp::getOperationName(),
+                             context, typeConverter) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto transOp = cast<vector::FlatTransposeOp>(op);
+    auto adaptor = vector::FlatTransposeOpOperandAdaptor(operands);
+    rewriter.replaceOpWithNewOp<LLVM::MatrixTransposeOp>(
+        transOp, typeConverter.convertType(transOp.res().getType()),
+        adaptor.matrix(), transOp.rows(), transOp.columns());
+    return success();
+  }
+};
+
 class VectorReductionOpConversion : public ConvertToLLVMPattern {
 public:
   explicit VectorReductionOpConversion(MLIRContext *context,
@@ -1157,6 +1178,7 @@ void mlir::populateVectorToLLVMMatrixConversionPatterns(
     LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
   MLIRContext *ctx = converter.getDialect()->getContext();
   patterns.insert<VectorMatmulOpConversion>(ctx, converter);
+  patterns.insert<VectorFlatTransposeOpConversion>(ctx, converter);
 }
 
 namespace {
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 26e3e9dbe2b1e..6150ac78fc2a5 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -952,3 +952,15 @@ func @genbool_1d() -> vector<8xi1> {
 // CHECK: %[[T8:.*]] = llvm.mlir.constant(3 : i64) : !llvm.i64
 // CHECK: %[[T9:.*]] = llvm.insertelement %[[T0]], %[[T7]][%[[T8]] : !llvm.i64] : !llvm<"<8 x i1>">
 // CHECK: llvm.return %9 : !llvm<"<8 x i1>">
+
+// CHECK-LABEL: func @flat_transpose
+// CHECK-SAME:  %[[A:.*]]: !llvm<"<16 x float>">
+// CHECK:       %[[T:.*]] = llvm.intr.matrix.transpose %[[A]]
+// CHECK-SAME:      {columns = 4 : i32, rows = 4 : i32} :
+// CHECK-SAME:      !llvm<"<16 x float>"> into !llvm<"<16 x float>">
+// CHECK:       llvm.return %[[T]] : !llvm<"<16 x float>">
+func @flat_transpose(%arg0: vector<16xf32>) -> vector<16xf32> {
+  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 }
+     : vector<16xf32> -> vector<16xf32>
+  return %0 : vector<16xf32>
+}
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index cc72511a6e782..1f6da8190baeb 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1145,6 +1145,13 @@ func @transpose_dim_size_mismatch(%arg0: vector<11x7x3x2xi32>) {
 
 // -----
 
+func @flat_transpose_type_mismatch(%arg0: vector<16xf32>) {
+  // expected-error@+1 {{'vector.flat_transpose' op failed to verify that source operand and result have same element type}}
+  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 } : vector<16xf32> -> vector<16xf64>
+}
+
+// -----
+
 func @type_cast_layout(%arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>>) {
   // expected-error@+1 {{expects operand to be a memref with no layout}}
   %0 = vector.type_cast %arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>> to memref<vector<4x3xf32>>
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 57c03c903fe89..dbffe4206f12a 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -140,7 +140,7 @@ func @extract_strided_slice(%arg0: vector<4x8x16xf32>) -> vector<2x2x16xf32> {
   indexing_maps = #contraction_to_scalar_accesses,
   iterator_types = ["reduction"]
 }
-// CHECK-LABEL: contraction_to_scalar
+// CHECK-LABEL: @contraction_to_scalar
 func @contraction_to_scalar(%arg0: vector<10xf32>, %arg1: vector<10xf32>) -> f32 {
   // CHECK:      %[[C0:.*]] = constant 0.000000e+00 : f32
   %f0 = constant 0.0: f32
@@ -172,7 +172,7 @@ func @contraction_to_scalar(%arg0: vector<10xf32>, %arg1: vector<10xf32>) -> f32
   iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction",
                     "reduction"]
 }
-// CHECK-LABEL: contraction
+// CHECK-LABEL: @contraction
 func @contraction(%arg0 : vector<7x8x16x15xf32>, %arg1 : vector<8x16x7x5xf32>,
                   %arg2 : vector<8x15x5xf32>, %arg3 : vector<8x8x15x5xf32>,
                   %arg4 : index) {
@@ -196,7 +196,7 @@ func @contraction(%arg0 : vector<7x8x16x15xf32>, %arg1 : vector<8x16x7x5xf32>,
   return
 }
 
-// CHECK-LABEL: create_vector_mask
+// CHECK-LABEL: @create_vector_mask
 func @create_vector_mask() {
   // CHECK:      %[[C2:.*]] = constant 2 : index
   %c2 = constant 2 : index
@@ -208,14 +208,14 @@ func @create_vector_mask() {
   return
 }
 
-// CHECK-LABEL: constant_vector_mask
+// CHECK-LABEL: @constant_vector_mask
 func @constant_vector_mask() {
   // CHECK: vector.constant_mask [3, 2] : vector<4x3xi1>
   %0 = vector.constant_mask [3, 2] : vector<4x3xi1>
   return
 }
 
-// CHECK-LABEL: extract_slices
+// CHECK-LABEL: @extract_slices
 func @extract_slices(%arg0 : vector<4x2xf32>)
   -> (tuple<vector<2x2xf32>, vector<2x2xf32>>) {
   // CHECK: vector.extract_slices %{{.*}}, [2, 2], [1, 1] : vector<4x2xf32> into tuple<vector<2x2xf32>, vector<2x2xf32>>
@@ -227,7 +227,7 @@ func @extract_slices(%arg0 : vector<4x2xf32>)
   return %3 : tuple<vector<2x2xf32>, vector<2x2xf32>>
 }
 
-// CHECK-LABEL: insert_slices
+// CHECK-LABEL: @insert_slices
 func @insert_slices(%arg0 : tuple<vector<2x2xf32>, vector<2x2xf32>>)
   -> (vector<4x2xf32>) {
   // CHECK: vector.insert_slices %{{.*}}, [2, 2], [1, 1] : tuple<vector<2x2xf32>, vector<2x2xf32>> into vector<4x2xf32>
@@ -243,7 +243,7 @@ func @vector_print(%arg0: vector<8x4xf32>) {
   return
 }
 
-// CHECK-LABEL: reshape
+// CHECK-LABEL: @reshape
 func @reshape(%arg0 : vector<3x2x4xf32>) -> (vector<2x3x4xf32>) {
   // CHECK:      %[[C2:.*]] = constant 2 : index
   %c2 = constant 2 : index
@@ -260,7 +260,7 @@ func @reshape(%arg0 : vector<3x2x4xf32>) -> (vector<2x3x4xf32>) {
   return %1 : vector<2x3x4xf32>
 }
 
-// CHECK-LABEL: shape_cast
+// CHECK-LABEL: @shape_cast
 func @shape_cast(%arg0 : vector<5x1x3x2xf32>,
                  %arg1 : tuple<vector<5x4x2xf32>, vector<3x4x2xf32>>)
   -> (vector<15x2xf32>, tuple<vector<20x2xf32>, vector<12x2xf32>>) {
@@ -284,7 +284,7 @@ func @vector_fma(%a: vector<8xf32>, %b: vector<8x4xf32>) {
   return
 }
 
-// CHECK-LABEL: reduce_fp
+// CHECK-LABEL: @reduce_fp
 func @reduce_fp(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
   // CHECK:    vector.reduction "add", %{{.*}} : vector<16xf32> into f32
   vector.reduction "add", %arg0 : vector<16xf32> into f32
@@ -302,7 +302,7 @@ func @reduce_fp(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
   return %0 : f32
 }
 
-// CHECK-LABEL: reduce_int
+// CHECK-LABEL: @reduce_int
 func @reduce_int(%arg0: vector<16xi32>) -> i32 {
   // CHECK:    vector.reduction "add", %{{.*}} : vector<16xi32> into i32
   vector.reduction "add", %arg0 : vector<16xi32> into i32
@@ -322,14 +322,34 @@ func @reduce_int(%arg0: vector<16xi32>) -> i32 {
   return %0 : i32
 }
 
-// CHECK-LABEL: transpose_fp
+// CHECK-LABEL: @transpose_fp
 func @transpose_fp(%arg0: vector<3x7xf32>) -> vector<7x3xf32> {
+  // CHECK: %[[X:.*]] = vector.transpose %{{.*}}, [1, 0] : vector<3x7xf32> to vector<7x3xf32>
   %0 = vector.transpose %arg0, [1, 0] : vector<3x7xf32> to vector<7x3xf32>
+  // CHECK: return %[[X]] : vector<7x3xf32>
   return %0 : vector<7x3xf32>
 }
 
-// CHECK-LABEL: transpose_int
+// CHECK-LABEL: @transpose_int
 func @transpose_int(%arg0: vector<11x7x3x2xi32>) -> vector<2x11x7x3xi32> {
+  // CHECK: %[[X:.*]] = vector.transpose %{{.*}}, [3, 0, 1, 2] : vector<11x7x3x2xi32> to vector<2x11x7x3xi32>
   %0 = vector.transpose %arg0, [3, 0, 1, 2] : vector<11x7x3x2xi32> to vector<2x11x7x3xi32>
+  // CHECK: return %[[X]] : vector<2x11x7x3xi32>
   return %0 : vector<2x11x7x3xi32>
 }
+
+// CHECK-LABEL: @flat_transpose_fp
+func @flat_transpose_fp(%arg0: vector<16xf32>) -> vector<16xf32> {
+  // CHECK: %[[X:.*]] = vector.flat_transpose %{{.*}} {columns = 4 : i32, rows = 4 : i32} : vector<16xf32> -> vector<16xf32>
+  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 } : vector<16xf32> -> vector<16xf32>
+  // CHECK: return %[[X]] : vector<16xf32>
+  return %0 : vector<16xf32>
+}
+
+// CHECK-LABEL: @flat_transpose_int
+func @flat_transpose_int(%arg0: vector<16xi32>) -> vector<16xi32> {
+  // CHECK: %[[X:.*]] = vector.flat_transpose %{{.*}} {columns = 8 : i32, rows = 2 : i32} : vector<16xi32> -> vector<16xi32>
+  %0 = vector.flat_transpose %arg0 { rows = 2: i32, columns = 8: i32 } : vector<16xi32> -> vector<16xi32>
+  // CHECK: return %[[X]] : vector<16xi32>
+  return %0 : vector<16xi32>
+}

From 2368bf52cd7725a34f09f4b27a9c205cda06f478 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 27 May 2020 09:50:14 -0500
Subject: [PATCH 258/770] [PowerPC] Add support for -mcpu=pwr10 in both clang
 and llvm

Summary:
This patch simply adds support for the new CPU in anticipation of
Power10. There isn't really any functionality added so there are no
associated test cases at this time.

Reviewers: stefanp, nemanjai, amyk, hfinkel, power-llvm-team, #powerpc

Reviewed By: stefanp, nemanjai, amyk, #powerpc

Subscribers: NeHuang, steven.zhang, hiraditya, llvm-commits, wuzish, shchenz, cfe-commits, kbarton, echristo

Tags: #clang, #powerpc, #llvm

Differential Revision: https://reviews.llvm.org/D80020
---
 clang/lib/Basic/Targets/PPC.cpp               | 43 ++++++++++-----
 clang/lib/Basic/Targets/PPC.h                 | 44 ++++++++-------
 clang/lib/Driver/ToolChains/Arch/PPC.cpp      | 20 ++++---
 clang/test/Misc/target-invalid-cpu-note.c     |  2 +-
 clang/test/Preprocessor/init-ppc64.c          | 18 ++++++
 llvm/lib/Support/Host.cpp                     |  1 +
 llvm/lib/Target/PowerPC/PPC.td                | 27 +++++++--
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  3 +
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp      |  1 +
 llvm/lib/Target/PowerPC/PPCSubtarget.h        | 55 ++++++++++---------
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  9 ++-
 llvm/test/CodeGen/PowerPC/check-cpu.ll        |  6 +-
 12 files changed, 152 insertions(+), 77 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 81c13a8104e8a..ad34c287b5188 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -151,6 +151,8 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("_ARCH_PWR8");
   if (ArchDefs & ArchDefinePwr9)
     Builder.defineMacro("_ARCH_PWR9");
+  if (ArchDefs & ArchDefinePwr10)
+    Builder.defineMacro("_ARCH_PWR10");
   if (ArchDefs & ArchDefineA2)
     Builder.defineMacro("_ARCH_A2");
   if (ArchDefs & ArchDefineA2q) {
@@ -313,10 +315,17 @@ bool PPCTargetInfo::initFeatureMap(
                         .Case("e500", true)
                         .Default(false);
 
-  // Future CPU should include all of the features of Power 9 as well as any
+  // Power10 includes all the same features as Power9 plus any features specific
+  // to the Power10 core.
+  if (CPU == "pwr10" || CPU == "power10") {
+    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
+    addP10SpecificFeatures(Features);
+  }
+
+  // Future CPU should include all of the features of Power 10 as well as any
   // additional features (yet to be determined) specific to it.
   if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
+    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
     addFutureSpecificFeatures(Features);
   }
 
@@ -333,6 +342,13 @@ bool PPCTargetInfo::initFeatureMap(
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
+// Add any Power10 specific features.
+void PPCTargetInfo::addP10SpecificFeatures(
+    llvm::StringMap<bool> &Features) const {
+  Features["htm"] = false; // HTM was removed for P10.
+  return;
+}
+
 // Add features specific to the "Future" CPU.
 void PPCTargetInfo::addFutureSpecificFeatures(
     llvm::StringMap<bool> &Features) const {
@@ -463,18 +479,17 @@ ArrayRef<TargetInfo::AddlRegName> PPCTargetInfo::getGCCAddlRegNames() const {
 }
 
 static constexpr llvm::StringLiteral ValidCPUNames[] = {
-    {"generic"},   {"440"},       {"450"},         {"601"},         {"602"},
-    {"603"},       {"603e"},      {"603ev"},       {"604"},         {"604e"},
-    {"620"},       {"630"},       {"g3"},          {"7400"},        {"g4"},
-    {"7450"},      {"g4+"},       {"750"},         {"8548"},        {"970"},
-    {"g5"},        {"a2"},        {"a2q"},         {"e500"},        {"e500mc"},
-    {"e5500"},     {"power3"},    {"pwr3"},        {"power4"},      {"pwr4"},
-    {"power5"},    {"pwr5"},      {"power5x"},     {"pwr5x"},       {"power6"},
-    {"pwr6"},      {"power6x"},   {"pwr6x"},       {"power7"},      {"pwr7"},
-    {"power8"},    {"pwr8"},      {"power9"},      {"pwr9"},        {"powerpc"},
-    {"ppc"},       {"powerpc64"}, {"ppc64"},       {"powerpc64le"}, {"ppc64le"},
-    {"future"}
-};
+    {"generic"},     {"440"},     {"450"},     {"601"},       {"602"},
+    {"603"},         {"603e"},    {"603ev"},   {"604"},       {"604e"},
+    {"620"},         {"630"},     {"g3"},      {"7400"},      {"g4"},
+    {"7450"},        {"g4+"},     {"750"},     {"8548"},      {"970"},
+    {"g5"},          {"a2"},      {"a2q"},     {"e500"},      {"e500mc"},
+    {"e5500"},       {"power3"},  {"pwr3"},    {"power4"},    {"pwr4"},
+    {"power5"},      {"pwr5"},    {"power5x"}, {"pwr5x"},     {"power6"},
+    {"pwr6"},        {"power6x"}, {"pwr6x"},   {"power7"},    {"pwr7"},
+    {"power8"},      {"pwr8"},    {"power9"},  {"pwr9"},      {"power10"},
+    {"pwr10"},       {"powerpc"}, {"ppc"},     {"powerpc64"}, {"ppc64"},
+    {"powerpc64le"}, {"ppc64le"}, {"future"}};
 
 bool PPCTargetInfo::isValidCPUName(StringRef Name) const {
   return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames);
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 7c19a96a99c74..691fa5fdcc6d1 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -43,13 +43,13 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
     ArchDefinePwr7 = 1 << 11,
     ArchDefinePwr8 = 1 << 12,
     ArchDefinePwr9 = 1 << 13,
-    ArchDefineFuture = 1 << 14,
-    ArchDefineA2 = 1 << 15,
-    ArchDefineA2q = 1 << 16,
-    ArchDefineE500 = 1 << 17
+    ArchDefinePwr10 = 1 << 14,
+    ArchDefineFuture = 1 << 15,
+    ArchDefineA2 = 1 << 16,
+    ArchDefineA2q = 1 << 17,
+    ArchDefineE500 = 1 << 18
   } ArchDefineTypes;
 
-
   ArchDefineTypes ArchDefs = ArchDefineNone;
   static const Builtin::Info BuiltinInfo[];
   static const char *const GCCRegNames[];
@@ -119,20 +119,20 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q)
               .Cases("power3", "pwr3", ArchDefinePpcgr)
               .Cases("power4", "pwr4",
-                    ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+                     ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power5", "pwr5",
-                    ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                        ArchDefinePpcsq)
+                     ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                         ArchDefinePpcsq)
               .Cases("power5x", "pwr5x",
-                    ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
-                        ArchDefinePpcgr | ArchDefinePpcsq)
+                     ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
+                         ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power6", "pwr6",
-                    ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
-                        ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+                     ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
+                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power6x", "pwr6x",
-                    ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
-                        ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                        ArchDefinePpcsq)
+                     ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
+                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                         ArchDefinePpcsq)
               .Cases("power7", "pwr7",
                      ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
@@ -146,11 +146,16 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                      ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                          ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                          ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
+              .Cases("power10", "pwr10",
+                     ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 |
+                         ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
+                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
+                         ArchDefinePpcsq)
               .Case("future",
-                    ArchDefineFuture | ArchDefinePwr9 | ArchDefinePwr8 |
-                        ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
-                        ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                        ArchDefinePpcsq)
+                    ArchDefineFuture | ArchDefinePwr10 | ArchDefinePwr9 |
+                        ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
+                        ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
+                        ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("8548", "e500", ArchDefineE500)
               .Default(ArchDefineNone);
     }
@@ -171,6 +176,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
                  StringRef CPU,
                  const std::vector<std::string> &FeaturesVec) const override;
 
+  void addP10SpecificFeatures(llvm::StringMap<bool> &Features) const;
   void addFutureSpecificFeatures(llvm::StringMap<bool> &Features) const;
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index e5130a9485de7..144e276a6bd87 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -70,6 +70,7 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
         .Case("power7", "pwr7")
         .Case("power8", "pwr8")
         .Case("power9", "pwr9")
+        .Case("power10", "pwr10")
         .Case("future", "future")
         .Case("pwr3", "pwr3")
         .Case("pwr4", "pwr4")
@@ -80,6 +81,7 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
         .Case("pwr7", "pwr7")
         .Case("pwr8", "pwr8")
         .Case("pwr9", "pwr9")
+        .Case("pwr10", "pwr10")
         .Case("powerpc", "ppc")
         .Case("powerpc64", "ppc64")
         .Case("powerpc64le", "ppc64le")
@@ -91,14 +93,16 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) {
 
 const char *ppc::getPPCAsmModeForCPU(StringRef Name) {
   return llvm::StringSwitch<const char *>(Name)
-        .Case("pwr7", "-mpower7")
-        .Case("power7", "-mpower7")
-        .Case("pwr8", "-mpower8")
-        .Case("power8", "-mpower8")
-        .Case("ppc64le", "-mpower8")
-        .Case("pwr9", "-mpower9")
-        .Case("power9", "-mpower9")
-        .Default("-many");
+      .Case("pwr7", "-mpower7")
+      .Case("power7", "-mpower7")
+      .Case("pwr8", "-mpower8")
+      .Case("power8", "-mpower8")
+      .Case("ppc64le", "-mpower8")
+      .Case("pwr9", "-mpower9")
+      .Case("power9", "-mpower9")
+      .Case("pwr10", "-mpower10")
+      .Case("power10", "-mpower10")
+      .Default("-many");
 }
 
 void ppc::getPPCTargetFeatures(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 9f036c94c3f8e..5c571fb458ec5 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -81,7 +81,7 @@
 // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750,
 // PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4,
 // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x,
-// PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, powerpc, ppc, powerpc64,
+// PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64,
 // PPC-SAME: ppc64, powerpc64le, ppc64le, future
 
 // RUN: not %clang_cc1 -triple mips--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix MIPS
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index b24f8eb7050be..ed8601636554e 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -627,12 +627,30 @@
 // PPCPOWER9:#define _ARCH_PWR7 1
 // PPCPOWER9:#define _ARCH_PWR9 1
 //
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr10 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER10 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power10 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER10 %s
+//
+// PPCPOWER10:#define _ARCH_PPC 1
+// PPCPOWER10:#define _ARCH_PPC64 1
+// PPCPOWER10:#define _ARCH_PPCGR 1
+// PPCPOWER10:#define _ARCH_PPCSQ 1
+// PPCPOWER10:#define _ARCH_PWR10 1
+// PPCPOWER10:#define _ARCH_PWR4 1
+// PPCPOWER10:#define _ARCH_PWR5 1
+// PPCPOWER10:#define _ARCH_PWR5X 1
+// PPCPOWER10:#define _ARCH_PWR6 1
+// PPCPOWER10-NOT:#define _ARCH_PWR6X 1
+// PPCPOWER10:#define _ARCH_PWR7 1
+// PPCPOWER10:#define _ARCH_PWR8 1
+// PPCPOWER10:#define _ARCH_PWR9 1
+//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu future -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCFUTURE %s
 //
 // PPCFUTURE:#define _ARCH_PPC 1
 // PPCFUTURE:#define _ARCH_PPC64 1
 // PPCFUTURE:#define _ARCH_PPCGR 1
 // PPCFUTURE:#define _ARCH_PPCSQ 1
+// PPCFUTURE:#define _ARCH_PWR10 1
 // PPCFUTURE:#define _ARCH_PWR4 1
 // PPCFUTURE:#define _ARCH_PWR5 1
 // PPCFUTURE:#define _ARCH_PWR5X 1
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index d9b3cac5e8dc0..da68464c4a3d9 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -142,6 +142,7 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Case("POWER8E", "pwr8")
       .Case("POWER8NVL", "pwr8")
       .Case("POWER9", "pwr9")
+      .Case("POWER10", "pwr10")
       // FIXME: If we get a simulator or machine with the capabilities of
       // mcpu=future, we should revisit this and add the name reported by the
       // simulator/machine.
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 1d1f11e498c20..a6c7868f6ac25 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -51,6 +51,7 @@ def DirectivePwr6x
 def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
 def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
+def DirectivePwr10: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR10", "">;
 def DirectivePwrFuture
     : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;
 
@@ -205,6 +206,9 @@ def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
 def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
                                      "true",
                                      "Enable instructions added in ISA 3.0.">;
+def FeatureISA3_1 : SubtargetFeature<"isa-v31-instructions", "IsISA3_1",
+                                     "true",
+                                     "Enable instructions added in ISA 3.1.">;
 def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
                                         "Enable POWER9 Altivec instructions",
                                         [FeatureISA3_0, FeatureP8Altivec]>;
@@ -328,14 +332,25 @@ def ProcessorFeatures {
   list<SubtargetFeature> P9Features =
     !listconcat(P9InheritableFeatures, P9SpecificFeatures);
 
+  // Power10
+  // For P10 CPU we assume that all of the existing features from Power9
+  // still exist with the exception of those we know are Power9 specific.
+  list<SubtargetFeature> P10AdditionalFeatures =
+    [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
+     FeaturePCRelativeMemops];
+  list<SubtargetFeature> P10SpecificFeatures = [];
+  list<SubtargetFeature> P10InheritableFeatures =
+    !listconcat(P9InheritableFeatures, P10AdditionalFeatures);
+  list<SubtargetFeature> P10Features =
+    !listconcat(P10InheritableFeatures, P10SpecificFeatures);
+
   // Future
-  // For future CPU we assume that all of the existing features from Power 9
-  // still exist with the exception of those we know are Power 9 specific.
+  // For future CPU we assume that all of the existing features from Power10
+  // still exist with the exception of those we know are Power10 specific.
   list<SubtargetFeature> FutureAdditionalFeatures = [];
-  list<SubtargetFeature> FutureSpecificFeatures =
-    [FeaturePrefixInstrs, FeaturePCRelativeMemops];
+  list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
-    !listconcat(P9InheritableFeatures, FutureAdditionalFeatures);
+    !listconcat(P10InheritableFeatures, FutureAdditionalFeatures);
   list<SubtargetFeature> FutureFeatures =
     !listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
 }
@@ -540,6 +555,8 @@ def : ProcessorModel<"pwr6x", G5Model,
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
+// No scheduler model yet.
+def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 42df83831113a..53f9ac678c7b7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1306,6 +1306,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE:
     setPrefLoopAlignment(Align(16));
     setPrefFunctionAlignment(Align(16));
@@ -14913,6 +14914,7 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE: {
     if (!ML)
       break;
@@ -16103,6 +16105,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
       // vector        7       2      2
       return true;
     case PPC::DIR_PWR9:
+    case PPC::DIR_PWR10:
     case PPC::DIR_PWR_FUTURE:
       //  type        mul     add    shl
       // scalar        5       2      2
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index cfc54df13f792..2f332715d8cac 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -115,6 +115,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasAddiLoadFusion = false;
   HasAddisLoadFusion = false;
   IsISA3_0 = false;
+  IsISA3_1 = false;
   UseLongCalls = false;
   SecurePlt = false;
   VectorsUseTwoUnits = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index be1143f903e8b..bfe39814e4cc8 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -34,32 +34,33 @@ class StringRef;
 
 namespace PPC {
   // -m directive values.
-  enum {
-    DIR_NONE,
-    DIR_32,
-    DIR_440,
-    DIR_601,
-    DIR_602,
-    DIR_603,
-    DIR_7400,
-    DIR_750,
-    DIR_970,
-    DIR_A2,
-    DIR_E500,
-    DIR_E500mc,
-    DIR_E5500,
-    DIR_PWR3,
-    DIR_PWR4,
-    DIR_PWR5,
-    DIR_PWR5X,
-    DIR_PWR6,
-    DIR_PWR6X,
-    DIR_PWR7,
-    DIR_PWR8,
-    DIR_PWR9,
-    DIR_PWR_FUTURE,
-    DIR_64
-  };
+enum {
+  DIR_NONE,
+  DIR_32,
+  DIR_440,
+  DIR_601,
+  DIR_602,
+  DIR_603,
+  DIR_7400,
+  DIR_750,
+  DIR_970,
+  DIR_A2,
+  DIR_E500,
+  DIR_E500mc,
+  DIR_E5500,
+  DIR_PWR3,
+  DIR_PWR4,
+  DIR_PWR5,
+  DIR_PWR5X,
+  DIR_PWR6,
+  DIR_PWR6X,
+  DIR_PWR7,
+  DIR_PWR8,
+  DIR_PWR9,
+  DIR_PWR10,
+  DIR_PWR_FUTURE,
+  DIR_64
+};
 }
 
 class GlobalValue;
@@ -138,6 +139,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool HasAddiLoadFusion;
   bool HasAddisLoadFusion;
   bool IsISA3_0;
+  bool IsISA3_1;
   bool UseLongCalls;
   bool SecurePlt;
   bool VectorsUseTwoUnits;
@@ -308,6 +310,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool hasHTM() const { return HasHTM; }
   bool hasFloat128() const { return HasFloat128; }
   bool isISA3_0() const { return IsISA3_0; }
+  bool isISA3_1() const { return IsISA3_1; }
   bool useLongCalls() const { return UseLongCalls; }
   bool hasFusion() const { return HasFusion; }
   bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index a41c6b41a991b..46c5335a558f4 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -651,11 +651,12 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
   if (CacheLineSize.getNumOccurrences() > 0)
     return CacheLineSize;
 
-  // On P7, P8 or P9 we have a cache line size of 128.
+  // Starting with P7 we have a cache line size of 128.
   unsigned Directive = ST->getCPUDirective();
   // Assume that Future CPU has the same cache line size as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+      Directive == PPC::DIR_PWR_FUTURE)
     return 128;
 
   // On other processors return a default of 64 bytes.
@@ -687,9 +688,11 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // For P7 and P8, floating-point instructions have a 6-cycle latency and
   // there are two execution units, so unroll by 12x for latency hiding.
   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
   // Assume that future is the same as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+      Directive == PPC::DIR_PWR_FUTURE)
     return 12;
 
   // For most things, modern systems have two execution units (and
diff --git a/llvm/test/CodeGen/PowerPC/check-cpu.ll b/llvm/test/CodeGen/PowerPC/check-cpu.ll
index baa39024ebe8d..132be3058216b 100644
--- a/llvm/test/CodeGen/PowerPC/check-cpu.ll
+++ b/llvm/test/CodeGen/PowerPC/check-cpu.ll
@@ -2,9 +2,13 @@
 ; RUN:     -mcpu=future < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:     -mcpu=future < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=power10 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr10 < %s | FileCheck %s
 
 
-; Test mcpu=future that should be recognized on PowerPC.
+; Test -mcpu=[pwr10|future] is recognized on PowerPC.
 
 ; CHECK-NOT: is not a recognized processor for this target
 ; CHECK:     .text

From 13f6c81c5d9a7a34a684363bcaad8eb7c65356fd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 27 May 2020 11:18:16 -0700
Subject: [PATCH 259/770] [BPF] simplify zero extension with MOV_32_64

The current pattern matching for zext results in the following code snippet
being produced,

  w1 = w0
  r1 <<= 32
  r1 >>= 32

Because BPF implementations require zero extension on 32bit loads this
both adds a few extra unneeded instructions but also makes it a bit
harder for the verifier to track the r1 register bounds. For example in
this verifier trace we see at the end of the snippet R2 offset is unknown.
However, if we track this correctly we see w1 should have the same bounds
as r8. R8 smax is less than U32 max value so a zero extend load should keep
the same value. Adding a max value of 800 (R8=inv(id=0,smax_value=800)) to
an off=0, as seen in R7 should create a max offset of 800. However at the
end of the snippet we note the R2 max offset is 0xffffFFFF.

  R0=inv(id=0,smax_value=800)
  R1_w=inv(id=0,umax_value=2147483647,var_off=(0x0; 0x7fffffff))
  R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
  R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  R9=inv800 R10=fp0 fp-8=mmmm????
 58: (1c) w9 -= w8
 59: (bc) w1 = w8
 60: (67) r1 <<= 32
 61: (77) r1 >>= 32
 62: (bf) r2 = r7
 63: (0f) r2 += r1
 64: (bf) r1 = r6
 65: (bc) w3 = w9
 66: (b7) r4 = 0
 67: (85) call bpf_get_stack#67
  R0=inv(id=0,smax_value=800)
  R1_w=ctx(id=0,off=0,imm=0)
  R2_w=map_value(id=0,off=0,ks=4,vs=1600,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  R3_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
  R4_w=inv0 R6=ctx(id=0,off=0,imm=0)
  R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
  R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  R9_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
  R10=fp0 fp-8=mmmm????

After this patch R1 bounds are not smashed by the <<=32 >>=32 shift and we
get correct bounds on R2 umax_value=800.

Further it reduces 3 insns to 1.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>

Differential Revision: https://reviews.llvm.org/D73985
---
 llvm/lib/Target/BPF/BPFISelLowering.cpp       |  6 +++++
 llvm/lib/Target/BPF/BPFInstrInfo.td           |  3 +--
 llvm/lib/Target/BPF/BPFMIPeephole.cpp         | 15 +++++-------
 .../CodeGen/BPF/32-bit-subreg-cond-select.ll  | 20 +++++++++-------
 .../BPF/32-bit-subreg-peephole-phi-1.ll       |  2 +-
 .../BPF/32-bit-subreg-peephole-phi-2.ll       |  4 ++--
 .../BPF/32-bit-subreg-peephole-phi-3.ll       |  5 ++--
 .../CodeGen/BPF/32-bit-subreg-peephole.ll     | 24 +++++++++++--------
 llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll   | 21 ++++++++++++++++
 9 files changed, 66 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll

diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index cc8a486775380..a02556a399098 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -604,6 +604,12 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
   DebugLoc DL = MI.getDebugLoc();
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
+
+  if (!isSigned) {
+    Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+    return PromotedReg0;
+  }
   Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg1 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg2 = RegInfo.createVirtualRegister(RC);
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 6781d09b846e7..4298e2eaec046 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -732,8 +732,7 @@ let isCodeGenOnly = 1 in {
 def : Pat<(i64 (sext GPR32:$src)),
           (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
 
-def : Pat<(i64 (zext GPR32:$src)),
-          (SRL_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+def : Pat<(i64 (zext GPR32:$src)), (MOV_32_64 GPR32:$src)>;
 
 // For i64 -> i32 truncation, use the 32-bit subregister directly.
 def : Pat<(i32 (trunc GPR:$src)),
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index a2ceade66800c..fe955fad04249 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -301,19 +301,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
       //
       //   MOV rA, rA
       //
-      // This is particularly possible to happen when sub-register support
-      // enabled. The special type cast insn MOV_32_64 involves different
-      // register class on src (i32) and dst (i64), RA could generate useless
-      // instruction due to this.
+      // Note that we cannot remove
+      //   MOV_32_64  rA, wA
+      //   MOV_rr_32  wA, wA
+      // as these two instructions having side effects, zeroing out
+      // top 32 bits of rA.
       unsigned Opcode = MI.getOpcode();
-      if (Opcode == BPF::MOV_32_64 ||
-          Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) {
+      if (Opcode == BPF::MOV_rr) {
         Register dst = MI.getOperand(0).getReg();
         Register src = MI.getOperand(1).getReg();
 
-        if (Opcode == BPF::MOV_32_64)
-          dst = TRI->getSubReg(dst, BPF::sub_32);
-
         if (dst != src)
           continue;
 
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
index 160be56c30a33..e8a4f81a0240e 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
@@ -56,8 +56,9 @@ entry:
   ret i32 %c.d
 }
 ; CHECK-LABEL: select_cc_32
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i64 @select_cc_32_64(i32 %a, i32 %b, i64 %c, i64 %d) local_unnamed_addr #0 {
@@ -67,8 +68,9 @@ entry:
   ret i64 %c.d
 }
 ; CHECK-LABEL: select_cc_32_64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @select_cc_64_32(i64 %a, i64 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
@@ -88,8 +90,9 @@ entry:
   ret i32 %c.d
 }
 ; CHECK-LABEL: selecti_cc_32
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i64 @selecti_cc_32_64(i32 %a, i64 %c, i64 %d) local_unnamed_addr #0 {
@@ -99,8 +102,9 @@ entry:
   ret i64 %c.d
 }
 ; CHECK-LABEL: selecti_cc_32_64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @selecti_cc_64_32(i64 %a, i32 %c, i32 %d) local_unnamed_addr #0 {
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
index 5a72f59593c68..2fc1e6c2783b3 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
@@ -27,7 +27,7 @@ entry:
   %call = tail call i32 @helper(i32 %conv)
   ret i32 %call
 }
-; CHECK: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
 ; CHECK-NOT: r{{[0-9]+}} >>= 32
 ; CHECK: if r{{[0-9]+}} == r{{[0-9]+}} goto
 
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
index 46a1b231c1f0a..da69657d02d00 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
@@ -27,8 +27,8 @@ entry:
   %call = tail call i32 @helper(i32 %conv)
   ret i32 %call
 }
-; CHECK: r{{[0-9]+}} >>= 32
-; CHECK: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
 ; CHECK: if r{{[0-9]+}} == r{{[0-9]+}} goto
 
 declare dso_local i32 @helper(i32) local_unnamed_addr
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
index d46214032e6ee..3f3f9c8c4a55f 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
@@ -44,8 +44,9 @@ for.body:                                         ; preds = %for.body, %entry
   %exitcond = icmp eq i64 %inc, 100
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !2
 }
-; CHECK: [[VAL:r[0-9]+]] <<= 32
-; CHECK: [[VAL]] >>= 32
+; CHECK: [[VAL:r[0-9]+]] = w{{[0-9]+}}
+; CHECK-NOT: [[VAL:r[0-9]+]] <<= 32
+; CHECK-NOT: [[VAL]] >>= 32
 ; CHECK: if [[VAL]] == 0 goto
 
 !2 = distinct !{!2, !3}
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
index 63a7c25ed33b7..7c5be7f1987a3 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
@@ -47,8 +47,9 @@ define dso_local i64 @select_u(i32 %a, i32 %b, i64 %c, i64 %d) local_unnamed_add
 entry:
   %cmp = icmp ugt i32 %a, %b
   %c.d = select i1 %cmp, i64 %c, i64 %d
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 ; CHECK: if r{{[0-9]+}} {{<|>}} r{{[0-9]+}} goto
   ret i64 %c.d
 }
@@ -58,8 +59,9 @@ define dso_local i64 @select_u_2(i32 %a, i64 %b, i64 %c, i64 %d) local_unnamed_a
 ; CHECK-LABEL: select_u_2:
 entry:
   %conv = zext i32 %a to i64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
   %cmp = icmp ugt i64 %conv, %b
   %c.d = select i1 %cmp, i64 %c, i64 %d
   ret i64 %c.d
@@ -84,10 +86,11 @@ entry:
   %call = tail call i64 bitcast (i64 (...)* @bar to i64 ()*)() #2
   %conv = trunc i64 %call to i32
   %cmp = icmp ult i32 %conv, 10
-; The shifts can't be optimized out because %call comes from function call
-; returning i64 so the high bits might be valid.
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; %call comes from function call returning i64 so the high bits will need
+; to be cleared.
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
   %b.c = select i1 %cmp, i32 %b, i32 %c
 ; CHECK: if r{{[0-9]+}} {{<|>}} {{[0-9]+}} goto
   ret i32 %b.c
@@ -100,8 +103,9 @@ define dso_local i32* @inc_p(i32* readnone %p, i32 %a) local_unnamed_addr #0 {
 ; CHECK-LABEL: inc_p:
 entry:
   %idx.ext = zext i32 %a to i64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
   %add.ptr = getelementptr inbounds i32, i32* %p, i64 %idx.ext
   ret i32* %add.ptr
 }
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll
new file mode 100644
index 0000000000000..57ea93a8fe6e6
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O2 -march=bpfel -mattr=+alu32 < %s | FileCheck %s
+; RUN: llc -O2 -march=bpfel -mcpu=v3 < %s | FileCheck %s
+; RUN: llc -O2 -march=bpfeb -mattr=+alu32 < %s | FileCheck %s
+; RUN: llc -O2 -march=bpfeb -mcpu=v3 < %s | FileCheck %s
+;
+; long zext(unsigned int a)
+; {
+;   long b = a;
+;   return b;
+; }
+
+; Function Attrs: norecurse nounwind
+define dso_local i64 @zext(i32 %a) local_unnamed_addr #0 {
+entry:
+  %conv = zext i32 %a to i64
+  ; CHECK-NOT: r[[#]] <<= 32
+  ; CHECK-NOT: r[[#]] >>= 32
+  ret i64 %conv
+}
+
+attributes #0 = { norecurse nounwind }

From 4b4496312e3380d8c427ef836f2b0a38d145652b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 13:25:37 -0400
Subject: [PATCH 260/770] AMDGPU: Start adding MODE register uses to
 instructions

This is the groundwork required to implement strictfp. For now, this
should be NFC for regular instructoins (many instructions just gain an
extra use of a reserved register). Regalloc won't rematerialize
instructions with reads of physical registers, but we were suffering
from that anyway with the exec reads.

Should add it for all the related FP uses (possibly with some
extras). I did not add it to either the gpr index mode instructions
(or every single VALU instruction) since it's a ridiculous feature
already modeled as an arbitrary side effect.

Also work towards marking instructions with FP exceptions. This
doesn't actually set the bit yet since this would start to change
codegen. It seems nofpexcept is currently not implied from the regular
IR FP operations. Add it to some MIR tests where I think it might
matter.
---
 llvm/lib/Target/AMDGPU/SIInstrFormats.td      |   3 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   4 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  29 ++-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  34 +++-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |  11 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  27 ++-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   8 +-
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |   8 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  27 ++-
 .../GlobalISel/inst-select-amdgcn.cos.mir     |   4 +-
 .../GlobalISel/inst-select-amdgcn.cos.s16.mir |   4 +-
 .../inst-select-amdgcn.cvt.pkrtz.mir          |   6 +-
 .../inst-select-amdgcn.fmad.ftz.mir           |  18 +-
 .../GlobalISel/inst-select-amdgcn.fmed3.mir   |  16 +-
 .../inst-select-amdgcn.fmed3.s16.mir          |   4 +-
 .../GlobalISel/inst-select-amdgcn.fract.mir   |   8 +-
 .../inst-select-amdgcn.fract.s16.mir          |   6 +-
 .../GlobalISel/inst-select-amdgcn.ldexp.mir   |  12 +-
 .../inst-select-amdgcn.ldexp.s16.mir          |   6 +-
 .../inst-select-amdgcn.rcp.legacy.mir         |   4 +-
 .../GlobalISel/inst-select-amdgcn.rcp.mir     |   8 +-
 .../GlobalISel/inst-select-amdgcn.rcp.s16.mir |   5 +-
 .../inst-select-amdgcn.rsq.clamp.mir          |   4 +-
 .../inst-select-amdgcn.rsq.legacy.mir         |   4 +-
 .../GlobalISel/inst-select-amdgcn.rsq.mir     |   8 +-
 .../GlobalISel/inst-select-amdgcn.rsq.s16.mir |   4 +-
 .../GlobalISel/inst-select-amdgcn.sin.mir     |   4 +-
 .../GlobalISel/inst-select-amdgcn.sin.s16.mir |   4 +-
 .../GlobalISel/inst-select-fadd.s16.mir       |  16 +-
 .../GlobalISel/inst-select-fadd.s32.mir       |  24 +--
 .../GlobalISel/inst-select-fadd.s64.mir       |  22 +-
 .../GlobalISel/inst-select-fcanonicalize.mir  |  56 ++---
 .../AMDGPU/GlobalISel/inst-select-fceil.mir   |   8 +-
 .../GlobalISel/inst-select-fceil.s16.mir      |   6 +-
 .../AMDGPU/GlobalISel/inst-select-fcmp.mir    | 116 +++++------
 .../GlobalISel/inst-select-fcmp.s16.mir       |  56 ++---
 .../AMDGPU/GlobalISel/inst-select-fexp2.mir   |   4 +-
 .../GlobalISel/inst-select-ffloor.s16.mir     |   6 +-
 .../GlobalISel/inst-select-ffloor.s32.mir     |   8 +-
 .../GlobalISel/inst-select-ffloor.s64.mir     |   4 +-
 .../AMDGPU/GlobalISel/inst-select-fma.s32.mir |  36 ++--
 .../GlobalISel/inst-select-fmad.s32.mir       |  24 +--
 .../GlobalISel/inst-select-fmaxnum-ieee.mir   |  24 +--
 .../inst-select-fmaxnum-ieee.s16.mir          |   4 +-
 .../inst-select-fmaxnum-ieee.v2s16.mir        |   2 +-
 .../AMDGPU/GlobalISel/inst-select-fmaxnum.mir |  24 +--
 .../GlobalISel/inst-select-fmaxnum.s16.mir    |   4 +-
 .../GlobalISel/inst-select-fmaxnum.v2s16.mir  |   2 +-
 .../GlobalISel/inst-select-fminnum-ieee.mir   |  24 +--
 .../inst-select-fminnum-ieee.s16.mir          |   4 +-
 .../inst-select-fminnum-ieee.v2s16.mir        |   2 +-
 .../AMDGPU/GlobalISel/inst-select-fminnum.mir |  24 +--
 .../GlobalISel/inst-select-fminnum.s16.mir    |   4 +-
 .../GlobalISel/inst-select-fminnum.v2s16.mir  |   2 +-
 .../AMDGPU/GlobalISel/inst-select-fmul.mir    |  38 ++--
 .../GlobalISel/inst-select-fmul.v2s16.mir     |   6 +-
 .../AMDGPU/GlobalISel/inst-select-fptosi.mir  |  18 +-
 .../AMDGPU/GlobalISel/inst-select-fptoui.mir  |  16 +-
 .../AMDGPU/GlobalISel/inst-select-frint.mir   |  10 +-
 .../GlobalISel/inst-select-frint.s16.mir      |   6 +-
 .../inst-select-intrinsic-trunc.mir           |   8 +-
 .../inst-select-intrinsic-trunc.s16.mir       |   6 +-
 .../AMDGPU/GlobalISel/inst-select-sitofp.mir  |  24 +--
 .../AMDGPU/GlobalISel/inst-select-uitofp.mir  |  24 +--
 llvm/test/CodeGen/AMDGPU/bundle-latency.mir   |  16 +-
 .../AMDGPU/clamp-omod-special-case.mir        |  50 ++---
 .../CodeGen/AMDGPU/cluster-flat-loads.mir     |   2 +-
 ...scer-subranges-another-copymi-not-live.mir |  10 +-
 ...oalescer-subranges-another-prune-error.mir |  22 +-
 .../AMDGPU/coalescer-subregjoin-fullcopy.mir  |  32 +--
 .../coalescer-with-subregs-bad-identical.mir  |  28 +--
 .../AMDGPU/couldnt-join-subrange-3.mir        |  36 ++--
 llvm/test/CodeGen/AMDGPU/dead-lane.mir        |   6 +-
 .../AMDGPU/debug-value-scheduler-crash.mir    |  58 +++---
 llvm/test/CodeGen/AMDGPU/dpp_combine.mir      |  44 ++--
 llvm/test/CodeGen/AMDGPU/endpgm-dce.mir       |   8 +-
 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir  |   2 +-
 llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir |  76 +++----
 .../AMDGPU/fold-immediate-output-mods.mir     |  16 +-
 llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir   |   4 +-
 .../AMDGPU/fp-atomic-to-s_denormmode.mir      |  66 +++---
 .../AMDGPU/hazard-buffer-store-v-interp.mir   |   4 +-
 .../CodeGen/AMDGPU/hazard-hidden-bundle.mir   |   2 +-
 llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir |   2 +-
 llvm/test/CodeGen/AMDGPU/hazard-kill.mir      |   4 +-
 llvm/test/CodeGen/AMDGPU/hazard.mir           |   4 +-
 .../CodeGen/AMDGPU/insert-waitcnts-callee.mir |   2 +-
 .../CodeGen/AMDGPU/inserted-wait-states.mir   |  50 ++---
 .../CodeGen/AMDGPU/madak-inline-constant.mir  |  40 ++--
 llvm/test/CodeGen/AMDGPU/mai-hazards.mir      |  52 ++---
 llvm/test/CodeGen/AMDGPU/merge-m0.mir         |   2 +-
 llvm/test/CodeGen/AMDGPU/mode-register.mir    | 108 +++++-----
 llvm/test/CodeGen/AMDGPU/movrels-bug.mir      |   2 +-
 llvm/test/CodeGen/AMDGPU/omod-nsz-flag.mir    |  20 +-
 .../AMDGPU/power-sched-no-instr-sunit.mir     |   2 +-
 .../AMDGPU/regcoal-subrange-join-seg.mir      |  16 +-
 .../CodeGen/AMDGPU/regcoal-subrange-join.mir  |  16 +-
 .../test/CodeGen/AMDGPU/regcoalesce-prune.mir |   4 +-
 ...scing-remove-partial-redundancy-assert.mir | 114 +++++------
 ...ename-independent-subregs-mac-operands.mir |  28 +--
 ...ched-assert-onlydbg-value-empty-region.mir |  24 +--
 .../CodeGen/AMDGPU/sched-crash-dbg-value.mir  |  22 +-
 llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir        |   8 +-
 .../AMDGPU/sdwa-peephole-instr-gfx10.mir      |  96 ++++-----
 .../CodeGen/AMDGPU/sdwa-peephole-instr.mir    | 192 +++++++++---------
 llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir    |   8 +-
 .../AMDGPU/shrink-instructions-flags.mir      |   2 +-
 llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir  |  38 ++--
 .../AMDGPU/spill-empty-live-interval.mir      |   8 +-
 .../AMDGPU/subreg-split-live-in-error.mir     |  88 ++++----
 llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir      |  46 ++---
 llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir      |  46 ++---
 llvm/test/CodeGen/AMDGPU/v_swap_b32.mir       |   4 +-
 .../AMDGPU/vccz-corrupt-bug-workaround.mir    |   2 +-
 .../CodeGen/AMDGPU/vcmpx-permlane-hazard.mir  |   2 +-
 .../CodeGen/AMDGPU/vmem-to-salu-hazard.mir    |   2 +-
 .../CodeGen/AMDGPU/waitcnt-back-edge-loop.mir |   8 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir |  42 ++--
 llvm/test/CodeGen/AMDGPU/waitcnt-permute.mir  |   2 +-
 llvm/unittests/MI/LiveIntervalTest.cpp        |  12 +-
 120 files changed, 1318 insertions(+), 1224 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index a4d11780118f2..428c21c896d50 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -114,6 +114,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
   field bit IsNonFlatSeg = 0;
 
+  // Reads the mode register, usually for FP environment.
+  field bit ReadsModeReg = 0;
+
   // This bit indicates that this uses the floating point double precision
   // rounding mode flags
   field bit FPDPRounding = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 400bbe2c0ed4c..fff2189498e18 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -64,7 +64,7 @@ def : ExpComprPattern<v2f16, EXP_DONE, -1>;
 // Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
 def VINTRPDst : VINTRPDstOperand <VGPR_32>;
 
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
 
 // FIXME: Specify SchedRW for VINTRP instructions.
 
@@ -109,7 +109,7 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
   [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
 
-} // End Uses = [M0, EXEC]
+} // End Uses = [MODE, M0, EXEC]
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e9dbe93fa3cee..4f9aaa1bc604f 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -807,8 +807,10 @@ def S_SETREG_B32 : SOPK_Pseudo <
   "s_setreg_b32",
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
   "$simm16, $sdst",
-  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
->;
+  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]> {
+  let Defs = [MODE];
+  let Uses = [MODE];
+}
 
 // FIXME: Not on SI?
 //def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
@@ -819,6 +821,8 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
   "$simm16, $imm"> {
   let Size = 8; // Unlike every other SOPK instruction.
   let has_sdst = 0;
+  let Defs = [MODE];
+  let Uses = [MODE];
 }
 
 } // End hasSideEffects = 1
@@ -953,6 +957,10 @@ def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
 } // End SubtargetPredicate = isGFX8Plus
 
 let SubtargetPredicate = HasVGPRIndexMode in {
+// Setting the GPR index mode is really writing the fields in the mode
+// register. We don't want to add mode register uses to every
+// instruction, and it's too complicated to deal with anyway. This is
+// modeled just as a side effect.
 def S_SET_GPR_IDX_ON : SOPC <0x11,
   (outs),
   (ins SSrc_b32:$src0, GPRIdxMode:$src1),
@@ -1209,13 +1217,16 @@ let SubtargetPredicate = isGFX10Plus in {
     }
   def S_WAITCNT_DEPCTR :
     SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
-  def S_ROUND_MODE :
-    SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
-  def S_DENORM_MODE :
-    SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
-    [(SIdenorm_mode (i32 timm:$simm16))]> {
-      let hasSideEffects = 1;
-    }
+
+  let hasSideEffects = 1, Uses = [MODE], Defs = [MODE] in {
+    // FIXME: Should remove hasSideEffects
+    def S_ROUND_MODE :
+      SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+    def S_DENORM_MODE :
+      SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+      [(SIdenorm_mode (i32 timm:$simm16))]>;
+  }
+
   def S_TTRACEDATA_IMM :
     SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
 } // End SubtargetPredicate = isGFX10Plus
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0c2b5fbf59fcc..e46d84d513cc1 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -48,9 +48,14 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
   let mayStore = 0;
   let hasSideEffects = 0;
 
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  // FIXME
+  // let mayRaiseFPException = ReadsModeReg;
+
   let VOP1 = 1;
   let VALU = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
@@ -186,31 +191,51 @@ def V_READFIRSTLANE_B32 :
 
 let SchedRW = [WriteDoubleCvt] in {
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64,  fp_to_sint>;
+
+let mayRaiseFPException = 0 in {
 defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+}
+
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64,  fpround>;
 defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32,  fpextend>;
 defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64,  fp_to_uint>;
+
+let mayRaiseFPException = 0 in {
 defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
+}
+
 } // End SchedRW = [WriteDoubleCvt]
 
 let SchedRW = [WriteFloatCvt] in {
+
+// XXX: Does this really not raise exceptions? The manual claims the
+// 16-bit ones can.
+let mayRaiseFPException = 0 in {
 defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
 defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
+}
+
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
 let FPDPRounding = 1 in {
 defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
 } // End FPDPRounding = 1
+
 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
+
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
 defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP1_F32_I32>;
+} // End ReadsModeReg = 0, mayRaiseFPException = 0
 } // End SchedRW = [WriteFloatCvt]
 
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
 defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
 defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
 defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
+} // ReadsModeReg = 0, mayRaiseFPException = 0
 
 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
 defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
@@ -417,8 +442,11 @@ let SubtargetPredicate = isGFX9Plus in {
   }
 
   defm V_SAT_PK_U8_I16    : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
-  defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
-  defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
+  let mayRaiseFPException = 0 in {
+    defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+    defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+  } // End mayRaiseFPException = 0
 } // End SubtargetPredicate = isGFX9Plus
 
 let SubtargetPredicate = isGFX9Only in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index c1ce1b755322d..4927c6c2f3f27 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -69,9 +69,14 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
   let mayStore = 0;
   let hasSideEffects = 0;
 
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  // FIXME: Set this
+  // let mayRaiseFPException = ReadsModeReg;
+
   let VOP2 = 1;
   let VALU = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
@@ -529,8 +534,12 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+}
+
 defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
 defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 249eb69ba4c91..7e1ac7509719c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -290,8 +290,11 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 
 let isCommutable = 1 in {
 
+let mayRaiseFPException = 0 in {
 def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
 def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+}
+
 def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
@@ -314,7 +317,7 @@ def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
 } // End SchedRW = [WriteQuarterRate32]
 
-let Uses = [VCC, EXEC] in {
+let Uses = [MODE, VCC, EXEC] in {
 // v_div_fmas_f32:
 //   result = src0 * src1 + src2
 //   if (vcc)
@@ -336,15 +339,20 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []>
 
 } // End isCommutable = 1
 
+let mayRaiseFPException = 0 in {
 def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
 def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
 def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
 def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+} // End mayRaiseFPException
+
 def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
 def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
 def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
 def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
 def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+
+let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
 def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
 def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
 def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
@@ -354,6 +362,8 @@ def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDG
 def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
 def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
 def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+} // End mayRaiseFPException = 0
+
 def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -366,6 +376,8 @@ def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_
 def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
 } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
+
+let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
   let AsmMatchConverter = "";
@@ -377,6 +389,7 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
   let AsmMatchConverter = "";
   let FPDPRounding = 1;
 }
+} // End mayRaiseFPException = 0
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
@@ -471,7 +484,7 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 let FPDPRounding = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
 // For some reason the intrinsic operands are in a different order
 // from the instruction operands.
 def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
@@ -482,7 +495,7 @@ def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i3
                                    (i32 timm:$attr),
                                    (i1 timm:$high),
                                    M0))]>;
-} // End Uses = [M0, EXEC]
+} // End Uses = [M0, MODE, EXEC]
 } // End FPDPRounding = 1
 } // End renamedInGFX9 = 1
 
@@ -498,7 +511,7 @@ def V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9Plus
 
-let Uses = [M0, EXEC], FPDPRounding = 1 in {
+let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
        [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
                                                   (i32 timm:$attrchan),
@@ -512,15 +525,15 @@ def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32
 
 
 def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
-} // End Uses = [M0, EXEC], FPDPRounding = 1
+} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
-let SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC] in {
+let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-} // End SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC]
+} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
 
 let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 25075e179847c..a3d973fc2f182 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -149,10 +149,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 }
 
 let SubtargetPredicate = HasMadMixInsts in {
+
 // These are VOP3a-like opcodes which accept no omod.
 // Size of src arguments (16/32) is controlled by op_sel.
 // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
-let isCommutable = 1 in {
+let isCommutable = 1, mayRaiseFPException = 0 in {
 def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
 let FPDPRounding = 1 in {
@@ -370,7 +371,8 @@ def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
   let isMoveImm = 1;
 }
 
-let isConvergent = 1 in {
+// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
+let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
 def V_MFMA_F32_4X4X1F32    : VOP3Inst<"v_mfma_f32_4x4x1f32",    VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_4x4x1f32>;
 def V_MFMA_F32_4X4X4F16    : VOP3Inst<"v_mfma_f32_4x4x4f16",    VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_4x4x4f16>;
 def V_MFMA_I32_4X4X4I8     : VOP3Inst<"v_mfma_i32_4x4x4i8",     VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_4x4x4i8>;
@@ -391,7 +393,7 @@ def V_MFMA_I32_32X32X4I8   : VOP3Inst<"v_mfma_i32_32x32x4i8",   VOPProfileMAI_I3
 def V_MFMA_I32_32X32X8I8   : VOP3Inst<"v_mfma_i32_32x32x8i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_32x32x8i8>;
 def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
 def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
-} // End isConvergent = 1
+} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
 
 } // End SubtargetPredicate = HasMAIInsts
 
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 003a4f73c1568..aa2fa260e7b52 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -92,9 +92,11 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
   let mayStore = 0;
   let hasSideEffects = 0;
 
+  let ReadsModeReg = isFloatType<P.Src0VT>.ret;
+
   let VALU = 1;
   let VOPC = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
   let Defs = !if(DefVcc, [VCC], []);
 
   VOPProfile Pfl = P;
@@ -738,6 +740,9 @@ multiclass VOPC_CLASS_F64 <string opName> :
 multiclass VOPCX_CLASS_F64 <string opName> :
   VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
 
+// cmp_class ignores the FP mode and faithfully reports the unmodified
+// source value.
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
 defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
 defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
@@ -747,6 +752,7 @@ let SubtargetPredicate = Has16BitInsts in {
 defm V_CMP_CLASS_F16  : VOPC_CLASS_F16 <"v_cmp_class_f16">;
 defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
 }
+} // End ReadsModeReg = 0, mayRaiseFPException = 0
 
 //===----------------------------------------------------------------------===//
 // V_ICMPIntrinsic Pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 42a275c6c3651..d52ad7f92997c 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -8,6 +8,8 @@
 
 // dummies for outer let
 class LetDummies {
+  bit ReadsModeReg;
+  bit mayRaiseFPException;
   bit isCommutable;
   bit isConvertibleToThreeAddress;
   bit isMoveImm;
@@ -35,7 +37,7 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
   let VALU = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 }
 
 class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
@@ -118,7 +120,12 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let ClampLo = P.HasClampLo;
   let ClampHi = P.HasClampHi;
 
-  let Uses = [EXEC];
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  // FIXME: Set this. Right now it seems regular IR operations don't
+  // automatically imply no FP exceptions.
+  // let mayRaiseFPException = ReadsModeReg;
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
   let AsmMatchConverter =
@@ -490,7 +497,13 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
 
   let VALU = 1;
   let SDWA = 1;
-  let Uses = [EXEC];
+
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  // FIXME: Set this. Right now it seems regular IR operations don't
+  // automatically imply no FP exceptions.
+  // let mayRaiseFPException = ReadsModeReg;
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let SubtargetPredicate = HasSDWA;
   let AssemblerPredicate = HasSDWA;
@@ -607,7 +620,13 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let VALU = 1;
   let DPP = 1;
   let Size = 8;
-  let Uses = [EXEC];
+
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  // FIXME: Set this. Right now it seems regular IR operations don't
+  // automatically imply no FP exceptions.
+  // let mayRaiseFPException = ReadsModeReg;
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
   let isConvergent = 1;
 
   string Mnemonic = OpName;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.mir
index 29e59cd4b2945..d010f7f441604 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.mir
@@ -15,7 +15,7 @@ body: |
     ; CHECK-LABEL: name: cos_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_COS_F32_e64_:%[0-9]+]]:vgpr_32 = V_COS_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_COS_F32_e64_:%[0-9]+]]:vgpr_32 = V_COS_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_COS_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), %0
@@ -35,7 +35,7 @@ body: |
     ; CHECK-LABEL: name: cos_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_COS_F32_e64_:%[0-9]+]]:vgpr_32 = V_COS_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_COS_F32_e64_:%[0-9]+]]:vgpr_32 = V_COS_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_COS_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir
index a18242c3e4483..86b782b27e823 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir
@@ -17,7 +17,7 @@ body: |
     ; CHECK-LABEL: name: cos_s16_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_COS_F16_e64_:%[0-9]+]]:vgpr_32 = V_COS_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_COS_F16_e64_:%[0-9]+]]:vgpr_32 = V_COS_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_COS_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -38,7 +38,7 @@ body: |
     ; CHECK-LABEL: name: cos_s16_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_COS_F16_e64_:%[0-9]+]]:vgpr_32 = V_COS_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_COS_F16_e64_:%[0-9]+]]:vgpr_32 = V_COS_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_COS_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cvt.pkrtz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cvt.pkrtz.mir
index bc987b0703758..2eef865fc85bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cvt.pkrtz.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cvt.pkrtz.mir
@@ -15,7 +15,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_CVT_PKRTZ_F16_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -37,7 +37,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_CVT_PKRTZ_F16_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -58,7 +58,7 @@ body: |
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_CVT_PKRTZ_F16_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
index b9f88557678a9..fe51eb9a39387 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir
@@ -19,7 +19,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -43,7 +43,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -67,7 +67,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -92,7 +92,7 @@ body: |
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -117,7 +117,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -140,7 +140,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -163,7 +163,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -185,7 +185,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0
@@ -226,7 +226,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.mir
index 92f264f6b7ea7..13d268d68e942 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.mir
@@ -16,7 +16,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -40,7 +40,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -64,7 +64,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -88,7 +88,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -113,7 +113,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -135,7 +135,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -157,7 +157,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -178,7 +178,7 @@ body: |
     ; GCN-LABEL: name: fmed3_s32_vsss
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F32_:%[0-9]+]]:vgpr_32 = V_MED3_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %0, %0, %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir
index c47565181cfcb..7ffb251907707 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir
@@ -21,7 +21,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_MED3_F16_:%[0-9]+]]:vgpr_32 = V_MED3_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F16_:%[0-9]+]]:vgpr_32 = V_MED3_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F16_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -48,7 +48,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_MED3_F16_:%[0-9]+]]:vgpr_32 = V_MED3_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MED3_F16_:%[0-9]+]]:vgpr_32 = V_MED3_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MED3_F16_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.mir
index f9e9978e9cada..8d62c2c490496 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.mir
@@ -15,7 +15,7 @@ body: |
     ; CHECK-LABEL: name: fract_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_FRACT_F32_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FRACT_F32_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_FRACT_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %0
@@ -35,7 +35,7 @@ body: |
     ; CHECK-LABEL: name: fract_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_FRACT_F32_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FRACT_F32_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_FRACT_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %0
@@ -55,7 +55,7 @@ body: |
     ; CHECK-LABEL: name: fract_s64_vs
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[V_FRACT_F64_e64_:%[0-9]+]]:vreg_64 = V_FRACT_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FRACT_F64_e64_:%[0-9]+]]:vreg_64 = V_FRACT_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_FRACT_F64_e64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %0
@@ -75,7 +75,7 @@ body: |
     ; CHECK-LABEL: name: fract_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_FRACT_F64_e64_:%[0-9]+]]:vreg_64 = V_FRACT_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FRACT_F64_e64_:%[0-9]+]]:vreg_64 = V_FRACT_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_FRACT_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir
index ecae749c163b3..8360aee9a83e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir
@@ -5,6 +5,8 @@
 # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %1:sgpr(s16) (in function: fract_s16_vs)
 # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %1:vgpr(s16) (in function: fract_s16_vv)
 
+---
+
 name: fract_s16_vs
 legalized: true
 regBankSelected: true
@@ -17,7 +19,7 @@ body: |
     ; CHECK-LABEL: name: fract_s16_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_FRACT_F16_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FRACT_F16_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_FRACT_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -38,7 +40,7 @@ body: |
     ; CHECK-LABEL: name: fract_s16_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_FRACT_F16_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FRACT_F16_e64_:%[0-9]+]]:vgpr_32 = V_FRACT_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_FRACT_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir
index 7371dbb998ffb..0fc2582f983f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir
@@ -14,7 +14,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_LDEXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -35,7 +35,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_LDEXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -56,7 +56,7 @@ body: |
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_LDEXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -77,7 +77,7 @@ body: |
     ; GCN: liveins: $sgpr0_sgpr1, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_LDEXP_F64_:%[0-9]+]]:vreg_64 = V_LDEXP_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F64_:%[0-9]+]]:vreg_64 = V_LDEXP_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s32) = COPY $vgpr0
@@ -98,7 +98,7 @@ body: |
     ; GCN: liveins: $sgpr0_sgpr1, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_LDEXP_F64_:%[0-9]+]]:vreg_64 = V_LDEXP_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F64_:%[0-9]+]]:vreg_64 = V_LDEXP_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:sgpr(s32) = COPY $sgpr0
@@ -119,7 +119,7 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
     ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN: [[V_LDEXP_F64_:%[0-9]+]]:vreg_64 = V_LDEXP_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F64_:%[0-9]+]]:vreg_64 = V_LDEXP_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir
index ad63aa85e9a08..1a620849f3362 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir
@@ -19,7 +19,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -41,7 +41,7 @@ body: |
     ; GCN: liveins: $sgpr0, $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -63,7 +63,7 @@ body: |
     ; GCN: liveins: $vgpr0, $vgpr1
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_LDEXP_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir
index def9d91830b88..3dbed8a7b5dd1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir
@@ -20,7 +20,7 @@ body: |
     ; CHECK-LABEL: name: rcp_legacy_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RCP_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_LEGACY_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0
@@ -40,7 +40,7 @@ body: |
     ; CHECK-LABEL: name: rcp_legacy_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RCP_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_LEGACY_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.mir
index 6dfdec7792730..fce8864294080 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.mir
@@ -15,7 +15,7 @@ body: |
     ; CHECK-LABEL: name: rcp_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0
@@ -35,7 +35,7 @@ body: |
     ; CHECK-LABEL: name: rcp_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0
@@ -55,7 +55,7 @@ body: |
     ; CHECK-LABEL: name: rcp_s64_vs
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[V_RCP_F64_e64_:%[0-9]+]]:vreg_64 = V_RCP_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_F64_e64_:%[0-9]+]]:vreg_64 = V_RCP_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_F64_e64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0
@@ -75,7 +75,7 @@ body: |
     ; CHECK-LABEL: name: rcp_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_RCP_F64_e64_:%[0-9]+]]:vreg_64 = V_RCP_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_F64_e64_:%[0-9]+]]:vreg_64 = V_RCP_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir
index 90cf12ee37e7b..c69890ae5c85d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir
@@ -5,6 +5,7 @@
 # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:sgpr(s16) (in function: rcp_s16_vs)
 # SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:vgpr(s16) (in function: rcp_s16_vv)
 
+---
 name: rcp_s16_vs
 legalized: true
 regBankSelected: true
@@ -17,7 +18,7 @@ body: |
     ; CHECK-LABEL: name: rcp_s16_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RCP_F16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_F16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -38,7 +39,7 @@ body: |
     ; CHECK-LABEL: name: rcp_s16_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RCP_F16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RCP_F16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RCP_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir
index 6e514d217b56f..4b78bf341b52d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir
@@ -20,7 +20,7 @@ body: |
     ; CHECK-LABEL: name: rsq_clamp_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RSQ_CLAMP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_CLAMP_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_CLAMP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_CLAMP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_CLAMP_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
@@ -40,7 +40,7 @@ body: |
     ; CHECK-LABEL: name: rsq_clamp_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RSQ_CLAMP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_CLAMP_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_CLAMP_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_CLAMP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_CLAMP_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir
index 0df3078a0e700..2c129b08c651b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir
@@ -20,7 +20,7 @@ body: |
     ; CHECK-LABEL: name: rsq_legacy_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RSQ_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_LEGACY_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.legacy), %0
@@ -40,7 +40,7 @@ body: |
     ; CHECK-LABEL: name: rsq_legacy_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RSQ_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_LEGACY_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_LEGACY_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_LEGACY_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.legacy), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.mir
index 65fcb5deb44f2..4cf3fc5a9b427 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.mir
@@ -15,7 +15,7 @@ body: |
     ; CHECK-LABEL: name: rsq_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %0
@@ -35,7 +35,7 @@ body: |
     ; CHECK-LABEL: name: rsq_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %0
@@ -55,7 +55,7 @@ body: |
     ; CHECK-LABEL: name: rsq_s64_vs
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[V_RSQ_F64_e64_:%[0-9]+]]:vreg_64 = V_RSQ_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_F64_e64_:%[0-9]+]]:vreg_64 = V_RSQ_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_F64_e64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %0
@@ -75,7 +75,7 @@ body: |
     ; CHECK-LABEL: name: rsq_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_RSQ_F64_e64_:%[0-9]+]]:vreg_64 = V_RSQ_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_F64_e64_:%[0-9]+]]:vreg_64 = V_RSQ_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir
index be12c84cf75e4..8620efd9fdcee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir
@@ -17,7 +17,7 @@ body: |
     ; CHECK-LABEL: name: rsq_s16_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_RSQ_F16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_F16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -38,7 +38,7 @@ body: |
     ; CHECK-LABEL: name: rsq_s16_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_RSQ_F16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_RSQ_F16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_RSQ_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.mir
index b069bc7ead290..90e586c6888b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.mir
@@ -15,7 +15,7 @@ body: |
     ; CHECK-LABEL: name: sin_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_SIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_SIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_SIN_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %0
@@ -35,7 +35,7 @@ body: |
     ; CHECK-LABEL: name: sin_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_SIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_SIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_SIN_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir
index ff049d1be98f1..903ee4a9a040f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir
@@ -17,7 +17,7 @@ body: |
     ; CHECK-LABEL: name: sin_s16_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_SIN_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -38,7 +38,7 @@ body: |
     ; CHECK-LABEL: name: sin_s16_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_SIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_SIN_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_SIN_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s16.mir
index c94e0665da35b..ed510864f3bbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s16.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vvv
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -36,7 +36,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vsv
     ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -59,7 +59,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vvs
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -82,7 +82,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vvv_fabs_lhs
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 2, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 2, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -106,7 +106,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vvv_fabs_rhs
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 2, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 2, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -130,7 +130,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vvv_fneg_fabs_lhs
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 3, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 3, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -155,7 +155,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_vvv_fneg_fabs_rhs
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 3, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 3, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -180,7 +180,7 @@ body: |
     ; GFX8-LABEL: name: fadd_s16_fneg_copy_sgpr
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec
+    ; GFX8: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_ADD_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir
index 064e06a684c3f..65482f3cee967 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s32_vvv
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -34,7 +34,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s32_vsv
     ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -55,7 +55,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s32_vvs
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -76,7 +76,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s32_vvv_fabs_lhs
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 2, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 2, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -97,7 +97,7 @@ body: |
     liveins: $vgpr0, $vgpr1
     ; GFX6-LABEL: name: fadd_s32_vvv_fabs_rhs
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -119,7 +119,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s32_vvv_fneg_fabs_lhs
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 3, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 3, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -141,7 +141,7 @@ body: |
     liveins: $vgpr0, $vgpr1
     ; GFX6-LABEL: name: fadd_s32_vvv_fneg_fabs_rhs
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -166,7 +166,7 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -191,7 +191,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s32_copy_fneg_copy_fabs
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 3, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[COPY]], 3, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:sgpr(s32) = COPY $sgpr0
@@ -222,7 +222,7 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
     ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 2, [[COPY2]], 2, [[COPY3]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 2, [[COPY2]], 2, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s32) = COPY $sgpr1
@@ -249,7 +249,7 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
     ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 1, [[COPY2]], 1, [[COPY3]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 1, [[COPY2]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s32) = COPY $sgpr1
@@ -276,7 +276,7 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
     ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 3, [[COPY2]], 3, [[COPY3]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 3, [[COPY2]], 3, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s32) = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir
index 0525e5ecc15c5..b4b9e2ce1385e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s64.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s64_vvv
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -34,7 +34,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s64_vsv
     ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = COPY $vgpr0_vgpr1
@@ -55,7 +55,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s64_vvs
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:sgpr(s64) = COPY $sgpr0_sgpr1
@@ -76,7 +76,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s64_vvv_fabs_lhs
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 2, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 2, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -97,7 +97,7 @@ body: |
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-LABEL: name: fadd_s64_vvv_fabs_rhs
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -119,7 +119,7 @@ body: |
     ; GFX6-LABEL: name: fadd_s64_vvv_fneg_fabs_lhs
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 3, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 3, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -141,7 +141,7 @@ body: |
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-LABEL: name: fadd_s64_vvv_fneg_fabs_rhs
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr0_vgpr1
@@ -167,7 +167,7 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:sgpr(s64) = COPY $sgpr0_sgpr1
@@ -196,7 +196,7 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
     ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]]
     ; GFX6: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 2, [[COPY2]], 2, [[COPY3]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 2, [[COPY2]], 2, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = COPY $sgpr2_sgpr3
@@ -223,7 +223,7 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
     ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]]
     ; GFX6: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 1, [[COPY2]], 1, [[COPY3]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 1, [[COPY2]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = COPY $sgpr2_sgpr3
@@ -250,7 +250,7 @@ body: |
     ; GFX6: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
     ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]]
     ; GFX6: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
-    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 3, [[COPY2]], 3, [[COPY3]], 0, 0, implicit $exec
+    ; GFX6: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 3, [[COPY2]], 3, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_ADD_F64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = COPY $sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
index 75086984a142d..7bf63ebfa1fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
@@ -17,11 +17,11 @@ body: |
     liveins: $vgpr0
     ; GFX8-LABEL: name: fcanonicalize_f16_denorm
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_f16_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -44,11 +44,11 @@ body: |
     liveins: $vgpr0
     ; GFX8-LABEL: name: fcanonicalize_f16_flush
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, 15360, 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, 15360, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_f16_flush
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -72,11 +72,11 @@ body: |
 
     ; GFX8-LABEL: name: fcanonicalize_f32_denorm
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_f32_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FCANONICALIZE %0
@@ -99,11 +99,11 @@ body: |
 
     ; GFX8-LABEL: name: fcanonicalize_f32_flush
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_f32_flush
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FCANONICALIZE %0
@@ -126,11 +126,11 @@ body: |
 
     ; GFX8-LABEL: name: fcanonicalize_v2f16_denorm
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX8: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
     ; GFX9-LABEL: name: fcanonicalize_v2f16_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = G_FCANONICALIZE %0
@@ -153,11 +153,11 @@ body: |
 
     ; GFX8-LABEL: name: fcanonicalize_v2f16_flush
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 0, 15360, 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX8: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 0, 15360, 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]]
     ; GFX9-LABEL: name: fcanonicalize_v2f16_flush
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = G_FCANONICALIZE %0
@@ -180,11 +180,11 @@ body: |
 
     ; GFX8-LABEL: name: fcanonicalize_f64_denorm
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MAX_F64_]]
     ; GFX9-LABEL: name: fcanonicalize_f64_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FCANONICALIZE %0
@@ -207,11 +207,11 @@ body: |
 
     ; GFX8-LABEL: name: fcanonicalize_f64_flush
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, 4607182418800017408, 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, 4607182418800017408, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F64_]]
     ; GFX9-LABEL: name: fcanonicalize_f64_flush
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FCANONICALIZE %0
@@ -233,11 +233,11 @@ body: |
     liveins: $vgpr0
     ; GFX8-LABEL: name: fcanonicalize_fabs_f32_denorm
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_fabs_f32_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FABS %0
@@ -261,11 +261,11 @@ body: |
     liveins: $vgpr0
     ; GFX8-LABEL: name: fcanonicalize_fabs_f32_flush
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_fabs_f32_flush
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FABS %0
@@ -288,11 +288,11 @@ body: |
     liveins: $vgpr0
     ; GFX8-LABEL: name: fcanonicalize_fneg_f32_denorm
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_fneg_f32_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
@@ -315,11 +315,11 @@ body: |
     liveins: $vgpr0
     ; GFX8-LABEL: name: fcanonicalize_fneg_f32_flush
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_fneg_f32_flush
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
@@ -344,13 +344,13 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
     ; GFX8: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_fneg_fabs_f32_denorm
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
     ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
@@ -376,13 +376,13 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
     ; GFX8: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec
+    ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
     ; GFX9-LABEL: name: fcanonicalize_fneg_fabs_f32_flush
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
     ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec
+    ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.mir
index 70c5b76d758fc..fdf6dcfb8d106 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: fceil_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_CEIL_F32_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_CEIL_F32_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_CEIL_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FCEIL %0
@@ -34,7 +34,7 @@ body: |
     ; CHECK-LABEL: name: fceil_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_CEIL_F32_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_CEIL_F32_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_CEIL_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_FCEIL %0
@@ -54,7 +54,7 @@ body: |
     ; CHECK-LABEL: name: fceil_s64_sv
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[V_CEIL_F64_e64_:%[0-9]+]]:vreg_64 = V_CEIL_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_CEIL_F64_e64_:%[0-9]+]]:vreg_64 = V_CEIL_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0_vgpr1 = COPY [[V_CEIL_F64_e64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = G_FCEIL %0
@@ -74,7 +74,7 @@ body: |
     ; CHECK-LABEL: name: fceil_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_CEIL_F64_e64_:%[0-9]+]]:vreg_64 = V_CEIL_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_CEIL_F64_e64_:%[0-9]+]]:vreg_64 = V_CEIL_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0_vgpr1 = COPY [[V_CEIL_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FCEIL %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 92b615e8cf6eb..75a78190e62e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -38,7 +38,7 @@ body: |
     ; GCN-LABEL: name: fceil_s16_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -60,7 +60,7 @@ body: |
     ; GCN-LABEL: name: fceil_s16_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -82,7 +82,7 @@ body: |
     ; GCN-LABEL: name: fceil_fneg_s16_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.mir
index 74f9154bd9663..c052f484bff3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.mir
@@ -37,13 +37,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oeq_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_EQ_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_oeq_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_EQ_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -62,13 +62,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ogt_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_GT_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ogt_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_GT_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -87,13 +87,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oge_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_GE_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_oge_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_GE_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -112,13 +112,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_olt_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LT_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_olt_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LT_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -137,13 +137,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ole_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LE_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ole_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LE_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -162,13 +162,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_one_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LG_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LG_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LG_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_one_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LG_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LG_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LG_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -187,13 +187,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ord_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_O_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_O_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_O_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ord_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_O_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_O_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_O_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_O_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -212,13 +212,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_uno_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_U_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_U_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_U_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_uno_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_U_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_U_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_U_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -237,13 +237,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ueq_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NLG_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLG_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLG_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ueq_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NLG_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLG_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLG_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLG_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -262,13 +262,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ugt_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NLE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLE_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ugt_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NLE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLE_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -287,13 +287,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_uge_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NLT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLT_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_uge_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NLT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLT_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -312,13 +312,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ult_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NGE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NGE_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NGE_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ult_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NGE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NGE_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGE_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NGE_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -337,13 +337,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ule_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NGT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NGT_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NGT_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_ule_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NGT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NGT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGT_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NGT_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -362,13 +362,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_une_s32_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NEQ_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NEQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NEQ_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NEQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NEQ_F32_e64_]]
     ; WAVE32-LABEL: name: fcmp_une_s32_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NEQ_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NEQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NEQ_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NEQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NEQ_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -435,13 +435,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oeq_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_EQ_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_EQ_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_EQ_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_oeq_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_EQ_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_EQ_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_EQ_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -460,13 +460,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ogt_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_GT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_GT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_GT_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ogt_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_GT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_GT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_GT_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -485,13 +485,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oge_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_GE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_GE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_GE_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_oge_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_GE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_GE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_GE_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -510,13 +510,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_olt_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_LT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LT_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_olt_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_LT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LT_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -535,13 +535,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ole_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_LE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LE_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ole_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_LE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LE_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -560,13 +560,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_one_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_LG_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LG_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LG_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_one_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_LG_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LG_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LG_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -585,13 +585,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ord_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_O_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_O_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_O_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_O_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_O_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ord_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_O_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_O_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_O_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_O_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_O_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -610,13 +610,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_uno_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_U_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_U_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_U_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_U_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_U_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_uno_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_U_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_U_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_U_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_U_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_U_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -635,13 +635,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ueq_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_NLG_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLG_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLG_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ueq_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_NLG_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLG_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLG_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLG_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -660,13 +660,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ugt_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_NLE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLE_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ugt_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_NLE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLE_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -685,13 +685,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_uge_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_NLT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLT_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_uge_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_NLT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLT_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -710,13 +710,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ult_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_NGE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NGE_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NGE_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ult_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_NGE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NGE_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGE_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NGE_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -735,13 +735,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ule_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_NGT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NGT_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NGT_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_ule_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_NGT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NGT_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGT_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NGT_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -760,13 +760,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_une_s64_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE64: [[V_CMP_NEQ_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NEQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NEQ_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NEQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NEQ_F64_e64_]]
     ; WAVE32-LABEL: name: fcmp_une_s64_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; WAVE32: [[V_CMP_NEQ_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NEQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NEQ_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_NEQ_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NEQ_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = COPY $vgpr2_vgpr3
@@ -809,14 +809,14 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oeq_s32_vv_select_user
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY1]], 0, [[COPY]], [[V_CMP_EQ_F32_e64_]], implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
     ; WAVE32-LABEL: name: fcmp_oeq_s32_vv_select_user
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY1]], 0, [[COPY]], [[V_CMP_EQ_F32_e64_]], implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
index 42b017b409a59..a0354d8403932 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
@@ -43,13 +43,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oeq_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_oeq_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -70,13 +70,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ogt_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ogt_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -97,13 +97,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_oge_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_oge_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -124,13 +124,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_olt_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_olt_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -151,13 +151,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ole_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ole_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -177,13 +177,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_one_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_one_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -204,13 +204,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ord_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ord_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -231,13 +231,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_uno_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_uno_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -258,13 +258,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ueq_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ueq_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -285,13 +285,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ugt_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ugt_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -312,13 +312,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_uge_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_uge_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -339,13 +339,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ult_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ult_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -366,13 +366,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_ule_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_ule_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -393,13 +393,13 @@ body: |
     ; WAVE64-LABEL: name: fcmp_une_s16_vv
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE64: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE64: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_64 = V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]]
     ; WAVE32-LABEL: name: fcmp_une_s16_vv
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; WAVE32: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $exec
+    ; WAVE32: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_32 = V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir
index bb2ba24d411c5..a0339fa9551e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fexp2.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: fexp2_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_EXP_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_FEXP2 %0
@@ -34,7 +34,7 @@ body: |
     ; CHECK-LABEL: name: fexp2_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_EXP_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FEXP2 %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index fa462ac93b06b..68bde4c25b64d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -38,7 +38,7 @@ body: |
     ; VI-LABEL: name: ffloor_s16_vv
     ; VI: liveins: $vgpr0
     ; VI: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; VI: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -60,7 +60,7 @@ body: |
     ; VI-LABEL: name: ffloor_s16_vs
     ; VI: liveins: $sgpr0
     ; VI: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; VI: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; VI: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -90,7 +90,7 @@ body: |
     ; VI-LABEL: name: ffloor_fneg_s16_vv
     ; VI: liveins: $vgpr0
     ; VI: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; VI: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir
index 611eab6bfa692..710a7927acd29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: ffloor_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FFLOOR %0
@@ -34,7 +34,7 @@ body: |
     ; CHECK-LABEL: name: ffloor_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_FFLOOR %0
@@ -54,7 +54,7 @@ body: |
     ; CHECK-LABEL: name: ffloor_fneg_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s32) = G_FNEG %0
@@ -74,7 +74,7 @@ body: |
     ; CHECK-LABEL: name: ffloor_fneg_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir
index 1af481c27a97d..276a1ffb9930d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: ffloor_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = V_FLOOR_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = V_FLOOR_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0_vgpr1 = COPY [[V_FLOOR_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FFLOOR %0
@@ -50,7 +50,7 @@ body: |
     ; CHECK-LABEL: name: ffloor_fneg_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = V_FLOOR_F64_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = V_FLOOR_F64_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0_vgpr1 = COPY [[V_FLOOR_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FNEG %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir
index c812fc48f1354..2034eb73fdf06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma.s32.mir
@@ -17,20 +17,20 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX9-DL-LABEL: name: fma_f32
     ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-DL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-DL: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-DL: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9-DL: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-DL: S_ENDPGM 0, implicit [[V_FMAC_F32_e64_]]
     ; GFX10-LABEL: name: fma_f32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_FMAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -54,20 +54,20 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX9-DL-LABEL: name: fma_f32_fneg_src0
     ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-DL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-DL: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-DL: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9-DL: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-DL: S_ENDPGM 0, implicit [[V_FMAC_F32_e64_]]
     ; GFX10-LABEL: name: fma_f32_fneg_src0
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_FMAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -92,20 +92,20 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX9-DL-LABEL: name: fma_f32_fneg_src1
     ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-DL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-DL: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-DL: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9-DL: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-DL: S_ENDPGM 0, implicit [[V_FMAC_F32_e64_]]
     ; GFX10-LABEL: name: fma_f32_fneg_src1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_FMAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -130,20 +130,20 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX9-DL-LABEL: name: fma_f32_fneg_src2
     ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-DL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-DL: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-DL: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9-DL: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-DL: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX10-LABEL: name: fma_f32_fneg_src2
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -168,20 +168,20 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX9-DL-LABEL: name: fma_f32_fabs_src2
     ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-DL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-DL: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-DL: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9-DL: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-DL: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX10-LABEL: name: fma_f32_fabs_src2
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -206,20 +206,20 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX9-DL-LABEL: name: fma_f32_copy_fneg_src2
     ; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-DL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9-DL: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-DL: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9-DL: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-DL: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     ; GFX10-LABEL: name: fma_f32_copy_fneg_src2
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_FMA_F32_:%[0-9]+]]:vgpr_32 = V_FMA_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_FMA_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmad.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmad.s32.mir
index 019bcd5cf2f3c..99e776b5d0ff6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmad.s32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmad.s32.mir
@@ -16,14 +16,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     ; GFX10-LABEL: name: fmad_f32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -47,14 +47,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     ; GFX10-LABEL: name: fmad_f32_fneg_src0
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -79,14 +79,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     ; GFX10-LABEL: name: fmad_f32_fneg_src1
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -111,14 +111,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     ; GFX10-LABEL: name: fmad_f32_fneg_src2
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -143,14 +143,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     ; GFX10-LABEL: name: fmad_f32_fabs_src2
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 2, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -175,14 +175,14 @@ body: |
     ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX6: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX6: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     ; GFX10-LABEL: name: fmad_f32_copy_fneg_src2
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $exec
+    ; GFX10: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 1, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX10: S_ENDPGM 0, implicit [[V_MAD_F32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
index 636b1d2dda694..720f9285961a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
@@ -21,15 +21,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MAX_F64_]], implicit [[V_MAX_F64_1]], implicit [[V_MAX_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -89,15 +89,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MAX_F64_]], implicit [[V_MAX_F64_1]], implicit [[V_MAX_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir
index 32ef48fcf4daa..e94ab1c3cdc56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: fmaxnum_ieee_f16_vv
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -36,7 +36,7 @@ body: |
     ; CHECK-LABEL: name: fmaxnum_ieee_f16_v_fneg_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
index 3028a1f1493fe..bc2e53d421c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX9-LABEL: name: fmaxnum_ieee_v2f16_vv
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
index 020e171d3fd57..a440e801682f3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
@@ -22,15 +22,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MAX_F64_]], implicit [[V_MAX_F64_1]], implicit [[V_MAX_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -88,15 +88,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MAX_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_1:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MAX_F64_2:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MAX_F64_]], implicit [[V_MAX_F64_1]], implicit [[V_MAX_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir
index e1caa4cce7e87..1bf0c576adfeb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: fmaxnum_f16_vv
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -36,7 +36,7 @@ body: |
     ; CHECK-LABEL: name: fmaxnum_f16_v_fneg_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
index 0b3b1a9ff9d64..bc83f90c8a113 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
@@ -14,7 +14,7 @@ body: |
     ; GFX9-LABEL: name: fmaxnum_v2f16_vv
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
index d6ac32e415434..40b97460b2031 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
@@ -21,15 +21,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MIN_F64_]], implicit [[V_MIN_F64_1]], implicit [[V_MIN_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -89,15 +89,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MIN_F64_]], implicit [[V_MIN_F64_1]], implicit [[V_MIN_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir
index 432243ec9c9c2..cf00b1b1d80a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: fminnum_ieee_f16_vv
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MIN_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -36,7 +36,7 @@ body: |
     ; CHECK-LABEL: name: fminnum_ieee_f16_v_fneg_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MIN_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
index 13853bf90c5c0..0bb68ef86ed98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX9-LABEL: name: fminnum_ieee_v2f16_vv
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MIN_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
index 1f4decb7826a4..74350b247fc48 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
@@ -22,15 +22,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MIN_F64_]], implicit [[V_MIN_F64_1]], implicit [[V_MIN_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -88,15 +88,15 @@ body: |
     ; GFX7: [[COPY4:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr10_vgpr11
     ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY $vgpr12_vgpr13
-    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX7: FLAT_STORE_DWORD [[COPY3]], [[V_MIN_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
-    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $exec
-    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $exec
+    ; GFX7: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_1:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
+    ; GFX7: [[V_MIN_F64_2:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec
     ; GFX7: S_ENDPGM 0, implicit [[V_MIN_F64_]], implicit [[V_MIN_F64_1]], implicit [[V_MIN_F64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir
index 71d7b6e8c6df4..0a4f65544a467 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: fminnum_f16_vv
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MIN_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
@@ -36,7 +36,7 @@ body: |
     ; CHECK-LABEL: name: fminnum_f16_v_fneg_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $exec
+    ; CHECK: [[V_MIN_F16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_F16_e64 0, [[COPY]], 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: S_ENDPGM 0, implicit [[V_MIN_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
index 84afe51ca3cfc..255d05d39f004 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX9-LABEL: name: fminnum_v2f16_vv
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MIN_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
index c7dbeada2ca42..babbe653b9800 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
@@ -15,9 +15,9 @@ body: |
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
-    ; GCN: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GCN: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GCN: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
@@ -53,9 +53,9 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GCN: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GCN: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F64_1:%[0-9]+]]:vreg_64 = V_MUL_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F64_2:%[0-9]+]]:vreg_64 = V_MUL_F64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GCN: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F64_1:%[0-9]+]]:vreg_64 = V_MUL_F64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F64_2:%[0-9]+]]:vreg_64 = V_MUL_F64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MUL_F64_]], implicit [[V_MUL_F64_1]], implicit [[V_MUL_F64_2]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = COPY $vgpr0_vgpr1
@@ -86,9 +86,9 @@ body: |
     ; GCN-LABEL: name: fmul_f16
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]], implicit [[V_MUL_F16_e64_1]], implicit [[V_MUL_F16_e64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
@@ -123,16 +123,16 @@ body: |
     ; GCN-LABEL: name: fmul_modifiers_f32
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GCN: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_MUL_F32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_MUL_F32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GCN: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GCN: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_2]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir
index 665a3589831d2..b70c8e25ccd1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir
@@ -13,7 +13,7 @@ body: |
     ; GFX9-LABEL: name: fmul_v2f16_vv
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = COPY $vgpr1
@@ -33,7 +33,7 @@ body: |
     ; GFX9-LABEL: name: fmul_v2f16_fneg_v_fneg_v
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 11, [[COPY]], 11, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 11, [[COPY]], 11, [[COPY1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]]
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(<2 x s16>) = COPY $vgpr1
@@ -60,7 +60,7 @@ body: |
     ; GFX9: [[FNEG:%[0-9]+]]:vgpr(s16) = G_FNEG [[TRUNC]]
     ; GFX9: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[FNEG]](s16)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:vgpr_32(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[COPY2]](s32)
-    ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32(<2 x s16>) = V_PK_MUL_F16 8, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 8, [[COPY]](<2 x s16>), 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32(<2 x s16>) = V_PK_MUL_F16 8, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 8, [[COPY]](<2 x s16>), 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]](<2 x s16>)
     %0:vgpr(<2 x s16>) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
index e1e8c0e250be2..64662d748cd17 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
@@ -14,7 +14,7 @@ body: |
     ; GCN-LABEL: name: fptosi_s32_to_s32_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FPTOSI %0
@@ -34,7 +34,7 @@ body: |
     ; GCN-LABEL: name: fptosi_s32_to_s32_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_FPTOSI %0
@@ -54,7 +54,7 @@ body: |
     ; GCN-LABEL: name: fptosi_s32_to_s32_fneg_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
@@ -75,8 +75,8 @@ body: |
     ; GCN-LABEL: name: fptosi_s16_to_s32_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $exec
-    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $exec
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -97,8 +97,8 @@ body: |
     ; GCN-LABEL: name: fptosi_s16_to_s32_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $exec
-    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $exec
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -121,8 +121,8 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[V_XOR_B32_e32_]], implicit $exec
-    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $exec
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[V_XOR_B32_e32_]], implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
index e6736f2d71475..a13620ad94520 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
@@ -15,8 +15,8 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
-    ; GCN: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e64 0, [[COPY]], 0, 0, implicit $exec
-    ; GCN: [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e64 0, [[COPY1]], 0, 0, implicit $exec
+    ; GCN: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; GCN: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_U32_F32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GCN: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_U32_F32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
@@ -48,8 +48,8 @@ body: |
     ; GCN-LABEL: name: fptoui_s16_to_s32_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $exec
-    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $exec
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -70,8 +70,8 @@ body: |
     ; GCN-LABEL: name: fptoui_s16_to_s32_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $exec
-    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $exec
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -94,8 +94,8 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; GCN: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[V_XOR_B32_e32_]], implicit $exec
-    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $exec
+    ; GCN: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_e32 [[V_XOR_B32_e32_]], implicit $mode, implicit $exec
+    ; GCN: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e32_]], implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir
index 45a8551ee47e3..316046edaad47 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir
@@ -15,7 +15,7 @@ body: |
     ; GCN-LABEL: name: frint_s32_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FRINT %0
@@ -35,7 +35,7 @@ body: |
     ; GCN-LABEL: name: frint_s32_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_FRINT %0
@@ -55,7 +55,7 @@ body: |
     ; GCN-LABEL: name: frint_fneg_s32_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
@@ -76,7 +76,7 @@ body: |
     ; GCN-LABEL: name: frint_s64_vv
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0_vgpr1 = COPY [[V_RNDNE_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FRINT %0
@@ -96,7 +96,7 @@ body: |
     ; GCN-LABEL: name: frint_s64_fneg_vv
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0_vgpr1 = COPY [[V_RNDNE_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FNEG %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.s16.mir
index c72ea740a3986..e449a7b93baa0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.s16.mir
@@ -38,7 +38,7 @@ body: |
     ; GCN-LABEL: name: frint_s16_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_RNDNE_F16_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F16_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_RNDNE_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -60,7 +60,7 @@ body: |
     ; GCN-LABEL: name: frint_s16_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_RNDNE_F16_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F16_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_RNDNE_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -82,7 +82,7 @@ body: |
     ; GCN-LABEL: name: frint_fneg_s16_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_RNDNE_F16_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F16_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_RNDNE_F16_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_RNDNE_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.mir
index 550f47c5471a5..a9cd8c51f62a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.mir
@@ -14,7 +14,7 @@ body: |
     ; CHECK-LABEL: name: intrinsic_trunc_s32_vv
     ; CHECK: liveins: $vgpr0
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_TRUNC_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_INTRINSIC_TRUNC %0
@@ -34,7 +34,7 @@ body: |
     ; CHECK-LABEL: name: intrinsic_trunc_s32_vs
     ; CHECK: liveins: $sgpr0
     ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; CHECK: [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F32_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0 = COPY [[V_TRUNC_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_INTRINSIC_TRUNC %0
@@ -54,7 +54,7 @@ body: |
     ; CHECK-LABEL: name: intrinsic_trunc_s64_sv
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK: [[V_TRUNC_F64_e64_:%[0-9]+]]:vreg_64 = V_TRUNC_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_TRUNC_F64_e64_:%[0-9]+]]:vreg_64 = V_TRUNC_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0_vgpr1 = COPY [[V_TRUNC_F64_e64_]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = G_INTRINSIC_TRUNC %0
@@ -74,7 +74,7 @@ body: |
     ; CHECK-LABEL: name: intrinsic_trunc_s64_vv
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK: [[V_TRUNC_F64_e64_:%[0-9]+]]:vreg_64 = V_TRUNC_F64_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; CHECK: [[V_TRUNC_F64_e64_:%[0-9]+]]:vreg_64 = V_TRUNC_F64_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; CHECK: $vgpr0_vgpr1 = COPY [[V_TRUNC_F64_e64_]]
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_INTRINSIC_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.s16.mir
index 1bf97cac9602a..d2fb035c8b6b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-intrinsic-trunc.s16.mir
@@ -14,7 +14,7 @@ body: |
     ; GCN-LABEL: name: intrinsic_trunc_s16_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_TRUNC_F16_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_TRUNC_F16_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_TRUNC_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
@@ -36,7 +36,7 @@ body: |
     ; GCN-LABEL: name: intrinsic_trunc_s16_vs
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN: [[V_TRUNC_F16_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F16_e64 0, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_TRUNC_F16_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_TRUNC_F16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
@@ -58,7 +58,7 @@ body: |
     ; GCN-LABEL: name: intrinsic_trunc_fneg_s16_vv
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN: [[V_TRUNC_F16_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F16_e64 1, [[COPY]], 0, 0, implicit $exec
+    ; GCN: [[V_TRUNC_F16_e64_:%[0-9]+]]:vgpr_32 = V_TRUNC_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_TRUNC_F16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
index e68fda19d493b..3cd2362b10934 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
@@ -16,8 +16,8 @@ body: |
     ; WAVE64: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE64: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
-    ; WAVE64: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $exec
-    ; WAVE64: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $exec
+    ; WAVE64: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; WAVE64: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; WAVE32-LABEL: name: sitofp
@@ -25,8 +25,8 @@ body: |
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; WAVE32: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
-    ; WAVE32: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $exec
-    ; WAVE32: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $exec
+    ; WAVE32: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; WAVE32: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
@@ -58,15 +58,15 @@ body: |
     ; WAVE64-LABEL: name: sitofp_s32_to_s16_vv
     ; WAVE64: liveins: $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $exec
-    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $exec
+    ; WAVE64: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $mode, implicit $exec
     ; WAVE64: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     ; WAVE32-LABEL: name: sitofp_s32_to_s16_vv
     ; WAVE32: liveins: $vgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $exec
-    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $exec
+    ; WAVE32: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $mode, implicit $exec
     ; WAVE32: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_SITOFP %0
@@ -87,15 +87,15 @@ body: |
     ; WAVE64-LABEL: name: sitofp_s32_to_s16_vs
     ; WAVE64: liveins: $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; WAVE64: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $exec
-    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $exec
+    ; WAVE64: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $mode, implicit $exec
     ; WAVE64: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     ; WAVE32-LABEL: name: sitofp_s32_to_s16_vs
     ; WAVE32: liveins: $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; WAVE32: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $exec
-    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $exec
+    ; WAVE32: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_I32_e32_]], implicit $mode, implicit $exec
     ; WAVE32: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s16) = G_SITOFP %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
index d35f7c428a470..421b987f8f922 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
@@ -15,13 +15,13 @@ body: |
     ; WAVE64-LABEL: name: uitofp_s32_to_s32_vv
     ; WAVE64: liveins: $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $exec
+    ; WAVE64: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; WAVE64: $vgpr0 = COPY [[V_CVT_F32_U32_e64_]]
     ; WAVE32-LABEL: name: uitofp_s32_to_s32_vv
     ; WAVE32: liveins: $vgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $exec
+    ; WAVE32: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32: $vgpr0 = COPY [[V_CVT_F32_U32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_UITOFP %0
@@ -41,13 +41,13 @@ body: |
     ; WAVE64-LABEL: name: uitofp_s32_to_s32_vs
     ; WAVE64: liveins: $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; WAVE64: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $exec
+    ; WAVE64: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; WAVE64: $vgpr0 = COPY [[V_CVT_F32_U32_e64_]]
     ; WAVE32-LABEL: name: uitofp_s32_to_s32_vs
     ; WAVE32: liveins: $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; WAVE32: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $exec
+    ; WAVE32: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32: $vgpr0 = COPY [[V_CVT_F32_U32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_UITOFP %0
@@ -67,15 +67,15 @@ body: |
     ; WAVE64-LABEL: name: uitofp_s32_to_s16_vv
     ; WAVE64: liveins: $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $exec
-    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $exec
+    ; WAVE64: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
     ; WAVE64: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     ; WAVE32-LABEL: name: uitofp_s32_to_s16_vv
     ; WAVE32: liveins: $vgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $exec
-    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $exec
+    ; WAVE32: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
     ; WAVE32: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_UITOFP %0
@@ -96,15 +96,15 @@ body: |
     ; WAVE64-LABEL: name: uitofp_s32_to_s16_vs
     ; WAVE64: liveins: $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; WAVE64: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $exec
-    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $exec
+    ; WAVE64: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE64: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
     ; WAVE64: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     ; WAVE32-LABEL: name: uitofp_s32_to_s16_vs
     ; WAVE32: liveins: $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; WAVE32: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $exec
-    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $exec
+    ; WAVE32: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; WAVE32: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F16_F32_e32 [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
     ; WAVE32: $vgpr0 = COPY [[V_CVT_F16_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s16) = G_UITOFP %0
diff --git a/llvm/test/CodeGen/AMDGPU/bundle-latency.mir b/llvm/test/CodeGen/AMDGPU/bundle-latency.mir
index 603d0cf33f90c..2bb21dec55a25 100644
--- a/llvm/test/CodeGen/AMDGPU/bundle-latency.mir
+++ b/llvm/test/CodeGen/AMDGPU/bundle-latency.mir
@@ -13,14 +13,14 @@ body:             |
     ; GCN:   $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
     ; GCN:   $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
     ; GCN: }
-    ; GCN: $vgpr6 = V_ADD_F32_e32 killed $vgpr0, $vgpr0, implicit $exec
-    ; GCN: $vgpr5 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $exec
+    ; GCN: $vgpr6 = V_ADD_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; GCN: $vgpr5 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec {
       $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec
       $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec
     }
-    $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec
-    $vgpr6 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr6 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
 ...
 
 ---
@@ -29,14 +29,14 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: dst_bundle_latency
-    ; GCN: $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $exec
-    ; GCN: $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $exec
+    ; GCN: $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $mode, implicit $exec
+    ; GCN: $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $mode, implicit $exec
     ; GCN: BUNDLE killed $vgpr0, killed $vgpr1, undef $vgpr3_vgpr4, implicit $exec {
     ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr1, 0, 0, 0, 0, implicit $exec
     ; GCN:   GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr0, 4, 0, 0, 0, implicit $exec
     ; GCN: }
-    $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $exec
-    $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $mode, implicit $exec
+    $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $mode, implicit $exec
     BUNDLE $vgpr0, $vgpr1, undef $vgpr3_vgpr4, implicit $exec {
       GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec
       GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
index f631bcd258115..f78ad501cebfd 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
+++ b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
@@ -1,8 +1,8 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands  %s -o - | FileCheck -check-prefix=GCN %s
 ---
 # GCN-LABEL: name: v_max_self_clamp_not_set_f32
-# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-# GCN-NEXT: %21:vgpr_32 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $exec
+# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+# GCN-NEXT: %21:vgpr_32 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $mode, implicit $exec
 
 name:            v_max_self_clamp_not_set_f32
 tracksRegLiveness: true
@@ -56,16 +56,16 @@ body:             |
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
     %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
 ---
 # GCN-LABEL: name: v_clamp_omod_already_set_f32
-# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-# GCN: %21:vgpr_32 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $exec
+# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+# GCN: %21:vgpr_32 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $mode, implicit $exec
 name:            v_clamp_omod_already_set_f32
 tracksRegLiveness: true
 registers:
@@ -118,8 +118,8 @@ body:             |
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
     %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -127,8 +127,8 @@ body:             |
 # Don't fold a mul that looks like an omod if itself has omod set
 
 # GCN-LABEL: name: v_omod_mul_omod_already_set_f32
-# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-# GCN-NEXT: %21:vgpr_32 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $exec
+# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+# GCN-NEXT: %21:vgpr_32 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $mode, implicit $exec
 name:            v_omod_mul_omod_already_set_f32
 tracksRegLiveness: true
 registers:
@@ -181,8 +181,8 @@ body:             |
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
     %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
@@ -191,8 +191,8 @@ body:             |
 # Don't fold a mul that looks like an omod if itself has clamp set
 # This might be OK, but would require folding the clamp at the same time.
 # GCN-LABEL: name: v_omod_mul_clamp_already_set_f32
-# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-# GCN-NEXT: %21:vgpr_32 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $exec
+# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+# GCN-NEXT: %21:vgpr_32 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $mode, implicit $exec
 
 name:            v_omod_mul_clamp_already_set_f32
 tracksRegLiveness: true
@@ -246,8 +246,8 @@ body:             |
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
     %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
@@ -269,8 +269,8 @@ body:             |
 # Don't fold a mul that looks like an omod if itself has omod set
 
 # GCN-LABEL: name: v_omod_add_omod_already_set_f32
-# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-# GCN-NEXT: %21:vgpr_32 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $exec
+# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+# GCN-NEXT: %21:vgpr_32 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $mode, implicit $exec
 name:            v_omod_add_omod_already_set_f32
 tracksRegLiveness: true
 registers:
@@ -323,8 +323,8 @@ body:             |
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
     %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
@@ -333,8 +333,8 @@ body:             |
 # Don't fold a mul that looks like an omod if itself has clamp set
 # This might be OK, but would require folding the clamp at the same time.
 # GCN-LABEL: name: v_omod_add_clamp_already_set_f32
-# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-# GCN-NEXT: %21:vgpr_32 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $exec
+# GCN: %20:vgpr_32 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+# GCN-NEXT: %21:vgpr_32 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $mode, implicit $exec
 
 name:            v_omod_add_clamp_already_set_f32
 tracksRegLiveness: true
@@ -388,8 +388,8 @@ body:             |
     %16 = REG_SEQUENCE killed %4, 17, %12, 18
     %18 = COPY %26
     %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec
-    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
+    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
@@ -407,6 +407,6 @@ body:             |
     liveins: $vgpr0
 
     %0 = COPY $vgpr0
-    %1 = V_MAX_F32_e64 0, killed %0, 0, 1056964608, 1, 0, implicit $exec
+    %1 = V_MAX_F32_e64 0, killed %0, 0, 1056964608, 1, 0, implicit $mode, implicit $exec
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
index a187cd11ed919..80a201bbfdd00 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
+++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
@@ -15,6 +15,6 @@ body:             |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
-    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $exec
+    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %3 = FLAT_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
index 96f35605b1c9e..1ef5c1098b639 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir
@@ -22,10 +22,10 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
-    %0:vgpr_32 = V_MUL_F32_e32 0, undef %1:vgpr_32, implicit $exec
-    %2:vgpr_32 = V_CVT_U32_F32_e32 killed %0, implicit $exec
-    %3:vgpr_32 = V_CVT_F32_I32_e32 killed %2, implicit $exec
-    %4:vgpr_32 = V_CVT_U32_F32_e32 killed %3, implicit $exec
+    %0:vgpr_32 = nofpexcept V_MUL_F32_e32 0, undef %1:vgpr_32, implicit $mode, implicit $exec
+    %2:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %0, implicit $mode, implicit $exec
+    %3:vgpr_32 = nofpexcept V_CVT_F32_I32_e32 killed %2, implicit $mode, implicit $exec
+    %4:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %3, implicit $mode, implicit $exec
     %5:vgpr_32 = V_LSHRREV_B32_e32 4, killed %4, implicit $exec
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
 
@@ -126,7 +126,7 @@ body:             |
     %27.sub6:sgpr_256 = COPY %26
     %27.sub7:sgpr_256 = COPY killed %26
     %28:vgpr_32 = IMAGE_LOAD_V1_V4 killed %25, killed %27, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
-    %29:vgpr_32 = V_ADD_F32_e32 0, killed %28, implicit $exec
+    %29:vgpr_32 = nofpexcept V_ADD_F32_e32 0, killed %28, implicit $mode, implicit $exec
     $m0 = S_MOV_B32 -1
     DS_WRITE_B32 undef %30:vgpr_32, killed %29, 0, 0, implicit $m0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`, addrspace 3)
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
index 8bcff8a99f45d..848011a8faac2 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir
@@ -41,10 +41,10 @@ body:             |
 
   bb.2:
     successors: %bb.4(0x80000000)
-    %6:vgpr_32 = V_MUL_F32_e32 1031798784, undef %7:vgpr_32, implicit $exec
-    %8:vgpr_32 = V_FLOOR_F32_e32 killed %6, implicit $exec
-    %9:vgpr_32 = V_ADD_F32_e32 0, killed %8, implicit $exec
-    %10:vgpr_32 = V_CVT_U32_F32_e32 killed %9, implicit $exec
+    %6:vgpr_32 = nofpexcept V_MUL_F32_e32 1031798784, undef %7:vgpr_32, implicit $mode, implicit $exec
+    %8:vgpr_32 = nofpexcept V_FLOOR_F32_e32 killed %6, implicit $mode, implicit $exec
+    %9:vgpr_32 = nofpexcept V_ADD_F32_e32 0, killed %8, implicit $mode, implicit $exec
+    %10:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %9, implicit $mode, implicit $exec
     %11:vgpr_32 = V_LSHLREV_B32_e32 1, killed %10, implicit $exec
     %12:sreg_64 = S_MOV_B64 0
     %13:sgpr_128 = COPY killed %2
@@ -243,8 +243,8 @@ body:             |
     S_BRANCH %bb.3
 
   bb.17:
-    %105:vgpr_32 = V_ADD_F32_e32 target-flags(amdgpu-rel32-lo) 0, %20.sub3, implicit $exec
-    %106:vgpr_32 = V_ADD_F32_e32 target-flags(amdgpu-gotprel32-hi) 0, killed %20.sub2, implicit $exec
+    %105:vgpr_32 = nofpexcept V_ADD_F32_e32 target-flags(amdgpu-rel32-lo) 0, %20.sub3, implicit $mode, implicit $exec
+    %106:vgpr_32 = nofpexcept V_ADD_F32_e32 target-flags(amdgpu-gotprel32-hi) 0, killed %20.sub2, implicit $mode, implicit $exec
     undef %107.sub0:vreg_64 = COPY killed %106
     %107.sub1:vreg_64 = COPY killed %105
     $exec = S_AND_B64 $exec, killed %0, implicit-def dead $scc
@@ -258,11 +258,11 @@ body:             |
     %109.sub6:sgpr_256 = COPY %108
     %109.sub7:sgpr_256 = COPY killed %108
     %110:vgpr_32 = IMAGE_SAMPLE_V1_V2 killed %107, killed %109, undef %111:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
-    %112:vgpr_32 = V_MUL_F32_e32 0, killed %110, implicit $exec
-    %113:vgpr_32 = V_MUL_F32_e32 0, killed %112, implicit $exec
-    %114:vgpr_32 = V_MAD_F32 0, killed %113, 0, 0, 0, 0, 0, 0, implicit $exec
-    %115:vgpr_32 = V_MAX_F32_e32 0, killed %114, implicit $exec
-    %116:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, killed %115, 0, 1065353216, 0, 0, implicit $exec
+    %112:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %110, implicit $mode, implicit $exec
+    %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %112, implicit $mode, implicit $exec
+    %114:vgpr_32 = nofpexcept V_MAD_F32 0, killed %113, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %115:vgpr_32 = nofpexcept V_MAX_F32_e32 0, killed %114, implicit $mode, implicit $exec
+    %116:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, killed %115, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     EXP 0, undef %117:vgpr_32, killed %116, undef %118:vgpr_32, undef %119:vgpr_32, -1, -1, 15, implicit $exec
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
index c3a945716f77d..47ecb6c58538f 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir
@@ -72,9 +72,9 @@ body: |
     %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     undef %13.sub1:vreg_128 = COPY %9.sub1
     %13.sub2:vreg_128 = COPY %9.sub2
-    %14:sreg_64 = V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $exec
-    %15:vgpr_32 = V_ADD_F32_e32 1065353216, undef %16:vgpr_32, implicit $exec
-    %17:sreg_64 = V_CMP_GT_F32_e64 0, 0, 0, killed %15, 0, implicit $exec
+    %14:sreg_64 = nofpexcept V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $mode, implicit $exec
+    %15:vgpr_32 = nofpexcept V_ADD_F32_e32 1065353216, undef %16:vgpr_32, implicit $mode, implicit $exec
+    %17:sreg_64 = nofpexcept V_CMP_GT_F32_e64 0, 0, 0, killed %15, 0, implicit $mode, implicit $exec
     %18:sreg_64 = S_AND_B64 killed %17, killed %14, implicit-def dead $scc
     %19:sreg_64 = COPY %10
     %20:vreg_128 = COPY %13
@@ -127,8 +127,8 @@ body: |
 
   bb.13:
     successors: %bb.14(0x80000000)
-    %32:vgpr_32 = V_MUL_F32_e32 undef %33:vgpr_32, killed %30.sub1, implicit $exec
-    %34:vgpr_32 = V_MUL_F32_e32 undef %35:vgpr_32, killed %32, implicit $exec
+    %32:vgpr_32 = nofpexcept V_MUL_F32_e32 undef %33:vgpr_32, killed %30.sub1, implicit $mode, implicit $exec
+    %34:vgpr_32 = nofpexcept V_MUL_F32_e32 undef %35:vgpr_32, killed %32, implicit $mode, implicit $exec
     undef %36.sub0:vreg_128 = COPY %34
     %31:vreg_128 = COPY killed %36
 
@@ -144,30 +144,30 @@ body: |
 
   bb.16:
     successors: %bb.17(0x80000000)
-    %39:vgpr_32 = V_FMA_F32 0, undef %40:vgpr_32, 0, killed %37.sub0, 0, undef %41:vgpr_32, 0, 0, implicit $exec
-    %42:vgpr_32 = V_FMA_F32 0, undef %43:vgpr_32, 0, undef %44:vgpr_32, 0, killed %39, 0, 0, implicit $exec
-    %45:vgpr_32 = V_FMA_F32 0, undef %46:vgpr_32, 0, undef %47:vgpr_32, 0, killed %42, 0, 0, implicit $exec
-    dead %48:vgpr_32 = V_MUL_F32_e32 undef %49:vgpr_32, killed %45, implicit $exec
-    %50:vgpr_32 = V_MUL_F32_e32 undef %51:vgpr_32, undef %52:vgpr_32, implicit $exec
+    %39:vgpr_32 = nofpexcept V_FMA_F32 0, undef %40:vgpr_32, 0, killed %37.sub0, 0, undef %41:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %42:vgpr_32 = nofpexcept V_FMA_F32 0, undef %43:vgpr_32, 0, undef %44:vgpr_32, 0, killed %39, 0, 0, implicit $mode, implicit $exec
+    %45:vgpr_32 = nofpexcept V_FMA_F32 0, undef %46:vgpr_32, 0, undef %47:vgpr_32, 0, killed %42, 0, 0, implicit $mode, implicit $exec
+    dead %48:vgpr_32 = nofpexcept V_MUL_F32_e32 undef %49:vgpr_32, killed %45, implicit $mode, implicit $exec
+    %50:vgpr_32 = nofpexcept V_MUL_F32_e32 undef %51:vgpr_32, undef %52:vgpr_32, implicit $mode, implicit $exec
     undef %53.sub1:vreg_128 = COPY %50
     %38:vreg_128 = COPY killed %53
 
   bb.17:
     %54:vreg_128 = COPY killed %38
-    %55:vgpr_32 = V_FMA_F32 0, killed %54.sub1, 0, target-flags(amdgpu-gotprel32-lo) 1056964608, 0, 1056964608, 0, 0, implicit $exec
+    %55:vgpr_32 = nofpexcept V_FMA_F32 0, killed %54.sub1, 0, target-flags(amdgpu-gotprel32-lo) 1056964608, 0, 1056964608, 0, 0, implicit $mode, implicit $exec
     EXP 1, undef %56:vgpr_32, killed %55, undef %57:vgpr_32, undef %58:vgpr_32, -1, 0, 15, implicit $exec
     S_ENDPGM 0
 
   bb.18:
     successors: %bb.7(0x80000000)
-    dead %59:vgpr_32 = V_FMA_F32 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $exec
+    dead %59:vgpr_32 = nofpexcept V_FMA_F32 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $mode, implicit $exec
     dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sgpr_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, implicit $exec
     undef %66.sub1:vreg_128 = COPY %13.sub1
     %66.sub2:vreg_128 = COPY %13.sub2
-    %67:sreg_64 = V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $exec
-    %69:vgpr_32 = V_ADD_F32_e32 1065353216, undef %70:vgpr_32, implicit $exec
-    %71:vgpr_32 = V_ADD_F32_e32 1065353216, killed %69, implicit $exec
-    %72:sreg_64 = V_CMP_NGT_F32_e64 0, 0, 0, killed %71, 0, implicit $exec
+    %67:sreg_64 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $mode, implicit $exec
+    %69:vgpr_32 = nofpexcept V_ADD_F32_e32 1065353216, undef %70:vgpr_32, implicit $mode, implicit $exec
+    %71:vgpr_32 = nofpexcept V_ADD_F32_e32 1065353216, killed %69, implicit $mode, implicit $exec
+    %72:sreg_64 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, killed %71, 0, implicit $mode, implicit $exec
     %73:sreg_64 = S_OR_B64 killed %72, killed %67, implicit-def dead $scc
     %74:sreg_64 = S_OR_B64 killed %73, killed %10, implicit-def dead $scc
     %19:sreg_64 = COPY killed %74
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir b/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
index a666428ded919..85dcacb93ffc5 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir
@@ -48,11 +48,11 @@ body: |
     %4.sub6:sgpr_256 = COPY %1
     %4.sub7:sgpr_256 = COPY killed %1
     %5:vgpr_32 = IMAGE_LOAD_V1_V4 killed %3, killed %4, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
-    %6:vgpr_32 = V_MAD_F32 0, killed %5, 0, 0, 0, 0, 0, 0, implicit $exec
-    %7:vgpr_32 = V_RCP_F32_e32 killed %6, implicit $exec
-    %8:vgpr_32 = V_MUL_F32_e32 0, killed %7, implicit $exec
-    %9:vgpr_32 = V_MAD_F32 0, killed %8, 0, 0, 0, 0, 0, 0, implicit $exec
-    dead %10:vgpr_32 = V_MAC_F32_e32 undef %11:vgpr_32, undef %12:vgpr_32, undef %10, implicit $exec
+    %6:vgpr_32 = nofpexcept V_MAD_F32 0, killed %5, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %7:vgpr_32 = nofpexcept V_RCP_F32_e32 killed %6, implicit $mode, implicit $exec
+    %8:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %7, implicit $mode, implicit $exec
+    %9:vgpr_32 = nofpexcept V_MAD_F32 0, killed %8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    dead %10:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %11:vgpr_32, undef %12:vgpr_32, undef %10, implicit $mode, implicit $exec
     undef %13.sub0:vreg_128 = COPY %9
     %14:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     S_CBRANCH_SCC0 %bb.4, implicit undef $scc
@@ -65,12 +65,12 @@ body: |
 
   bb.4:
     successors: %bb.5(0x40000000), %bb.7(0x40000000)
-    %17:vgpr_32 = V_MAD_F32 0, killed %9, 0, 0, 0, 0, 0, 0, implicit $exec
-    %18:vgpr_32 = V_MIN_F32_e32 1065353216, killed %17, implicit $exec
-    %19:sreg_64_xexec = V_CMP_NEQ_F32_e64 0, 1065353216, 0, killed %18, 0, implicit $exec
+    %17:vgpr_32 = nofpexcept V_MAD_F32 0, killed %9, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_MIN_F32_e32 1065353216, killed %17, implicit $mode, implicit $exec
+    %19:sreg_64_xexec = nofpexcept V_CMP_NEQ_F32_e64 0, 1065353216, 0, killed %18, 0, implicit $mode, implicit $exec
     %20:vgpr_32 = V_MOV_B32_e32 2143289344, implicit $exec
     %21:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %20, killed %19, implicit $exec
-    %22:sreg_64 = V_CMP_LT_F32_e64 0, 0, 0, killed %21, 0, implicit $exec
+    %22:sreg_64 = nofpexcept V_CMP_LT_F32_e64 0, 0, 0, killed %21, 0, implicit $mode, implicit $exec
     %23:sreg_64 = COPY $exec, implicit-def $exec
     %24:sreg_64 = S_AND_B64 %23, %22, implicit-def dead $scc
     $exec = S_MOV_B64_term killed %24
@@ -140,11 +140,11 @@ body: |
 
   bb.14:
     successors: %bb.15(0x40000000), %bb.16(0x40000000)
-    %38:vgpr_32 = V_MAD_F32 0, killed %36.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $exec
-    %39:vgpr_32 = V_MAD_F32 0, killed %38, 0, 0, 0, 0, 0, 0, implicit $exec
-    %40:vgpr_32 = V_MAD_F32 0, killed %39, 0, -1090519040, 0, 1056964608, 0, 0, implicit $exec
-    %41:vgpr_32 = V_MAD_F32 0, killed %40, 0, 0, 0, -1090519040, 0, 0, implicit $exec
-    %42:vgpr_32 = V_CVT_I32_F32_e32 killed %41, implicit $exec
+    %38:vgpr_32 = nofpexcept V_MAD_F32 0, killed %36.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %39:vgpr_32 = nofpexcept V_MAD_F32 0, killed %38, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vgpr_32 = nofpexcept V_MAD_F32 0, killed %39, 0, -1090519040, 0, 1056964608, 0, 0, implicit $mode, implicit $exec
+    %41:vgpr_32 = nofpexcept V_MAD_F32 0, killed %40, 0, 0, 0, -1090519040, 0, 0, implicit $mode, implicit $exec
+    %42:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 killed %41, implicit $mode, implicit $exec
     %43:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %44:sgpr_128, 12, 0, 0 :: (dereferenceable invariant load 4)
     %45:vgpr_32 = V_MUL_LO_I32 killed %42, killed %43, implicit $exec
     %46:vgpr_32 = V_LSHLREV_B32_e32 2, killed %45, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
index 1605880a59e41..91901b7233762 100644
--- a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
+++ b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
@@ -17,9 +17,9 @@ body:             |
   ; GCN-LABEL: name: _amdgpu_ps_main
   ; GCN: bb.0:
   ; GCN:   successors: %bb.1(0x80000000)
-  ; GCN:   [[V_TRUNC_F32_e32_:%[0-9]+]]:vgpr_32 = V_TRUNC_F32_e32 undef %4:vgpr_32, implicit $exec
-  ; GCN:   [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e32 [[V_TRUNC_F32_e32_]], implicit $exec
-  ; GCN:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 4, [[V_CVT_U32_F32_e32_]], implicit $exec
+  ; GCN:   %3:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %4:vgpr_32, implicit $mode, implicit $exec
+  ; GCN:   %5:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 %3, implicit $mode, implicit $exec
+  ; GCN:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 4, %5, implicit $exec
   ; GCN:   undef %11.sub0:vreg_128 = V_MUL_LO_I32 [[V_LSHRREV_B32_e32_]], 3, implicit $exec
   ; GCN:   %11.sub3:vreg_128 = COPY %11.sub0
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
@@ -47,19 +47,19 @@ body:             |
   ; GCN:   S_CBRANCH_VCCNZ %bb.4, implicit killed $vcc
   ; GCN:   S_BRANCH %bb.6
   ; GCN: bb.5:
-  ; GCN:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, %11.sub0, implicit $exec
-  ; GCN:   [[V_MIN_F32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_F32_e32 1106771968, [[V_MUL_F32_e32_]], implicit $exec
-  ; GCN:   [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, [[V_MIN_F32_e32_]], 0, 0, 0, 0, 0, 0, implicit $exec
-  ; GCN:   [[V_MAD_F32_1:%[0-9]+]]:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, [[V_MAD_F32_]], 0, 0, 0, 0, 0, 0, implicit $exec
-  ; GCN:   [[V_MAD_F32_2:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[V_MAD_F32_1]], 0, 0, 0, 0, 0, 0, implicit $exec
-  ; GCN:   [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, [[V_MAD_F32_2]], 0, undef %27:vgpr_32, 0, 0, implicit $exec
-  ; GCN:   EXP_DONE 0, [[V_CVT_PKRTZ_F16_F32_e64_]], undef %28:vgpr_32, undef %29:vgpr_32, undef %30:vgpr_32, -1, -1, 15, implicit $exec
+  ; GCN:   %21:vgpr_32 = nofpexcept V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, %11.sub0, implicit $mode, implicit $exec
+  ; GCN:   %22:vgpr_32 = nofpexcept V_MIN_F32_e32 1106771968, %21, implicit $mode, implicit $exec
+  ; GCN:   %23:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32 0, %22, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN:   %24:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32 0, %23, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN:   %25:vgpr_32 = nofpexcept V_MAD_F32 0, %24, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN:   %26:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, %25, 0, undef %27:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  ; GCN:   EXP_DONE 0, %26, undef %28:vgpr_32, undef %29:vgpr_32, undef %30:vgpr_32, -1, -1, 15, implicit $exec
   ; GCN:   S_ENDPGM 0
   ; GCN: bb.6:
   ; GCN:   S_ENDPGM 0
   bb.0:
-    %10:vgpr_32 = V_TRUNC_F32_e32 undef %11:vgpr_32, implicit $exec
-    %12:vgpr_32 = V_CVT_U32_F32_e32 killed %10, implicit $exec
+    %10:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %11:vgpr_32, implicit $mode, implicit $exec
+    %12:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %10, implicit $mode, implicit $exec
     %50:vgpr_32 = V_LSHRREV_B32_e32 4, killed %12, implicit $exec
     %51:vgpr_32 = V_MUL_LO_I32 killed %50, 3, implicit $exec
     undef %52.sub0:vreg_128 = COPY %51
@@ -102,12 +102,12 @@ body:             |
     S_BRANCH %bb.6
 
   bb.5:
-    %39:vgpr_32 = V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, killed %55.sub0, implicit $exec
-    %41:vgpr_32 = V_MIN_F32_e32 1106771968, killed %39, implicit $exec
-    %42:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, killed %41, 0, 0, 0, 0, 0, 0, implicit $exec
-    %43:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, killed %42, 0, 0, 0, 0, 0, 0, implicit $exec
-    %44:vgpr_32 = V_MAD_F32 0, killed %43, 0, 0, 0, 0, 0, 0, implicit $exec
-    %45:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, killed %44, 0, undef %46:vgpr_32, 0, 0, implicit $exec
+    %39:vgpr_32 = nofpexcept V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, killed %55.sub0, implicit $mode, implicit $exec
+    %41:vgpr_32 = nofpexcept V_MIN_F32_e32 1106771968, killed %39, implicit $mode, implicit $exec
+    %42:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32 0, killed %41, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32 0, killed %42, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %44:vgpr_32 = nofpexcept V_MAD_F32 0, killed %43, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %45:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, killed %44, 0, undef %46:vgpr_32, 0, 0, implicit $mode, implicit $exec
     EXP_DONE 0, killed %45, undef %47:vgpr_32, undef %48:vgpr_32, undef %49:vgpr_32, -1, -1, 15, implicit $exec
     S_ENDPGM 0
 
diff --git a/llvm/test/CodeGen/AMDGPU/dead-lane.mir b/llvm/test/CodeGen/AMDGPU/dead-lane.mir
index 1477c3302c340..8e95009e72c6c 100644
--- a/llvm/test/CodeGen/AMDGPU/dead-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead-lane.mir
@@ -2,15 +2,15 @@
 
 # GCN-LABEL: name: dead_lane
 # GCN:      bb.0:
-# GCN-NEXT: undef %3.sub0:vreg_64 = V_MAC_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, undef %3.sub0, implicit $exec
+# GCN-NEXT: undef %3.sub0:vreg_64 = nofpexcept V_MAC_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, undef %3.sub0, implicit $mode, implicit $exec
 # GCN-NEXT: FLAT_STORE_DWORD undef %4:vreg_64, %3.sub0,
 ---
 name:            dead_lane
 tracksRegLiveness: true
 body:             |
   bb.0:
-    %1:vgpr_32 = V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $exec
-    %2:vgpr_32 = V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $exec
+    %1:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $mode, implicit $exec
+    %2:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $mode, implicit $exec
     %3:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, %2:vgpr_32, %subreg.sub1
     FLAT_STORE_DWORD undef %4:vreg_64, %3.sub0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index 3a35c558e6ac4..2743f766ddeeb 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -32,7 +32,7 @@ body:             |
   ; CHECK:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 1082130432, [[DEF1]], implicit $exec
+  ; CHECK:   %9:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec
   ; CHECK:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK: bb.1:
@@ -48,29 +48,29 @@ body:             |
   ; CHECK:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK:   [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $exec
-  ; CHECK:   [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $exec
-  ; CHECK:   [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $exec
+  ; CHECK:   %16:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK:   %17:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK:   %18:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
   ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
   ; CHECK:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $exec
-  ; CHECK:   [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $exec
-  ; CHECK:   dead %23:vgpr_32 = V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $exec
-  ; CHECK:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $exec
+  ; CHECK:   %21:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
+  ; CHECK:   %22:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK:   dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 %22, [[DEF13]], implicit $mode, implicit $exec
+  ; CHECK:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 %21, [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
   ; CHECK:   [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; CHECK:   $sgpr4 = IMPLICIT_DEF
   ; CHECK:   $vgpr0 = COPY [[DEF11]]
   ; CHECK:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
   ; CHECK:   $vgpr1 = COPY [[DEF7]]
-  ; CHECK:   $vgpr0 = COPY [[V_MUL_F32_e32_1]]
-  ; CHECK:   $vgpr1 = COPY [[V_MUL_F32_e32_2]]
-  ; CHECK:   $vgpr2 = COPY [[V_MUL_F32_e32_3]]
+  ; CHECK:   $vgpr0 = COPY %16
+  ; CHECK:   $vgpr1 = COPY %17
+  ; CHECK:   $vgpr2 = COPY %18
   ; CHECK:   dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu_highregs, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
-  ; CHECK:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $exec
-  ; CHECK:   [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_MAC_F32_e32_]], implicit $exec
-  ; CHECK:   dead %26:vgpr_32 = V_MAD_F32 0, [[V_MAC_F32_e32_]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $exec
-  ; CHECK:   dead %27:vgpr_32 = V_MAD_F32 0, [[V_MAC_F32_e32_]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $exec
-  ; CHECK:   dead %28:vgpr_32 = V_MAD_F32 0, [[V_MAC_F32_e32_]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $exec
+  ; CHECK:   %25:vgpr_32 = nofpexcept V_ADD_F32_e32 %9, [[DEF8]], implicit $mode, implicit $exec
+  ; CHECK:   %25:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], %25, implicit $mode, implicit $exec
+  ; CHECK:   dead %26:vgpr_32 = nofpexcept V_MAD_F32 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK:   dead %27:vgpr_32 = nofpexcept V_MAD_F32 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK:   dead %28:vgpr_32 = nofpexcept V_MAD_F32 0, %25, 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
   ; CHECK:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, 0, 0, implicit $exec
   ; CHECK:   S_ENDPGM 0
   bb.0:
@@ -85,7 +85,7 @@ body:             |
     %6:vgpr_32 = IMPLICIT_DEF
     %7:vgpr_32 = IMPLICIT_DEF
     %8:vgpr_32 = IMPLICIT_DEF
-    %9:vgpr_32 = V_MUL_F32_e32 1082130432, %1, implicit $exec
+    %9:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, %1, implicit $mode, implicit $exec
     %10:vgpr_32 = IMPLICIT_DEF
     %11:vgpr_32 = IMPLICIT_DEF
 
@@ -106,15 +106,15 @@ body:             |
     %13:vgpr_32 = COPY %12
     %14:vgpr_32 = IMPLICIT_DEF
     %15:vgpr_32 = IMPLICIT_DEF
-    %16:vgpr_32 = V_MUL_F32_e32 %7, %7, implicit $exec
-    %17:vgpr_32 = V_MUL_F32_e32 %7, %7, implicit $exec
-    %18:vgpr_32 = V_MUL_F32_e32 %12, %12, implicit $exec
+    %16:vgpr_32 = nofpexcept V_MUL_F32_e32 %7, %7, implicit $mode, implicit $exec
+    %17:vgpr_32 = nofpexcept V_MUL_F32_e32 %7, %7, implicit $mode, implicit $exec
+    %18:vgpr_32 = nofpexcept V_MUL_F32_e32 %12, %12, implicit $mode, implicit $exec
     %19:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %20:vgpr_32 = IMPLICIT_DEF
-    %21:vgpr_32 = V_ADD_F32_e32 %12, %12, implicit $exec
-    %22:vgpr_32 = V_MUL_F32_e32 %7, %7, implicit $exec
-    %23:vgpr_32 = V_MUL_F32_e32 %22, %20, implicit $exec
-    %19:vgpr_32 = V_MAC_F32_e32 %21, %13, %19, implicit $exec
+    %21:vgpr_32 = nofpexcept V_ADD_F32_e32 %12, %12, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_MUL_F32_e32 %7, %7, implicit $mode, implicit $exec
+    %23:vgpr_32 = nofpexcept V_MUL_F32_e32 %22, %20, implicit $mode, implicit $exec
+    %19:vgpr_32 = nofpexcept V_MAC_F32_e32 %21, %13, %19, implicit $mode, implicit $exec
     %24:sreg_64 = IMPLICIT_DEF
     $vgpr0 = COPY %14
     $vgpr0 = COPY %12
@@ -124,11 +124,11 @@ body:             |
     $vgpr2 = COPY %18
     $sgpr4 = IMPLICIT_DEF
     dead $sgpr30_sgpr31 = SI_CALL %24, @foo, csr_amdgpu_highregs, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit-def $vgpr0
-    %25:vgpr_32 = V_ADD_F32_e32 %9, %8, implicit $exec
-    %25:vgpr_32 = V_MAC_F32_e32 %15, %10, %25, implicit $exec
-    %26:vgpr_32 = V_MAD_F32 0, %25, 0, %4, 0, %1, 0, 0, implicit $exec
-    %27:vgpr_32 = V_MAD_F32 0, %25, 0, %5, 0, %2, 0, 0, implicit $exec
-    %28:vgpr_32 = V_MAD_F32 0, %25, 0, %6, 0, %3, 0, 0, implicit $exec
+    %25:vgpr_32 = nofpexcept V_ADD_F32_e32 %9, %8, implicit $mode, implicit $exec
+    %25:vgpr_32 = nofpexcept V_MAC_F32_e32 %15, %10, %25, implicit $mode, implicit $exec
+    %26:vgpr_32 = nofpexcept V_MAD_F32 0, %25, 0, %4, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %27:vgpr_32 = nofpexcept V_MAD_F32 0, %25, 0, %5, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %28:vgpr_32 = nofpexcept V_MAD_F32 0, %25, 0, %6, 0, %3, 0, 0, implicit $mode, implicit $exec
     GLOBAL_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
index 859c21d8842fb..358a331da1a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
@@ -298,10 +298,10 @@ body:             |
 # check for floating point modifiers
 # GCN-LABEL: name: add_f32_e64
 # GCN: %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
-# GCN: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
-# GCN: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $exec
-# GCN: %8:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 15, 15, 1, implicit $exec
-# GCN: %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $exec
+# GCN: %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $mode, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %8:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $mode, implicit $exec
 
 name: add_f32_e64
 tracksRegLiveness: true
@@ -315,19 +315,19 @@ body:             |
 
     ; this shouldn't be combined as omod is set
     %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
-    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $mode, implicit $exec
 
     ; this should be combined as all modifiers are default
     %5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
-    %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $mode, implicit $exec
 
     ; this should be combined as modifiers other than abs|neg are default
     %7:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
-    %8:vgpr_32 = V_ADD_F32_e64 1, %7, 2, %0, 0, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_F32_e64 1, %7, 2, %0, 0, 0, implicit $mode, implicit $exec
 
     ; this shouldn't be combined as modifiers aren't abs|neg
     %9:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
-    %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $mode, implicit $exec
 ...
 
 # check for e64 modifiers
@@ -532,73 +532,73 @@ body: |
 
 # Test instruction which does not have modifiers in VOP1 form but does in DPP form.
 # GCN-LABEL: name: dpp_vop1
-# GCN: %3:vgpr_32 = V_CEIL_F32_dpp %0, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $exec
+# GCN: %3:vgpr_32 = V_CEIL_F32_dpp %0, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
 name: dpp_vop1
 tracksRegLiveness: true
 body: |
   bb.0:
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
-    %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $exec
+    %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec
 ...
 
 # Test instruction which does not have modifiers in VOP2 form but does in DPP form.
 # GCN-LABEL: name: dpp_min
-# GCN: %3:vgpr_32 = V_MIN_F32_dpp %0, 0, undef %2:vgpr_32, 0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec
+# GCN: %3:vgpr_32 = V_MIN_F32_dpp %0, 0, undef %2:vgpr_32, 0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
 name: dpp_min
 tracksRegLiveness: true
 body: |
   bb.0:
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
-    %4:vgpr_32 = V_MIN_F32_e32 %2, undef %3:vgpr_32, implicit $exec
+    %4:vgpr_32 = V_MIN_F32_e32 %2, undef %3:vgpr_32, implicit $mode, implicit $exec
 ...
 
 # Test an undef old operand
 # GCN-LABEL: name: dpp_undef_old
-# GCN: %3:vgpr_32 = V_CEIL_F32_dpp undef %1:vgpr_32, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $exec
+# GCN: %3:vgpr_32 = V_CEIL_F32_dpp undef %1:vgpr_32, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec
 name: dpp_undef_old
 tracksRegLiveness: true
 body: |
   bb.0:
     %2:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
-    %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $exec
+    %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec
 ...
 
 # Do not combine a dpp mov which writes a physreg.
 # GCN-LABEL: name: phys_dpp_mov_dst
 # GCN: $vgpr0 = V_MOV_B32_dpp undef %0:vgpr_32, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
-# GCN: %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $exec
+# GCN: %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec
 name: phys_dpp_mov_dst
 tracksRegLiveness: true
 body: |
   bb.0:
     $vgpr0 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
-    %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $exec
+    %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec
 ...
 
 # Do not combine a dpp mov which reads a physreg.
 # GCN-LABEL: name: phys_dpp_mov_old_src
 # GCN: %0:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec
-# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $exec
+# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec
 name: phys_dpp_mov_old_src
 tracksRegLiveness: true
 body: |
   bb.0:
     %1:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec
-    %2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $exec
+    %2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec
 ...
 
 # Do not combine a dpp mov which reads a physreg.
 # GCN-LABEL: name: phys_dpp_mov_src
 # GCN: %0:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec
-# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $exec
+# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec
 name: phys_dpp_mov_src
 tracksRegLiveness: true
 body: |
   bb.0:
     %1:vgpr_32 = V_MOV_B32_dpp undef %0:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec
-    %2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $exec
+    %2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec
 ...
 
 # GCN-LABEL: name: dpp_reg_sequence_both_combined
@@ -817,7 +817,7 @@ body: |
 
 # Make sure flags aren't dropped
 # GCN-LABEL: name: flags_add_f32_e64
-# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $exec
+# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec
 name: flags_add_f32_e64
 tracksRegLiveness: true
 body:             |
@@ -829,7 +829,7 @@ body:             |
     %2:vgpr_32 = IMPLICIT_DEF
 
     %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec
-    %4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $exec
+    %4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
     S_ENDPGM 0, implicit %4
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir b/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
index baa54b492f610..95a878c1997ff 100644
--- a/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
+++ b/llvm/test/CodeGen/AMDGPU/endpgm-dce.mir
@@ -18,7 +18,7 @@ body:             |
     %3 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
     %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
-    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $exec
+    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %4 = S_ADD_U32 %3, 1, implicit-def $scc
     S_ENDPGM 0
 ...
@@ -42,7 +42,7 @@ body:             |
     %3 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
     %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr
-    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $exec
+    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %4 = S_ADD_U32 %3, 1, implicit-def $scc
     S_ENDPGM 0
 ...
@@ -66,7 +66,7 @@ body:             |
     %3 = IMPLICIT_DEF
     $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc
     %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4)
-    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $exec
+    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %4 = S_ADD_U32 %3, 1, implicit-def $scc
     S_ENDPGM 0
 ...
@@ -173,7 +173,7 @@ body:             |
   bb.1:
     %0 = IMPLICIT_DEF
     %2 = IMPLICIT_DEF
-    %1 = V_ADD_F32_e64 0, killed %0, 0, 1, 0, 0, implicit $exec
+    %1 = V_ADD_F32_e64 0, killed %0, 0, 1, 0, 0, implicit $mode, implicit $exec
     %3 = S_ADD_U32 %2, 1, implicit-def $scc
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 394df72a1c830..7bc14939624d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -11,7 +11,7 @@ body:               |
     %1:sreg_32 = IMPLICIT_DEF
     %2:sreg_32 = IMPLICIT_DEF
     %3:sreg_32 = IMPLICIT_DEF
-    %4:vgpr_32 = V_CVT_U32_F32_e64 0, %0:vgpr_32, 0, 0, implicit $exec
+    %4:vgpr_32 = V_CVT_U32_F32_e64 0, %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
     %5:sreg_32 = COPY %4:vgpr_32
     %6:sreg_32 = S_ADD_I32 %2:sreg_32, %5:sreg_32, implicit-def $scc
     %7:sreg_32 = S_ADDC_U32 %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $scc
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
index e76f1be6c485b..b81556c94ccea 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
@@ -111,7 +111,7 @@
 #  literal constant.
 
 # CHECK-LABEL: name: add_f32_1.0_one_f16_use
-# CHECK: %13:vgpr_32 = V_ADD_F16_e32  1065353216, killed %11, implicit $exec
+# CHECK: %13:vgpr_32 = V_ADD_F16_e32  1065353216, killed %11, implicit $mode, implicit $exec
 
 name:            add_f32_1.0_one_f16_use
 alignment:       1
@@ -160,7 +160,7 @@ body:             |
     %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
     %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %12 = V_MOV_B32_e32 1065353216, implicit $exec
-    %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $exec
+    %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
 
@@ -171,8 +171,8 @@ body:             |
 
 # CHECK-LABEL: name: add_f32_1.0_multi_f16_use
 # CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit $mode, implicit $exec
 
 
 name:            add_f32_1.0_multi_f16_use
@@ -225,8 +225,8 @@ body:             |
     %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 1065353216, implicit $exec
-    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec
-    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec
+    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $mode, implicit $exec
+    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
@@ -238,8 +238,8 @@ body:             |
 #  immediate, and folded into the single f16 use as a literal constant
 
 # CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit $exec
-# CHECK: %16:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit $mode, implicit $exec
+# CHECK: %16:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $mode, implicit $exec
 
 name:            add_f32_1.0_one_f32_use_one_f16_use
 alignment:       1
@@ -293,8 +293,8 @@ body:             |
     %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %14 = V_MOV_B32_e32 1065353216, implicit $exec
-    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec
-    %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec
+    %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     S_ENDPGM 0
@@ -307,9 +307,9 @@ body:             |
 
 # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
 # CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32  %11, %14, implicit $exec
-# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12,  %14, implicit $exec
-# CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32  %11, %14, implicit $mode, implicit $exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12,  %14, implicit $mode, implicit $exec
+# CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $mode, implicit $exec
 
 name:            add_f32_1.0_one_f32_use_multi_f16_use
 alignment:       1
@@ -364,9 +364,9 @@ body:             |
     %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %14 = V_MOV_B32_e32 1065353216, implicit $exec
-    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec
-    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec
-    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec
+    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $mode, implicit $exec
+    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
@@ -376,8 +376,8 @@ body:             |
 ---
 # CHECK-LABEL: name: add_i32_1_multi_f16_use
 # CHECK: %13:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1, killed %11, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1, killed %12, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1, killed %11, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1, killed %12, implicit $mode, implicit $exec
 
 
 name:            add_i32_1_multi_f16_use
@@ -430,8 +430,8 @@ body:             |
     %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 1, implicit $exec
-    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec
-    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec
+    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $mode, implicit $exec
+    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
@@ -441,9 +441,9 @@ body:             |
 
 # CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use
 # CHECK: %14:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 -2, %11, implicit $exec
-# CHECK: %16:vgpr_32 = V_ADD_F16_e32 -2, %12, implicit $exec
-# CHECK: %17:vgpr_32 = V_ADD_F32_e32 -2, killed %13, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 -2, %11, implicit $mode, implicit $exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 -2, %12, implicit $mode, implicit $exec
+# CHECK: %17:vgpr_32 = V_ADD_F32_e32 -2, killed %13, implicit $mode, implicit $exec
 
 name:            add_i32_m2_one_f32_use_multi_f16_use
 alignment:       1
@@ -498,9 +498,9 @@ body:             |
     %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %14 = V_MOV_B32_e32 -2, implicit $exec
-    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec
-    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec
-    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec
+    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $mode, implicit $exec
+    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
@@ -514,8 +514,8 @@ body:             |
 
 # CHECK-LABEL: name: add_f16_1.0_multi_f32_use
 # CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
 
 name:            add_f16_1.0_multi_f32_use
 alignment:       1
@@ -567,8 +567,8 @@ body:             |
     %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 15360, implicit $exec
-    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec
-    %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $exec
+    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec
+    %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     S_ENDPGM 0
@@ -581,8 +581,8 @@ body:             |
 
 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
 # CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec
 
 name:            add_f16_1.0_other_high_bits_multi_f16_use
 alignment:       1
@@ -634,8 +634,8 @@ body:             |
     %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 80886784, implicit $exec
-    %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $exec
-    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec
+    %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec
+    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
@@ -648,8 +648,8 @@ body:             |
 
 # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
 # CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit $exec
-# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $exec
-# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec
 name:            add_f16_1.0_other_high_bits_use_f16_f32
 alignment:       1
 exposesReturnsTwice: false
@@ -700,8 +700,8 @@ body:             |
     %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`)
     %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`)
     %13 = V_MOV_B32_e32 305413120, implicit $exec
-    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec
-    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec
+    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec
+    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec
     BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`)
     BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`)
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
index e26f0c934fce4..4eef4d6477010 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
@@ -2,7 +2,7 @@
 ...
 # GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32
 # GCN: %23:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec
-# GCN-NEXT: %24:vgpr_32 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec
+# GCN-NEXT: %24:vgpr_32 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec
 
 name:            no_fold_imm_madak_mac_clamp_f32
 tracksRegLiveness: true
@@ -64,7 +64,7 @@ body:             |
     %22 = COPY %29
     %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
-    %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec
+    %24 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec
     %26 = COPY %29
     BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
@@ -73,7 +73,7 @@ body:             |
 ---
 # GCN-LABEL: name: no_fold_imm_madak_mac_omod_f32
 # GCN: %23:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec
-# GCN: %24:vgpr_32 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $exec
+# GCN: %24:vgpr_32 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $mode, implicit $exec
 
 name:            no_fold_imm_madak_mac_omod_f32
 tracksRegLiveness: true
@@ -135,7 +135,7 @@ body:             |
     %22 = COPY %29
     %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
-    %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $exec
+    %24 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $mode, implicit $exec
     %26 = COPY %29
     BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
@@ -144,7 +144,7 @@ body:             |
 ---
 # GCN: name: no_fold_imm_madak_mad_clamp_f32
 # GCN: %23:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec
-# GCN: %24:vgpr_32 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec
+# GCN: %24:vgpr_32 = nofpexcept V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec
 
 name:            no_fold_imm_madak_mad_clamp_f32
 tracksRegLiveness: true
@@ -206,7 +206,7 @@ body:             |
     %22 = COPY %29
     %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
-    %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec
+    %24 = nofpexcept V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec
     %26 = COPY %29
     BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
@@ -215,7 +215,7 @@ body:             |
 ---
 # GCN: name: no_fold_imm_madak_mad_omod_f32
 # GCN: %23:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec
-# GCN: %24:vgpr_32 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $exec
+# GCN: %24:vgpr_32 = nofpexcept V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $mode, implicit $exec
 
 name:            no_fold_imm_madak_mad_omod_f32
 tracksRegLiveness: true
@@ -277,7 +277,7 @@ body:             |
     %22 = COPY %29
     %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     %23 = V_MOV_B32_e32 1090519040, implicit $exec
-    %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $exec
+    %24 = nofpexcept V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $mode, implicit $exec
     %26 = COPY %29
     BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir
index 6f0e6e39eea80..1b87ef241cace 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir
@@ -39,7 +39,7 @@ body:             |
 
 # GCN-LABEL: name: fma_sgpr_use
 # GCN:      %0:sreg_64_xexec = IMPLICIT_DEF
-# GCN-NEXT: %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMA_F32 2, %0.sub0, 0, 1073741824, 0, %0.sub1, 0, 0, implicit $exec
+# GCN-NEXT: %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMA_F32 2, %0.sub0, 0, 1073741824, 0, %0.sub1, 0, 0, implicit $mode, implicit $exec
 ---
 name:            fma_sgpr_use
 body:             |
@@ -48,6 +48,6 @@ body:             |
     %1:sgpr_32 = COPY %0.sub0
     %2:sgpr_32 = COPY %0.sub1
     %3:vgpr_32 = COPY %2
-    %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %1, 0, 1073741824, 0, %3, 0, 0, implicit $exec
+    %4:vgpr_32 = nnan ninf nsz arcp contract afn reassoc V_FMAC_F32_e64 2, %1, 0, 1073741824, 0, %3, 0, 0, implicit $mode, implicit $exec
     DS_WRITE2_B32_gfx9 undef %5:vgpr_32, killed %4, undef %6:vgpr_32, 0, 1, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir b/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
index f1b5ee3524d95..5f4e6830eb44f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir
@@ -11,7 +11,7 @@ name:            flat_atomic_fcmpswap_to_s_denorm_mode
 body:            |
   bb.0:
     FLAT_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fcmpswap_x2_to_s_denorm_mode
@@ -25,7 +25,7 @@ name:            flat_atomic_fcmpswap_x2_to_s_denorm_mode
 body:            |
   bb.0:
     FLAT_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_128, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmax_to_s_denorm_mode
@@ -39,7 +39,7 @@ name:            flat_atomic_fmax_to_s_denorm_mode
 body:            |
   bb.0:
     FLAT_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmax_x2_to_s_denorm_mode
@@ -53,7 +53,7 @@ name:            flat_atomic_fmax_x2_to_s_denorm_mode
 body:            |
   bb.0:
     FLAT_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmin_to_s_denorm_mode
@@ -67,7 +67,7 @@ name:            flat_atomic_fmin_to_s_denorm_mode
 body:            |
   bb.0:
     FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmin_x2_to_s_denorm_mode
@@ -81,7 +81,7 @@ name:            flat_atomic_fmin_x2_to_s_denorm_mode
 body:            |
   bb.0:
     FLAT_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fcmpswap_x2_rtn_to_s_denorm_mode
@@ -95,7 +95,7 @@ name:            flat_atomic_fcmpswap_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = FLAT_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_128, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmax_rtn_to_s_denorm_mode
@@ -109,7 +109,7 @@ name:            flat_atomic_fmax_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = FLAT_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmax_x2_rtn_to_s_denorm_mode
@@ -123,7 +123,7 @@ name:            flat_atomic_fmax_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = FLAT_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmin_rtn_to_s_denorm_mode
@@ -137,7 +137,7 @@ name:            flat_atomic_fmin_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = FLAT_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fmin_x2_rtn_to_s_denorm_mode
@@ -151,7 +151,7 @@ name:            flat_atomic_fmin_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = FLAT_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_atomic_fcmpswap_rtn_to_s_denorm_mode
@@ -165,7 +165,7 @@ name:            flat_atomic_fcmpswap_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = FLAT_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fcmpswap_to_s_denorm_mode
@@ -179,7 +179,7 @@ name:            global_atomic_fcmpswap_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fcmpswap_x2_to_s_denorm_mode
@@ -193,7 +193,7 @@ name:            global_atomic_fcmpswap_x2_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmax_to_s_denorm_mode
@@ -207,7 +207,7 @@ name:            global_atomic_fmax_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmax_x2_to_s_denorm_mode
@@ -221,7 +221,7 @@ name:            global_atomic_fmax_x2_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmin_to_s_denorm_mode
@@ -235,7 +235,7 @@ name:            global_atomic_fmin_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmin_x2_to_s_denorm_mode
@@ -249,7 +249,7 @@ name:            global_atomic_fmin_x2_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fcmpswap_rtn_to_s_denorm_mode
@@ -263,7 +263,7 @@ name:            global_atomic_fcmpswap_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = GLOBAL_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fcmpswap_x2_rtn_to_s_denorm_mode
@@ -277,7 +277,7 @@ name:            global_atomic_fcmpswap_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmax_rtn_to_s_denorm_mode
@@ -291,7 +291,7 @@ name:            global_atomic_fmax_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmax_x2_rtn_to_s_denorm_mode
@@ -305,7 +305,7 @@ name:            global_atomic_fmax_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmin_rtn_to_s_denorm_mode
@@ -319,7 +319,7 @@ name:            global_atomic_fmin_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmin_x2_rtn_to_s_denorm_mode
@@ -333,7 +333,7 @@ name:            global_atomic_fmin_x2_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fcmpswap_saddr_to_s_denorm_mode
@@ -347,7 +347,7 @@ name:            global_atomic_fcmpswap_saddr_to_s_denorm_mode
 body:            |
   bb.0:
     GLOBAL_ATOMIC_FCMPSWAP_SADDR undef %0:vreg_64, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fcmpswap_x2_saddr_rtn_to_s_denorm_mode
@@ -361,7 +361,7 @@ name:            global_atomic_fcmpswap_x2_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_SADDR_RTN undef %0:vreg_64, undef %1:vreg_64, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmax_saddr_rtn_to_s_denorm_mode
@@ -375,7 +375,7 @@ name:            global_atomic_fmax_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_SADDR_RTN undef %0:vreg_64, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmax_x2_saddr_rtn_to_s_denorm_mode
@@ -389,7 +389,7 @@ name:            global_atomic_fmax_x2_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_SADDR_RTN undef %0:vreg_64, undef %1:vreg_64, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmin_saddr_rtn_to_s_denorm_mode
@@ -403,7 +403,7 @@ name:            global_atomic_fmin_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_SADDR_RTN undef %0:vreg_64, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: global_atomic_fmin_x2_saddr_rtn_to_s_denorm_mode
@@ -417,7 +417,7 @@ name:            global_atomic_fmin_x2_saddr_rtn_to_s_denorm_mode
 body:            |
   bb.0:
     %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_SADDR_RTN undef %0:vreg_64, undef %1:vreg_64, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_fp_atomic_to_s_denorm_mode_waitcnt
@@ -430,7 +430,7 @@ body:            |
   bb.0:
     FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
     S_WAITCNT 0
-    S_DENORM_MODE 0
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
 
 # GCN-LABEL: name: flat_fp_atomic_to_s_denorm_mode_valu
@@ -442,6 +442,6 @@ name:            flat_fp_atomic_to_s_denorm_mode_valu
 body:            |
   bb.0:
     FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`)
-    %2:vgpr_32 = V_ADD_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, implicit $exec
-    S_DENORM_MODE 0
+    %2:vgpr_32 = V_ADD_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, implicit $mode, implicit $exec
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir b/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
index bd6244127e6f9..a8c82e6cf2545 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir
@@ -11,9 +11,9 @@ name:            hazard_buffer_store_v_interp
 body:             |
   bb.0.entry:
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr7, $vgpr8, $vgpr9, $vgpr10
-  
+
     BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+    $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_ENDPGM 0
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
index d0f32f287473c..830e9aa340fd8 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
@@ -32,7 +32,7 @@ body:	|
     S_SENDMSG 3, implicit $exec, implicit $m0
     $m0 = S_MOV_B32 $sgpr8
     BUNDLE implicit-def $vgpr0 {
-      $vgpr0 = V_INTERP_P1_F32 killed $vgpr4, 0, 0, implicit $m0, implicit $exec
+      $vgpr0 = V_INTERP_P1_F32 killed $vgpr4, 0, 0, implicit $mode, implicit $m0, implicit $exec
     }
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
index c88be5fdaba3b..8d02f7a60add2 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
@@ -38,7 +38,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     BUNDLE implicit-def $sgpr0_sgpr1 {
       $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
-      $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+      $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     }
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-kill.mir b/llvm/test/CodeGen/AMDGPU/hazard-kill.mir
index 5f4b55132112f..6602c079986e6 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-kill.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-kill.mir
@@ -19,12 +19,12 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $sgpr2, $sgpr3, $sgpr4
-  
+
     $sgpr6 = S_MOV_B32 killed $sgpr3
     renamable $sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 16, 0, 0
     $m0 = S_MOV_B32 killed renamable $sgpr4
     dead renamable $sgpr0 = KILL undef renamable $sgpr2
-    renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $m0, implicit $exec
+    renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec
     renamable $sgpr0 = S_MOV_B32 0
 
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/hazard.mir b/llvm/test/CodeGen/AMDGPU/hazard.mir
index bc62bd9ef087e..1b53aac3646be 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard.mir
@@ -27,7 +27,7 @@ body:             |
 
     $m0 = S_MOV_B32 killed $sgpr7
     $vgpr5 = IMPLICIT_DEF
-    $vgpr0 = V_INTERP_P1_F32 killed $vgpr4, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P1_F32 killed $vgpr4, 0, 0, implicit $mode, implicit $m0, implicit $exec
     SI_RETURN_TO_EPILOG killed $vgpr5, killed $vgpr0
 
 ...
@@ -56,7 +56,7 @@ body:             |
 
     $m0 = S_MOV_B32 killed $sgpr7
     INLINEASM &"; no-op", 1, 327690, def $vgpr5
-    $vgpr0 = V_INTERP_P1_F32 killed $vgpr4, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P1_F32 killed $vgpr4, 0, 0, implicit $mode, implicit $m0, implicit $exec
     SI_RETURN_TO_EPILOG killed $vgpr5, killed $vgpr0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-callee.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-callee.mir
index b9f6c2f79db0f..0ffed0ae4bfd5 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-callee.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-callee.mir
@@ -19,7 +19,7 @@ liveins:
 name: entry_callee_wait
 body:             |
   bb.0:
-    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
     S_SETPC_B64 killed $sgpr0_sgpr1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
index c8778c73aea38..a8c930d27c9be 100644
--- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -79,22 +79,22 @@ name: div_fmas
 body: |
   bb.0:
     $vcc = S_MOV_B64 0
-    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $vcc, implicit $exec
+    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
     implicit $vcc = V_CMP_EQ_I32_e32 $vgpr1, $vgpr2, implicit $exec
-    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $vcc, implicit $exec
+    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
     $vcc = V_CMP_EQ_I32_e64 $vgpr1, $vgpr2, implicit $exec
-    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $vcc, implicit $exec
+    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
-    $vgpr4, $vcc = V_DIV_SCALE_F32 $vgpr1, $vgpr1, $vgpr3, implicit $exec
-    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $vcc, implicit $exec
+    $vgpr4, $vcc = V_DIV_SCALE_F32 $vgpr1, $vgpr1, $vgpr3, implicit $mode, implicit $exec
+    $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -128,24 +128,24 @@ name: s_getreg
 
 body: |
   bb.0:
-    S_SETREG_B32 $sgpr0, 1
-    $sgpr1 = S_GETREG_B32 1
+    S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode
+    $sgpr1 = S_GETREG_B32 1, implicit-def $mode, implicit $mode
     S_BRANCH %bb.1
 
   bb.1:
-    S_SETREG_IMM32_B32 0, 1
-    $sgpr1 = S_GETREG_B32 1
+    S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+    $sgpr1 = S_GETREG_B32 1, implicit-def $mode, implicit $mode
     S_BRANCH %bb.2
 
   bb.2:
-    S_SETREG_B32 $sgpr0, 1
+    S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode
     $sgpr1 = S_MOV_B32 0
-    $sgpr2 = S_GETREG_B32 1
+    $sgpr2 = S_GETREG_B32 1, implicit-def $mode, implicit $mode
     S_BRANCH %bb.3
 
   bb.3:
-    S_SETREG_B32 $sgpr0, 0
-    $sgpr1 = S_GETREG_B32 1
+    S_SETREG_B32 $sgpr0, 0, implicit-def $mode, implicit $mode
+    $sgpr1 = S_GETREG_B32 1, implicit-def $mode, implicit $mode
     S_ENDPGM 0
 ...
 
@@ -173,18 +173,18 @@ name: s_setreg
 
 body: |
   bb.0:
-    S_SETREG_B32 $sgpr0, 1
-    S_SETREG_B32 $sgpr1, 1
+    S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode
+    S_SETREG_B32 $sgpr1, 1, implicit-def $mode, implicit $mode
     S_BRANCH %bb.1
 
   bb.1:
-    S_SETREG_B32 $sgpr0, 64
-    S_SETREG_B32 $sgpr1, 128
+    S_SETREG_B32 $sgpr0, 64, implicit-def $mode, implicit $mode
+    S_SETREG_B32 $sgpr1, 128, implicit-def $mode, implicit $mode
     S_BRANCH %bb.2
 
   bb.2:
-    S_SETREG_B32 $sgpr0, 1
-    S_SETREG_B32 $sgpr1, 0
+    S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode
+    S_SETREG_B32 $sgpr1, 0, implicit-def $mode, implicit $mode
     S_ENDPGM 0
 ...
 
@@ -342,12 +342,12 @@ name: rfe
 
 body: |
   bb.0:
-    S_SETREG_B32 $sgpr0, 3
+    S_SETREG_B32 $sgpr0, 3, implicit-def $mode, implicit $mode
     S_RFE_B64 $sgpr2_sgpr3
     S_BRANCH %bb.1
 
   bb.1:
-    S_SETREG_B32 $sgpr0, 0
+    S_SETREG_B32 $sgpr0, 0, implicit-def $mode, implicit $mode
     S_RFE_B64 $sgpr2_sgpr3
     S_ENDPGM 0
 
@@ -461,22 +461,22 @@ name: v_interp
 body: |
   bb.0:
     $m0 = S_MOV_B32 0
-    $vgpr0 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
     $m0 = S_MOV_B32 0
-    $vgpr0 = V_INTERP_P2_F32 $vgpr0, $vgpr1, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F32 $vgpr0, $vgpr1, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
     $m0 = S_MOV_B32 0
-    $vgpr0 = V_INTERP_P1_F32_16bank $vgpr0, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P1_F32_16bank $vgpr0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
     $m0 = S_MOV_B32 0
-    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir b/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir
index 473c69e383950..935e91a3a864b 100644
--- a/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir
@@ -4,7 +4,7 @@
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192
 # GCN:  S_MOV_B32 1082130432
-# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $exec
+# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src1-inlined
@@ -15,7 +15,7 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:sreg_32 = S_MOV_B32 1082130432
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed %18, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed %18, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
@@ -23,7 +23,7 @@ body:             |
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192
 # GCN:  S_MOV_B32 1082130432
-# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $exec
+# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src0-inlined
@@ -34,14 +34,14 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:sreg_32 = S_MOV_B32 1082130432
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %18, 0, killed %0, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %18, 0, killed %0, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192
 # GCN:  S_MOV_B32 1082130432
-# GCN:  %3:vgpr_32 = V_MADAK_F32 killed %0, killed %0, 1092616192, implicit $exec
+# GCN:  %3:vgpr_32 = V_MADAK_F32 killed %0, killed %0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test none-inlined
@@ -52,14 +52,14 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:sreg_32 = S_MOV_B32 1082130432
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed %0, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed %0, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192
 # GCN:  V_MOV_B32_e32 1082130432
-# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $exec
+# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src1-2vgprs-inlined
@@ -70,7 +70,7 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 1082130432, implicit $exec
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed %18, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed %18, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
@@ -78,7 +78,7 @@ body:             |
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192
 # GCN:  V_MOV_B32_e32 1082130432
-# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $exec
+# GCN:  %3:vgpr_32 = V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src0-2vgprs-inlined
@@ -89,14 +89,14 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 1082130432, implicit $exec
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %18, 0, killed %0, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %18, 0, killed %0, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192, implicit $exec
 # GCN:  S_MOV_B32 1082130432
-# GCN:  V_MADAK_F32 1082130432, killed $vgpr1, 1092616192, implicit $exec
+# GCN:  V_MADAK_F32 1082130432, killed $vgpr1, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src0-phys-vgpr
@@ -108,14 +108,14 @@ body:             |
     $vgpr1 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:sgpr_32 = S_MOV_B32 1082130432
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed $vgpr1, 0, killed %18, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed $vgpr1, 0, killed %18, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192, implicit $exec
 # GCN:  S_MOV_B32 1082130432
-# GCN:  V_MADAK_F32 1082130432, killed $vgpr0, 1092616192, implicit $exec
+# GCN:  V_MADAK_F32 1082130432, killed $vgpr0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src1-phys-vgpr
@@ -127,13 +127,13 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     %18:sgpr_32 = S_MOV_B32 1082130432
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %18, 0, killed $vgpr0, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %18, 0, killed $vgpr0, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192, implicit $exec
-# GCN:  V_MAC_F32_e64 0, killed $sgpr2, 0, killed %0, 0, %1, 0, 0, implicit $exec
+# GCN:  V_MAC_F32_e64 0, killed $sgpr2, 0, killed %0, 0, %1, 0, 0, implicit $mode, implicit $exec
 
 ---
 name:            test src0-phys-sgpr
@@ -144,13 +144,13 @@ body:             |
 
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed $sgpr2, 0, killed %0, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed $sgpr2, 0, killed %0, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192, implicit $exec
-# GCN:  V_MAC_F32_e64 0, killed %0, 0, killed $sgpr2, 0, %1, 0, 0, implicit $exec
+# GCN:  V_MAC_F32_e64 0, killed %0, 0, killed $sgpr2, 0, %1, 0, 0, implicit $mode, implicit $exec
 
 ---
 name:            test src1-phys-sgpr
@@ -161,14 +161,14 @@ body:             |
 
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed $sgpr2, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed $sgpr2, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: bb.0:
 # GCN:  V_MOV_B32_e32 1092616192, implicit $exec
 # GCN:  $sgpr2 = S_MOV_B32 1082130432
-# GCN:  V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $exec
+# GCN:  V_MADAK_F32 1082130432, killed %0, 1092616192, implicit $mode, implicit $exec
 
 ---
 name:            test src1-phys-sgpr-move
@@ -180,6 +180,6 @@ body:             |
     %0:vgpr_32 = COPY $vgpr0
     %17:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
     $sgpr2 = S_MOV_B32 1082130432
-    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed $sgpr2, 0, %17, 0, 0, implicit $exec
+    %19:vgpr_32 = V_MAC_F32_e64 0, killed %0, 0, killed $sgpr2, 0, %17, 0, 0, implicit $mode, implicit $exec
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 9f49f9fd58526..59ce256dc0127 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -11,7 +11,7 @@ body:             |
   bb.0:
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
     $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -34,8 +34,8 @@ body:             |
 name:            mfma_write_agpr_mfma_read_same_agpr
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -47,8 +47,8 @@ body:             |
 name:            mfma_write_agpr_mfma_read_overlap
 body:             |
   bb.0:
-    $agpr1_agpr2_agpr3_agpr4 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr1_agpr2_agpr3_agpr4 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -60,8 +60,8 @@ body:             |
 name:            mfma_write_agpr_mfma_read_partial
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -75,8 +75,8 @@ body:             |
 name:            mfma_write_agpr_mfma_srca_read_overlap
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $agpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $agpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -90,8 +90,8 @@ body:             |
 name:            mfma_write_agpr_mfma_srcb_read_overlap
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $agpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $agpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -105,7 +105,7 @@ body:             |
 name:            mfma_4x4_write_agpr_accvgpr_read
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
 ...
 ---
@@ -126,7 +126,7 @@ body:             |
 name:            mfma_16x16_write_agpr_accvgpr_read
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
 ...
 ---
@@ -155,7 +155,7 @@ body:             |
 name:            mfma_32x32_write_agpr_accvgpr_read
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec
 ...
 ---
@@ -167,7 +167,7 @@ body:             |
 name:            mfma_4x4_write_agpr_accvgpr_write
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
 ...
 ---
@@ -185,7 +185,7 @@ body:             |
 name:            mfma_16x16_write_agpr_accvgpr_write
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
 ...
 ---
@@ -211,7 +211,7 @@ body:             |
 name:            mfma_32x32_write_agpr_accvgpr_write
 body:             |
   bb.0:
-    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
 ...
 ---
@@ -222,7 +222,7 @@ body:             |
 name:            mfma_4x4_read_srcc_accvgpr_write
 body:             |
   bb.0:
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
 ...
 ---
@@ -238,7 +238,7 @@ body:             |
 name:            mfma_16x16_read_srcc_accvgpr_write
 body:             |
   bb.0:
-    $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_16X16X1F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
 ...
 ---
@@ -262,7 +262,7 @@ body:             |
 name:            mfma_32x32_read_srcc_accvgpr_write
 body:             |
   bb.0:
-    $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $exec
+    $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X2F32 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec
 ...
 ---
@@ -274,7 +274,7 @@ name:            accvgpr_read_write_vgpr_valu_read
 body:             |
   bb.0:
     $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr4, implicit $exec
-    $vgpr1 = V_ADD_F32_e32 0, killed $vgpr0, implicit $exec
+    $vgpr1 = V_ADD_F32_e32 0, killed $vgpr0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -287,7 +287,7 @@ name:            accvgpr_read_write_vgpr_mfma_read
 body:             |
   bb.0:
     $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr4, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr0, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32 killed $vgpr0, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -312,7 +312,7 @@ name:            accvgpr_write_agpr_mfma_read_srcc
 body:             |
   bb.0:
     $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr2, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $vgpr2, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -326,7 +326,7 @@ name:            accvgpr_write_agpr_mfma_read_srca
 body:             |
   bb.0:
     $agpr8 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -340,7 +340,7 @@ name:            accvgpr_write_agpr_mfma_read_srcb
 body:             |
   bb.0:
     $agpr8 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $agpr8, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $vgpr1, killed $agpr8, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
@@ -369,7 +369,7 @@ name:            vcmpx_write_exec_mfma
 body:             |
   bb.0:
     implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/merge-m0.mir b/llvm/test/CodeGen/AMDGPU/merge-m0.mir
index 9c6ff0b0a628f..0afc5d1cb1a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-m0.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-m0.mir
@@ -291,7 +291,7 @@ body:             |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    S_SETREG_IMM32_B32 0, 1
+    S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
     SI_INIT_M0 -1, implicit-def $m0
     DS_WRITE_B32 %0, %1, 0, 0, implicit $m0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/mode-register.mir b/llvm/test/CodeGen/AMDGPU/mode-register.mir
index a6324410b4888..753e6a3ce0a75 100644
--- a/llvm/test/CodeGen/AMDGPU/mode-register.mir
+++ b/llvm/test/CodeGen/AMDGPU/mode-register.mir
@@ -17,12 +17,12 @@ body: |
     liveins: $sgpr0, $sgpr1, $sgpr2
     $m0 = S_MOV_B32 killed $sgpr2
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
-    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -41,14 +41,14 @@ body: |
   bb.0:
     liveins: $sgpr0, $sgpr1, $sgpr2
     $m0 = S_MOV_B32 killed $sgpr2
-    S_SETREG_IMM32_B32 3, 2177
+    S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
-    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -68,13 +68,13 @@ body: |
     liveins: $sgpr0, $sgpr1, $sgpr2
     $m0 = S_MOV_B32 killed $sgpr2
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
-    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
-    S_SETREG_IMM32_B32 0, 2177
-    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $mode, implicit $m0, implicit $exec
+    S_SETREG_IMM32_B32 0, 2177, implicit-def $mode, implicit $mode
+    $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -89,7 +89,7 @@ name: rtn_default
 body: |
   bb.0:
     liveins: $vgpr1_vgpr2
-    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -106,8 +106,8 @@ name: rtn_from_rtz
 body: |
   bb.0:
     liveins: $vgpr1_vgpr2
-    S_SETREG_IMM32_B32 3, 2177
-    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -122,11 +122,11 @@ body: |
   bb.0:
     successors: %bb.1
     liveins: $vgpr1_vgpr2
-    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $mode, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -150,13 +150,13 @@ body: |
     liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
     $m0 = S_MOV_B32 killed $sgpr2
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
-    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
-    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -179,14 +179,14 @@ body: |
     liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
     $m0 = S_MOV_B32 killed $sgpr2
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
-    S_SETREG_IMM32_B32 2, 2049
-    $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
-    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
-    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
-    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
-    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    S_SETREG_IMM32_B32 2, 2049, implicit-def $mode, implicit $mode
+    $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec
+    $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
+    $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $mode, implicit $m0, implicit $exec
+    $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -212,13 +212,13 @@ body: |
 
   bb.1:
     successors: %bb.2
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
     successors: %bb.1, %bb.3
-    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_CBRANCH_VCCZ %bb.1, implicit $vcc
     S_BRANCH %bb.3
 
@@ -251,7 +251,7 @@ body: |
 
   bb.2:
     successors: %bb.1, %bb.3
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
     S_CBRANCH_VCCZ %bb.1, implicit $vcc
     S_BRANCH %bb.3
 
@@ -267,7 +267,7 @@ body: |
 
   bb.5:
     successors: %bb.1, %bb.6
-    S_SETREG_IMM32_B32 3, 2177
+    S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
     S_CBRANCH_VCCZ %bb.1, implicit $vcc
     S_BRANCH %bb.6
 
@@ -306,7 +306,7 @@ body: |
   bb.3:
     successors: %bb.1, %bb.4
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_CBRANCH_VCCZ %bb.1, implicit $vcc
     S_BRANCH %bb.4
 
@@ -337,12 +337,12 @@ body: |
 
   bb.2:
     successors: %bb.3
-    S_SETREG_IMM32_B32 3, 2177
+    S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
     S_BRANCH %bb.3
 
   bb.3:
     successors: %bb.4
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
@@ -373,7 +373,7 @@ body: |
 
   bb.2:
     successors: %bb.3
-    S_SETREG_IMM32_B32 3, 2177
+    S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
     S_BRANCH %bb.3
 
   bb.3:
@@ -383,7 +383,7 @@ body: |
 
   bb.4:
     successors: %bb.5
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
     S_BRANCH %bb.5
 
   bb.5:
@@ -402,8 +402,8 @@ body: |
   bb.0:
     successors: %bb.1
     liveins: $vgpr1_vgpr2
-    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -419,7 +419,7 @@ body: |
     S_BRANCH %bb.4
 
   bb.4:
-    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+    $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
     S_ENDPGM 0
 ...
 ---
@@ -446,12 +446,12 @@ body: |
 
   bb.2:
     successors: %bb.3
-    S_SETREG_IMM32_B32 3, 2177
+    S_SETREG_IMM32_B32 3, 2177, implicit-def $mode, implicit $mode
     S_BRANCH %bb.3
 
   bb.3:
     successors: %bb.4
-    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+    $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $mode, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/movrels-bug.mir b/llvm/test/CodeGen/AMDGPU/movrels-bug.mir
index c5575b2b7387e..6d2b9ab4422aa 100644
--- a/llvm/test/CodeGen/AMDGPU/movrels-bug.mir
+++ b/llvm/test/CodeGen/AMDGPU/movrels-bug.mir
@@ -24,7 +24,7 @@ body:             |
     V_MOVRELD_B32_e32 undef $vgpr2, 0, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit undef $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8(tied-def 4)
     $m0 = S_MOV_B32 undef $sgpr0
     $vgpr1 = V_MOVRELS_B32_e32 undef $vgpr1, implicit $m0, implicit $exec, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
-    $vgpr4 = V_MAC_F32_e32 undef $vgpr0, undef $vgpr0, undef $vgpr4, implicit $exec
+    $vgpr4 = nofpexcept V_MAC_F32_e32 undef $vgpr0, undef $vgpr0, undef $vgpr4, implicit $mode, implicit $exec
     EXP_DONE 15, undef $vgpr0, killed $vgpr1, killed $vgpr4, undef $vgpr0, 0, 0, 12, implicit $exec
     S_ENDPGM 0
 
diff --git a/llvm/test/CodeGen/AMDGPU/omod-nsz-flag.mir b/llvm/test/CodeGen/AMDGPU/omod-nsz-flag.mir
index 01d95ad4c70d1..f8a140d732066 100644
--- a/llvm/test/CodeGen/AMDGPU/omod-nsz-flag.mir
+++ b/llvm/test/CodeGen/AMDGPU/omod-nsz-flag.mir
@@ -4,8 +4,8 @@
 
 # FIXME: Is it OK to fold omod for this?
 # GCN-LABEL: name: omod_inst_flag_nsz_src
-# GCN: %0:vgpr_32 = nsz V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $exec
-# GCN-NEXT: %1:vgpr_32 = V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $exec
+# GCN: %0:vgpr_32 = nsz nofpexcept V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+# GCN-NEXT: %1:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $mode, implicit $exec
 # GCN-NEXT: S_ENDPGM 0, implicit %1
 name: omod_inst_flag_nsz_src
 tracksRegLiveness: true
@@ -18,15 +18,15 @@ body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-  %0:vgpr_32 = nsz V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $exec
-  %1:vgpr_32 = V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $exec
+  %0:vgpr_32 = nsz nofpexcept V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+  %1:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $mode, implicit $exec
   S_ENDPGM 0, implicit %1
 
 ...
 ---
 
 # GCN-LABEL: name: omod_inst_flag_nsz_result
-# GCN: %0:vgpr_32 = V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1, implicit $exec
+# GCN: %0:vgpr_32 = nofpexcept V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1, implicit $mode, implicit $exec
 # GCN-NEXT: S_ENDPGM 0, implicit %0
 
 name: omod_inst_flag_nsz_result
@@ -40,15 +40,15 @@ body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-  %0:vgpr_32 = V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $exec
-  %1:vgpr_32 = nsz V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $exec
+  %0:vgpr_32 = nofpexcept V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+  %1:vgpr_32 = nsz nofpexcept V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $mode, implicit $exec
   S_ENDPGM 0, implicit %1
 ...
 
 ---
 
 # GCN-LABEL: name: omod_inst_flag_nsz_both
-# GCN: %0:vgpr_32 = nsz V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1, implicit $exec
+# GCN: %0:vgpr_32 = nsz nofpexcept V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1, implicit $mode, implicit $exec
 # GCN-NEXT: S_ENDPGM 0, implicit %0
 
 name: omod_inst_flag_nsz_both
@@ -62,7 +62,7 @@ body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1
 
-  %0:vgpr_32 = nsz V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $exec
-  %1:vgpr_32 = nsz V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $exec
+  %0:vgpr_32 = nsz nofpexcept V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+  %1:vgpr_32 = nsz nofpexcept V_MUL_F32_e64 0, %0, 0, 1073741824, 0, 0, implicit $mode, implicit $exec
   S_ENDPGM 0, implicit %1
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
index 66bd4c163c669..837389d6aa7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
+++ b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir
@@ -15,7 +15,7 @@ body:             |
     $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     S_BARRIER
-    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $mode, implicit $exec
     $vgpr0 = V_ACCVGPR_READ_B32 $agpr31, implicit $exec
     BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, implicit $exec
 
diff --git a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
index 9b1bb7f2fb7e5..e4e33026da4b0 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir
@@ -188,7 +188,7 @@ body:             |
     %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
     %46 = V_AND_B32_e32 1, killed %45, implicit $exec
     %21 = S_BUFFER_LOAD_DWORD_SGPR undef %22, undef %23, 0, 0 :: (dereferenceable invariant load 4)
-    %25 = V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $exec
+    %25 = nofpexcept V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $mode, implicit $exec
     %26 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %25, implicit $exec
     %62 = IMPLICIT_DEF
 
@@ -211,13 +211,13 @@ body:             |
     S_BRANCH %bb.31
 
   bb.30:
-    %33 = V_MAD_F32 1, killed %53.sub0, 0, undef %34, 0, 0, 0, 0, implicit $exec
-    %35 = V_MAC_F32_e32 killed %33, undef %36, undef %35, implicit $exec
-    %38 = V_MAX_F32_e32 0, killed %35, implicit $exec
-    %39 = V_LOG_F32_e32 killed %38, implicit $exec
-    %40 = V_MUL_F32_e32 killed %39, undef %41, implicit $exec
-    %42 = V_EXP_F32_e32 killed %40, implicit $exec
-    dead %43 = V_MUL_F32_e32 killed %42, undef %44, implicit $exec
+    %33 = nofpexcept V_MAD_F32 1, killed %53.sub0, 0, undef %34, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %35 = nofpexcept V_MAC_F32_e32 killed %33, undef %36, undef %35, implicit $mode, implicit $exec
+    %38 = nofpexcept V_MAX_F32_e32 0, killed %35, implicit $mode, implicit $exec
+    %39 = nofpexcept V_LOG_F32_e32 killed %38, implicit $mode, implicit $exec
+    %40 = nofpexcept V_MUL_F32_e32 killed %39, undef %41, implicit $mode, implicit $exec
+    %42 = nofpexcept V_EXP_F32_e32 killed %40, implicit $mode, implicit $exec
+    dead %43 = nofpexcept V_MUL_F32_e32 killed %42, undef %44, implicit $mode, implicit $exec
     %63 = COPY killed %51
 
   bb.31:
diff --git a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join.mir b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
index ad56ba08583ef..6d1df163ec824 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
@@ -113,10 +113,10 @@ body:             |
 
   bb.1:
     %30 = V_MOV_B32_e32 1036831949, implicit $exec
-    %31 = V_ADD_F32_e32 %30, %1.sub3, implicit $exec
-    %33 = V_ADD_F32_e32 %30, %1.sub2, implicit $exec
-    %35 = V_ADD_F32_e32 %30, %1.sub1, implicit $exec
-    %37 = V_ADD_F32_e32 killed %30, killed %1.sub0, implicit $exec
+    %31 = nofpexcept V_ADD_F32_e32 %30, %1.sub3, implicit $mode, implicit $exec
+    %33 = nofpexcept V_ADD_F32_e32 %30, %1.sub2, implicit $mode, implicit $exec
+    %35 = nofpexcept V_ADD_F32_e32 %30, %1.sub1, implicit $mode, implicit $exec
+    %37 = nofpexcept V_ADD_F32_e32 killed %30, killed %1.sub0, implicit $mode, implicit $exec
     undef %56.sub0 = COPY killed %37
     %56.sub1 = COPY killed %35
     %56.sub2 = COPY killed %33
@@ -141,10 +141,10 @@ body:             |
     %7 = COPY killed %61
     %6 = COPY killed %60
     %8 = S_ADD_I32 killed %6, 1, implicit-def dead $scc
-    %44 = V_ADD_F32_e32 %43, %7.sub3, implicit $exec
-    %46 = V_ADD_F32_e32 %43, %7.sub2, implicit $exec
-    %48 = V_ADD_F32_e32 %43, %7.sub1, implicit $exec
-    %50 = V_ADD_F32_e32 %43, killed %7.sub0, implicit $exec
+    %44 = nofpexcept V_ADD_F32_e32 %43, %7.sub3, implicit $mode, implicit $exec
+    %46 = nofpexcept V_ADD_F32_e32 %43, %7.sub2, implicit $mode, implicit $exec
+    %48 = nofpexcept V_ADD_F32_e32 %43, %7.sub1, implicit $mode, implicit $exec
+    %50 = nofpexcept V_ADD_F32_e32 %43, killed %7.sub0, implicit $mode, implicit $exec
     undef %57.sub0 = COPY killed %50
     %57.sub1 = COPY killed %48
     %57.sub2 = COPY %46
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-prune.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-prune.mir
index 96bc78cbbd545..5664c7005b5dd 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-prune.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-prune.mir
@@ -23,9 +23,9 @@ body: |
     %6 : vreg_64 = COPY killed %4
 
   bb.2:
-    %2 : vgpr_32 = V_CVT_F32_I32_e32 killed %5.sub1, implicit $exec
+    %2 : vgpr_32 = V_CVT_F32_I32_e32 killed %5.sub1, implicit $mode, implicit $exec
 
   bb.3:
-    %3 : vgpr_32 = V_CVT_F32_I32_e32 killed %6.sub1, implicit $exec
+    %3 : vgpr_32 = V_CVT_F32_I32_e32 killed %6.sub1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalescing-remove-partial-redundancy-assert.mir b/llvm/test/CodeGen/AMDGPU/regcoalescing-remove-partial-redundancy-assert.mir
index 9693f61a45ff0..7f45c4058221c 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalescing-remove-partial-redundancy-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalescing-remove-partial-redundancy-assert.mir
@@ -11,68 +11,68 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     successors: %bb.1, %bb.2
-  
-    %21:vgpr_32 = V_TRUNC_F32_e32 undef %22:vgpr_32, implicit $exec
-    %23:vgpr_32 = V_CVT_U32_F32_e32 killed %21, implicit $exec
+
+    %21:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %22:vgpr_32, implicit $mode, implicit $exec
+    %23:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %21, implicit $mode, implicit $exec
     %108:vgpr_32 = V_LSHRREV_B32_e32 4, killed %23, implicit $exec
     undef %109.sub1:vreg_128 = COPY %108
     %28:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %29:sgpr_128, 3044, 0, 0 :: (dereferenceable invariant load 4)
     S_CMP_EQ_U32 killed %28, 0, implicit-def $scc
     S_CBRANCH_SCC0 %bb.2, implicit killed $scc
-  
+
   bb.1:
     %138:vreg_128 = COPY killed %109
     S_BRANCH %bb.9
-  
+
   bb.2:
     successors: %bb.3, %bb.4
-  
+
     S_CBRANCH_SCC0 %bb.4, implicit undef $scc
-  
+
   bb.3:
     %136:vreg_128 = COPY killed %109
     S_BRANCH %bb.5
-  
+
   bb.4:
     %136:vreg_128 = COPY killed %109
-  
+
   bb.5:
     successors: %bb.6, %bb.8
-  
+
     %110:vreg_128 = COPY killed %136
     dead %32:sreg_32_xm0 = S_MOV_B32 0
     %111:vreg_128 = COPY %110
     %111.sub3:vreg_128 = COPY undef %32
     S_CBRANCH_SCC1 %bb.8, implicit undef $scc
     S_BRANCH %bb.6
-  
+
   bb.6:
     %36:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %37:sgpr_128, 2708, 0, 0 :: (dereferenceable invariant load 4)
-    %39:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, killed %110.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 0, 0, 0, 0, implicit $exec
-    %40:vgpr_32 = V_MAD_F32 0, %111.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 0, 0, 0, 0, implicit $exec
-    %41:vgpr_32 = V_MUL_F32_e64 0, 0, 0, killed %40, 1, 0, implicit $exec
-    %43:vgpr_32 = V_MUL_F32_e32 0, %39, implicit $exec
+    %39:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32 0, killed %110.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vgpr_32 = nofpexcept V_MAD_F32 0, %111.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vgpr_32 = nofpexcept V_MUL_F32_e64 0, 0, 0, killed %40, 1, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %39, implicit $mode, implicit $exec
     %44:vgpr_32 = COPY killed %43
-    %44:vgpr_32 = V_MAC_F32_e32 0, killed %41, %44, implicit $exec
+    %44:vgpr_32 = nofpexcept V_MAC_F32_e32 0, killed %41, %44, implicit $mode, implicit $exec
     %47:vgpr_32 = V_MOV_B32_e32 2143289344, implicit $exec
     %46:vgpr_32 = COPY killed %47
-    %46:vgpr_32 = V_MAC_F32_e32 0, killed %39, %46, implicit $exec
+    %46:vgpr_32 = nofpexcept V_MAC_F32_e32 0, killed %39, %46, implicit $mode, implicit $exec
     undef %115.sub0:vreg_128 = COPY %46
     %115.sub1:vreg_128 = COPY killed %46
     %115.sub2:vreg_128 = COPY killed %44
     %50:sreg_64_xexec = V_CMP_NE_U32_e64 0, killed %36, implicit $exec
     dead %118:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %137:vreg_128 = IMPLICIT_DEF
-  
+
   bb.7:
     successors: %bb.7, %bb.8
-  
+
     %119:vreg_128 = COPY killed %137
     %121:vreg_128 = COPY killed %119
     %121.sub3:vreg_128 = COPY undef %32
-    %56:vgpr_32 = V_ADD_F32_e32 %115.sub2, %121.sub2, implicit $exec
-    %59:vgpr_32 = V_ADD_F32_e32 %115.sub1, %121.sub1, implicit $exec
-    %62:vgpr_32 = V_ADD_F32_e32 %115.sub0, killed %121.sub0, implicit $exec
+    %56:vgpr_32 = nofpexcept V_ADD_F32_e32 %115.sub2, %121.sub2, implicit $mode, implicit $exec
+    %59:vgpr_32 = nofpexcept V_ADD_F32_e32 %115.sub1, %121.sub1, implicit $mode, implicit $exec
+    %62:vgpr_32 = nofpexcept V_ADD_F32_e32 %115.sub0, killed %121.sub0, implicit $mode, implicit $exec
     undef %117.sub0:vreg_128 = COPY killed %62
     %117.sub1:vreg_128 = COPY killed %59
     %117.sub2:vreg_128 = COPY killed %56
@@ -81,118 +81,118 @@ body:             |
     %137:vreg_128 = COPY killed %117
     S_CBRANCH_VCCNZ %bb.7, implicit killed $vcc
     S_BRANCH %bb.8
-  
+
   bb.8:
     dead %66:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %67:sgpr_128, 2704, 0, 0 :: (dereferenceable invariant load 4)
     %138:vreg_128 = COPY killed %111
-  
+
   bb.9:
     %113:vreg_128 = COPY killed %138
     S_CBRANCH_SCC1 %bb.18, implicit undef $scc
     S_BRANCH %bb.10
-  
+
   bb.10:
     S_CBRANCH_SCC1 %bb.12, implicit undef $scc
     S_BRANCH %bb.11
-  
+
   bb.11:
-  
+
   bb.12:
     successors: %bb.13, %bb.18
-  
+
     S_CBRANCH_SCC1 %bb.18, implicit undef $scc
     S_BRANCH %bb.13
-  
+
   bb.13:
     successors: %bb.14, %bb.17
-  
+
     S_CBRANCH_SCC1 %bb.17, implicit undef $scc
     S_BRANCH %bb.14
-  
+
   bb.14:
     S_CBRANCH_SCC1 %bb.16, implicit undef $scc
     S_BRANCH %bb.15
-  
+
   bb.15:
-  
+
   bb.16:
-  
+
   bb.17:
-  
+
   bb.18:
     S_CBRANCH_SCC1 %bb.26, implicit undef $scc
     S_BRANCH %bb.19
-  
+
   bb.19:
     S_CBRANCH_SCC1 %bb.26, implicit undef $scc
     S_BRANCH %bb.20
-  
+
   bb.20:
     S_CBRANCH_SCC1 %bb.25, implicit undef $scc
     S_BRANCH %bb.21
-  
+
   bb.21:
     successors: %bb.22, %bb.24
-  
+
     S_CBRANCH_SCC1 %bb.24, implicit undef $scc
     S_BRANCH %bb.22
-  
+
   bb.22:
     successors: %bb.23, %bb.24
-  
+
     S_CBRANCH_SCC1 %bb.24, implicit undef $scc
     S_BRANCH %bb.23
-  
+
   bb.23:
-  
+
   bb.24:
-  
+
   bb.25:
-  
+
   bb.26:
     S_CBRANCH_SCC1 %bb.33, implicit undef $scc
     S_BRANCH %bb.27
-  
+
   bb.27:
     S_CBRANCH_SCC1 %bb.33, implicit undef $scc
     S_BRANCH %bb.28
-  
+
   bb.28:
     dead %77:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %78:vgpr_32 = nnan arcp contract reassoc V_MAD_F32 0, killed %113.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 0, 1065353216, 0, 0, implicit $exec
+    %78:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32 0, killed %113.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 0, 1065353216, 0, 0, implicit $mode, implicit $exec
     dead %80:sreg_32_xm0 = S_MOV_B32 0
-    dead %82:vgpr_32 = V_MUL_F32_e32 killed %78, %78, implicit $exec
+    dead %82:vgpr_32 = nofpexcept V_MUL_F32_e32 killed %78, %78, implicit $mode, implicit $exec
     dead %126:vgpr_32 = V_MOV_B32_e32 2143289344, implicit $exec
     dead %125:vreg_128 = IMPLICIT_DEF
     dead %91:sreg_32_xm0 = S_MOV_B32 2143289344
     %96:sreg_64 = S_AND_B64 $exec, 0, implicit-def dead $scc
     %139:vreg_128 = IMPLICIT_DEF
-  
+
   bb.29:
     successors: %bb.30, %bb.31
-  
+
     dead %127:vreg_128 = COPY killed %139
     S_CBRANCH_SCC0 %bb.31, implicit undef $scc
-  
+
   bb.30:
     S_BRANCH %bb.32
-  
+
   bb.31:
     successors: %bb.32, %bb.34
-  
+
     $vcc = COPY %96
     S_CBRANCH_VCCNZ %bb.34, implicit killed $vcc
     S_BRANCH %bb.32
-  
+
   bb.32:
     dead %130:vreg_128 = IMPLICIT_DEF
     dead %128:vreg_128 = COPY undef %130
     %139:vreg_128 = IMPLICIT_DEF
     S_BRANCH %bb.29
-  
+
   bb.33:
     S_ENDPGM 0
-  
+
   bb.34:
     S_ENDPGM 0
 
diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
index d7892a0c97592..d03f60cc68355 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
@@ -2,7 +2,7 @@
 ---
 
 # GCN-LABEL: name: mac_invalid_operands
-# GCN: undef %18.sub0:vreg_128 = V_MAC_F32_e32 undef %3:vgpr_32, undef %9:vgpr_32, undef %18.sub0, implicit $exec
+# GCN: undef %18.sub0:vreg_128 = nofpexcept V_MAC_F32_e32 undef %3:vgpr_32, undef %9:vgpr_32, undef %18.sub0, implicit $mode, implicit $exec
 
 name:            mac_invalid_operands
 alignment:       1
@@ -38,14 +38,14 @@ body:             |
   bb.0:
     successors: %bb.2, %bb.1
 
-    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, implicit $exec
+    %7 = nofpexcept V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, implicit $mode, implicit $exec
     $vcc = COPY killed %7
     S_CBRANCH_VCCZ %bb.2, implicit killed $vcc
 
   bb.1:
     successors: %bb.3
 
-    %4 = V_ADD_F32_e32 undef %6, undef %5, implicit $exec
+    %4 = nofpexcept V_ADD_F32_e32 undef %6, undef %5, implicit $mode, implicit $exec
     undef %12.sub0 = COPY killed %4
     %17 = COPY killed %12
     S_BRANCH %bb.3
@@ -53,7 +53,7 @@ body:             |
   bb.2:
     successors: %bb.3
 
-    %8 = V_MAC_F32_e32 undef %3, undef %9, undef %8, implicit $exec
+    %8 = nofpexcept V_MAC_F32_e32 undef %3, undef %9, undef %8, implicit $mode, implicit $exec
     undef %13.sub0 = COPY %8
     %13.sub1 = COPY %8
     %13.sub2 = COPY killed %8
@@ -77,13 +77,13 @@ body:             |
 
 # GCN-LABEL: name: vreg_does_not_dominate
 
-# GCN: undef %8.sub1:vreg_128 = V_MAC_F32_e32 undef %2:vgpr_32, undef %1:vgpr_32, undef %8.sub1, implicit $exec
+# GCN: undef %8.sub1:vreg_128 = nofpexcept V_MAC_F32_e32 undef %2:vgpr_32, undef %1:vgpr_32, undef %8.sub1, implicit $mode, implicit $exec
 # GCN: undef %7.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
 # GCN: undef %9.sub2:vreg_128 = COPY %7.sub0
 
-# GCN: undef %6.sub3:vreg_128 = V_ADD_F32_e32 undef %3:vgpr_32, undef %3:vgpr_32, implicit $exec
-# GCN: undef %7.sub0:vreg_128 = V_ADD_F32_e64 0, 0, 0, 0, 0, 0, implicit $exec
-# GCN: %8.sub1:vreg_128 = V_ADD_F32_e32 %8.sub1, %8.sub1, implicit $exec
+# GCN: undef %6.sub3:vreg_128 = nofpexcept V_ADD_F32_e32 undef %3:vgpr_32, undef %3:vgpr_32, implicit $mode, implicit $exec
+# GCN: undef %7.sub0:vreg_128 = nofpexcept V_ADD_F32_e64 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+# GCN: %8.sub1:vreg_128 = nofpexcept V_ADD_F32_e32 %8.sub1, %8.sub1, implicit $mode, implicit $exec
 
 # GCN: BUFFER_STORE_DWORD_OFFEN %6.sub3, %0,
 # GCN: BUFFER_STORE_DWORD_OFFEN %9.sub2, %0,
@@ -117,7 +117,7 @@ body:             |
 
     %5 = COPY $sgpr30_sgpr31
     %0 = COPY $vgpr0
-    undef %6.sub1 = V_MAC_F32_e32 undef %2, undef %1, undef %6.sub1, implicit $exec
+    undef %6.sub1 = nofpexcept V_MAC_F32_e32 undef %2, undef %1, undef %6.sub1, implicit $mode, implicit $exec
     %6.sub0 = V_MOV_B32_e32 0, implicit $exec
     %6.sub2 = COPY %6.sub0
     S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
@@ -126,9 +126,9 @@ body:             |
   bb.1:
     successors: %bb.2
 
-    %6.sub3 = V_ADD_F32_e32 undef %3, undef %3, implicit $exec
-    %6.sub0 = V_ADD_F32_e64 0, 0, 0, 0, 0, 0, implicit $exec
-    %6.sub1 = V_ADD_F32_e32 %6.sub1, %6.sub1, implicit $exec
+    %6.sub3 = nofpexcept V_ADD_F32_e32 undef %3, undef %3, implicit $mode, implicit $exec
+    %6.sub0 = nofpexcept V_ADD_F32_e64 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %6.sub1 = nofpexcept V_ADD_F32_e32 %6.sub1, %6.sub1, implicit $mode, implicit $exec
     %6.sub2 = COPY %6.sub0
 
   bb.2:
@@ -143,7 +143,7 @@ body:             |
 
 # GCN-LABEL: name: inf_loop_tied_operand
 # GCN: bb.0:
-# GCN-NEXT: undef %2.sub0:vreg_128 = V_MAC_F32_e32 1073741824, undef %0:vgpr_32, undef %2.sub0, implicit $exec
+# GCN-NEXT: undef %2.sub0:vreg_128 = nofpexcept V_MAC_F32_e32 1073741824, undef %0:vgpr_32, undef %2.sub0, implicit $mode, implicit $exec
 # GCN-NEXT: dead undef %3.sub1:vreg_128 = COPY %2.sub0
 
 name:            inf_loop_tied_operand
@@ -154,7 +154,7 @@ registers:
   - { id: 2, class: vreg_128, preferred-register: '' }
 body:             |
   bb.0:
-    %1 = V_MAC_F32_e32 1073741824, undef %0, undef %1, implicit $exec
+    %1 = nofpexcept V_MAC_F32_e32 1073741824, undef %0, undef %1, implicit $mode, implicit $exec
     undef %2.sub0 = COPY %1
     %2.sub1 = COPY %1
 
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
index 88cb57ca0cdc7..fd435d4adbe64 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -33,13 +33,13 @@ body:             |
   ; CHECK:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-  ; CHECK:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec
-  ; CHECK:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec
+  ; CHECK:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
+  ; CHECK:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
   ; CHECK:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
-  ; CHECK:   undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
+  ; CHECK:   undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
   ; CHECK:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK:   %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec
+  ; CHECK:   %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
   ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
   ; CHECK:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
@@ -55,11 +55,11 @@ body:             |
   ; CHECK:   GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   S_SETREG_IMM32_B32 0, 1
+  ; CHECK:   S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
   ; CHECK:   DBG_VALUE
   ; CHECK:   DBG_VALUE
   ; CHECK:   DBG_VALUE
-  ; CHECK:   S_SETREG_IMM32_B32 0, 1
+  ; CHECK:   S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
   ; CHECK: bb.2:
   ; CHECK:   S_NOP 0, implicit [[COPY]]
   ; CHECK:   S_NOP 0, implicit [[DEF8]]
@@ -74,8 +74,8 @@ body:             |
     undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0, %0, implicit $exec
     %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
     %5:vreg_64 = COPY %2
-    undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0, %5.sub0, implicit $exec
-    %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1, %5.sub0, implicit $exec
+    undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0, %5.sub0, implicit $mode, implicit $exec
+    %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1, %5.sub0, implicit $mode, implicit $exec
     %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, 0, 0, implicit $exec
     %8:vreg_64 = IMPLICIT_DEF
     %9:vreg_64 = IMPLICIT_DEF
@@ -88,8 +88,8 @@ body:             |
     %16:vgpr_32 = IMPLICIT_DEF
     %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7, %2.sub0, implicit $exec
-    %19.sub1:vreg_64 = V_ADD_F32_e32 %3, %3, implicit $exec
+    undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7, %2.sub0, implicit $mode, implicit $exec
+    %19.sub1:vreg_64 = V_ADD_F32_e32 %3, %3, implicit $mode, implicit $exec
     GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
     %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, 0, 0, implicit $exec
     %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, 0, 0, implicit $exec
@@ -101,11 +101,11 @@ body:             |
     GLOBAL_STORE_DWORD %15, %18, 0, 0, 0, 0, implicit $exec
 
   bb.1:
-    S_SETREG_IMM32_B32 0, 1
+    S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
     DBG_VALUE
     DBG_VALUE
     DBG_VALUE
-    S_SETREG_IMM32_B32 0, 1
+    S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
 
   bb.2:
     S_NOP 0, implicit %0
diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
index 9d3144196eb17..aac40b73a41e0 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@@ -275,7 +275,7 @@ body:             |
     %76:vreg_128 = GLOBAL_LOAD_DWORDX4 %72, 0, 0, 0, 0, implicit $exec
     %77:vgpr_32 = IMPLICIT_DEF
     %78:vgpr_32 = IMPLICIT_DEF
-    %79:vgpr_32 = V_MUL_F32_e32 0, %77, implicit $exec
+    %79:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %77, implicit $mode, implicit $exec
     %80:vgpr_32 = IMPLICIT_DEF
     %81:vgpr_32 = IMPLICIT_DEF
     %84:vgpr_32 = IMPLICIT_DEF
@@ -288,9 +288,9 @@ body:             |
     %87:vgpr_32 = IMPLICIT_DEF
     %88:vgpr_32 = IMPLICIT_DEF
     %90:vgpr_32 = IMPLICIT_DEF
-    %91:vgpr_32, dead %92:sreg_64 = V_DIV_SCALE_F32 %90, %90, 1065353216, implicit $exec
-    %95:vgpr_32 = V_FMA_F32 0, 0, 0, 0, 0, undef %93:vgpr_32, 0, 0, implicit $exec
-    %96:vgpr_32, %97:sreg_64 = V_DIV_SCALE_F32 1065353216, %90, 1065353216, implicit $exec
+    %91:vgpr_32, dead %92:sreg_64 = nofpexcept V_DIV_SCALE_F32 %90, %90, 1065353216, implicit $mode, implicit $exec
+    %95:vgpr_32 = nofpexcept V_FMA_F32 0, 0, 0, 0, 0, undef %93:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %96:vgpr_32, %97:sreg_64 = nofpexcept V_DIV_SCALE_F32 1065353216, %90, 1065353216, implicit $mode, implicit $exec
     %98:vgpr_32 = IMPLICIT_DEF
     %99:vgpr_32 = IMPLICIT_DEF
     %100:vgpr_32 = IMPLICIT_DEF
@@ -299,18 +299,18 @@ body:             |
     %103:vgpr_32 = IMPLICIT_DEF
     %104:vgpr_32 = IMPLICIT_DEF
     %105:vgpr_32 = IMPLICIT_DEF
-    %106:vgpr_32, dead %107:sreg_64 = V_DIV_SCALE_F32 %90, %90, %105, implicit $exec
-    %108:vgpr_32 = V_RCP_F32_e32 0, implicit $exec
+    %106:vgpr_32, dead %107:sreg_64 = nofpexcept V_DIV_SCALE_F32 %90, %90, %105, implicit $mode, implicit $exec
+    %108:vgpr_32 = nofpexcept V_RCP_F32_e32 0, implicit $mode, implicit $exec
     %109:vgpr_32 = IMPLICIT_DEF
-    %110:vgpr_32 = V_FMA_F32 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %111:vgpr_32, %112:sreg_64 = V_DIV_SCALE_F32 0, 0, 0, implicit $exec
-    %113:vgpr_32 = V_MUL_F32_e32 0, %110, implicit $exec
+    %110:vgpr_32 = nofpexcept V_FMA_F32 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %111:vgpr_32, %112:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, 0, 0, implicit $mode, implicit $exec
+    %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %110, implicit $mode, implicit $exec
     %114:vgpr_32 = IMPLICIT_DEF
     %115:vgpr_32 = IMPLICIT_DEF
     %116:vgpr_32 = IMPLICIT_DEF
     $vcc = IMPLICIT_DEF
-    %117:vgpr_32 = V_DIV_FMAS_F32 0, %116, 0, %110, 0, %115, 0, 0, implicit killed $vcc, implicit $exec
-    %118:vgpr_32 = V_DIV_FIXUP_F32 0, %117, 0, %90, 0, %105, 0, 0, implicit $exec
+    %117:vgpr_32 = nofpexcept V_DIV_FMAS_F32 0, %116, 0, %110, 0, %115, 0, 0, implicit killed $vcc, implicit $mode, implicit $exec
+    %118:vgpr_32 = nofpexcept V_DIV_FIXUP_F32 0, %117, 0, %90, 0, %105, 0, 0, implicit $mode, implicit $exec
     %119:vgpr_32 = IMPLICIT_DEF
     %120:vgpr_32 = IMPLICIT_DEF
     %121:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
index b4ac7cf8732c4..192bce362c4f9 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir
@@ -50,12 +50,12 @@ body:             |
 # GCN-LABEL: {{^}}name: trunc_shr_f32
 
 # CI: [[SHIFT:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
-# CI: %{{[0-9]+}}:vgpr_32 = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def $vcc, implicit $exec
+# CI: %{{[0-9]+}}:vgpr_32 = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit $mode, implicit $exec, implicit-def $vcc
 
 # VI: [[SHIFT:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def $vcc, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit $mode, implicit $exec, implicit-def $vcc
 
-#GFX9: %{{[0-9]+}}:vgpr_32 = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit $mode, implicit $exec
 
 ---
 name:            trunc_shr_f32
@@ -82,7 +82,7 @@ body:             |
     %0 = COPY $vgpr0_vgpr1
     %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
     %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
-    %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit-def $vcc, implicit $exec
+    %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit $mode, implicit $exec, implicit-def $vcc
     FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
index bd518924bb588..688e039b16640 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
@@ -3,21 +3,21 @@
 # GCN-LABEL: {{^}}name: vop1_instructions
 
 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
 
 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
 
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $mode, implicit $exec
 
 ---
 name:            vop1_instructions
@@ -88,43 +88,43 @@ body:             |
     %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %11 = V_MOV_B32_e32 %10, implicit $exec
     %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
-    %14 = V_FRACT_F32_e32 123, implicit $exec
+    %14 = V_FRACT_F32_e32 123, implicit $mode, implicit $exec
     %15 = V_LSHLREV_B32_e64 16, %14, implicit $exec
     %16 = V_LSHRREV_B32_e64 16, %15, implicit $exec
-    %17 = V_SIN_F32_e32 %16, implicit $exec
+    %17 = V_SIN_F32_e32 %16, implicit $mode, implicit $exec
     %18 = V_LSHLREV_B32_e64 16, %17, implicit $exec
     %19 = V_LSHRREV_B32_e64 16, %18, implicit $exec
-    %20 = V_CVT_U32_F32_e32 %19, implicit $exec
+    %20 = V_CVT_U32_F32_e32 %19, implicit $mode, implicit $exec
     %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
-    %23 = V_CVT_F32_I32_e32 123, implicit $exec
+    %23 = V_CVT_F32_I32_e32 123, implicit $mode, implicit $exec
     %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
 
     %25 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %26 = V_MOV_B32_e64 %25, implicit $exec
     %26 = V_LSHLREV_B32_e64 16, %26, implicit $exec
-    %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $exec
+    %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
     %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
     %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
-    %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $exec
+    %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $mode, implicit $exec
     %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
     %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
-    %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $exec
+    %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $mode, implicit $exec
     %34 = V_LSHLREV_B32_e64 16, %33, implicit $exec
-    %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $exec
+    %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $mode, implicit $exec
     %36 = V_LSHLREV_B32_e64 16, %35, implicit $exec
 
 
     %37 = V_LSHRREV_B32_e64 16, %36, implicit $exec
-    %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $exec
+    %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $mode, implicit $exec
     %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
     %40 = V_LSHRREV_B32_e64 16, %39, implicit $exec
-    %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $exec
+    %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $mode, implicit $exec
     %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
     %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
-    %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $exec
+    %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $mode, implicit $exec
     %45 = V_LSHLREV_B32_e64 16, %44, implicit $exec
     %46 = V_LSHRREV_B32_e64 16, %45, implicit $exec
-    %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $exec
+    %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $mode, implicit $exec
     %48 = V_LSHLREV_B32_e64 16, %47, implicit $exec
 
 
@@ -139,21 +139,21 @@ body:             |
 # GCN-LABEL: {{^}}name: vop2_instructions
 
 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
 
 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $mode, implicit $exec
 
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $mode, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $mode, implicit $exec
 
 name:            vop2_instructions
 tracksRegLiveness: true
@@ -237,18 +237,18 @@ body:             |
     %13 = V_LSHLREV_B32_e64 16, %12, implicit $exec
     %14 = V_LSHRREV_B32_e64 16, %13, implicit $exec
     %15 = V_BFE_U32 %13, 8, 8, implicit $exec
-    %16 = V_ADD_F32_e32 %14, %15, implicit $exec
+    %16 = V_ADD_F32_e32 %14, %15, implicit $mode, implicit $exec
     %17 = V_LSHLREV_B32_e64 16, %16, implicit $exec
     %18 = V_LSHRREV_B32_e64 16, %17, implicit $exec
     %19 = V_BFE_U32 %17, 8, 8, implicit $exec
-    %20 = V_SUB_F16_e32 %18, %19, implicit $exec
+    %20 = V_SUB_F16_e32 %18, %19, implicit $mode, implicit $exec
     %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
     %22 = V_BFE_U32 %20, 8, 8, implicit $exec
-    %23 = V_FMAC_F32_e32 %21, %22, %22, implicit $exec
+    %23 = V_FMAC_F32_e32 %21, %22, %22, implicit $mode, implicit $exec
     %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
     %25 = V_LSHRREV_B32_e64 16, %24, implicit $exec
     %26 = V_BFE_U32 %24, 8, 8, implicit $exec
-    %27 = V_FMAC_F16_e32 %25, %26, %26, implicit $exec
+    %27 = V_FMAC_F16_e32 %25, %26, %26, implicit $mode, implicit $exec
     %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
 
     %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
@@ -256,32 +256,32 @@ body:             |
     %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
     %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
     %33 = V_BFE_U32 %31, 8, 8, implicit $exec
-    %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $exec
+    %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $mode, implicit $exec
     %35 = V_LSHLREV_B32_e64 16, %34, implicit $exec
     %37 = V_BFE_U32 %35, 8, 8, implicit $exec
-    %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $exec
+    %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $mode, implicit $exec
     %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
     %40 = V_BFE_U32 %39, 8, 8, implicit $exec
-    %41 = V_FMAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $exec
+    %41 = V_FMAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $mode, implicit $exec
     %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
     %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
     %44 = V_BFE_U32 %42, 8, 8, implicit $exec
-    %45 = V_FMAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $exec
+    %45 = V_FMAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $mode, implicit $exec
     %46 = V_LSHLREV_B32_e64 16, %45, implicit $exec
 
     %47 = V_LSHRREV_B32_e64 16, %46, implicit $exec
     %48 = V_BFE_U32 %46, 8, 8, implicit $exec
-    %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $exec
+    %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $mode, implicit $exec
     %50 = V_LSHLREV_B32_e64 16, %49, implicit $exec
     %51 = V_BFE_U32 %50, 8, 8, implicit $exec
-    %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $exec
+    %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $mode, implicit $exec
     %53 = V_LSHLREV_B32_e64 16, %52, implicit $exec
     %54 = V_BFE_U32 %53, 8, 8, implicit $exec
-    %55 = V_FMAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $exec
+    %55 = V_FMAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $mode, implicit $exec
     %56 = V_LSHLREV_B32_e64 16, %55, implicit $exec
     %57 = V_LSHRREV_B32_e64 16, %56, implicit $exec
     %58 = V_BFE_U32 %56, 8, 8, implicit $exec
-    %59 = V_FMAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $exec
+    %59 = V_FMAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $mode, implicit $exec
     %60 = V_LSHLREV_B32_e64 16, %59, implicit $exec
 
     %100 = V_MOV_B32_e32 %60, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index 8ba20b4a66ddb..fa55e1be8a3f7 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -4,28 +4,28 @@
 # GFX89-LABEL: {{^}}name: vop1_instructions
 
 # GFX89: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
 
 
 # GFX89: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX89: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $mode, implicit $exec
 
 
-# VI: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_e64 %{{[0-9]+}}, 0, 1, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_e64 %{{[0-9]+}}, 0, 1, implicit $mode, implicit $exec
 
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $mode, implicit $exec
 
 
 ---
@@ -97,43 +97,43 @@ body:             |
     %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %11 = V_MOV_B32_e32 %10, implicit $exec
     %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
-    %14 = V_FRACT_F32_e32 123, implicit $exec
+    %14 = V_FRACT_F32_e32 123, implicit $mode, implicit $exec
     %15 = V_LSHLREV_B32_e64 16, %14, implicit $exec
     %16 = V_LSHRREV_B32_e64 16, %15, implicit $exec
-    %17 = V_SIN_F32_e32 %16, implicit $exec
+    %17 = V_SIN_F32_e32 %16, implicit $mode, implicit $exec
     %18 = V_LSHLREV_B32_e64 16, %17, implicit $exec
     %19 = V_LSHRREV_B32_e64 16, %18, implicit $exec
-    %20 = V_CVT_U32_F32_e32 %19, implicit $exec
+    %20 = V_CVT_U32_F32_e32 %19, implicit $mode, implicit $exec
     %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
-    %23 = V_CVT_F32_I32_e32 123, implicit $exec
+    %23 = V_CVT_F32_I32_e32 123, implicit $mode, implicit $exec
     %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
 
     %25 = V_LSHRREV_B32_e64 16, %3, implicit $exec
     %26 = V_MOV_B32_e64 %25, implicit $exec
     %26 = V_LSHLREV_B32_e64 16, %26, implicit $exec
-    %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $exec
+    %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
     %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
     %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
-    %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $exec
+    %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $mode, implicit $exec
     %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
     %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
-    %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $exec
+    %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $mode, implicit $exec
     %34 = V_LSHLREV_B32_e64 16, %33, implicit $exec
-    %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $exec
+    %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $mode, implicit $exec
     %36 = V_LSHLREV_B32_e64 16, %35, implicit $exec
 
 
     %37 = V_LSHRREV_B32_e64 16, %36, implicit $exec
-    %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $exec
+    %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $mode, implicit $exec
     %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
     %40 = V_LSHRREV_B32_e64 16, %39, implicit $exec
-    %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $exec
+    %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $mode, implicit $exec
     %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
     %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
-    %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $exec
+    %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $mode, implicit $exec
     %45 = V_LSHLREV_B32_e64 16, %44, implicit $exec
     %46 = V_LSHRREV_B32_e64 16, %45, implicit $exec
-    %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $exec
+    %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $mode, implicit $exec
     %48 = V_LSHLREV_B32_e64 16, %47, implicit $exec
 
 
@@ -149,40 +149,40 @@ body:             |
 
 
 # VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
 
 # GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
 
 
 # VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
 
 # GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $mode, implicit $exec
 
 
-# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, %{{[0-9]+}}, 1, 0, 6, 0, 6, 1, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, %{{[0-9]+}}, 1, 0, 6, 0, 6, 1, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $mode, implicit $exec
 
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $mode, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $mode, implicit $exec
 
 name:            vop2_instructions
 tracksRegLiveness: true
@@ -266,18 +266,18 @@ body:             |
     %13 = V_LSHLREV_B32_e64 16, %12, implicit $exec
     %14 = V_LSHRREV_B32_e64 16, %13, implicit $exec
     %15 = V_BFE_U32 %13, 8, 8, implicit $exec
-    %16 = V_ADD_F32_e32 %14, %15, implicit $exec
+    %16 = V_ADD_F32_e32 %14, %15, implicit $mode, implicit $exec
     %17 = V_LSHLREV_B32_e64 16, %16, implicit $exec
     %18 = V_LSHRREV_B32_e64 16, %17, implicit $exec
     %19 = V_BFE_U32 %17, 8, 8, implicit $exec
-    %20 = V_SUB_F16_e32 %18, %19, implicit $exec
+    %20 = V_SUB_F16_e32 %18, %19, implicit $mode, implicit $exec
     %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
     %22 = V_BFE_U32 %20, 8, 8, implicit $exec
-    %23 = V_MAC_F32_e32 %21, %22, %22, implicit $exec
+    %23 = V_MAC_F32_e32 %21, %22, %22, implicit $mode, implicit $exec
     %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
     %25 = V_LSHRREV_B32_e64 16, %24, implicit $exec
     %26 = V_BFE_U32 %24, 8, 8, implicit $exec
-    %27 = V_MAC_F16_e32 %25, %26, %26, implicit $exec
+    %27 = V_MAC_F16_e32 %25, %26, %26, implicit $mode, implicit $exec
     %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
 
     %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
@@ -285,32 +285,32 @@ body:             |
     %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
     %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
     %33 = V_BFE_U32 %31, 8, 8, implicit $exec
-    %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $exec
+    %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $mode, implicit $exec
     %35 = V_LSHLREV_B32_e64 16, %34, implicit $exec
     %37 = V_BFE_U32 %35, 8, 8, implicit $exec
-    %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $exec
+    %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $mode, implicit $exec
     %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
     %40 = V_BFE_U32 %39, 8, 8, implicit $exec
-    %41 = V_MAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $exec
+    %41 = V_MAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $mode, implicit $exec
     %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
     %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
     %44 = V_BFE_U32 %42, 8, 8, implicit $exec
-    %45 = V_MAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $exec
+    %45 = V_MAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $mode, implicit $exec
     %46 = V_LSHLREV_B32_e64 16, %45, implicit $exec
 
     %47 = V_LSHRREV_B32_e64 16, %46, implicit $exec
     %48 = V_BFE_U32 %46, 8, 8, implicit $exec
-    %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $exec
+    %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $mode, implicit $exec
     %50 = V_LSHLREV_B32_e64 16, %49, implicit $exec
     %51 = V_BFE_U32 %50, 8, 8, implicit $exec
-    %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $exec
+    %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $mode, implicit $exec
     %53 = V_LSHLREV_B32_e64 16, %52, implicit $exec
     %54 = V_BFE_U32 %53, 8, 8, implicit $exec
-    %55 = V_MAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $exec
+    %55 = V_MAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $mode, implicit $exec
     %56 = V_LSHLREV_B32_e64 16, %55, implicit $exec
     %57 = V_LSHRREV_B32_e64 16, %56, implicit $exec
     %58 = V_BFE_U32 %56, 8, 8, implicit $exec
-    %59 = V_MAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $exec
+    %59 = V_MAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $mode, implicit $exec
     %60 = V_LSHLREV_B32_e64 16, %59, implicit $exec
 
     %100 = V_MOV_B32_e32 %60, implicit $exec
@@ -325,40 +325,40 @@ body:             |
 # GCN-LABEL: {{^}}name: vopc_instructions
 
 # GFX89: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 123, implicit $exec
-# GFX89: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $exec
-# GFX89: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
+# GFX89: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $mode, implicit $exec
+# GFX89: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
 # GFX89: $vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $exec
 # GFX89: $vcc = V_CMPX_EQ_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
 
 
-# VI: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $exec
-# VI: %{{[0-9]+}}:sreg_64 = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, implicit-def $exec, implicit $exec
+# VI: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $mode, implicit $exec
+# VI: %{{[0-9]+}}:sreg_64 = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, implicit-def $exec, implicit $mode, implicit $exec
 # VI: $vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %3, 0, 6, 4, implicit-def $vcc, implicit $exec
 # VI: %{{[0-9]+}}:sreg_64 = V_CMPX_EQ_I32_e64 23, killed %{{[0-9]+}}, implicit-def $exec, implicit $exec
 
-# GFX9: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $exec
+# GFX9: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $mode, implicit $exec
 # GFX9: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 23, implicit $exec
-# GFX9: %{{[0-9]+}}:sreg_64 = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
+# GFX9: %{{[0-9]+}}:sreg_64 = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
 # GFX9: $vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit $exec
 # GFX9: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 23, implicit $exec
 # GFX9: %{{[0-9]+}}:sreg_64 = V_CMPX_EQ_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
 
 
-# VI: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def $vcc, implicit $exec
-# VI: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# VI: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def $vcc, implicit $exec
-# VI: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# VI: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# VI: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# VI: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
+# VI: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def $vcc, implicit $mode, implicit $exec
+# VI: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# VI: $vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def $vcc, implicit $mode, implicit $exec
+# VI: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# VI: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# VI: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# VI: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
 
-# GFX9: $vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit $exec
-# GFX9: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# GFX9: $vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit $exec
-# GFX9: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# GFX9: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# GFX9: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $exec
-# GFX9: $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def $exec, implicit $exec
+# GFX9: $vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit $mode, implicit $exec
+# GFX9: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# GFX9: $vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit $mode, implicit $exec
+# GFX9: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# GFX9: $vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# GFX9: $vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def $vcc, implicit-def $exec, implicit $mode, implicit $exec
+# GFX9: $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def $exec, implicit $mode, implicit $exec
 
 
 name:            vopc_instructions
@@ -406,37 +406,37 @@ body:             |
     %6 = S_MOV_B32 65535
 
     %10 = V_AND_B32_e64 %5, %3, implicit $exec
-    V_CMP_EQ_F32_e32 123, killed %10, implicit-def $vcc, implicit $exec
+    V_CMP_EQ_F32_e32 123, killed %10, implicit-def $vcc, implicit $mode, implicit $exec
     %11 = V_AND_B32_e64 %5, %3, implicit $exec
-    V_CMPX_GT_F32_e32 123, killed %11, implicit-def $vcc, implicit-def $exec, implicit $exec
+    V_CMPX_GT_F32_e32 123, killed %11, implicit-def $vcc, implicit $mode, implicit-def $exec, implicit $exec
     %12 = V_AND_B32_e64 %5, %3, implicit $exec
     V_CMP_LT_I32_e32 123, killed %12, implicit-def $vcc, implicit $exec
     %13 = V_AND_B32_e64 %5, %3, implicit $exec
     V_CMPX_EQ_I32_e32 123, killed %13, implicit-def $vcc, implicit-def $exec, implicit $exec
 
     %14 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, implicit $exec
+    $vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, implicit $mode, implicit $exec
     %15 = V_AND_B32_e64 %5, %3, implicit $exec
-    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, implicit-def $exec, implicit $exec
+    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, implicit-def $exec, implicit $mode, implicit $exec
     %16 = V_AND_B32_e64 %5, %3, implicit $exec
     $vcc = V_CMP_LT_I32_e64 %6, killed %16, implicit $exec
     %17 = V_AND_B32_e64 %5, %3, implicit $exec
     %19 = V_CMPX_EQ_I32_e64 23, killed %17, implicit-def $exec, implicit $exec
 
     %20 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, implicit $exec
+    $vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, implicit $mode, implicit $exec
     %21 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, implicit-def $exec, implicit $exec
+    $vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, implicit-def $exec, implicit $mode, implicit $exec
     %23 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, implicit $exec
+    $vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, implicit $mode, implicit $exec
     %24 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, implicit-def $exec, implicit $exec
+    $vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, implicit-def $exec, implicit $mode, implicit $exec
     %25 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, implicit-def $exec, implicit $exec
+    $vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, implicit-def $exec, implicit $mode, implicit $exec
     %26 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, implicit-def $exec, implicit $exec
+    $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, implicit-def $exec, implicit $mode, implicit $exec
     %27 = V_AND_B32_e64 %5, %3, implicit $exec
-    $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, implicit-def $exec, implicit $exec
+    $vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, implicit-def $exec, implicit $mode, implicit $exec
 
 
     %100 = V_MOV_B32_e32 $vcc_lo, implicit $exec
@@ -447,7 +447,7 @@ body:             |
 ...
 
 # GCN-LABEL: name: preserve_flags
-# GCN: = nnan nofpexcept V_ADD_F32_sdwa 0, %4, 0, %4, 0, 0, 6, 0, 5, 1, implicit $exec
+# GCN: = nnan nofpexcept V_ADD_F32_sdwa 0, %4, 0, %4, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
 
 ---
 name: preserve_flags
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index e91268364a653..33802ad21fdd4 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -14,7 +14,7 @@
 ---
 name:            add_f16_u32_preserve
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: vreg_64 }
   - { id: 1, class: vreg_64 }
   - { id: 2, class: sreg_64 }
@@ -32,7 +32,7 @@ registers:
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
-  
+
     %2 = COPY $sgpr30_sgpr31
     %1 = COPY $vgpr2_vgpr3
     %0 = COPY $vgpr0_vgpr1
@@ -44,9 +44,9 @@ body:             |
     %7 = V_BFE_U32 %3, 8, 8, implicit $exec
     %8 = V_LSHRREV_B32_e32 24, %4, implicit $exec
 
-    %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $exec
+    %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
     %10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
-    %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $exec
+    %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
     %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
 
     %13 = V_OR_B32_e64 %10, %12, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
index b8c36bc77148f..6a4e942e07a96 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
@@ -14,7 +14,7 @@ body:             |
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK: %2:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $exec
+    ; CHECK: %2:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; CHECK: S_NOP 0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
index 531cca11e47bf..a3747ac6ac42b 100644
--- a/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/smem-war-hazard.mir
@@ -10,7 +10,7 @@ body: |
   bb.0:
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -25,7 +25,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $sgpr4, $sgpr5, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     $sgpr3 = S_ADD_U32 $sgpr4, $sgpr5, implicit-def $scc
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -42,7 +42,7 @@ body: |
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT 0
     $sgpr3 = S_ADD_U32 $sgpr2, $sgpr4, implicit-def $scc
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -59,7 +59,7 @@ body: |
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT 0
     $sgpr3 = S_ADD_U32 $sgpr5, $sgpr4, implicit-def $scc
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -75,7 +75,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $sgpr6, $sgpr7, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     $sgpr5 = S_LOAD_DWORD_IMM $sgpr6_sgpr7, 0, 0, 0
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -90,7 +90,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT 0
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -106,7 +106,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT 3952
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -122,7 +122,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT 53007
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -137,7 +137,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT 49279
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -152,7 +152,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT_LGKMCNT $sgpr_null, 0
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -168,7 +168,7 @@ body: |
     liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
     S_WAITCNT_LGKMCNT $sgpr_null, 1
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -187,7 +187,7 @@ body: |
 
   bb.1:
     liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -212,12 +212,12 @@ body: |
 
   bb.1:
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr4, $sgpr5, $vgpr0, $vgpr1
-    $sgpr4_sgpr5 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr4_sgpr5 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 
   bb.2:
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr4, $sgpr5, $vgpr0, $vgpr1
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -244,17 +244,17 @@ body: |
 
   bb.1:
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr4, $sgpr5, $vgpr0, $vgpr1
-    $sgpr4_sgpr5 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr4_sgpr5 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 
   bb.2:
     successors: %bb.3
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr4, $sgpr5, $vgpr0, $vgpr1
-    $sgpr4_sgpr5 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr4_sgpr5 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
 
   bb.3:
     liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr4, $sgpr5, $vgpr0, $vgpr1
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -268,7 +268,7 @@ body: |
   bb.0:
     liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
     successors: %bb.1
-    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $exec
+    $sgpr0_sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
 
   bb.1:
     liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
@@ -286,7 +286,7 @@ body: |
   bb.0:
     liveins: $vcc, $vgpr0
     $sgpr0 = S_LOAD_DWORD_IMM $vcc, 0, 0, 0
-    V_CMP_EQ_F32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
+    V_CMP_EQ_F32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index fd0debda403c2..2be645954aed6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -7,10 +7,10 @@
 
 # CHECK-LABEL: name: expecting_non_empty_interval
 
-# CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $exec
+# CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec
 # CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
 # CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-# CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec
+# CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $mode, implicit $exec
 
 # CHECK: S_NOP 0, implicit %6.sub1
 # CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
@@ -26,9 +26,9 @@ body:             |
   bb.0:
     successors: %bb.1
 
-    undef %0.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %0.sub1, implicit $exec
+    undef %0.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %0.sub1, implicit $mode, implicit $exec
     undef %2.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-    dead %3:vgpr_32 = V_MUL_F32_e32 0, %2.sub1, implicit $exec
+    dead %3:vgpr_32 = V_MUL_F32_e32 0, %2.sub1, implicit $mode, implicit $exec
 
   bb.1:
     S_NOP 0, implicit %2.sub1
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
index 1bac81699edd5..0fa0ddab4e11f 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir
@@ -85,10 +85,10 @@ body: |
   bb.6:
     successors: %bb.8(0x40000000), %bb.11(0x40000000)
     %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    dead %6:vgpr_32 = V_MUL_F32_e32 0, undef %7:vgpr_32, implicit $exec
-    dead %8:vgpr_32 = V_MUL_F32_e32 0, %2, implicit $exec
-    undef %9.sub1:vreg_64 = V_MUL_F32_e32 0, %1, implicit $exec
-    undef %10.sub0:vreg_128 = V_MUL_F32_e32 0, %0, implicit $exec
+    dead %6:vgpr_32 = V_MUL_F32_e32 0, undef %7:vgpr_32, implicit $mode, implicit $exec
+    dead %8:vgpr_32 = V_MUL_F32_e32 0, %2, implicit $mode, implicit $exec
+    undef %9.sub1:vreg_64 = V_MUL_F32_e32 0, %1, implicit $mode, implicit $exec
+    undef %10.sub0:vreg_128 = V_MUL_F32_e32 0, %0, implicit $mode, implicit $exec
     undef %11.sub0:sgpr_256 = S_MOV_B32 0
     %11.sub1:sgpr_256 = COPY %11.sub0
     %11.sub2:sgpr_256 = COPY %11.sub0
@@ -161,31 +161,31 @@ body: |
   bb.13:
     successors: %bb.15(0x40000000), %bb.14(0x40000000)
 
-    %18:vgpr_32 = V_MAD_F32 0, %10.sub0, 0, target-flags(amdgpu-gotprel) 1073741824, 0, -1082130432, 0, 0, implicit $exec
-    %19:vgpr_32 = V_MAD_F32 0, %12.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $exec
+    %18:vgpr_32 = V_MAD_F32 0, %10.sub0, 0, target-flags(amdgpu-gotprel) 1073741824, 0, -1082130432, 0, 0, implicit $mode, implicit $exec
+    %19:vgpr_32 = V_MAD_F32 0, %12.sub0, 0, target-flags(amdgpu-gotprel) 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %20:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM undef %21:sgpr_128, 1040, 0, 0 :: (dereferenceable invariant load 16)
-    %22:vgpr_32 = V_ADD_F32_e32 0, %19, implicit $exec
-    %23:vgpr_32 = V_MAD_F32 0, %18, 0, 0, 0, 0, 0, 0, implicit $exec
+    %22:vgpr_32 = V_ADD_F32_e32 0, %19, implicit $mode, implicit $exec
+    %23:vgpr_32 = V_MAD_F32 0, %18, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %24:vgpr_32 = COPY %20.sub3
-    %25:vgpr_32 = V_MUL_F32_e64 0, target-flags(amdgpu-gotprel32-lo) 0, 0, %20.sub1, 0, 0, implicit $exec
+    %25:vgpr_32 = V_MUL_F32_e64 0, target-flags(amdgpu-gotprel32-lo) 0, 0, %20.sub1, 0, 0, implicit $mode, implicit $exec
     %26:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM undef %27:sgpr_128, 1056, 0, 0 :: (dereferenceable invariant load 16)
-    %28:vgpr_32 = V_MAD_F32 0, %18, 0, %26.sub0, 0, 0, 0, 0, implicit $exec
-    %29:vgpr_32 = V_ADD_F32_e32 %28, %19, implicit $exec
-    %30:vgpr_32 = V_RCP_F32_e32 %29, implicit $exec
-    %25:vgpr_32 = V_MAC_F32_e32 0, %18, %25, implicit $exec
-    %31:vgpr_32 = V_MAD_F32 0, target-flags(amdgpu-gotprel) 0, 0, %12.sub0, 0, %24, 0, 0, implicit $exec
-    %32:vgpr_32 = V_ADD_F32_e32 %25, %31, implicit $exec
-    %33:vgpr_32 = V_MUL_F32_e32 %22, %30, implicit $exec
-    %34:vgpr_32 = V_MUL_F32_e32 %23, %30, implicit $exec
-    %35:vgpr_32 = V_MUL_F32_e32 %32, %30, implicit $exec
-    %36:vgpr_32 = V_MUL_F32_e32 0, %34, implicit $exec
-    %36:vgpr_32 = V_MAC_F32_e32 0, %33, %36, implicit $exec
-    %37:vgpr_32 = V_MAD_F32 0, %35, 0, 0, 0, 0, 0, 0, implicit $exec
+    %28:vgpr_32 = V_MAD_F32 0, %18, 0, %26.sub0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %29:vgpr_32 = V_ADD_F32_e32 %28, %19, implicit $mode, implicit $exec
+    %30:vgpr_32 = V_RCP_F32_e32 %29, implicit $mode, implicit $exec
+    %25:vgpr_32 = V_MAC_F32_e32 0, %18, %25, implicit $mode, implicit $exec
+    %31:vgpr_32 = V_MAD_F32 0, target-flags(amdgpu-gotprel) 0, 0, %12.sub0, 0, %24, 0, 0, implicit $mode, implicit $exec
+    %32:vgpr_32 = V_ADD_F32_e32 %25, %31, implicit $mode, implicit $exec
+    %33:vgpr_32 = V_MUL_F32_e32 %22, %30, implicit $mode, implicit $exec
+    %34:vgpr_32 = V_MUL_F32_e32 %23, %30, implicit $mode, implicit $exec
+    %35:vgpr_32 = V_MUL_F32_e32 %32, %30, implicit $mode, implicit $exec
+    %36:vgpr_32 = V_MUL_F32_e32 0, %34, implicit $mode, implicit $exec
+    %36:vgpr_32 = V_MAC_F32_e32 0, %33, %36, implicit $mode, implicit $exec
+    %37:vgpr_32 = V_MAD_F32 0, %35, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     %38:sreg_64_xexec = V_CMP_NE_U32_e64 0, %5, implicit $exec
     %39:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %38, implicit $exec
     V_CMP_NE_U32_e32 1, %39, implicit-def $vcc, implicit $exec
     $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
-    %40:vgpr_32 = V_ADD_F32_e32 %36, %37, implicit $exec
+    %40:vgpr_32 = V_ADD_F32_e32 %36, %37, implicit $mode, implicit $exec
     S_CBRANCH_VCCZ %bb.15, implicit $vcc
 
   bb.14:
@@ -194,9 +194,9 @@ body: |
 
   bb.15:
     successors: %bb.16(0x40000000), %bb.18(0x40000000)
-    %41:vgpr_32 = V_MAD_F32 0, %40, 0, 0, 0, 0, 0, 0, implicit $exec
-    %42:sreg_64 = V_CMP_LE_F32_e64 0, 0, 0, %41, 0, implicit $exec
-    %43:sreg_64 = V_CMP_GE_F32_e64 0, 1065353216, 0, %41, 0, implicit $exec
+    %41:vgpr_32 = V_MAD_F32 0, %40, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %42:sreg_64 = V_CMP_LE_F32_e64 0, 0, 0, %41, 0, implicit $mode, implicit $exec
+    %43:sreg_64 = V_CMP_GE_F32_e64 0, 1065353216, 0, %41, 0, implicit $mode, implicit $exec
     %44:sreg_64 = S_AND_B64 %43, %43, implicit-def dead $scc
     %45:sreg_64 = S_AND_B64 %42, %42, implicit-def dead $scc
     %46:sreg_64 = S_AND_B64 %45, %44, implicit-def dead $scc
@@ -222,15 +222,15 @@ body: |
   bb.18:
     successors: %bb.20(0x40000000), %bb.19(0x40000000)
     $exec = S_OR_B64 $exec, %47, implicit-def $scc
-    %52:vgpr_32 = V_MAD_F32 0, %3.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 1, %3.sub0, 0, 0, implicit $exec
-    %53:vgpr_32 = V_MUL_F32_e32 -2147483648, %3.sub1, implicit $exec
-    %53:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel32-hi) 1065353216, %3.sub2, %53, implicit $exec
-    %54:vgpr_32 = V_MUL_F32_e32 %53, %53, implicit $exec
-    %54:vgpr_32 = V_MAC_F32_e32 %52, %52, %54, implicit $exec
-    %55:vgpr_32 = V_SQRT_F32_e32 %54, implicit $exec
+    %52:vgpr_32 = V_MAD_F32 0, %3.sub1, 0, target-flags(amdgpu-gotprel32-lo) 0, 1, %3.sub0, 0, 0, implicit $mode, implicit $exec
+    %53:vgpr_32 = V_MUL_F32_e32 -2147483648, %3.sub1, implicit $mode, implicit $exec
+    %53:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel32-hi) 1065353216, %3.sub2, %53, implicit $mode, implicit $exec
+    %54:vgpr_32 = V_MUL_F32_e32 %53, %53, implicit $mode, implicit $exec
+    %54:vgpr_32 = V_MAC_F32_e32 %52, %52, %54, implicit $mode, implicit $exec
+    %55:vgpr_32 = V_SQRT_F32_e32 %54, implicit $mode, implicit $exec
     %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %56:vgpr_32 = V_MOV_B32_e32 981668463, implicit $exec
-    %57:sreg_64 = V_CMP_NGT_F32_e64 0, %55, 0, %56, 0, implicit $exec
+    %57:sreg_64 = V_CMP_NGT_F32_e64 0, %55, 0, %56, 0, implicit $mode, implicit $exec
     %58:sreg_64 = S_AND_B64 $exec, %57, implicit-def dead $scc
     $vcc = COPY %58
     S_CBRANCH_VCCZ %bb.20, implicit $vcc
@@ -255,8 +255,8 @@ body: |
 
   bb.23:
     successors: %bb.22(0x80000000)
-    undef %60.sub1:vreg_64 = V_CVT_I32_F32_e32 %1, implicit $exec
-    %60.sub0:vreg_64 = V_CVT_I32_F32_e32 %0, implicit $exec
+    undef %60.sub1:vreg_64 = V_CVT_I32_F32_e32 %1, implicit $mode, implicit $exec
+    %60.sub0:vreg_64 = V_CVT_I32_F32_e32 %0, implicit $mode, implicit $exec
     undef %61.sub0:sgpr_256 = S_MOV_B32 0
     %61.sub1:sgpr_256 = COPY %61.sub0
     %61.sub2:sgpr_256 = COPY %61.sub0
@@ -266,20 +266,20 @@ body: |
     %61.sub6:sgpr_256 = COPY %61.sub0
     %61.sub7:sgpr_256 = COPY %61.sub0
     %62:vgpr_32 = V_MOV_B32_e32 1033100696, implicit $exec
-    %63:vgpr_32 = V_MUL_F32_e32 1060575065, %15.sub1, implicit $exec
-    %63:vgpr_32 = V_MAC_F32_e32 1046066128, %15.sub0, %63, implicit $exec
+    %63:vgpr_32 = V_MUL_F32_e32 1060575065, %15.sub1, implicit $mode, implicit $exec
+    %63:vgpr_32 = V_MAC_F32_e32 1046066128, %15.sub0, %63, implicit $mode, implicit $exec
     %64:vgpr_32 = IMAGE_LOAD_V1_V2 %60, %61, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4)
-    %64:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel) 0, %51.sub0, %64, implicit $exec
-    %65:vgpr_32 = V_MUL_F32_e32 0, %64, implicit $exec
-    %66:vgpr_32 = V_MUL_F32_e32 0, %65, implicit $exec
-    %67:vgpr_32 = V_MAD_F32 0, %66, 0, %62, 0, 0, 0, 0, implicit $exec
-    %63:vgpr_32 = V_MAC_F32_e32 %15.sub2, %62, %63, implicit $exec
-    %4:vgpr_32 = V_ADD_F32_e32 %63, %67, implicit $exec
+    %64:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel) 0, %51.sub0, %64, implicit $mode, implicit $exec
+    %65:vgpr_32 = V_MUL_F32_e32 0, %64, implicit $mode, implicit $exec
+    %66:vgpr_32 = V_MUL_F32_e32 0, %65, implicit $mode, implicit $exec
+    %67:vgpr_32 = V_MAD_F32 0, %66, 0, %62, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %63:vgpr_32 = V_MAC_F32_e32 %15.sub2, %62, %63, implicit $mode, implicit $exec
+    %4:vgpr_32 = V_ADD_F32_e32 %63, %67, implicit $mode, implicit $exec
     S_BRANCH %bb.22
 
   bb.24:
-    %68:vgpr_32 = V_MUL_F32_e32 0, %4, implicit $exec
-    %69:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, undef %70:vgpr_32, 0, %68, 0, 0, implicit $exec
+    %68:vgpr_32 = V_MUL_F32_e32 0, %4, implicit $mode, implicit $exec
+    %69:vgpr_32 = V_CVT_PKRTZ_F16_F32_e64 0, undef %70:vgpr_32, 0, %68, 0, 0, implicit $mode, implicit $exec
     EXP 0, undef %71:vgpr_32, %69, undef %72:vgpr_32, undef %73:vgpr_32, -1, -1, 15, implicit $exec
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
index b0aeb91787cd2..343864c4cd678 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f32
-# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_reg_imm_f32
 registers:
@@ -15,12 +15,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_FMAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $exec
+    %3 = V_FMAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmamk_imm_reg_f32
-# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_imm_reg_f32
 registers:
@@ -34,12 +34,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_FMAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $exec
+    %3 = V_FMAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmaak_f32
-# GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
+# GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
 name:            test_fmaak_f32
 registers:
@@ -51,12 +51,12 @@ body:             |
 
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
-    %2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
+    %2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f16
-# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_reg_imm_f16
 registers:
@@ -70,12 +70,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $exec
+    %3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmamk_imm_reg_f16
-# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_imm_reg_f16
 registers:
@@ -89,12 +89,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_FMAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $exec
+    %3 = V_FMAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmaak_f16
-# GCN: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
+# GCN: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
 name:            test_fmaak_f16
 registers:
@@ -106,11 +106,11 @@ body:             |
 
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
-    %2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
+    %2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
 ...
 
 # GCN-LABEL: name: test_fmaak_sgpr_src0_f32
-# GCN: %2:vgpr_32 = V_FMAMK_F32 killed %0, 1078523331, %3:vgpr_32, implicit $exec
+# GCN: %2:vgpr_32 = V_FMAMK_F32 killed %0, 1078523331, %3:vgpr_32, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_sgpr_src0_f32
@@ -124,12 +124,12 @@ body:             |
 
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
-    %2 = V_FMAC_F32_e32 killed %0, %1, %3, implicit $exec
+    %2 = V_FMAC_F32_e32 killed %0, %1, %3, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmaak_inlineimm_src0_f32
-# GCN: %1:vgpr_32 = V_FMAMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $exec
+# GCN: %1:vgpr_32 = V_FMAMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_inlineimm_src0_f32
@@ -141,12 +141,12 @@ body:             |
   bb.0:
 
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
-    %1 = V_FMAC_F32_e32 1073741824, %0, %2, implicit $exec
+    %1 = V_FMAC_F32_e32 1073741824, %0, %2, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmaak_otherimm_src0_f32
-# GCN: %1:vgpr_32 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $exec
+# GCN: %1:vgpr_32 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_otherimm_src0_f32
@@ -158,12 +158,12 @@ body:             |
   bb.0:
 
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
-    %1 = V_FMAC_F32_e32 1120403456, %0, %2, implicit $exec
+    %1 = V_FMAC_F32_e32 1120403456, %0, %2, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmaak_other_constantlike_src0_f32
-# GCN: %1:vgpr_32 = V_FMAC_F32_e32 %stack.0, %0, %1, implicit $exec
+# GCN: %1:vgpr_32 = V_FMAC_F32_e32 %stack.0, %0, %1, implicit $mode, implicit $exec
 ---
 name:            test_fmaak_other_constantlike_src0_f32
 registers:
@@ -178,12 +178,12 @@ body:             |
   bb.0:
 
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
-    %1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec
+    %1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_fmaak_inline_literal_f16
-# GCN: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $exec
+# GCN: %2:vgpr_32 = V_FMAAK_F16 16384, killed %0, 49664, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_inline_literal_f16
@@ -192,11 +192,11 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0
-  
+
     %3:vgpr_32 = COPY killed $vgpr0
 
     %26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
-    %28:vgpr_32 = V_FMAC_F16_e32 16384, killed %3, %26, implicit $exec
+    %28:vgpr_32 = V_FMAC_F16_e32 16384, killed %3, %26, implicit $mode, implicit $exec
     S_ENDPGM 0
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
index e75194bf4b8f0..75949e6a2476e 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-mad.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: name: test_madmk_reg_imm_f32
-# GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_madmk_reg_imm_f32
 registers:
@@ -15,12 +15,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_MAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $exec
+    %3 = V_MAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_madmk_imm_reg_f32
-# GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_MADMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_madmk_imm_reg_f32
 registers:
@@ -34,12 +34,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_MAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $exec
+    %3 = V_MAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_madak_f32
-# GCN: V_MADAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
+# GCN: V_MADAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
 name:            test_madak_f32
 registers:
@@ -51,12 +51,12 @@ body:             |
 
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
-    %2 = V_MAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
+    %2 = V_MAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_madmk_reg_imm_f16
-# GCN: V_MADMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_MADMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_madmk_reg_imm_f16
 registers:
@@ -70,12 +70,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_MAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $exec
+    %3 = V_MAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_madmk_imm_reg_f16
-# GCN: V_MADMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
+# GCN: V_MADMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_madmk_imm_reg_f16
 registers:
@@ -89,12 +89,12 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = V_MOV_B32_e32 1078523331, implicit $exec
-    %3 = V_MAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $exec
+    %3 = V_MAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_madak_f16
-# GCN: V_MADAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
+# GCN: V_MADAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $mode, implicit $exec
 ---
 name:            test_madak_f16
 registers:
@@ -106,14 +106,14 @@ body:             |
 
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
-    %2 = V_MAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
+    %2 = V_MAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $mode, implicit $exec
 ...
 
 # Make sure constant bus restriction isn't violated if src0 is an SGPR.
 
 # GCN-LABEL: name: test_madak_sgpr_src0_f32
 # GCN: %1:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-# GCN: %2:vgpr_32 = V_MAD_F32 0, killed %0, 0, %1, 0, %3:vgpr_32, 0, 0, implicit $exec
+# GCN: %2:vgpr_32 = V_MAD_F32 0, killed %0, 0, %1, 0, %3:vgpr_32, 0, 0, implicit $mode, implicit $exec
 
 ---
 name:            test_madak_sgpr_src0_f32
@@ -127,14 +127,14 @@ body:             |
 
     %0 = IMPLICIT_DEF
     %1 = V_MOV_B32_e32 1078523331, implicit $exec
-    %2 = V_MAC_F32_e32 killed %0, %1, %3, implicit $exec
+    %2 = V_MAC_F32_e32 killed %0, %1, %3, implicit $mode, implicit $exec
 
 ...
 
 # This can still fold if this is an inline immediate.
 
 # GCN-LABEL: name: test_madak_inlineimm_src0_f32
-# GCN: %1:vgpr_32 = V_MADMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $exec
+# GCN: %1:vgpr_32 = V_MADMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $mode, implicit $exec
 
 ---
 name:            test_madak_inlineimm_src0_f32
@@ -146,13 +146,13 @@ body:             |
   bb.0:
 
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
-    %1 = V_MAC_F32_e32 1073741824, %0, %2, implicit $exec
+    %1 = V_MAC_F32_e32 1073741824, %0, %2, implicit $mode, implicit $exec
 
 ...
 # Non-inline immediate uses constant bus already.
 
 # GCN-LABEL: name: test_madak_otherimm_src0_f32
-# GCN: %1:vgpr_32 = V_MAC_F32_e32 1120403456, %0, %1, implicit $exec
+# GCN: %1:vgpr_32 = V_MAC_F32_e32 1120403456, %0, %1, implicit $mode, implicit $exec
 
 ---
 name:            test_madak_otherimm_src0_f32
@@ -164,13 +164,13 @@ body:             |
   bb.0:
 
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
-    %1 = V_MAC_F32_e32 1120403456, %0, %2, implicit $exec
+    %1 = V_MAC_F32_e32 1120403456, %0, %2, implicit $mode, implicit $exec
 
 ...
 # Non-inline immediate uses constant bus already.
 
 # GCN-LABEL: name: test_madak_other_constantlike_src0_f32
-# GCN: %1:vgpr_32 = V_MAC_F32_e32 %stack.0, %0, %1, implicit $exec
+# GCN: %1:vgpr_32 = V_MAC_F32_e32 %stack.0, %0, %1, implicit $mode, implicit $exec
 ---
 name:            test_madak_other_constantlike_src0_f32
 registers:
@@ -185,12 +185,12 @@ body:             |
   bb.0:
 
     %0 = V_MOV_B32_e32 1078523331, implicit $exec
-    %1 = V_MAC_F32_e32 %stack.0, %0, %2, implicit $exec
+    %1 = V_MAC_F32_e32 %stack.0, %0, %2, implicit $mode, implicit $exec
 
 ...
 
 # GCN-LABEL: name: test_madak_inline_literal_f16
-# GCN: %2:vgpr_32 = V_MADAK_F16 16384, killed %0, 49664, implicit $exec
+# GCN: %2:vgpr_32 = V_MADAK_F16 16384, killed %0, 49664, implicit $mode, implicit $exec
 
 ---
 name:            test_madak_inline_literal_f16
@@ -199,11 +199,11 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0
-  
+
     %3:vgpr_32 = COPY killed $vgpr0
 
     %26:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
-    %28:vgpr_32 = V_MAC_F16_e32 16384, killed %3, %26, implicit $exec
+    %28:vgpr_32 = V_MAC_F16_e32 16384, killed %3, %26, implicit $mode, implicit $exec
     S_ENDPGM 0
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
index 9f36e0b5d6854..3190641ae6910 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir
@@ -66,7 +66,7 @@ body:             |
 # GCN-LABEL: name: swap_phys_overlap_x
 # GCN: bb.0:
 # GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-# GCN-NEXT: $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $exec
+# GCN-NEXT: $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $mode, implicit $exec
 # GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
 # GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
 ---
@@ -74,7 +74,7 @@ name:            swap_phys_overlap_x
 body:             |
   bb.0:
     $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $exec
+    $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $mode, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
     $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 20c21aae6c759..55453c7d263ab 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -17,7 +17,7 @@ body: |
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 11, 0, 0
     $sgpr7 = S_MOV_B32 61440
     $sgpr6 = S_MOV_B32 -1
-    $vcc = V_CMP_EQ_F32_e64 0, 0, 0, undef $sgpr2, 0, implicit $exec
+    $vcc = V_CMP_EQ_F32_e64 0, 0, 0, undef $sgpr2, 0, implicit $mode, implicit $exec
     S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
 
   bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
index 5b98a82216d7f..96b6eaf1967d5 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
@@ -117,7 +117,7 @@ body:            |
     $vgpr1 = IMPLICIT_DEF
     $sgpr0 = IMPLICIT_DEF
     $sgpr1 = IMPLICIT_DEF
-    $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1,  implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1,  implicit $mode, implicit $exec
     $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
index 1f30d91753a5f..761ef6054b81b 100644
--- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
@@ -94,7 +94,7 @@ body:             |
     $sgpr4 = IMPLICIT_DEF
     $vgpr0 = IMPLICIT_DEF
     $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     $sgpr0 = S_MOV_B32 0
 ...
 # GCN-LABEL: name: vmem_swait0_write_sgpr
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
index e3b56d3f9619a..0e6aecb7ad706 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
@@ -3,7 +3,7 @@
 # GCN-LABEL: waitcnt-back-edge-loop
 # GCN: bb.2
 # GCN: S_WAITCNT 112
-# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
+# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $mode, implicit $exec
 
 ---
 name: waitcnt-back-edge-loop
@@ -28,7 +28,7 @@ body:             |
   bb.1:
     successors: %bb.5, %bb.2
 
-    $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
+    $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $mode, implicit $exec
     V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
     $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
     S_CBRANCH_VCCZ %bb.5, implicit killed $vcc
@@ -44,7 +44,7 @@ body:             |
     successors: %bb.3, %bb.1
 
     $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
-    $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $exec
+    $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $mode, implicit $exec
     V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec
     $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
     $vgpr4 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec
@@ -53,7 +53,7 @@ body:             |
 
   bb.5:
 
-    $vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec
+    $vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
     EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
index eae38031047f6..e06e3031c3d2a 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
@@ -16,21 +16,21 @@ body:             |
     ; GFX9: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
     ; GFX9-NOT: S_WAITCNT 53119
     ; GFX9-NEXT: S_WAITCNT 52863
-    ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $exec
-    ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $exec
-    ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $exec
-    ; GFX9-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $exec
+    ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
+    ; GFX9-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0
     ; GFX10-LABEL: name: max-counter-lgkmcnt
     ; GFX10: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
     ; GFX10-NEXT: S_WAITCNT 53631
-    ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $exec
+    ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_WAITCNT 53375
-    ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $exec
+    ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_WAITCNT 53119
-    ; GFX10-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $exec
+    ; GFX10-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_WAITCNT 52863
-    ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $exec
+    ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0
     $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
     $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
@@ -50,10 +50,10 @@ body:             |
     $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec
     $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec
     $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
-    $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $exec
-    $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $exec
-    $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $exec
-    $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $exec
+    $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
+    $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -70,10 +70,10 @@ body:             |
     ; GFX10-NOT: S_WAITCNT 65407
     ; GFX9-NEXT: S_WAITCNT 53118
     ; GFX10-NEXT: S_WAITCNT 65406
-    ; GFX9_10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $exec
-    ; GFX9_10-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $exec
-    ; GFX9_10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $exec
-    ; GFX9_10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $exec
+    ; GFX9_10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; GFX9_10-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec
+    ; GFX9_10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    ; GFX9_10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec
     ; GFX9_10-NEXT: S_ENDPGM 0
     $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
@@ -142,10 +142,10 @@ body:             |
     $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, implicit $exec
     $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, implicit $exec
     $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $exec
-    $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $exec
-    $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $exec
-    $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $exec
+    $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
 
@@ -167,6 +167,6 @@ body:             |
     EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
-    $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $exec
+    $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-permute.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-permute.mir
index 7a70ae5464dd4..2f453c6156b2e 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-permute.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-permute.mir
@@ -15,7 +15,7 @@ body:             |
     liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
 
     $vgpr0 = DS_BPERMUTE_B32 killed $vgpr0, killed $vgpr1, 0, implicit $exec
-    $vgpr0 = V_ADD_F32_e32 1065353216, killed $vgpr0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 1065353216, killed $vgpr0, implicit $mode, implicit $exec
     S_SETPC_B64_return killed $sgpr30_sgpr31, implicit killed $vgpr0
 
 ...
diff --git a/llvm/unittests/MI/LiveIntervalTest.cpp b/llvm/unittests/MI/LiveIntervalTest.cpp
index 6faa8abd4cd81..ea8476db1e656 100644
--- a/llvm/unittests/MI/LiveIntervalTest.cpp
+++ b/llvm/unittests/MI/LiveIntervalTest.cpp
@@ -432,12 +432,12 @@ TEST(LiveIntervalTest, DeadSubRegMoveUp) {
     %54:vgpr_32 = V_MOV_B32_e32 1742342378, implicit $exec
     %57:vgpr_32 = V_MOV_B32_e32 3168768712, implicit $exec
     %59:vgpr_32 = V_MOV_B32_e32 1039972644, implicit $exec
-    %60:vgpr_32 = V_MAD_F32 0, %52, 0, undef %61:vgpr_32, 0, %59, 0, 0, implicit $exec
-    %63:vgpr_32 = V_ADD_F32_e32 %51.sub3, undef %64:vgpr_32, implicit $exec
-    dead %66:vgpr_32 = V_MAD_F32 0, %60, 0, undef %67:vgpr_32, 0, %125.sub2, 0, 0, implicit $exec
-    undef %124.sub1:vreg_128 = V_MAD_F32 0, %57, 0, undef %70:vgpr_32, 0, %125.sub1, 0, 0, implicit $exec
-    %124.sub0:vreg_128 = V_MAD_F32 0, %54, 0, undef %73:vgpr_32, 0, %125.sub0, 0, 0, implicit $exec
-    dead undef %125.sub3:vreg_128 = V_MAC_F32_e32 %63, undef %76:vgpr_32, %125.sub3, implicit $exec
+    %60:vgpr_32 = nofpexcept V_MAD_F32 0, %52, 0, undef %61:vgpr_32, 0, %59, 0, 0, implicit $mode, implicit $exec
+    %63:vgpr_32 = nofpexcept V_ADD_F32_e32 %51.sub3, undef %64:vgpr_32, implicit $mode, implicit $exec
+    dead %66:vgpr_32 = nofpexcept V_MAD_F32 0, %60, 0, undef %67:vgpr_32, 0, %125.sub2, 0, 0, implicit $mode, implicit $exec
+    undef %124.sub1:vreg_128 = nofpexcept V_MAD_F32 0, %57, 0, undef %70:vgpr_32, 0, %125.sub1, 0, 0, implicit $mode, implicit $exec
+    %124.sub0:vreg_128 = nofpexcept V_MAD_F32 0, %54, 0, undef %73:vgpr_32, 0, %125.sub0, 0, 0, implicit $mode, implicit $exec
+    dead undef %125.sub3:vreg_128 = nofpexcept V_MAC_F32_e32 %63, undef %76:vgpr_32, %125.sub3, implicit $mode, implicit $exec
 )MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 15, 12);
   });

From 48cb380abdca27d177520aea4fe4dfe8d628b466 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 27 May 2020 10:26:31 -0400
Subject: [PATCH 261/770] [InstCombine] add tests for vector demanded elements
 of select condition; NFC

---
 .../InstCombine/vec_demanded_elts.ll          | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
index 932dcf8e56271..f444404d14d0e 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -672,3 +672,64 @@ bb:
   %tmp2 = extractelement <4 x i32*> %tmp, i64 0
   ret i32* %tmp2
 }
+
+; The non-zero elements of the result are always 'min', so the splat is unnecessary.
+
+define <4 x i8> @select_cond_with_eq_true_false_elts(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_eq_true_false_elts(
+; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[SPLAT]], <4 x i8> [[TVAL]], <4 x i8> [[Y]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %splat = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> zeroinitializer
+  %r = select <4 x i1> %splat, <4 x i8> %tval, <4 x i8> %y
+  ret <4 x i8> %r
+}
+
+; First element of the result is always x[0], so first element of select condition is unnecessary.
+
+define <4 x i8> @select_cond_with_eq_true_false_elts2(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_eq_true_false_elts2(
+; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x i8> [[TVAL]], <4 x i8> [[X]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
+  ret <4 x i8> %r
+}
+
+; Second element of the result is always x[3], so second element of select condition is unnecessary.
+; Fourth element of the result is always undef, so fourth element of select condition is unnecessary.
+
+define <4 x float> @select_cond_with_eq_true_false_elts3(<4 x float> %x, <4 x float> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_eq_true_false_elts3(
+; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
+; CHECK-NEXT:    [[FVAL:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[X]], <4 x i32> <i32 0, i32 7, i32 6, i32 undef>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[TVAL]], <4 x float> [[FVAL]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %tval = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
+  %fval = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 7, i32 6, i32 undef>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+  %r = select <4 x i1> %cond, <4 x float> %tval, <4 x float> %fval
+  ret <4 x float> %r
+}
+
+define <4 x i8> @select_cond_with_undef_true_false_elts(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
+; CHECK-LABEL: @select_cond_with_undef_true_false_elts(
+; CHECK-NEXT:    [[TVAL:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[CMP:%.*]], <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x i8> [[TVAL]], <4 x i8> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %tval = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 undef, i32 5, i32 6, i32 7>
+  %cond = shufflevector <4 x i1> %cmp, <4 x i1> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
+  ret <4 x i8> %r
+}

From fa3b587196dbc04e445257ae38e7906e5c0c4888 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 13 May 2020 19:32:40 -0700
Subject: [PATCH 262/770] [llvm]NFC] Simplify ProfileSummaryInfo state
 transitions

ProfileSummaryInfo is updated seldom, as result of very specific
triggers. This patch clearly demarcates state updates from read-only uses.
This, arguably, improves readability and maintainability.
---
 .../llvm/Analysis/ProfileSummaryInfo.h        | 13 ++---
 llvm/lib/Analysis/ProfileSummaryInfo.cpp      | 53 +++++++------------
 llvm/lib/Transforms/IPO/SampleProfile.cpp     |  5 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  1 +
 4 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index 8fbc9e8990b2e..e650e1c9d6890 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -40,7 +40,6 @@ class ProfileSummaryInfo {
 private:
   Module &M;
   std::unique_ptr<ProfileSummary> Summary;
-  bool computeSummary();
   void computeThresholds();
   // Count thresholds to answer isHotCount and isColdCount queries.
   Optional<uint64_t> HotCountThreshold, ColdCountThreshold;
@@ -56,15 +55,17 @@ class ProfileSummaryInfo {
   Optional<uint64_t> computeThreshold(int PercentileCutoff);
   // The map that caches the threshold values. The keys are the percentile
   // cutoff values and the values are the corresponding threshold values.
-  DenseMap<int, uint64_t> ThresholdCache;
+  mutable DenseMap<int, uint64_t> ThresholdCache;
 
 public:
-  ProfileSummaryInfo(Module &M) : M(M) {}
-  ProfileSummaryInfo(ProfileSummaryInfo &&Arg)
-      : M(Arg.M), Summary(std::move(Arg.Summary)) {}
+  ProfileSummaryInfo(Module &M) : M(M) { refresh(); }
+  ProfileSummaryInfo(ProfileSummaryInfo &&Arg) = default;
+
+  /// If no summary is present, attempt to refresh.
+  void refresh();
 
   /// Returns true if profile summary is available.
-  bool hasProfileSummary() { return computeSummary(); }
+  bool hasProfileSummary() const { return Summary != nullptr; }
 
   /// Returns true if module \c M has sample profile.
   bool hasSampleProfile() {
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index ef33b9b1de5a3..ec7649c516e04 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -86,23 +86,24 @@ static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS,
 // The profile summary metadata may be attached either by the frontend or by
 // any backend passes (IR level instrumentation, for example). This method
 // checks if the Summary is null and if so checks if the summary metadata is now
-// available in the module and parses it to get the Summary object. Returns true
-// if a valid Summary is available.
-bool ProfileSummaryInfo::computeSummary() {
-  if (Summary)
-    return true;
+// available in the module and parses it to get the Summary object.
+void ProfileSummaryInfo::refresh() {
+  if (hasProfileSummary())
+    return;
   // First try to get context sensitive ProfileSummary.
   auto *SummaryMD = M.getProfileSummary(/* IsCS */ true);
-  if (SummaryMD) {
+  if (SummaryMD)
     Summary.reset(ProfileSummary::getFromMD(SummaryMD));
-    return true;
+
+  if (!hasProfileSummary()) {
+    // This will actually return PSK_Instr or PSK_Sample summary.
+    SummaryMD = M.getProfileSummary(/* IsCS */ false);
+    if (SummaryMD)
+      Summary.reset(ProfileSummary::getFromMD(SummaryMD));
   }
-  // This will actually return PSK_Instr or PSK_Sample summary.
-  SummaryMD = M.getProfileSummary(/* IsCS */ false);
-  if (!SummaryMD)
-    return false;
-  Summary.reset(ProfileSummary::getFromMD(SummaryMD));
-  return true;
+  if (!hasProfileSummary())
+    return;
+  computeThresholds();
 }
 
 Optional<uint64_t> ProfileSummaryInfo::getProfileCount(const CallBase &Call,
@@ -129,7 +130,7 @@ Optional<uint64_t> ProfileSummaryInfo::getProfileCount(const CallBase &Call,
 /// either means it is not hot or it is unknown whether it is hot or not (for
 /// example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
-  if (!F || !computeSummary())
+  if (!F || !hasProfileSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining hotness is based on
@@ -145,7 +146,7 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
 /// (for example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
                                                   BlockFrequencyInfo &BFI) {
-  if (!F || !computeSummary())
+  if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
     if (isHotCount(FunctionCount.getCount()))
@@ -174,7 +175,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
 /// (for example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
                                                    BlockFrequencyInfo &BFI) {
-  if (!F || !computeSummary())
+  if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
     if (!isColdCount(FunctionCount.getCount()))
@@ -204,7 +205,7 @@ bool ProfileSummaryInfo::isFunctionHotnessUnknown(const Function &F) {
 template<bool isHot>
 bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
     int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
-  if (!F || !computeSummary())
+  if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount()) {
     if (isHot &&
@@ -256,7 +257,7 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
     return false;
   if (F->hasFnAttribute(Attribute::Cold))
     return true;
-  if (!computeSummary())
+  if (!hasProfileSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining coldness is based on
@@ -267,8 +268,6 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
 
 /// Compute the hot and cold thresholds.
 void ProfileSummaryInfo::computeThresholds() {
-  if (!computeSummary())
-    return;
   auto &DetailedSummary = Summary->getDetailedSummary();
   auto &HotEntry =
       getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffHot);
@@ -289,7 +288,7 @@ void ProfileSummaryInfo::computeThresholds() {
 }
 
 Optional<uint64_t> ProfileSummaryInfo::computeThreshold(int PercentileCutoff) {
-  if (!computeSummary())
+  if (!hasProfileSummary())
     return None;
   auto iter = ThresholdCache.find(PercentileCutoff);
   if (iter != ThresholdCache.end()) {
@@ -304,26 +303,18 @@ Optional<uint64_t> ProfileSummaryInfo::computeThreshold(int PercentileCutoff) {
 }
 
 bool ProfileSummaryInfo::hasHugeWorkingSetSize() {
-  if (!HasHugeWorkingSetSize)
-    computeThresholds();
   return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue();
 }
 
 bool ProfileSummaryInfo::hasLargeWorkingSetSize() {
-  if (!HasLargeWorkingSetSize)
-    computeThresholds();
   return HasLargeWorkingSetSize && HasLargeWorkingSetSize.getValue();
 }
 
 bool ProfileSummaryInfo::isHotCount(uint64_t C) {
-  if (!HotCountThreshold)
-    computeThresholds();
   return HotCountThreshold && C >= HotCountThreshold.getValue();
 }
 
 bool ProfileSummaryInfo::isColdCount(uint64_t C) {
-  if (!ColdCountThreshold)
-    computeThresholds();
   return ColdCountThreshold && C <= ColdCountThreshold.getValue();
 }
 
@@ -346,14 +337,10 @@ bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff, uint64_t
 }
 
 uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() {
-  if (!HotCountThreshold)
-    computeThresholds();
   return HotCountThreshold ? HotCountThreshold.getValue() : UINT64_MAX;
 }
 
 uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() {
-  if (!ColdCountThreshold)
-    computeThresholds();
   return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
 }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 697341443273a..475f6bc8e9b73 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1848,10 +1848,11 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
 
   PSI = _PSI;
-  if (M.getProfileSummary(/* IsCS */ false) == nullptr)
+  if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
     M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
                         ProfileSummary::PSK_Sample);
-
+    PSI->refresh();
+  }
   // Compute the total number of samples collected in this profile.
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 72eb5cd61b003..7579139231423 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1613,6 +1613,7 @@ static bool annotateAllFunctions(
   M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
                       IsCS ? ProfileSummary::PSK_CSInstr
                            : ProfileSummary::PSK_Instr);
+  PSI->refresh();
 
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);

From 8e7e6a8d6bae19c5a18e0d0daa0614272b85598c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Wed, 27 May 2020 11:12:05 -0700
Subject: [PATCH 263/770] [X86] Restore selection of MULX on BMI2 targets.

Looking back over gcc and icc behavior it looks like icc does
use mulx32 on 32-bit targets and mulx64 on 64-bit targets. It's
also used when dividing i32 by constant on 32-bit targets and
i64 by constant on 64-bit targets.

gcc uses it multiplies producing a 64 bit result on 32-bit targets
and 128-bit results on a 64-bit target. gcc does not appear to use
it for division by constant.

After this patch clang is closer to the icc behavior. This
basically reverts d1c61861ddc94457b08a5a653d3908b7b38ebb22, but
there were no strong feelings at the time.

Fixes PR45518.

Differential Revision: https://reviews.llvm.org/D80498
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |  69 ++-
 llvm/test/CodeGen/X86/atomic-unordered.ll     |  85 ++--
 llvm/test/CodeGen/X86/bmi2-x86_64.ll          |   8 +-
 llvm/test/CodeGen/X86/bmi2.ll                 |  12 +-
 llvm/test/CodeGen/X86/hoist-invariant-load.ll |  21 +-
 llvm/test/CodeGen/X86/i128-mul.ll             | 415 ++++++++++++------
 llvm/test/CodeGen/X86/mulx32.ll               |   8 +-
 llvm/test/CodeGen/X86/mulx64.ll               |   8 +-
 llvm/test/CodeGen/X86/pr35636.ll              |  20 +-
 9 files changed, 400 insertions(+), 246 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 91e0cdb80386a..a5fa98ec8d926 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4758,17 +4758,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     unsigned Opc, MOpc;
     unsigned LoReg, HiReg;
     bool IsSigned = Opcode == ISD::SMUL_LOHI;
+    bool UseMULX = !IsSigned && Subtarget->hasBMI2();
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
-      Opc  = IsSigned ? X86::IMUL32r : X86::MUL32r;
-      MOpc = IsSigned ? X86::IMUL32m : X86::MUL32m;
-      LoReg = X86::EAX; HiReg = X86::EDX;
+      Opc  = UseMULX ? X86::MULX32rr :
+             IsSigned ? X86::IMUL32r : X86::MUL32r;
+      MOpc = UseMULX ? X86::MULX32rm :
+             IsSigned ? X86::IMUL32m : X86::MUL32m;
+      LoReg = UseMULX ? X86::EDX : X86::EAX;
+      HiReg = X86::EDX;
       break;
     case MVT::i64:
-      Opc  = IsSigned ? X86::IMUL64r : X86::MUL64r;
-      MOpc = IsSigned ? X86::IMUL64m : X86::MUL64m;
-      LoReg = X86::RAX; HiReg = X86::RDX;
+      Opc  = UseMULX ? X86::MULX64rr :
+             IsSigned ? X86::IMUL64r : X86::MUL64r;
+      MOpc = UseMULX ? X86::MULX64rm :
+             IsSigned ? X86::IMUL64m : X86::MUL64m;
+      LoReg = UseMULX ? X86::RDX : X86::RAX;
+      HiReg = X86::RDX;
       break;
     }
 
@@ -4783,15 +4790,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
     if (foldedLoad) {
       SDValue Chain;
       MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
-      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
-      Chain = SDValue(CNode, 0);
-      InFlag = SDValue(CNode, 1);
+      if (UseMULX) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
 
       // Update the chain.
       ReplaceUses(N1.getValue(1), Chain);
@@ -4799,27 +4815,38 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       SDValue Ops[] = { N1, InFlag };
-      SDVTList VTs = CurDAG->getVTList(MVT::Glue);
-      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
-      InFlag = SDValue(CNode, 0);
+      if (UseMULX) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        InFlag = SDValue(CNode, 0);
+      }
     }
 
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      assert(LoReg && "Register for low half is not defined!");
-      SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
-                                             NVT, InFlag);
-      InFlag = ResLo.getValue(2);
+      if (!ResLo) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
+                                       NVT, InFlag);
+        InFlag = ResLo.getValue(2);
+      }
       ReplaceUses(SDValue(Node, 0), ResLo);
       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      assert(HiReg && "Register for high half is not defined!");
-      SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
-                                             NVT, InFlag);
-      InFlag = ResHi.getValue(2);
+      if (!ResHi) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
+                                       NVT, InFlag);
+        InFlag = ResHi.getValue(2);
+      }
       ReplaceUses(SDValue(Node, 1), ResHi);
       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
                  dbgs() << '\n');
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 9843bf81e9053..b321820cf506a 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -837,18 +837,16 @@ define i64 @load_fold_udiv1(i64* %p) {
 ;
 ; CHECK-O3-CUR-LABEL: load_fold_udiv1:
 ; CHECK-O3-CUR:       # %bb.0:
-; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
-; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT:    mulq %rcx
-; CHECK-O3-CUR-NEXT:    movq %rdx, %rax
+; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
+; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-CUR-NEXT:    mulxq %rax, %rcx, %rax
 ; CHECK-O3-CUR-NEXT:    shrq $3, %rax
 ; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: load_fold_udiv1:
 ; CHECK-O3-EX:       # %bb.0:
-; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT:    mulq (%rdi)
-; CHECK-O3-EX-NEXT:    movq %rdx, %rax
+; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rcx, %rax
 ; CHECK-O3-EX-NEXT:    shrq $3, %rax
 ; CHECK-O3-EX-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
@@ -1033,15 +1031,14 @@ define i64 @load_fold_urem1(i64* %p) {
 ;
 ; CHECK-O3-LABEL: load_fold_urem1:
 ; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    movq (%rdi), %rcx
-; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-NEXT:    movq %rcx, %rax
-; CHECK-O3-NEXT:    mulq %rdx
+; CHECK-O3-NEXT:    movq (%rdi), %rax
+; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-O3-NEXT:    movq %rax, %rdx
+; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rdx
 ; CHECK-O3-NEXT:    shrq $3, %rdx
-; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rax
-; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
-; CHECK-O3-NEXT:    subq %rax, %rcx
-; CHECK-O3-NEXT:    movq %rcx, %rax
+; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rcx
+; CHECK-O3-NEXT:    leaq (%rcx,%rcx,2), %rcx
+; CHECK-O3-NEXT:    subq %rcx, %rax
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
   %ret = urem i64 %v, 15
@@ -1694,28 +1691,28 @@ define void @rmw_fold_sdiv2(i64* %p, i64 %v) {
 define void @rmw_fold_udiv1(i64* %p, i64 %v) {
 ; CHECK-O0-LABEL: rmw_fold_udiv1:
 ; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    movq (%rdi), %rax
-; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O0-NEXT:    mulq %rcx
-; CHECK-O0-NEXT:    shrq $3, %rdx
-; CHECK-O0-NEXT:    movq %rdx, (%rdi)
+; CHECK-O0-NEXT:    movq (%rdi), %rdx
+; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O0-NEXT:    mulxq %rax, %rcx, %rax
+; CHECK-O0-NEXT:    shrq $3, %rax
+; CHECK-O0-NEXT:    movq %rax, (%rdi)
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-CUR-LABEL: rmw_fold_udiv1:
 ; CHECK-O3-CUR:       # %bb.0:
-; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
-; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT:    mulq %rcx
-; CHECK-O3-CUR-NEXT:    shrq $3, %rdx
-; CHECK-O3-CUR-NEXT:    movq %rdx, (%rdi)
+; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
+; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rcx
+; CHECK-O3-CUR-NEXT:    shrq $3, %rcx
+; CHECK-O3-CUR-NEXT:    movq %rcx, (%rdi)
 ; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
 ; CHECK-O3-EX:       # %bb.0:
-; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT:    mulq (%rdi)
-; CHECK-O3-EX-NEXT:    shrq $3, %rdx
-; CHECK-O3-EX-NEXT:    movq %rdx, (%rdi)
+; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rcx
+; CHECK-O3-EX-NEXT:    shrq $3, %rcx
+; CHECK-O3-EX-NEXT:    movq %rcx, (%rdi)
 ; CHECK-O3-EX-NEXT:    retq
   %prev = load atomic i64, i64* %p unordered, align 8
   %val = udiv i64 %prev, 15
@@ -1842,27 +1839,25 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT:    mulq %rcx
-; CHECK-O0-NEXT:    shrq $3, %rdx
-; CHECK-O0-NEXT:    leaq (%rdx,%rdx,4), %rax
-; CHECK-O0-NEXT:    leaq (%rax,%rax,2), %rax
-; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-O0-NEXT:    subq %rax, %rcx
-; CHECK-O0-NEXT:    movq %rcx, (%rdi)
+; CHECK-O0-NEXT:    movq %rax, %rdx
+; CHECK-O0-NEXT:    mulxq %rcx, %rdx, %rcx
+; CHECK-O0-NEXT:    shrq $3, %rcx
+; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
+; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
+; CHECK-O0-NEXT:    subq %rcx, %rax
+; CHECK-O0-NEXT:    movq %rax, (%rdi)
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: rmw_fold_urem1:
 ; CHECK-O3:       # %bb.0:
-; CHECK-O3-NEXT:    movq (%rdi), %rcx
-; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-NEXT:    movq %rcx, %rax
-; CHECK-O3-NEXT:    mulq %rdx
-; CHECK-O3-NEXT:    shrq $3, %rdx
-; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rax
+; CHECK-O3-NEXT:    movq (%rdi), %rdx
+; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-NEXT:    mulxq %rax, %rax, %rcx
+; CHECK-O3-NEXT:    shrq $3, %rcx
+; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rax
 ; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
-; CHECK-O3-NEXT:    subq %rax, %rcx
-; CHECK-O3-NEXT:    movq %rcx, (%rdi)
+; CHECK-O3-NEXT:    subq %rax, %rdx
+; CHECK-O3-NEXT:    movq %rdx, (%rdi)
 ; CHECK-O3-NEXT:    retq
   %prev = load atomic i64, i64* %p unordered, align 8
   %val = urem i64 %prev, 15
diff --git a/llvm/test/CodeGen/X86/bmi2-x86_64.ll b/llvm/test/CodeGen/X86/bmi2-x86_64.ll
index 6333732ae0f26..bb03138ccf763 100644
--- a/llvm/test/CodeGen/X86/bmi2-x86_64.ll
+++ b/llvm/test/CodeGen/X86/bmi2-x86_64.ll
@@ -68,8 +68,8 @@ define i64 @mulx64(i64 %x, i64 %y, i64* %p)   {
 ; CHECK-LABEL: mulx64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdx, %rcx
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    mulxq %rsi, %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, (%rcx)
 ; CHECK-NEXT:    retq
   %x1 = zext i64 %x to i128
@@ -86,8 +86,8 @@ define i64 @mulx64_load(i64 %x, i64* %y, i64* %p)   {
 ; CHECK-LABEL: mulx64_load:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdx, %rcx
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq (%rsi)
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    mulxq (%rsi), %rax, %rdx
 ; CHECK-NEXT:    movq %rdx, (%rcx)
 ; CHECK-NEXT:    retq
   %y1 = load i64, i64* %y
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index 114f9ac5479af..bf78cb4f72efb 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -120,11 +120,11 @@ define i32 @mulx32(i32 %x, i32 %y, i32* %p)   {
 ; X86-LABEL: mulx32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    mulxl %eax, %eax, %edx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
@@ -156,10 +156,10 @@ define i32 @mulx32_load(i32 %x, i32* %y, i32* %p)   {
 ; X86-LABEL: mulx32_load:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    mull (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    mulxl (%eax), %eax, %edx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/hoist-invariant-load.ll b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
index 13b72bdfc6dc7..73cf898223bc6 100644
--- a/llvm/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
@@ -215,22 +215,21 @@ declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 define void @test_multi_def(i64* dereferenceable(8) %x1,
 ; CHECK-LABEL: test_multi_def:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rdx, %r8
-; CHECK-NEXT:    xorl %r9d, %r9d
-; CHECK-NEXT:    movq (%rdi), %rdi
-; CHECK-NEXT:    movq (%rsi), %rsi
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    movq (%rdi), %rdx
+; CHECK-NEXT:    movq (%rsi), %r9
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB4_2: ## %for.body
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rsi
-; CHECK-NEXT:    addq %rax, (%r8)
-; CHECK-NEXT:    adcq %rdx, 8(%r8)
+; CHECK-NEXT:    mulxq %r9, %rsi, %rdi
+; CHECK-NEXT:    addq %rsi, (%rax)
+; CHECK-NEXT:    adcq %rdi, 8(%rax)
 ; CHECK-NEXT:  ## %bb.1: ## %for.check
 ; CHECK-NEXT:    ## in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    incq %r9
-; CHECK-NEXT:    addq $16, %r8
-; CHECK-NEXT:    cmpq %rcx, %r9
+; CHECK-NEXT:    incq %r8
+; CHECK-NEXT:    addq $16, %rax
+; CHECK-NEXT:    cmpq %rcx, %r8
 ; CHECK-NEXT:    jl LBB4_2
 ; CHECK-NEXT:  ## %bb.3: ## %exit
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index e40f10a67dd1b..45834f2eeecd3 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -7,48 +7,86 @@
 ; PR1198
 
 define i64 @foo(i64 %x, i64 %y) nounwind {
-; X86-LABEL: foo:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
+; X86-NOBMI-LABEL: foo:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    mull %ebx
+; X86-NOBMI-NEXT:    movl %edx, %edi
+; X86-NOBMI-NEXT:    movl %ebp, %eax
+; X86-NOBMI-NEXT:    mull %ebx
+; X86-NOBMI-NEXT:    movl %edx, %ebx
+; X86-NOBMI-NEXT:    movl %eax, %ebp
+; X86-NOBMI-NEXT:    addl %edi, %ebp
+; X86-NOBMI-NEXT:    adcl $0, %ebx
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    mull %esi
+; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    addl %ebp, %eax
+; X86-NOBMI-NEXT:    adcl %ebx, %ecx
+; X86-NOBMI-NEXT:    setb %al
+; X86-NOBMI-NEXT:    movzbl %al, %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    mull %esi
+; X86-NOBMI-NEXT:    addl %ecx, %eax
+; X86-NOBMI-NEXT:    adcl %edi, %edx
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
+; X86-NOBMI-NEXT:    retl
 ;
-; X64-LABEL: foo:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    retq
+; X86-BMI-LABEL: foo:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    pushl %ebp
+; X86-BMI-NEXT:    pushl %ebx
+; X86-BMI-NEXT:    pushl %edi
+; X86-BMI-NEXT:    pushl %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    movl %eax, %edx
+; X86-BMI-NEXT:    mulxl %esi, %edx, %ebx
+; X86-BMI-NEXT:    movl %ecx, %edx
+; X86-BMI-NEXT:    mulxl %esi, %esi, %ebp
+; X86-BMI-NEXT:    addl %ebx, %esi
+; X86-BMI-NEXT:    adcl $0, %ebp
+; X86-BMI-NEXT:    movl %eax, %edx
+; X86-BMI-NEXT:    mulxl %edi, %eax, %ebx
+; X86-BMI-NEXT:    addl %esi, %eax
+; X86-BMI-NEXT:    adcl %ebp, %ebx
+; X86-BMI-NEXT:    setb %al
+; X86-BMI-NEXT:    movzbl %al, %esi
+; X86-BMI-NEXT:    movl %ecx, %edx
+; X86-BMI-NEXT:    mulxl %edi, %eax, %edx
+; X86-BMI-NEXT:    addl %ebx, %eax
+; X86-BMI-NEXT:    adcl %esi, %edx
+; X86-BMI-NEXT:    popl %esi
+; X86-BMI-NEXT:    popl %edi
+; X86-BMI-NEXT:    popl %ebx
+; X86-BMI-NEXT:    popl %ebp
+; X86-BMI-NEXT:    retl
+;
+; X64-NOBMI-LABEL: foo:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    mulq %rsi
+; X64-NOBMI-NEXT:    movq %rdx, %rax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: foo:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    movq %rdi, %rdx
+; X64-BMI-NEXT:    mulxq %rsi, %rcx, %rax
+; X64-BMI-NEXT:    retq
   %tmp0 = zext i64 %x to i128
   %tmp1 = zext i64 %y to i128
   %tmp2 = mul i128 %tmp0, %tmp1
@@ -62,107 +100,202 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; zero-extended value.
 
 define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind {
-; X86-LABEL: mul1:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    je .LBB1_3
-; X86-NEXT:  # %bb.1: # %for.body.preheader
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB1_2: # %for.body
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax,%ebp,8), %esi
-; X86-NEXT:    movl 4(%eax,%ebp,8), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, (%esi,%ebp,8)
-; X86-NEXT:    movl %edi, 4(%esi,%ebp,8)
-; X86-NEXT:    addl $1, %ebp
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    jne .LBB1_2
-; X86-NEXT:  .LBB1_3: # %for.end
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
+; X86-NOBMI-LABEL: mul1:
+; X86-NOBMI:       # %bb.0: # %entry
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $24, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    orl %ecx, %eax
+; X86-NOBMI-NEXT:    je .LBB1_3
+; X86-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    xorl %ebp, %ebp
+; X86-NOBMI-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NOBMI-NEXT:    .p2align 4, 0x90
+; X86-NOBMI-NEXT:  .LBB1_2: # %for.body
+; X86-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl (%eax,%ebp,8), %esi
+; X86-NOBMI-NEXT:    movl 4(%eax,%ebp,8), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    mull %edi
+; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    mull %edi
+; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl $0, %ecx
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    mull %edx
+; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    addl %ebx, %edi
+; X86-NOBMI-NEXT:    adcl %ecx, %esi
+; X86-NOBMI-NEXT:    setb %bl
+; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    addl %esi, %eax
+; X86-NOBMI-NEXT:    movzbl %bl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    adcl %esi, %edx
+; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl $0, %eax
+; X86-NOBMI-NEXT:    adcl $0, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %ecx, (%esi,%ebp,8)
+; X86-NOBMI-NEXT:    movl %edi, 4(%esi,%ebp,8)
+; X86-NOBMI-NEXT:    addl $1, %ebp
+; X86-NOBMI-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NOBMI-NEXT:    adcl $0, %edi
+; X86-NOBMI-NEXT:    movl %ebp, %esi
+; X86-NOBMI-NEXT:    xorl %ebx, %esi
+; X86-NOBMI-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NOBMI-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    orl %esi, %edi
+; X86-NOBMI-NEXT:    jne .LBB1_2
+; X86-NOBMI-NEXT:  .LBB1_3: # %for.end
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    addl $24, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI-LABEL: mul1:
+; X86-BMI:       # %bb.0: # %entry
+; X86-BMI-NEXT:    pushl %ebp
+; X86-BMI-NEXT:    pushl %ebx
+; X86-BMI-NEXT:    pushl %edi
+; X86-BMI-NEXT:    pushl %esi
+; X86-BMI-NEXT:    subl $16, %esp
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    orl %ecx, %eax
+; X86-BMI-NEXT:    je .LBB1_3
+; X86-BMI-NEXT:  # %bb.1: # %for.body.preheader
+; X86-BMI-NEXT:    xorl %ecx, %ecx
+; X86-BMI-NEXT:    xorl %edx, %edx
+; X86-BMI-NEXT:    xorl %ebx, %ebx
+; X86-BMI-NEXT:    xorl %ebp, %ebp
+; X86-BMI-NEXT:    .p2align 4, 0x90
+; X86-BMI-NEXT:  .LBB1_2: # %for.body
+; X86-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl (%ecx,%ebx,8), %eax
+; X86-BMI-NEXT:    movl 4(%ecx,%ebx,8), %esi
+; X86-BMI-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-BMI-NEXT:    movl %eax, %edx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    mulxl %ecx, %edx, %edi
+; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl %esi, %edx
+; X86-BMI-NEXT:    mulxl %ecx, %esi, %ecx
+; X86-BMI-NEXT:    addl %edi, %esi
+; X86-BMI-NEXT:    adcl $0, %ecx
+; X86-BMI-NEXT:    movl %eax, %edx
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %edi, %eax
+; X86-BMI-NEXT:    addl %esi, %edi
+; X86-BMI-NEXT:    adcl %ecx, %eax
+; X86-BMI-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %ecx, %edx
+; X86-BMI-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-BMI-NEXT:    addl %eax, %ecx
+; X86-BMI-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-BMI-NEXT:    adcl %eax, %edx
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-BMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl $0, %ecx
+; X86-BMI-NEXT:    adcl $0, %edx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %esi, (%eax,%ebx,8)
+; X86-BMI-NEXT:    movl %edi, 4(%eax,%ebx,8)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    addl $1, %ebx
+; X86-BMI-NEXT:    adcl $0, %ebp
+; X86-BMI-NEXT:    movl %ebx, %eax
+; X86-BMI-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %ebp, %esi
+; X86-BMI-NEXT:    xorl %edi, %esi
+; X86-BMI-NEXT:    orl %eax, %esi
+; X86-BMI-NEXT:    jne .LBB1_2
+; X86-BMI-NEXT:  .LBB1_3: # %for.end
+; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    xorl %edx, %edx
+; X86-BMI-NEXT:    addl $16, %esp
+; X86-BMI-NEXT:    popl %esi
+; X86-BMI-NEXT:    popl %edi
+; X86-BMI-NEXT:    popl %ebx
+; X86-BMI-NEXT:    popl %ebp
+; X86-BMI-NEXT:    retl
+;
+; X64-NOBMI-LABEL: mul1:
+; X64-NOBMI:       # %bb.0: # %entry
+; X64-NOBMI-NEXT:    testq %rdi, %rdi
+; X64-NOBMI-NEXT:    je .LBB1_3
+; X64-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
+; X64-NOBMI-NEXT:    movq %rcx, %r8
+; X64-NOBMI-NEXT:    movq %rdx, %r9
+; X64-NOBMI-NEXT:    xorl %r10d, %r10d
+; X64-NOBMI-NEXT:    xorl %ecx, %ecx
+; X64-NOBMI-NEXT:    .p2align 4, 0x90
+; X64-NOBMI-NEXT:  .LBB1_2: # %for.body
+; X64-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NOBMI-NEXT:    movq %r8, %rax
+; X64-NOBMI-NEXT:    mulq (%r9,%rcx,8)
+; X64-NOBMI-NEXT:    addq %r10, %rax
+; X64-NOBMI-NEXT:    adcq $0, %rdx
+; X64-NOBMI-NEXT:    movq %rax, (%rsi,%rcx,8)
+; X64-NOBMI-NEXT:    incq %rcx
+; X64-NOBMI-NEXT:    cmpq %rcx, %rdi
+; X64-NOBMI-NEXT:    movq %rdx, %r10
+; X64-NOBMI-NEXT:    jne .LBB1_2
+; X64-NOBMI-NEXT:  .LBB1_3: # %for.end
+; X64-NOBMI-NEXT:    xorl %eax, %eax
+; X64-NOBMI-NEXT:    retq
 ;
-; X64-LABEL: mul1:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB1_3
-; X64-NEXT:  # %bb.1: # %for.body.preheader
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    xorl %r10d, %r10d
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB1_2: # %for.body
-; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq (%r9,%rcx,8)
-; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    movq %rax, (%rsi,%rcx,8)
-; X64-NEXT:    incq %rcx
-; X64-NEXT:    cmpq %rcx, %rdi
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    jne .LBB1_2
-; X64-NEXT:  .LBB1_3: # %for.end
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
+; X64-BMI-LABEL: mul1:
+; X64-BMI:       # %bb.0: # %entry
+; X64-BMI-NEXT:    testq %rdi, %rdi
+; X64-BMI-NEXT:    je .LBB1_3
+; X64-BMI-NEXT:  # %bb.1: # %for.body.preheader
+; X64-BMI-NEXT:    movq %rcx, %r8
+; X64-BMI-NEXT:    movq %rdx, %r9
+; X64-BMI-NEXT:    xorl %r10d, %r10d
+; X64-BMI-NEXT:    xorl %ecx, %ecx
+; X64-BMI-NEXT:    .p2align 4, 0x90
+; X64-BMI-NEXT:  .LBB1_2: # %for.body
+; X64-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-BMI-NEXT:    movq %r8, %rdx
+; X64-BMI-NEXT:    mulxq (%r9,%rcx,8), %rax, %rdx
+; X64-BMI-NEXT:    addq %r10, %rax
+; X64-BMI-NEXT:    adcq $0, %rdx
+; X64-BMI-NEXT:    movq %rax, (%rsi,%rcx,8)
+; X64-BMI-NEXT:    incq %rcx
+; X64-BMI-NEXT:    cmpq %rcx, %rdi
+; X64-BMI-NEXT:    movq %rdx, %r10
+; X64-BMI-NEXT:    jne .LBB1_2
+; X64-BMI-NEXT:  .LBB1_3: # %for.end
+; X64-BMI-NEXT:    xorl %eax, %eax
+; X64-BMI-NEXT:    retq
 entry:
   %conv = zext i64 %y to i128
   %cmp11 = icmp eq i64 %n, 0
diff --git a/llvm/test/CodeGen/X86/mulx32.ll b/llvm/test/CodeGen/X86/mulx32.ll
index faf299f3a2dfa..872e72d503aa3 100644
--- a/llvm/test/CodeGen/X86/mulx32.ll
+++ b/llvm/test/CodeGen/X86/mulx32.ll
@@ -5,8 +5,8 @@
 define i64 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    mull {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    mulxl {{[0-9]+}}(%esp), %eax, %edx
 ; CHECK-NEXT:    retl
   %x = zext i32 %a to i64
   %y = zext i32 %b to i64
@@ -17,9 +17,9 @@ define i64 @f1(i32 %a, i32 %b) {
 define i64 @f2(i32 %a, i32* %p) {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    mull (%ecx)
+; CHECK-NEXT:    mulxl (%eax), %eax, %edx
 ; CHECK-NEXT:    retl
   %b = load i32, i32* %p
   %x = zext i32 %a to i64
diff --git a/llvm/test/CodeGen/X86/mulx64.ll b/llvm/test/CodeGen/X86/mulx64.ll
index 38f1d3ea5ab32..e038f33000937 100644
--- a/llvm/test/CodeGen/X86/mulx64.ll
+++ b/llvm/test/CodeGen/X86/mulx64.ll
@@ -5,8 +5,8 @@
 define i128 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    mulxq %rsi, %rax, %rdx
 ; CHECK-NEXT:    retq
   %x = zext i64 %a to i128
   %y = zext i64 %b to i128
@@ -17,8 +17,8 @@ define i128 @f1(i64 %a, i64 %b) {
 define i128 @f2(i64 %a, i64* %p) {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq (%rsi)
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    mulxq (%rsi), %rax, %rdx
 ; CHECK-NEXT:    retq
   %b = load i64, i64* %p
   %x = zext i64 %a to i128
diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll
index a97af6a1ac67f..07fb37f4b62a8 100644
--- a/llvm/test/CodeGen/X86/pr35636.ll
+++ b/llvm/test/CodeGen/X86/pr35636.ll
@@ -5,11 +5,11 @@
 define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ; HSW-LABEL: _Z15uint64_to_asciimPc:
 ; HSW:       # %bb.0: # %bb
-; HSW-NEXT:    movq %rdi, %rax
-; HSW-NEXT:    movabsq $811296384146066817, %rcx # imm = 0xB424DC35095CD81
-; HSW-NEXT:    mulq %rcx
-; HSW-NEXT:    shrq $42, %rdx
-; HSW-NEXT:    imulq $281474977, %rdx, %rax # imm = 0x10C6F7A1
+; HSW-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; HSW-NEXT:    movq %rdi, %rdx
+; HSW-NEXT:    mulxq %rax, %rax, %rcx
+; HSW-NEXT:    shrq $42, %rcx
+; HSW-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
 ; HSW-NEXT:    shrq $20, %rax
 ; HSW-NEXT:    leal (%rax,%rax,4), %eax
 ; HSW-NEXT:    addl $5, %eax
@@ -22,11 +22,11 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ;
 ; ZN-LABEL: _Z15uint64_to_asciimPc:
 ; ZN:       # %bb.0: # %bb
-; ZN-NEXT:    movq %rdi, %rax
-; ZN-NEXT:    movabsq $811296384146066817, %rcx # imm = 0xB424DC35095CD81
-; ZN-NEXT:    mulq %rcx
-; ZN-NEXT:    shrq $42, %rdx
-; ZN-NEXT:    imulq $281474977, %rdx, %rax # imm = 0x10C6F7A1
+; ZN-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; ZN-NEXT:    movq %rdi, %rdx
+; ZN-NEXT:    mulxq %rax, %rax, %rcx
+; ZN-NEXT:    shrq $42, %rcx
+; ZN-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
 ; ZN-NEXT:    shrq $20, %rax
 ; ZN-NEXT:    leal 5(%rax,%rax,4), %eax
 ; ZN-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF

From fe9d8442e0dfc8c83e9a0a31f5079e7a70b54d9d Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 12:01:36 -0700
Subject: [PATCH 264/770] [lldb/Test] Generate YAML binary in build directory

Although it's not entirely clear to me why, this test was generating its
binary in the source directory instead of the build directory. This
patch fixes that following the same approach as other tests.
---
 .../show_location/TestShowLocationDwarf5.py            | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/lldb/test/API/functionalities/show_location/TestShowLocationDwarf5.py b/lldb/test/API/functionalities/show_location/TestShowLocationDwarf5.py
index 1d4bc6f134500..76d24d5d4e521 100644
--- a/lldb/test/API/functionalities/show_location/TestShowLocationDwarf5.py
+++ b/lldb/test/API/functionalities/show_location/TestShowLocationDwarf5.py
@@ -14,17 +14,9 @@ class TestTargetSourceMap(TestBase):
     def test_source_map(self):
         # Set the target soure map to map "./" to the current test directory.
         yaml_path = os.path.join(self.getSourceDir(), "a.yaml")
-        yaml_base, ext = os.path.splitext(yaml_path)
-        obj_path = self.getBuildArtifact(yaml_base)
+        obj_path = self.getBuildArtifact('a.out')
         self.yaml2obj(yaml_path, obj_path)
 
-        def cleanup():
-            if os.path.exists(obj_path):
-                os.unlink(obj_path)
-
-        # Execute the cleanup function during test case tear down.
-        self.addTearDownHook(cleanup)
-
         # Create a target with the object file we just created from YAML
         target = self.dbg.CreateTarget(obj_path)
 

From c30c2368c77f05a1447bb7442c6ac2fad2912a57 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 12:07:08 -0700
Subject: [PATCH 265/770] [lldb/Reproducers] Skip tests relying on timeouts

The reproducer don't model timeouts so tests that rely on them end up
with unexpected packets during replay. Skip them until we can handle
this scenario.
---
 .../API/commands/expression/no-deadlock/TestExprDoesntBlock.py   | 1 +
 lldb/test/API/commands/expression/timeout/TestCallWithTimeout.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py b/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py
index d7d963390b051..3423ec6e6ab9b 100644
--- a/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py
+++ b/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py
@@ -17,6 +17,7 @@ class ExprDoesntDeadlockTestCase(TestBase):
 
     @expectedFailureAll(oslist=['freebsd'], bugnumber='llvm.org/pr17946')
     @add_test_categories(["basic_process"])
+    @skipIfReproducer # Timeouts are not currently modeled.
     def test_with_run_command(self):
         """Test that expr will time out and allow other threads to run if it blocks."""
         self.build()
diff --git a/lldb/test/API/commands/expression/timeout/TestCallWithTimeout.py b/lldb/test/API/commands/expression/timeout/TestCallWithTimeout.py
index 42e28a5a440a8..36ed7ce26de13 100644
--- a/lldb/test/API/commands/expression/timeout/TestCallWithTimeout.py
+++ b/lldb/test/API/commands/expression/timeout/TestCallWithTimeout.py
@@ -26,6 +26,7 @@ def setUp(self):
         oslist=[
             "windows"],
         bugnumber="llvm.org/pr21765")
+    @skipIfReproducer # Timeouts are not currently modeled.
     def test(self):
         """Test calling std::String member function."""
         self.build()

From 334552150770faaa407fecab42f5333bb2a898a6 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 26 May 2020 15:40:43 -0700
Subject: [PATCH 266/770] Also cache negative results in GetXcodeSDKPath (NFC)

This fixes a performance issue in the failure case.

rdar://63547920

Differential Revision: https://reviews.llvm.org/D80595
---
 lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index 79ccc5277d2e0..cb6f03465ef70 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -367,8 +367,10 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
   static std::mutex g_sdk_path_mutex;
 
   std::lock_guard<std::mutex> guard(g_sdk_path_mutex);
-  std::string &path = g_sdk_path[sdk.GetString()];
-  if (path.empty())
-    path = GetXcodeSDK(sdk);
+  auto it = g_sdk_path.find(sdk.GetString());
+  if (it != g_sdk_path.end())
+    return it->second;
+  std::string path = GetXcodeSDK(sdk);
+  g_sdk_path.insert({sdk.GetString(), path});
   return path;
 }

From eb1092ada32d6855dcb4f763ce48ede21f4d7441 Mon Sep 17 00:00:00 2001
From: alex-t <alexander.timofeev@amd.com>
Date: Mon, 18 May 2020 23:40:27 +0300
Subject: [PATCH 267/770] [AMDGPU] Fix for the lost CarryOut/CarryIn register
 operands in S_ADD/SUB_CO_PSEUDO.

Summary: This fixes the 5b898bddff51 bug when the carry-in and carry-out registers became lost in lowering S_ADD/SUB_CO_PSEUDO.

Reviewers: rampitec, arsenm

Reviewed By: arsenm

Subscribers: msearles, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80158
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 18 ++++---
 .../AMDGPU/s_add_co_pseudo_lowering.mir       | 50 +++++++++++++++++++
 2 files changed, 62 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 18d08362512d4..5392abfa8f6e5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5248,18 +5248,24 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
                          ? AMDGPU::V_ADDC_U32_e64
                          : AMDGPU::V_SUBB_U32_e64;
       const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-      Register DummyCReg = MRI.createVirtualRegister(CarryRC);
-      Register CarryReg = MRI.createVirtualRegister(CarryRC);
+
+      Register CarryInReg = Inst.getOperand(4).getReg();
+      if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
+        Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
+        BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
+            .addReg(CarryInReg);
+      }
+
+      Register CarryOutReg = Inst.getOperand(1).getReg();
+
       Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
           MRI.getRegClass(Inst.getOperand(0).getReg())));
-      BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), CarryReg)
-          .addReg(Inst.getOperand(4).getReg());
       MachineInstr *CarryOp =
           BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
-              .addReg(DummyCReg, RegState::Define | RegState::Dead)
+              .addReg(CarryOutReg, RegState::Define)
               .add(Inst.getOperand(2))
               .add(Inst.getOperand(3))
-              .addReg(CarryReg, RegState::Kill)
+              .addReg(CarryInReg)
               .addImm(0);
       legalizeOperands(*CarryOp);
       MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
diff --git a/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir b/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir
new file mode 100644
index 0000000000000..40bdf8e643175
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-sgpr-copies  %s -o - | FileCheck -check-prefix=GCN %s
+---
+name:            s_add_co_pseudo_test
+tracksRegLiveness: true
+body:             |
+
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr0, $sgpr1, $sgpr2
+    ; GCN-LABEL: name: s_add_co_pseudo_test
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr0, $sgpr1, $sgpr2
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GCN: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; GCN: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+    ; GCN: [[COPY6:%[0-9]+]]:sgpr_32 = COPY [[COPY3]]
+    ; GCN: [[V_MUL_LO_U32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[COPY]], [[COPY4]], implicit $exec
+    ; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 killed [[V_MUL_LO_U32_]], [[COPY6]], 0, implicit $exec
+    ; GCN: [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[COPY4]], [[COPY5]]
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -614296167
+    ; GCN: [[V_MUL_LO_U32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[COPY]], [[COPY3]], implicit $exec
+    ; GCN: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_]]
+    ; GCN: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 killed [[V_MUL_LO_U32_1]], [[COPY7]], [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GCN: [[V_MUL_HI_U32_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32 [[COPY4]], [[V_ADDC_U32_e64_]], implicit $exec
+    ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -181084736
+    ; GCN: [[V_MUL_LO_U32_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_MUL_HI_U32_]], [[S_MOV_B32_1]], implicit $exec
+    ; GCN: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]]
+    ; GCN: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY8]], killed [[V_MUL_LO_U32_2]], [[V_ADDC_U32_e64_1]], 0, implicit $exec
+    %0:vgpr_32 = COPY $vgpr0
+    %6:sreg_32 = COPY %0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    %3:sreg_32 = COPY $sgpr0
+    %4:sreg_32 = COPY $sgpr1
+    %5:sreg_32 = COPY $sgpr2
+    %20:vgpr_32 = COPY %3
+    %7:sreg_32 = S_MUL_I32 %6, %4
+    %9:vgpr_32, %10:sreg_64_xexec = V_ADD_I32_e64 killed %7, %20, 0, implicit $exec
+    %8:sreg_32 = S_MUL_HI_U32 %4, %5
+    %11:sreg_32 = S_MOV_B32 -614296167
+    %12:sreg_32 = S_MUL_I32 %6, %3
+    %14:sreg_32, %13:sreg_64_xexec = S_ADD_CO_PSEUDO killed %12, killed %11, killed %10, implicit-def dead $scc
+    %15:sreg_32 = S_MUL_HI_U32 %4, %14
+    %16:sreg_32 = S_MOV_B32 -181084736
+    %17:sreg_32 = S_MUL_I32 %15, %16
+    %19:sreg_32, %18:sreg_64_xexec = S_ADD_CO_PSEUDO killed %16, killed %17, killed %13, implicit-def dead $scc
+...

From d24dd2b279ffe60d579b425fb74f6e4904323a34 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 27 May 2020 21:47:13 +0200
Subject: [PATCH 268/770] tsan: fix test in debug mode

sanitizer-x86_64-linux-autoconf has failed after the previous tsan commit:

FAIL: ThreadSanitizer-x86_64 :: java_finalizer2.cpp (245 of 403)
******************** TEST 'ThreadSanitizer-x86_64 :: java_finalizer2.cpp' FAILED ********************
Script:
--
: 'RUN: at line 1';      /b/sanitizer-x86_64-linux-autoconf/build/tsan_debug_build/./bin/clang  --driver-mode=g++ -fsanitize=thread -Wall  -m64   -gline-tables-only -I/b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/test/tsan/../ -std=c++11 -I/b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/test/tsan/../ -nostdinc++ -I/b/sanitizer-x86_64-linux-autoconf/build/tsan_debug_build/tools/clang/runtime/compiler-rt-bins/lib/tsan/libcxx_tsan_x86_64/include/c++/v1 -O1 /b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/test/tsan/java_finalizer2.cpp -o /b/sanitizer-x86_64-linux-autoconf/build/tsan_debug_build/tools/clang/runtime/compiler-rt-bins/test/tsan/X86_64Config/Output/java_finalizer2.cpp.tmp &&  /b/sanitizer-x86_64-linux-autoconf/build/tsan_debug_build/tools/clang/runtime/compiler-rt-bins/test/tsan/X86_64Config/Output/java_finalizer2.cpp.tmp 2>&1 | FileCheck /b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/test/tsan/java_finalizer2.cpp
--
Exit Code: 1

Command Output (stderr):
--
/b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/test/tsan/java_finalizer2.cpp:82:11: error: CHECK: expected string not found in input
// CHECK: DONE
          ^
<stdin>:1:1: note: scanning from here
FATAL: ThreadSanitizer CHECK failed: /b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/lib/tsan/rtl/tsan_sync.cpp:69 "((*meta)) == ((0))" (0x4000003e, 0x0)
^
<stdin>:5:12: note: possible intended match here
 #3 __tsan::OnUserAlloc(__tsan::ThreadState*, unsigned long, unsigned long, unsigned long, bool) /b/sanitizer-x86_64-linux-autoconf/build/llvm-project/compiler-rt/lib/tsan/rtl/tsan_mman.cpp:225:16 (java_finalizer2.cpp.tmp+0x4af407)
           ^

http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-autoconf/builds/51143/steps/test%20tsan%20in%20debug%20compiler-rt%20build/logs/stdio

Fix heap object overlap by offsetting java heap as other tests are doing.
---
 compiler-rt/test/tsan/java_finalizer2.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/tsan/java_finalizer2.cpp b/compiler-rt/test/tsan/java_finalizer2.cpp
index f2590f7c40b9d..0cacf3f9adfd7 100644
--- a/compiler-rt/test/tsan/java_finalizer2.cpp
+++ b/compiler-rt/test/tsan/java_finalizer2.cpp
@@ -47,7 +47,7 @@ void *Ballast(void *p) {
 }
 
 int main() {
-  Heap* heap = (Heap*)calloc(sizeof(Heap), 1);
+  Heap* heap = (Heap*)calloc(sizeof(Heap), 2) + 1;
   __tsan_java_init((jptr)heap, sizeof(*heap));
   __tsan_java_alloc((jptr)heap, sizeof(*heap));
   // Ballast threads merely make the bug a bit easier to trigger.

From c593bf534222f2206f89b6a61993125b2475b954 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Fri, 15 May 2020 17:37:12 -0700
Subject: [PATCH 269/770] [GlobalISel] Don't combine instructions which are fed
 by memory instructions.

If we have a memory instruction (e.g. a load), we shouldn't combine it away in
some trivial combine.

It's possible that, say, a call lives between the instructions. This could
modify the value loaded, making the load instructions not safe to fold.

Differential Revision: https://reviews.llvm.org/D80053
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 24 +++++-
 ...galizercombiner-not-really-equiv-insts.mir | 82 +++++++++++++++++++
 2 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-not-really-equiv-insts.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1d888245af9fb..45b7d991ae727 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1534,8 +1534,28 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
   if (!I2)
     return false;
 
-  // Check for physical registers on the instructions first to avoid cases like
-  // this:
+  // If we have an instruction which loads or stores, we can't guarantee that
+  // it is identical.
+  //
+  // For example, we may have
+  //
+  // %x1 = G_LOAD %addr (load N from @somewhere)
+  // ...
+  // call @foo
+  // ...
+  // %x2 = G_LOAD %addr (load N from @somewhere)
+  // ...
+  // %or = G_OR %x1, %x2
+  //
+  // It's possible that @foo will modify whatever lives at the address we're
+  // loading from. To be safe, let's just assume that all loads and stores
+  // are different (unless we have something which is guaranteed to not
+  // change.)
+  if (I1->mayLoadOrStore() && !I1->isDereferenceableInvariantLoad(nullptr))
+    return false;
+
+  // Check for physical registers on the instructions first to avoid cases
+  // like this:
   //
   // %a = COPY $physreg
   // ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-not-really-equiv-insts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-not-really-equiv-insts.mir
new file mode 100644
index 0000000000000..e387c5e58d6fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-not-really-equiv-insts.mir
@@ -0,0 +1,82 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  @g = external hidden unnamed_addr global i32, align 4
+  define void @not_necessarily_equiv_loads() { ret void }
+  define void @invariant_loads() { ret void }
+  define void @both_have_to_be_invariant() { ret void }
+...
+---
+name:            not_necessarily_equiv_loads
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+
+    ; %load1 || %load2 == %load1 is not necessarily true, even though they
+    ; both load from the same address. Whatever is in that address may be
+    ; changed by another instruction which appears between them.
+    ;
+    ; Check that we don't remove the G_OR.
+
+    ; CHECK-LABEL: name: not_necessarily_equiv_loads
+    ; CHECK: %ptr:_(p0) = G_GLOBAL_VALUE @g
+    ; CHECK: %load1:_(s32) = G_LOAD %ptr(p0) :: (load 4 from @g)
+    ; CHECK: %load2:_(s32) = G_LOAD %ptr(p0) :: (load 4 from @g)
+    ; CHECK: %or:_(s32) = G_OR %load2, %load1
+    ; CHECK: G_STORE %or(s32), %ptr(p0) :: (store 4 into @g)
+    ; CHECK: RET_ReallyLR
+    %ptr:_(p0) = G_GLOBAL_VALUE @g
+    %load1:_(s32) = G_LOAD %ptr(p0) :: (load 4 from @g)
+    %load2:_(s32) = G_LOAD %ptr(p0) :: (load 4 from @g)
+    %or:_(s32) = G_OR %load2, %load1
+    G_STORE %or(s32), %ptr(p0) :: (store 4 into @g)
+    RET_ReallyLR
+
+...
+---
+name:            invariant_loads
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+
+    ; %load1 || %load2 == %load1 is fine here, because the loads are invariant.
+
+    ; CHECK-LABEL: name: invariant_loads
+    ; CHECK: %ptr:_(p0) = G_GLOBAL_VALUE @g
+    ; CHECK: %load2:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable invariant load 4 from @g)
+    ; CHECK: G_STORE %load2(s32), %ptr(p0) :: (store 4 into @g)
+    ; CHECK: RET_ReallyLR
+    %ptr:_(p0) = G_GLOBAL_VALUE @g
+    %load1:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable invariant load 4 from @g)
+    %load2:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable invariant load 4 from @g)
+    %or:_(s32) = G_OR %load2, %load1
+    G_STORE %or(s32), %ptr(p0) :: (store 4 into @g)
+    RET_ReallyLR
+
+...
+---
+name:            both_have_to_be_invariant
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+
+    ; We shouldn't combine here, because the loads both have to be invariant.
+
+    ; CHECK-LABEL: name: both_have_to_be_invariant
+    ; CHECK: %ptr:_(p0) = G_GLOBAL_VALUE @g
+    ; CHECK: %load1:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable invariant load 4 from @g)
+    ; CHECK: %load2:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable load 4 from @g)
+    ; CHECK: %or:_(s32) = G_OR %load2, %load1
+    ; CHECK: G_STORE %or(s32), %ptr(p0) :: (store 4 into @g)
+    ; CHECK: RET_ReallyLR
+    %ptr:_(p0) = G_GLOBAL_VALUE @g
+    %load1:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable invariant load 4 from @g)
+    %load2:_(s32) = G_LOAD %ptr(p0) :: (dereferenceable load 4 from @g)
+    %or:_(s32) = G_OR %load2, %load1
+    G_STORE %or(s32), %ptr(p0) :: (store 4 into @g)
+    RET_ReallyLR
+...

From 49688b3c306d0bf918c0abeee030cfd56a17c348 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Wed, 27 May 2020 15:57:03 -0400
Subject: [PATCH 270/770] Fix `-Wpedantic` warning. NFC.

---
 llvm/tools/llvm-cov/CoverageFilters.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-cov/CoverageFilters.h b/llvm/tools/llvm-cov/CoverageFilters.h
index ccaa7a9df5905..33fd9929c59a2 100644
--- a/llvm/tools/llvm-cov/CoverageFilters.h
+++ b/llvm/tools/llvm-cov/CoverageFilters.h
@@ -23,7 +23,7 @@ class SpecialCaseList;
 namespace coverage {
 class CoverageMapping;
 struct FunctionRecord;
-}; // namespace coverage
+} // namespace coverage
 
 /// Matches specific functions that pass the requirement of this filter.
 class CoverageFilter {

From c6fa2efd481a58c979a8e9f95119b4278b13d99a Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Wed, 27 May 2020 12:42:14 -0700
Subject: [PATCH 271/770] [mlir][Linalg] Fix build failure from D80188

Differential Revision: https://reviews.llvm.org/D80657
---
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index cd8b17650bb11..c48b87aaa4e44 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -129,25 +129,29 @@ template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
 template struct mlir::linalg::GenerateLoopNest<AffineForOp>;
 
+namespace mlir {
+namespace linalg {
 /// Specialization of loop nest generator for scf.parallel loops to handle
 /// iterator types that are not parallel. These are generated as sequential
 /// loops.
 template <>
-void mlir::linalg::GenerateLoopNest<scf::ForOp>::doit(
-    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
-    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+void GenerateLoopNest<scf::ForOp>::doit(MutableArrayRef<Value> allIvs,
+                                        ArrayRef<SubViewOp::Range> loopRanges,
+                                        ArrayRef<Attribute> iteratorTypes,
+                                        std::function<void(void)> fun) {
   edsc::GenericLoopNestRangeBuilder<scf::ForOp>(allIvs, loopRanges)(fun);
 }
 
 template <>
-void mlir::linalg::GenerateLoopNest<AffineForOp>::doit(
-    MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
-    ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
+void GenerateLoopNest<AffineForOp>::doit(MutableArrayRef<Value> allIvs,
+                                         ArrayRef<SubViewOp::Range> loopRanges,
+                                         ArrayRef<Attribute> iteratorTypes,
+                                         std::function<void(void)> fun) {
   edsc::GenericLoopNestRangeBuilder<AffineForOp>(allIvs, loopRanges)(fun);
 }
 
 template <>
-void mlir::linalg::GenerateLoopNest<scf::ParallelOp>::doit(
+void GenerateLoopNest<scf::ParallelOp>::doit(
     MutableArrayRef<Value> allIvs, ArrayRef<SubViewOp::Range> loopRanges,
     ArrayRef<Attribute> iteratorTypes, std::function<void(void)> fun) {
   // Check if there is nothing to do here. This is also the recursion
@@ -190,3 +194,5 @@ void mlir::linalg::GenerateLoopNest<scf::ParallelOp>::doit(
       allIvs.take_front(nOuterPar), loopRanges.take_front(nOuterPar),
       iteratorTypes.take_front(nOuterPar), nestedFn);
 }
+} // namespace linalg
+} // namespace mlir

From 79aa9bfdb819c02faa3c6c78e307b20ae7f69057 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Wed, 27 May 2020 16:14:12 -0400
Subject: [PATCH 272/770] [mlir] Fix RunnerUtils template specialization

Undoing a spurious change that broke SFINAE for some out of core use
cases.
---
 mlir/include/mlir/ExecutionEngine/CRunnerUtils.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
index bc59d3de20860..604e90258ca03 100644
--- a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
@@ -106,6 +106,12 @@ using Vector3D = Vector<T, D1, D2, D3>;
 template <int D1, int D2, int D3, int D4, typename T>
 using Vector4D = Vector<T, D1, D2, D3, D4>;
 
+template <int N>
+void dropFront(int64_t arr[N], int64_t *res) {
+  for (unsigned i = 1; i < N; ++i)
+    *(res + i - 1) = arr[i];
+}
+
 //===----------------------------------------------------------------------===//
 // Codegen-compatible structures for StridedMemRef type.
 //===----------------------------------------------------------------------===//
@@ -123,10 +129,6 @@ struct StridedMemRefType {
     res.basePtr = basePtr;
     res.data = data;
     res.offset = offset + idx * strides[0];
-    auto dropFront = [](const int64_t *arr, int64_t *res) {
-      for (unsigned i = 1; i < N; ++i)
-        res[i - 1] = arr[i];
-    };
     dropFront<N>(sizes, res.sizes);
     dropFront<N>(strides, res.strides);
     return res;
@@ -209,3 +211,4 @@ extern "C" MLIR_CRUNNERUTILS_EXPORT void print_comma();
 extern "C" MLIR_CRUNNERUTILS_EXPORT void print_newline();
 
 #endif // EXECUTIONENGINE_CRUNNERUTILS_H_
+

From 54b64572407c8305c7bb8cc20c46a5e0c66b2979 Mon Sep 17 00:00:00 2001
From: Juneyoung Lee <aqjune@gmail.com>
Date: Thu, 28 May 2020 01:48:37 +0900
Subject: [PATCH 273/770] [TargetPassConfig] Add CanonicalizeFreezeInLoops
 before LSR

Summary:
This patch adds CanonicalizeFreezeInLoops before LSR.
Relevant patch: https://reviews.llvm.org/D77523

Reviewers: spatel, efriedma, jdoerfert, fhahn, nikic, reames, xbolva00

Reviewed By: nikic

Subscribers: xbolva00, nikic, lebedev.ri, hiraditya, llvm-commits, sanwou01, nlopes

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D77524
---
 llvm/lib/CodeGen/TargetPassConfig.cpp                     | 1 +
 llvm/test/CodeGen/AArch64/O3-pipeline.ll                  | 1 +
 llvm/test/CodeGen/ARM/O3-pipeline.ll                      | 1 +
 llvm/test/CodeGen/X86/O3-pipeline.ll                      | 1 +
 llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll | 7 +++----
 5 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index d8da6431bff16..241357be53941 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -669,6 +669,7 @@ void TargetPassConfig::addIRPasses() {
 
     // Run loop strength reduction before anything else.
     if (!DisableLSR) {
+      addPass(createCanonicalizeFreezeInLoopsPass());
       addPass(createLoopStrengthReducePass());
       if (PrintLSR)
         addPass(createPrintFunctionPass(dbgs(),
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index eeee6d0f6049c..401a0ac8df66b 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -35,6 +35,7 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index c31c6694cb24c..f137f715ee420 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -15,6 +15,7 @@
 ; CHECK-NEXT:      Canonicalize natural loops
 ; CHECK-NEXT:      Scalar Evolution Analysis
 ; CHECK-NEXT:      Loop Pass Manager
+; CHECK-NEXT:        Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:        Induction Variable Users
 ; CHECK-NEXT:        Loop Strength Reduction
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll
index 9087c69fac96c..c91b8143e09c7 100644
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll b/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
index c41d044c26bf8..3b2f98335652e 100644
--- a/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
+++ b/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
@@ -7,13 +7,12 @@
 define void @f(i8* %p, i32 %n, i32 %m) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    add w8, w2, #1 // =1
 ; CHECK-NEXT:  .LBB0_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add w9, w2, w8
-; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    strb wzr, [x0, w8, sxtw]
+; CHECK-NEXT:    subs w1, w1, #1 // =1
 ; CHECK-NEXT:    add w8, w8, #1 // =1
-; CHECK-NEXT:    strb wzr, [x0, w9, sxtw]
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret

From b2773823116157aa73ea4ac01270b22042d6bb42 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Tue, 26 May 2020 20:14:34 -0700
Subject: [PATCH 274/770] Remove error-prone mlir::ExecutionEngine::invoke
 overload.

I just spent a bunch of time debugging a mysterious bug that ended being due to my SmallVector getting passed to the Args&... overload instead of the MutableArrayRef overload, with disastrous results.

I appreciate the intent of this API, but for a function that does a bunch of unsafe casts, adding in potential overload confusion is just too much C++ footgun. If we end up needing this functionality, having something like a separate `packArgs(Args&...) -> SmallVector` overload would be preferable.

Turns out this API is unused and untested (even out of tree as far as I can tell, modulo the optional passing of no args to the other invoke as I fixed in this patch), so it's an easy fix -- just delete it and touch up the other overload.

Differential Revision: https://reviews.llvm.org/D80607
---
 .../mlir/ExecutionEngine/ExecutionEngine.h    | 24 ++-----------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
index 914cab78dee74..d0ad8326bac89 100644
--- a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
+++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -94,16 +94,9 @@ class ExecutionEngine {
   /// pointer to it.  Propagates errors in case of failure.
   llvm::Expected<void (*)(void **)> lookup(StringRef name) const;
 
-  /// Invokes the function with the given name passing it the list of arguments.
-  /// The arguments are accepted by lvalue-reference since the packed function
-  /// interface expects a list of non-null pointers.
-  template <typename... Args>
-  llvm::Error invoke(StringRef name, Args &... args);
-
   /// Invokes the function with the given name passing it the list of arguments
-  /// as a list of opaque pointers. This is the arity-agnostic equivalent of
-  /// the templated `invoke`.
-  llvm::Error invoke(StringRef name, MutableArrayRef<void *> args);
+  /// as a list of opaque pointers.
+  llvm::Error invoke(StringRef name, MutableArrayRef<void *> args = llvm::None);
 
   /// Set the target triple on the module. This is implicitly done when creating
   /// the engine.
@@ -135,19 +128,6 @@ class ExecutionEngine {
   llvm::JITEventListener *perfListener;
 };
 
-template <typename... Args>
-llvm::Error ExecutionEngine::invoke(StringRef name, Args &... args) {
-  auto expectedFPtr = lookup(name);
-  if (!expectedFPtr)
-    return expectedFPtr.takeError();
-  auto fptr = *expectedFPtr;
-
-  SmallVector<void *, 8> packedArgs{static_cast<void *>(&args)...};
-  (*fptr)(packedArgs.data());
-
-  return llvm::Error::success();
-}
-
 } // end namespace mlir
 
 #endif // MLIR_EXECUTIONENGINE_EXECUTIONENGINE_H_

From 14f33575868556f928434192bd6141f4be16a7a4 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 13:22:16 -0700
Subject: [PATCH 275/770] [StackSafety] Bailout more aggressively Many edge
 cases, e.g. wrapped ranges, can be processed precisely without bailout.
 However it's very unlikely that memory access with min/max integer offsets
 will be classified as safe anyway. Early bailout may help with ThinLTO where
 we can drop unsafe parameters from summaries.

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 38 ++++++++++++++++-------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index a44732613c0fa..d544f71c7308d 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -85,7 +85,11 @@ struct UseInfo {
 
   explicit UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
 
-  void updateRange(ConstantRange R) { Range = Range.unionWith(R); }
+  void updateRange(const ConstantRange &R) {
+    assert(!R.isUpperSignWrapped());
+    Range = Range.unionWith(R);
+    assert(!Range.isUpperSignWrapped());
+  }
 };
 
 raw_ostream &operator<<(raw_ostream &OS, const UseInfo &U) {
@@ -189,6 +193,11 @@ StackSafetyInfo makeSSI(FunctionInfo Info) {
 
 namespace {
 
+// Check if we should bailout for such ranges.
+bool isUnsafe(const ConstantRange &R) {
+  return R.isEmptySet() || R.isFullSet() || R.isUpperSignWrapped();
+}
+
 class StackSafetyLocalAnalysis {
   Function &F;
   const DataLayout &DL;
@@ -227,7 +236,7 @@ ConstantRange StackSafetyLocalAnalysis::offsetFrom(Value *Addr, Value *Base) {
   AllocaOffsetRewriter Rewriter(SE, Base);
   const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
   ConstantRange Offset = SE.getSignedRange(Expr);
-  if (Offset.isEmptySet() || Offset.isFullSet() || Offset.isSignWrappedSet())
+  if (isUnsafe(Offset))
     return UnknownRange;
   return Offset.sextOrTrunc(PointerSize);
 }
@@ -238,18 +247,26 @@ StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
   // Zero-size loads and stores do not access memory.
   if (SizeRange.isEmptySet())
     return ConstantRange::getEmpty(PointerSize);
+  assert(!isUnsafe(SizeRange));
+
+  ConstantRange AccessRange = offsetFrom(Addr, Base);
+  if (isUnsafe(AccessRange))
+    return UnknownRange;
 
-  ConstantRange AccessStartRange = offsetFrom(Addr, Base);
-  ConstantRange AccessRange = AccessStartRange.add(SizeRange);
-  assert(!AccessRange.isEmptySet());
+  if (AccessRange.signedAddMayOverflow(SizeRange) !=
+      ConstantRange::OverflowResult::NeverOverflows)
+    return UnknownRange;
+  AccessRange = AccessRange.add(SizeRange);
+  if (isUnsafe(AccessRange))
+    return UnknownRange;
   return AccessRange;
 }
 
 ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
                                                        TypeSize Size) {
-  ConstantRange SizeRange =
-      Size.isScalable() ? UnknownRange : getRange(0, Size.getFixedSize());
-  return getAccessRange(Addr, Base, SizeRange);
+  if (Size.isScalable())
+    return UnknownRange;
+  return getAccessRange(Addr, Base, getRange(0, Size.getFixedSize()));
 }
 
 ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
@@ -268,9 +285,8 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
   const SCEV *Expr =
       SE.getTruncateOrZeroExtend(SE.getSCEV(MI->getLength()), CalculationTy);
   ConstantRange LenRange = SE.getSignedRange(Expr);
-  assert(!LenRange.isEmptySet());
-  if (LenRange.isSignWrappedSet() || LenRange.isFullSet() ||
-      LenRange.getUpper().isNegative())
+  assert(!isUnsafe(LenRange));
+  if (LenRange.getUpper().isNegative())
     return UnknownRange;
   LenRange = LenRange.sextOrTrunc(PointerSize);
   ConstantRange SizeRange(APInt::getNullValue(PointerSize),

From 804a39a201567f5f615246bf99cf8e8ff7e006c8 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 13:32:58 -0700
Subject: [PATCH 276/770] [NFC,StackSafety] Rename some variables

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index d544f71c7308d..aead4ec9fd160 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -249,17 +249,17 @@ StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
     return ConstantRange::getEmpty(PointerSize);
   assert(!isUnsafe(SizeRange));
 
-  ConstantRange AccessRange = offsetFrom(Addr, Base);
-  if (isUnsafe(AccessRange))
+  ConstantRange Offsets = offsetFrom(Addr, Base);
+  if (isUnsafe(Offsets))
     return UnknownRange;
 
-  if (AccessRange.signedAddMayOverflow(SizeRange) !=
+  if (Offsets.signedAddMayOverflow(SizeRange) !=
       ConstantRange::OverflowResult::NeverOverflows)
     return UnknownRange;
-  AccessRange = AccessRange.add(SizeRange);
-  if (isUnsafe(AccessRange))
+  Offsets = Offsets.add(SizeRange);
+  if (isUnsafe(Offsets))
     return UnknownRange;
-  return AccessRange;
+  return Offsets;
 }
 
 ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
@@ -284,13 +284,13 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
 
   const SCEV *Expr =
       SE.getTruncateOrZeroExtend(SE.getSCEV(MI->getLength()), CalculationTy);
-  ConstantRange LenRange = SE.getSignedRange(Expr);
-  assert(!isUnsafe(LenRange));
-  if (LenRange.getUpper().isNegative())
+  ConstantRange Sizes = SE.getSignedRange(Expr);
+  assert(!isUnsafe(Sizes));
+  if (Sizes.getUpper().isNegative())
     return UnknownRange;
-  LenRange = LenRange.sextOrTrunc(PointerSize);
+  Sizes = Sizes.sextOrTrunc(PointerSize);
   ConstantRange SizeRange(APInt::getNullValue(PointerSize),
-                          LenRange.getUpper() - 1);
+                          Sizes.getUpper() - 1);
   return getAccessRange(U, Base, SizeRange);
 }
 

From 03481287ca530494512d128cbbdc9c87f2d84921 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Wed, 27 May 2020 15:58:07 -0400
Subject: [PATCH 277/770] Refactor argument attribute specification in
 intrinsic definition. NFC.

- Argument attribute needs specifiying through `ArgIndex<n>`
  (corresponding to `FirstArgIndex`) to distinguish explicitly from the
  index number from the overloaded type list.
- In addition, `RetIndex` (corresponding to `ReturnIndex`) and
  `FuncIndex` (corresponding to `FunctionIndex`) are introduced for us
  to associate attributes on the return value and potentially function
  itself.

Differential Revision: https://reviews.llvm.org/D80422
---
 llvm/include/llvm/IR/Intrinsics.td            | 250 +++---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  74 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 174 ++---
 llvm/include/llvm/IR/IntrinsicsARM.td         |  64 +-
 llvm/include/llvm/IR/IntrinsicsBPF.td         |   2 +-
 llvm/include/llvm/IR/IntrinsicsHexagon.td     |  26 +-
 llvm/include/llvm/IR/IntrinsicsHexagonDep.td  | 370 ++++-----
 llvm/include/llvm/IR/IntrinsicsMips.td        | 254 +++----
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  26 +-
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |  24 +-
 llvm/include/llvm/IR/IntrinsicsRISCV.td       |   4 +-
 llvm/include/llvm/IR/IntrinsicsSystemZ.td     |  36 +-
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td |  16 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         | 716 +++++++++---------
 llvm/include/llvm/IR/IntrinsicsXCore.td       |  72 +-
 .../GlobalISelEmitter-SDNodeXForm-timm.td     |   4 +-
 ...lobalISelEmitter-immarg-literal-pattern.td |   2 +-
 llvm/test/TableGen/immarg.td                  |   4 +-
 llvm/utils/TableGen/CodeGenIntrinsics.h       |  16 +-
 llvm/utils/TableGen/CodeGenTarget.cpp         |  17 +-
 llvm/utils/TableGen/IntrinsicEmitter.cpp      |   7 +-
 21 files changed, 1111 insertions(+), 1047 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index a2553cdeec6c2..f6df3faba83f5 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -62,44 +62,52 @@ def Commutative : IntrinsicProperty;
 // Throws - This intrinsic can throw.
 def Throws : IntrinsicProperty;
 
+// Attribute index needs to match `AttrIndex` defined `Attributes.h`.
+class AttrIndex<int idx> {
+  int Value = idx;
+}
+def FuncIndex : AttrIndex<-1>;
+def RetIndex : AttrIndex<0>;
+class ArgIndex<int argNo> : AttrIndex<!add(argNo, 1)>;
+
 // NoCapture - The specified argument pointer is not captured by the intrinsic.
-class NoCapture<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class NoCapture<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 // NoAlias - The specified argument pointer is not aliasing other "noalias" pointer
 // arguments of the intrinsic wrt. the intrinsic scope.
-class NoAlias<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class NoAlias<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 // Returned - The specified argument is always the return value of the
 // intrinsic.
-class Returned<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class Returned<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 // ImmArg - The specified argument must be an immediate.
-class ImmArg<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class ImmArg<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 // ReadOnly - The specified argument pointer is not written to through the
 // pointer by the intrinsic.
-class ReadOnly<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class ReadOnly<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 // WriteOnly - The intrinsic does not read memory through the specified
 // argument pointer.
-class WriteOnly<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class WriteOnly<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 // ReadNone - The specified argument pointer is not dereferenced by the
 // intrinsic.
-class ReadNone<int argNo> : IntrinsicProperty {
-  int ArgNo = argNo;
+class ReadNone<AttrIndex idx> : IntrinsicProperty {
+  int ArgNo = idx.Value;
 }
 
 def IntrNoReturn : IntrinsicProperty;
@@ -356,7 +364,8 @@ def int_gcread  : Intrinsic<[llvm_ptr_ty],
                             [IntrReadMem, IntrArgMemOnly]>;
 def int_gcwrite : Intrinsic<[],
                             [llvm_ptr_ty, llvm_ptr_ty, llvm_ptrptr_ty],
-                            [IntrArgMemOnly, NoCapture<1>, NoCapture<2>]>;
+                            [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
+                             NoCapture<ArgIndex<2>>]>;
 
 //===------------------- ObjC ARC runtime Intrinsics --------------------===//
 //
@@ -432,9 +441,11 @@ def int_objc_arc_annotation_bottomup_bbend  : Intrinsic<[],
 
 //===--------------------- Code Generator Intrinsics ----------------------===//
 //
-def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
+                                  [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 def int_addressofreturnaddress : Intrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
-def int_frameaddress : Intrinsic<[llvm_anyptr_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+def int_frameaddress : Intrinsic<[llvm_anyptr_ty], [llvm_i32_ty],
+                                 [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 def int_sponentry  : Intrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
@@ -452,7 +463,7 @@ def int_localescape : Intrinsic<[], [llvm_vararg_ty]>;
 // to an escaped allocation indicated by the index.
 def int_localrecover : Intrinsic<[llvm_ptr_ty],
                                  [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-                                 [IntrNoMem, ImmArg<2>]>;
+                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 // Given the frame pointer passed into an SEH filter function, returns a
 // pointer to the local variable area suitable for use with llvm.localrecover.
@@ -478,8 +489,9 @@ def int_thread_pointer : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
 // memory while not impeding optimization.
 def int_prefetch
     : Intrinsic<[], [ llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ],
-                [ IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, ReadOnly<0>, NoCapture<0>,
-                  ImmArg<1>, ImmArg<2>]>;
+                [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+                 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
+                 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
 
 def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
@@ -520,10 +532,13 @@ def int_call_preallocated_arg : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i3
 //
 
 def int_memcpy  : Intrinsic<[],
-                             [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
-                              llvm_i1_ty],
-                            [IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>,
-                             NoAlias<0>, NoAlias<1>, WriteOnly<0>, ReadOnly<1>, ImmArg<3>]>;
+                            [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
+                             llvm_i1_ty],
+                            [IntrArgMemOnly, IntrWillReturn,
+                             NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+                             NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+                             WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+                             ImmArg<ArgIndex<3>>]>;
 
 // Memcpy semantic that is guaranteed to be inlined.
 // In particular this means that the generated code is not allowed to call any
@@ -531,23 +546,25 @@ def int_memcpy  : Intrinsic<[],
 // The third argument (specifying the size) must be a constant.
 def int_memcpy_inline
     : Intrinsic<[],
-      [ llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i1_ty ],
-      [ IntrArgMemOnly, IntrWillReturn,
-      NoCapture<0>, NoCapture<1>,
-      NoAlias<0>, NoAlias<1>,
-      WriteOnly<0>, ReadOnly<1>,
-      ImmArg<2>, ImmArg<3> ]>;
+      [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrArgMemOnly, IntrWillReturn,
+       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+       NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+       ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_memmove : Intrinsic<[],
                             [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty,
                              llvm_i1_ty],
-                            [IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>,
-                             ReadOnly<1>, ImmArg<3>]>;
+                            [IntrArgMemOnly, IntrWillReturn,
+                             NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+                             ReadOnly<ArgIndex<1>>, ImmArg<ArgIndex<3>>]>;
 def int_memset  : Intrinsic<[],
                             [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty,
                              llvm_i1_ty],
-                            [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, NoCapture<0>,
-                             WriteOnly<0>, ImmArg<3>]>;
+                            [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
+                             NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+                             ImmArg<ArgIndex<3>>]>;
 
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
@@ -614,7 +631,9 @@ def int_maximum : Intrinsic<[llvm_anyfloat_ty],
 def int_objectsize : Intrinsic<[llvm_anyint_ty],
                                [llvm_anyptr_ty, llvm_i1_ty,
                                 llvm_i1_ty, llvm_i1_ty],
-                               [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<1>, ImmArg<2>, ImmArg<3>]>,
+                               [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+                                ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
+                                ImmArg<ArgIndex<3>>]>,
                                GCCBuiltin<"__builtin_object_size">;
 
 //===--------------- Access to Floating Point Environment -----------------===//
@@ -827,7 +846,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
-let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<1>] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+                      ImmArg<ArgIndex<1>>] in {
   def int_ctlz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
   def int_cttz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
 }
@@ -917,12 +937,12 @@ def int_codeview_annotation : Intrinsic<[], [llvm_metadata_ty],
 //
 def int_init_trampoline : Intrinsic<[],
                                     [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
-                                    [IntrArgMemOnly, NoCapture<0>]>,
-                                   GCCBuiltin<"__builtin_init_trampoline">;
+                                    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
+                                    GCCBuiltin<"__builtin_init_trampoline">;
 
 def int_adjust_trampoline : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
                                       [IntrReadMem, IntrArgMemOnly]>,
-                                     GCCBuiltin<"__builtin_adjust_trampoline">;
+                                      GCCBuiltin<"__builtin_adjust_trampoline">;
 
 //===------------------------ Overflow Intrinsics -------------------------===//
 //
@@ -969,52 +989,64 @@ def int_usub_sat : Intrinsic<[llvm_anyint_ty],
 //
 def int_smul_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                             [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+                              Commutative, ImmArg<ArgIndex<2>>]>;
 
 def int_umul_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                             [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+                             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+                              Commutative, ImmArg<ArgIndex<2>>]>;
 
 def int_sdiv_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                             [IntrNoMem, ImmArg<2>]>;
+                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_udiv_fix : Intrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                             [IntrNoMem, ImmArg<2>]>;
+                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 //===------------------- Fixed Point Saturation Arithmetic Intrinsics ----------------===//
 //
 def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+                                  Commutative, ImmArg<ArgIndex<2>>]>;
 def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+                                 [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+                                  Commutative, ImmArg<ArgIndex<2>>]>;
 
 def int_sdiv_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                                 [IntrNoMem, ImmArg<2>]>;
+                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-                                 [IntrNoMem, ImmArg<2>]>;
+                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
                                     [llvm_i64_ty, llvm_anyptr_ty],
-                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<1>, ImmArg<0>]>;
+                                    [IntrArgMemOnly, IntrWillReturn,
+                                     NoCapture<ArgIndex<1>>,
+                                     ImmArg<ArgIndex<0>>]>;
 def int_lifetime_end    : Intrinsic<[],
                                     [llvm_i64_ty, llvm_anyptr_ty],
-                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<1>, ImmArg<0>]>;
+                                    [IntrArgMemOnly, IntrWillReturn,
+                                     NoCapture<ArgIndex<1>>,
+                                     ImmArg<ArgIndex<0>>]>;
 def int_invariant_start : Intrinsic<[llvm_descriptor_ty],
                                     [llvm_i64_ty, llvm_anyptr_ty],
-                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<1>, ImmArg<0>]>;
+                                    [IntrArgMemOnly, IntrWillReturn,
+                                     NoCapture<ArgIndex<1>>,
+                                     ImmArg<ArgIndex<0>>]>;
 def int_invariant_end   : Intrinsic<[],
                                     [llvm_descriptor_ty, llvm_i64_ty,
                                      llvm_anyptr_ty],
-                                    [IntrArgMemOnly, IntrWillReturn, NoCapture<2>, ImmArg<1>]>;
+                                    [IntrArgMemOnly, IntrWillReturn,
+                                     NoCapture<ArgIndex<2>>,
+                                     ImmArg<ArgIndex<1>>]>;
 
 // launder.invariant.group can't be marked with 'readnone' (IntrNoMem),
 // because it would cause CSE of two barriers with the same argument.
@@ -1061,13 +1093,17 @@ def int_experimental_gc_statepoint : Intrinsic<[llvm_token_ty],
                                [llvm_i64_ty, llvm_i32_ty,
                                 llvm_anyptr_ty, llvm_i32_ty,
                                 llvm_i32_ty, llvm_vararg_ty],
-                                [Throws, ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>]>;
+                               [Throws, ImmArg<ArgIndex<0>>,
+                                ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>,
+                                ImmArg<ArgIndex<4>>]>;
 
 def int_experimental_gc_result   : Intrinsic<[llvm_any_ty], [llvm_token_ty],
                                              [IntrReadMem]>;
 def int_experimental_gc_relocate : Intrinsic<[llvm_any_ty],
-                                [llvm_token_ty, llvm_i32_ty, llvm_i32_ty],
-                                [IntrReadMem, ImmArg<1>, ImmArg<2>]>;
+                                             [llvm_token_ty, llvm_i32_ty,
+                                              llvm_i32_ty],
+                                             [IntrReadMem, ImmArg<ArgIndex<1>>,
+                                              ImmArg<ArgIndex<2>>]>;
 
 //===------------------------ Coroutine Intrinsics ---------------===//
 // These are documented in docs/Coroutines.rst
@@ -1077,7 +1113,8 @@ def int_experimental_gc_relocate : Intrinsic<[llvm_any_ty],
 def int_coro_id : Intrinsic<[llvm_token_ty], [llvm_i32_ty, llvm_ptr_ty,
                              llvm_ptr_ty, llvm_ptr_ty],
                             [IntrArgMemOnly, IntrReadMem,
-                             ReadNone<1>, ReadOnly<2>, NoCapture<2>]>;
+                             ReadNone<ArgIndex<1>>, ReadOnly<ArgIndex<2>>,
+                             NoCapture<ArgIndex<2>>]>;
 def int_coro_id_retcon : Intrinsic<[llvm_token_ty],
     [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty,
      llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
@@ -1088,11 +1125,12 @@ def int_coro_id_retcon_once : Intrinsic<[llvm_token_ty],
     []>;
 def int_coro_alloc : Intrinsic<[llvm_i1_ty], [llvm_token_ty], []>;
 def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
-                               [WriteOnly<1>]>;
+                               [WriteOnly<ArgIndex<1>>]>;
 
 def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
-                              [IntrReadMem, IntrArgMemOnly, ReadOnly<1>,
-                               NoCapture<1>]>;
+                              [IntrReadMem, IntrArgMemOnly,
+                               ReadOnly<ArgIndex<1>>,
+                               NoCapture<ArgIndex<1>>]>;
 def int_coro_end : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty], []>;
 
 def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
@@ -1110,23 +1148,26 @@ def int_coro_alloca_get : Intrinsic<[llvm_ptr_ty], [llvm_token_ty], []>;
 def int_coro_alloca_free : Intrinsic<[], [llvm_token_ty], []>;
 
 def int_coro_param : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_ptr_ty],
-                               [IntrNoMem, ReadNone<0>, ReadNone<1>]>;
+                               [IntrNoMem, ReadNone<ArgIndex<0>>,
+                                ReadNone<ArgIndex<1>>]>;
 
 // Coroutine Manipulation Intrinsics.
 
 def int_coro_resume : Intrinsic<[], [llvm_ptr_ty], [Throws]>;
 def int_coro_destroy : Intrinsic<[], [llvm_ptr_ty], [Throws]>;
 def int_coro_done : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
-                              [IntrArgMemOnly, ReadOnly<0>, NoCapture<0>]>;
+                              [IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
+                               NoCapture<ArgIndex<0>>]>;
 def int_coro_promise : Intrinsic<[llvm_ptr_ty],
                                  [llvm_ptr_ty, llvm_i32_ty, llvm_i1_ty],
-                                 [IntrNoMem, NoCapture<0>]>;
+                                 [IntrNoMem, NoCapture<ArgIndex<0>>]>;
 
 // Coroutine Lowering Intrinsics. Used internally by coroutine passes.
 
 def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty],
-                                    [IntrReadMem, IntrArgMemOnly, ReadOnly<0>,
-                                     NoCapture<0>]>;
+                                    [IntrReadMem, IntrArgMemOnly,
+                                     ReadOnly<ArgIndex<0>>,
+                                     NoCapture<ArgIndex<0>>]>;
 
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
@@ -1255,24 +1296,26 @@ def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
                                       LLVMAnyPointerType<LLVMMatchType<0>>,
                                       llvm_i32_ty,
                                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                 [IntrArgMemOnly, IntrWillReturn, ImmArg<2>]>;
+                                 [IntrArgMemOnly, IntrWillReturn, ImmArg<ArgIndex<2>>]>;
 
 def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
                                  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
-                                 [IntrReadMem, IntrArgMemOnly, IntrWillReturn, ImmArg<1>]>;
+                                 [IntrReadMem, IntrArgMemOnly, IntrWillReturn,
+                                  ImmArg<ArgIndex<1>>]>;
 
 def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
                                  [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                   LLVMMatchType<0>],
-                                 [IntrReadMem, IntrWillReturn, ImmArg<1>]>;
+                                 [IntrReadMem, IntrWillReturn,
+                                  ImmArg<ArgIndex<1>>]>;
 
 def int_masked_scatter: Intrinsic<[],
                                   [llvm_anyvector_ty,
                                    LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                                   [IntrWillReturn, ImmArg<2>]>;
+                                  [IntrWillReturn, ImmArg<ArgIndex<2>>]>;
 
 def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
                                      [LLVMPointerToElt<0>,
@@ -1303,20 +1346,24 @@ def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
                                  [IntrReadMem, IntrArgMemOnly]>;
 
 def int_hwasan_check_memaccess :
-  Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOnly, ImmArg<2>]>;
+  Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
+            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
 def int_hwasan_check_memaccess_shortgranules :
-  Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOnly, ImmArg<2>]>;
+  Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
+            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
 
 // Xray intrinsics
 //===----------------------------------------------------------------------===//
 // Custom event logging for x-ray.
 // Takes a pointer to a string and the length of the string.
 def int_xray_customevent : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
-                                     [NoCapture<0>, ReadOnly<0>, IntrWriteMem]>;
+                                     [IntrWriteMem, NoCapture<ArgIndex<0>>,
+                                      ReadOnly<ArgIndex<0>>]>;
 // Typed event logging for x-ray.
 // Takes a numeric type tag, a pointer to a string and the length of the string.
 def int_xray_typedevent : Intrinsic<[], [llvm_i16_ty, llvm_ptr_ty, llvm_i32_ty],
-                                        [NoCapture<1>, ReadOnly<1>, IntrWriteMem]>;
+                                        [IntrWriteMem, NoCapture<ArgIndex<1>>,
+                                         ReadOnly<ArgIndex<1>>]>;
 //===----------------------------------------------------------------------===//
 
 //===------ Memory intrinsics with element-wise atomicity guarantees ------===//
@@ -1325,30 +1372,25 @@ def int_xray_typedevent : Intrinsic<[], [llvm_i16_ty, llvm_ptr_ty, llvm_i32_ty],
 // @llvm.memcpy.element.unordered.atomic.*(dest, src, length, elementsize)
 def int_memcpy_element_unordered_atomic
     : Intrinsic<[],
-                [
-                  llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty
-                ],
-                [
-                  IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>, WriteOnly<0>,
-                  ReadOnly<1>, ImmArg<3>
-                ]>;
+                [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+                 NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<0>>,
+                 ReadOnly<ArgIndex<1>>, ImmArg<ArgIndex<3>>]>;
 
 // @llvm.memmove.element.unordered.atomic.*(dest, src, length, elementsize)
 def int_memmove_element_unordered_atomic
     : Intrinsic<[],
-                [
-                  llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty
-                ],
-                [
-                  IntrArgMemOnly, IntrWillReturn, NoCapture<0>, NoCapture<1>, WriteOnly<0>,
-                  ReadOnly<1>, ImmArg<3>
-                ]>;
+                [llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
+                 NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<0>>,
+                 ReadOnly<ArgIndex<1>>, ImmArg<ArgIndex<3>>]>;
 
 // @llvm.memset.element.unordered.atomic.*(dest, value, length, elementsize)
 def int_memset_element_unordered_atomic
-    : Intrinsic<[], [ llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i32_ty ],
-                [ IntrWriteMem, IntrArgMemOnly, IntrWillReturn, NoCapture<0>, WriteOnly<0>,
-                  ImmArg<3> ]>;
+    : Intrinsic<[], [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i32_ty],
+                [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
+                 NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
+                 ImmArg<ArgIndex<3>>]>;
 
 //===------------------------ Reduction Intrinsics ------------------------===//
 //
@@ -1390,7 +1432,8 @@ def int_matrix_transpose : Intrinsic<[llvm_anyvector_ty],
                                      llvm_i32_ty,
                                      llvm_i32_ty],
                                      [IntrNoMem, IntrSpeculatable,
-                                      IntrWillReturn, ImmArg<1>, ImmArg<2>]>;
+                                      IntrWillReturn, ImmArg<ArgIndex<1>>,
+                                      ImmArg<ArgIndex<2>>]>;
 
 def int_matrix_multiply : Intrinsic<[llvm_anyvector_ty],
                                     [llvm_anyvector_ty,
@@ -1399,8 +1442,9 @@ def int_matrix_multiply : Intrinsic<[llvm_anyvector_ty],
                                      llvm_i32_ty,
                                      llvm_i32_ty],
                                     [IntrNoMem, IntrSpeculatable,
-                                     IntrWillReturn, ImmArg<2>, ImmArg<3>,
-                                     ImmArg<4>]>;
+                                     IntrWillReturn, ImmArg<ArgIndex<2>>,
+                                     ImmArg<ArgIndex<3>>,
+                                     ImmArg<ArgIndex<4>>]>;
 
 def int_matrix_columnwise_load : Intrinsic<[llvm_anyvector_ty],
                                            [LLVMAnyPointerType<LLVMMatchType<0>>,
@@ -1408,7 +1452,9 @@ def int_matrix_columnwise_load : Intrinsic<[llvm_anyvector_ty],
                                             llvm_i32_ty,
                                             llvm_i32_ty],
                                            [IntrArgMemOnly, IntrReadMem,
-                                            IntrWillReturn, ImmArg<2>, ImmArg<3>]>;
+                                            IntrWillReturn,
+                                            ImmArg<ArgIndex<2>>,
+                                            ImmArg<ArgIndex<3>>]>;
 
 def int_matrix_columnwise_store : Intrinsic<[],
                                             [llvm_anyvector_ty,
@@ -1417,8 +1463,10 @@ def int_matrix_columnwise_store : Intrinsic<[],
                                              llvm_i32_ty,
                                              llvm_i32_ty],
                                             [IntrArgMemOnly, IntrWillReturn,
-                                             IntrWriteMem, WriteOnly<1>,
-                                             ImmArg<3>, ImmArg<4>]>;
+                                             IntrWriteMem,
+                                             WriteOnly<ArgIndex<1>>,
+                                             ImmArg<ArgIndex<3>>,
+                                             ImmArg<ArgIndex<4>>]>;
 
 //===---------- Intrinsics to control hardware supported loops ----------===//
 
@@ -1452,22 +1500,26 @@ def int_loop_decrement_reg :
 //===----- Intrinsics that are used to provide predicate information -----===//
 
 def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
-                             [IntrNoMem, Returned<0>]>;
+                             [IntrNoMem, Returned<ArgIndex<0>>]>;
 
 //===------- Intrinsics that are used to preserve debug information -------===//
 
 def int_preserve_array_access_index : Intrinsic<[llvm_anyptr_ty],
                                                 [llvm_anyptr_ty, llvm_i32_ty,
                                                  llvm_i32_ty],
-                                                [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                                                [IntrNoMem,
+                                                 ImmArg<ArgIndex<1>>,
+                                                 ImmArg<ArgIndex<2>>]>;
 def int_preserve_union_access_index : Intrinsic<[llvm_anyptr_ty],
                                                 [llvm_anyptr_ty, llvm_i32_ty],
-                                                [IntrNoMem, ImmArg<1>]>;
+                                                [IntrNoMem,
+                                                 ImmArg<ArgIndex<1>>]>;
 def int_preserve_struct_access_index : Intrinsic<[llvm_anyptr_ty],
                                                  [llvm_anyptr_ty, llvm_i32_ty,
                                                   llvm_i32_ty],
-                                                 [IntrNoMem, ImmArg<1>,
-                                                  ImmArg<2>]>;
+                                                 [IntrNoMem,
+                                                  ImmArg<ArgIndex<1>>,
+                                                  ImmArg<ArgIndex<2>>]>;
 
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 384e3209f5f5f..d00456123f519 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -487,7 +487,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                   [IntrReadMem, IntrArgMemOnly]>;
   class AdvSIMD_1Vec_Store_Lane_Intrinsic
     : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
-                [IntrArgMemOnly, NoCapture<2>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
 
   class AdvSIMD_2Vec_Load_Intrinsic
     : Intrinsic<[LLVMMatchType<0>, llvm_anyvector_ty],
@@ -501,11 +501,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Vec_Store_Intrinsic
     : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                      LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrArgMemOnly, NoCapture<2>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
   class AdvSIMD_2Vec_Store_Lane_Intrinsic
     : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                  llvm_i64_ty, llvm_anyptr_ty],
-                [IntrArgMemOnly, NoCapture<3>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
 
   class AdvSIMD_3Vec_Load_Intrinsic
     : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
@@ -519,12 +519,12 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_3Vec_Store_Intrinsic
     : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                      LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrArgMemOnly, NoCapture<3>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
   class AdvSIMD_3Vec_Store_Lane_Intrinsic
     : Intrinsic<[], [llvm_anyvector_ty,
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  llvm_i64_ty, llvm_anyptr_ty],
-                [IntrArgMemOnly, NoCapture<4>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
 
   class AdvSIMD_4Vec_Load_Intrinsic
     : Intrinsic<[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -542,12 +542,12 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrArgMemOnly, NoCapture<4>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
   class AdvSIMD_4Vec_Store_Lane_Intrinsic
     : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
                  LLVMMatchType<0>, LLVMMatchType<0>,
                  llvm_i64_ty, llvm_anyptr_ty],
-                [IntrArgMemOnly, NoCapture<5>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<5>>]>;
 }
 
 // Memory ops
@@ -744,20 +744,20 @@ def int_aarch64_irg_sp   : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty],
 //   ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset
 // It is intended that ptr0 is an alloca address, and baseptr is the direct output of llvm.aarch64.irg.sp.
 def int_aarch64_tagp : Intrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_ptr_ty, llvm_i64_ty],
-    [IntrNoMem, ImmArg<2>]>;
+    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 // Update allocation tags for the memory range to match the tag in the pointer argument.
 def int_aarch64_settag  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
-    [IntrWriteMem, IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
+    [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 
 // Update allocation tags for the memory range to match the tag in the pointer argument,
 // and set memory contents to zero.
 def int_aarch64_settag_zero  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
-    [IntrWriteMem, IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
+    [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 
 // Update allocation tags for 16-aligned, 16-sized memory region, and store a pair 8-byte values.
 def int_aarch64_stgp  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty],
-    [IntrWriteMem, IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
+    [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 }
 
 // Transactional Memory Extension (TME) Intrinsics
@@ -768,7 +768,7 @@ def int_aarch64_tstart  : GCCBuiltin<"__builtin_arm_tstart">,
 def int_aarch64_tcommit : GCCBuiltin<"__builtin_arm_tcommit">, Intrinsic<[]>;
 
 def int_aarch64_tcancel : GCCBuiltin<"__builtin_arm_tcancel">,
-                          Intrinsic<[], [llvm_i64_ty], [ImmArg<0>]>;
+                          Intrinsic<[], [llvm_i64_ty], [ImmArg<ArgIndex<0>>]>;
 
 def int_aarch64_ttest   : GCCBuiltin<"__builtin_arm_ttest">,
                           Intrinsic<[llvm_i64_ty], [],
@@ -800,26 +800,26 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [llvm_anyvector_ty,
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMPointerToElt<0>],
-                [IntrArgMemOnly, NoCapture<2>]>;
+                [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
 
   class AdvSIMD_2Vec_PredStore_Intrinsic
       : Intrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>,
                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerTo<0>],
-                  [IntrArgMemOnly, NoCapture<3>]>;
+                  [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
 
   class AdvSIMD_3Vec_PredStore_Intrinsic
       : Intrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerTo<0>],
-                  [IntrArgMemOnly, NoCapture<4>]>;
+                  [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
 
   class AdvSIMD_4Vec_PredStore_Intrinsic
       : Intrinsic<[],
                   [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
                    LLVMMatchType<0>,
                    LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMPointerTo<0>],
-                  [IntrArgMemOnly, NoCapture<5>]>;
+                  [IntrArgMemOnly, NoCapture<ArgIndex<5>>]>;
 
   class AdvSIMD_SVE_Index_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -839,7 +839,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<2>]>;
+                [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_3VectorArgIndexed_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -847,7 +847,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_Pred1VectorArg_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -895,7 +895,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [LLVMMatchType<0>,
                  llvm_i32_ty,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_SVE_Saturating_N_Intrinsic<LLVMType T>
     : Intrinsic<[T],
@@ -905,7 +905,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic<LLVMType T>
     : Intrinsic<[T],
                 [T, llvm_i32_ty, llvm_i32_ty],
-                [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_SVE_CNT_Intrinsic
     : Intrinsic<[LLVMVectorOfBitcastsToInt<0>],
@@ -926,7 +926,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<2>]>;
+                [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class AdvSIMD_SVE_ShiftWide_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -946,7 +946,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_SVE_CMLA_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -955,7 +955,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMMatchType<0>,
                  LLVMMatchType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<4>]>;
+                [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   class AdvSIMD_SVE_CMLA_LANE_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -964,7 +964,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMMatchType<0>,
                  llvm_i32_ty,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>, ImmArg<4>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
   class AdvSIMD_SVE_DUP_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -1011,7 +1011,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_SVE_PTRUE_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [llvm_i32_ty],
-                [IntrNoMem, ImmArg<0>]>;
+                [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
   class AdvSIMD_SVE_PUNPKHI_Intrinsic
     : Intrinsic<[LLVMHalfElementsVectorType<0>],
@@ -1041,7 +1041,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_SVE_CNTB_Intrinsic
     : Intrinsic<[llvm_i64_ty],
                 [llvm_i32_ty],
-                [IntrNoMem, ImmArg<0>]>;
+                [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
   class AdvSIMD_SVE_CNTP_Intrinsic
     : Intrinsic<[llvm_i64_ty],
@@ -1061,7 +1061,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMSubdivide4VectorType<0>,
                  LLVMSubdivide4VectorType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_SVE_PTEST_Intrinsic
     : Intrinsic<[llvm_i1_ty],
@@ -1086,7 +1086,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<1>]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   class SVE2_2VectorArg_Long_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -1099,7 +1099,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
               [LLVMSubdivide2VectorType<0>,
                LLVMSubdivide2VectorType<0>,
                llvm_i32_ty],
-              [IntrNoMem, ImmArg<2>]>;
+              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class SVE2_2VectorArg_Wide_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -1127,7 +1127,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class SVE2_1VectorArg_Narrowing_Intrinsic
     : Intrinsic<[LLVMSubdivide2VectorType<0>],
@@ -1154,13 +1154,13 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class SVE2_1VectorArg_Imm_Narrowing_Intrinsic
       : Intrinsic<[LLVMSubdivide2VectorType<0>],
                   [llvm_anyvector_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   class SVE2_2VectorArg_Imm_Narrowing_Intrinsic
       : Intrinsic<[LLVMSubdivide2VectorType<0>],
                   [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty,
                    llvm_i32_ty],
-                  [IntrNoMem, ImmArg<2>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   class SVE2_CONFLICT_DETECT_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -1173,7 +1173,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   class AdvSIMD_SVE_CDOT_LANE_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -1182,7 +1182,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMSubdivide4VectorType<0>,
                  llvm_i32_ty,
                  llvm_i32_ty],
-                [IntrNoMem, ImmArg<3>, ImmArg<4>]>;
+                [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
   // NOTE: There is no relationship between these intrinsics beyond an attempt
   // to reuse currently identical class definitions.
@@ -1283,7 +1283,7 @@ class SVE_gather_prf_SV
                   llvm_anyvector_ty, // Offsets
                   llvm_i32_ty // Prfop
                 ],
-                [IntrInaccessibleMemOrArgMemOnly, NoCapture<1>, ImmArg<3>]>;
+                [IntrInaccessibleMemOrArgMemOnly, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<3>>]>;
 
 class SVE_gather_prf_VS
     : Intrinsic<[],
@@ -1293,7 +1293,7 @@ class SVE_gather_prf_VS
                   llvm_i64_ty, // Scalar offset
                   llvm_i32_ty // Prfop
                 ],
-                [IntrInaccessibleMemOrArgMemOnly, ImmArg<3>]>;
+                [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<3>>]>;
 
 class SVE_MatMul_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
@@ -1329,7 +1329,7 @@ def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic;
 
 def int_aarch64_sve_prf
   : Intrinsic<[], [llvm_anyvector_ty, llvm_ptr_ty, llvm_i32_ty],
-                  [IntrArgMemOnly, ImmArg<2>]>;
+                  [IntrArgMemOnly, ImmArg<ArgIndex<2>>]>;
 
 // Scalar + 32-bit scaled offset vector, zero extend, packed and
 // unpacked.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 221d723c3e4aa..132d6b7360f74 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -177,7 +177,7 @@ def int_amdgcn_implicit_buffer_ptr :
 // FIXME: Should be mangled for wave size.
 def int_amdgcn_init_exec : Intrinsic<[],
   [llvm_i64_ty],      // 64-bit literal constant
-  [IntrConvergent, ImmArg<0>]>;
+  [IntrConvergent, ImmArg<ArgIndex<0>>]>;
 
 // Set EXEC according to a thread count packed in an SGPR input:
 //    thread_count = (input >> bitoffset) & 0x7f;
@@ -185,7 +185,7 @@ def int_amdgcn_init_exec : Intrinsic<[],
 def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [llvm_i32_ty,       // 32-bit SGPR input
    llvm_i32_ty],      // bit offset of the thread count
-  [IntrConvergent, ImmArg<1>]>;
+  [IntrConvergent, ImmArg<ArgIndex<1>>]>;
 
 def int_amdgcn_wavefrontsize :
   GCCBuiltin<"__builtin_amdgcn_wavefrontsize">,
@@ -200,10 +200,10 @@ def int_amdgcn_wavefrontsize :
 // the second one is copied to m0
 def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
   Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
-  [ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
+  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
 def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
   Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
-  [ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
+  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
 
 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent]>;
@@ -212,7 +212,7 @@ def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
   Intrinsic<[], [], [IntrConvergent]>;
 
 def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
-  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]>;
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 def int_amdgcn_div_scale : Intrinsic<
   // 1st parameter: Numerator
@@ -221,7 +221,7 @@ def int_amdgcn_div_scale : Intrinsic<
   //                (0 = Denominator, 1 = Numerator).
   [llvm_anyfloat_ty, llvm_i1_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
-  [IntrNoMem, IntrSpeculatable, ImmArg<2>]
+  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
 >;
 
 def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty],
@@ -384,7 +384,7 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
   llvm_i32_ty, // ordering
   llvm_i32_ty, // scope
   llvm_i1_ty], // isVolatile
-  [IntrArgMemOnly, NoCapture<0>, ImmArg<2>, ImmArg<3>, ImmArg<4>], "",
+  [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "",
   [SDNPMemOperand]
 >;
 
@@ -399,7 +399,7 @@ class AMDGPULDSF32Intrin<string clang_builtin> :
     llvm_i32_ty, // ordering
     llvm_i32_ty, // scope
     llvm_i1_ty], // isVolatile
-    [IntrArgMemOnly, NoCapture<0>, ImmArg<2>, ImmArg<3>, ImmArg<4>]
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]
 >;
 
 // FIXME: The m0 argument should be moved after the normal arguments
@@ -416,9 +416,9 @@ class AMDGPUDSOrderedIntrinsic : Intrinsic<
                 // gfx10: bits 24-27 indicate the number of active threads/dwords
    llvm_i1_ty,  // wave release, usually set to 1
    llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
-  [NoCapture<0>,
-   ImmArg<2>, ImmArg<3>, ImmArg<4>,
-   ImmArg<5>, ImmArg<6>, ImmArg<7>
+  [NoCapture<ArgIndex<0>>,
+   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
+   ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>
   ]
 >;
 
@@ -426,7 +426,7 @@ class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
   [llvm_i32_ty],
   [llvm_anyptr_ty, // LDS or GDS ptr
    llvm_i1_ty], // isVolatile
-   [IntrConvergent, IntrArgMemOnly, NoCapture<0>, ImmArg<1>],
+   [IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>],
    "",
    [SDNPMemOperand]
 >;
@@ -698,10 +698,10 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
       [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
        llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
      !listconcat(props,
-          !if(P_.IsAtomic, [], [ImmArg<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>]),
-          !if(P_.IsSample, [ImmArg<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>], []),
-          [ImmArg<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>,
-           ImmArg<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>]),
+          !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
+          !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
+          [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
+           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
       "", sdnodeprops>,
   AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
                            !if(P_.IsAtomic, 0, 1)), 1> {
@@ -861,7 +861,7 @@ class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty,        // glc(imm)
    llvm_i1_ty],       // slc(imm)
-  [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>,
+  [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
@@ -871,7 +871,7 @@ def int_amdgcn_s_buffer_load : Intrinsic <
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // byte offset(SGPR/imm)
    llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
-  [IntrNoMem, ImmArg<2>]>,
+  [IntrNoMem, ImmArg<ArgIndex<2>>]>,
   AMDGPURsrcIntrinsic<0>;
 
 class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
@@ -882,7 +882,7 @@ class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty,        // glc(imm)
    llvm_i1_ty],       // slc(imm)
-  [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
+  [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
@@ -903,7 +903,7 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
                       //                                       bit 1 = slc,
                       //                                       bit 2 = dlc on gfx10+),
                       //                      swizzled buffer (bit 3 = swz))
-  [IntrReadMem, ImmArg<3>], "", [SDNPMemOperand]>,
+  [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
@@ -918,7 +918,7 @@ class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
                       //                                       bit 1 = slc,
                       //                                       bit 2 = dlc on gfx10+),
                       //                      swizzled buffer (bit 3 = swz))
-  [IntrReadMem, ImmArg<4>], "", [SDNPMemOperand]>,
+  [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
 def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
@@ -933,7 +933,7 @@ class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
                       //                                       bit 1 = slc,
                       //                                       bit 2 = dlc on gfx10+),
                       //                      swizzled buffer (bit 3 = swz))
-  [IntrWriteMem, ImmArg<4>], "", [SDNPMemOperand]>,
+  [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
@@ -949,7 +949,7 @@ class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
                       //                                       bit 1 = slc,
                       //                                       bit 2 = dlc on gfx10+),
                       //                      swizzled buffer (bit 3 = swz))
-  [IntrWriteMem, ImmArg<5>], "", [SDNPMemOperand]>,
+  [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
 def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
@@ -961,7 +961,7 @@ class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
-  [ImmArg<4>], "", [SDNPMemOperand]>,
+  [ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1, 0>;
 def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
@@ -983,7 +983,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
-  [ImmArg<5>], "", [SDNPMemOperand]>,
+  [ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
 class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
@@ -994,7 +994,7 @@ class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
-  [ImmArg<5>], "", [SDNPMemOperand]>,
+  [ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1, 0>;
 def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
 def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
@@ -1017,7 +1017,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
    llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
-  [ImmArg<6>], "", [SDNPMemOperand]>,
+  [ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
 // Obsolescent tbuffer intrinsics.
@@ -1032,8 +1032,8 @@ def int_amdgcn_tbuffer_load : Intrinsic <
      llvm_i32_ty,     // nfmt(imm)
      llvm_i1_ty,     // glc(imm)
      llvm_i1_ty],    // slc(imm)
-    [IntrReadMem, ImmArg<4>, ImmArg<5>, ImmArg<6>,
-     ImmArg<7>, ImmArg<8>], "", [SDNPMemOperand]>,
+    [IntrReadMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
+     ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
 def int_amdgcn_tbuffer_store : Intrinsic <
@@ -1048,8 +1048,8 @@ def int_amdgcn_tbuffer_store : Intrinsic <
      llvm_i32_ty,    // nfmt(imm)
      llvm_i1_ty,     // glc(imm)
      llvm_i1_ty],    // slc(imm)
-    [IntrWriteMem, ImmArg<5>, ImmArg<6>, ImmArg<7>,
-     ImmArg<8>, ImmArg<9>], "", [SDNPMemOperand]>,
+    [IntrWriteMem, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
+     ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
 // New tbuffer intrinsics, with:
@@ -1066,7 +1066,7 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic <
                       //                                       bit 1 = slc,
                       //                                       bit 2 = dlc on gfx10+),
                       //                      swizzled buffer (bit 3 = swz))
-    [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>,
+    [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
 def int_amdgcn_raw_tbuffer_store : Intrinsic <
@@ -1080,7 +1080,7 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic <
                      //                                       bit 1 = slc,
                      //                                       bit 2 = dlc on gfx10+),
                      //                      swizzled buffer (bit 3 = swz))
-    [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
+    [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
 def int_amdgcn_struct_tbuffer_load : Intrinsic <
@@ -1094,7 +1094,7 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic <
                       //                                       bit 1 = slc,
                       //                                       bit 2 = dlc on gfx10+),
                       //                      swizzled buffer (bit 3 = swz))
-    [IntrReadMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
+    [IntrReadMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
 
 def int_amdgcn_struct_tbuffer_store : Intrinsic <
@@ -1109,7 +1109,7 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic <
                      //                                       bit 1 = slc,
                      //                                       bit 2 = dlc on gfx10+),
                      //                      swizzled buffer (bit 3 = swz))
-    [IntrWriteMem, ImmArg<5>, ImmArg<6>], "", [SDNPMemOperand]>,
+    [IntrWriteMem, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
 
 class AMDGPUBufferAtomic : Intrinsic <
@@ -1119,7 +1119,7 @@ class AMDGPUBufferAtomic : Intrinsic <
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty],       // slc(imm)
-  [ImmArg<4>], "", [SDNPMemOperand]>,
+  [ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1, 0>;
 def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
 def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
@@ -1139,7 +1139,7 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty],       // slc(imm)
-  [ImmArg<5>], "", [SDNPMemOperand]>,
+  [ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<2, 0>;
 
 } // defset AMDGPUBufferIntrinsics
@@ -1156,7 +1156,7 @@ def int_amdgcn_exp : Intrinsic <[], [
   llvm_i1_ty,        // done
   llvm_i1_ty         // vm
   ],
-  [ImmArg<0>, ImmArg<1>, ImmArg<6>, ImmArg<7>, IntrWriteMem, IntrInaccessibleMemOnly]
+  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly]
 >;
 
 // exp with compr bit set.
@@ -1167,7 +1167,7 @@ def int_amdgcn_exp_compr : Intrinsic <[], [
   LLVMMatchType<0>,  // src1
   llvm_i1_ty,        // done
   llvm_i1_ty],       // vm
-  [ImmArg<0>, ImmArg<1>, ImmArg<4>, ImmArg<5>, IntrWriteMem, IntrInaccessibleMemOnly]
+  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly]
 >;
 
 def int_amdgcn_buffer_wbinvl1_sc :
@@ -1188,23 +1188,23 @@ def int_amdgcn_s_memtime :
 
 def int_amdgcn_s_sleep :
   GCCBuiltin<"__builtin_amdgcn_s_sleep">,
-  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]> {
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]> {
 }
 
 def int_amdgcn_s_incperflevel :
   GCCBuiltin<"__builtin_amdgcn_s_incperflevel">,
-  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]> {
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]> {
 }
 
 def int_amdgcn_s_decperflevel :
   GCCBuiltin<"__builtin_amdgcn_s_decperflevel">,
-  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]> {
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]> {
 }
 
 def int_amdgcn_s_getreg :
   GCCBuiltin<"__builtin_amdgcn_s_getreg">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
-  [IntrInaccessibleMemOnly, IntrReadMem, IntrSpeculatable, ImmArg<0>]
+  [IntrInaccessibleMemOnly, IntrReadMem, IntrSpeculatable, ImmArg<ArgIndex<0>>]
 >;
 
 // int_amdgcn_s_getpc is provided to allow a specific style of position
@@ -1223,7 +1223,7 @@ def int_amdgcn_interp_mov :
   GCCBuiltin<"__builtin_amdgcn_interp_mov">,
   Intrinsic<[llvm_float_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable, ImmArg<0>, ImmArg<1>, ImmArg<2>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 // __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
 // This intrinsic reads from lds, but the memory values are constant,
@@ -1232,14 +1232,14 @@ def int_amdgcn_interp_p1 :
   GCCBuiltin<"__builtin_amdgcn_interp_p1">,
   Intrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable, ImmArg<1>, ImmArg<2>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 // __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
 def int_amdgcn_interp_p2 :
   GCCBuiltin<"__builtin_amdgcn_interp_p2">,
   Intrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable, ImmArg<2>, ImmArg<3>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
           // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
 
 // __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
@@ -1247,14 +1247,14 @@ def int_amdgcn_interp_p1_f16 :
   GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
   Intrinsic<[llvm_float_ty],
             [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable, ImmArg<1>, ImmArg<2>, ImmArg<3>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 // __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
 def int_amdgcn_interp_p2_f16 :
   GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
   Intrinsic<[llvm_half_ty],
             [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
-            [IntrNoMem, IntrSpeculatable, ImmArg<2>, ImmArg<3>, ImmArg<4>]>;
+            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
@@ -1275,7 +1275,7 @@ def int_amdgcn_mbcnt_hi :
 def int_amdgcn_ds_swizzle :
   GCCBuiltin<"__builtin_amdgcn_ds_swizzle">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrConvergent, ImmArg<1>]>;
+            [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<1>>]>;
 
 def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
     [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
@@ -1343,11 +1343,11 @@ def int_amdgcn_cvt_pk_u8_f32 :
 
 def int_amdgcn_icmp :
   Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
-            [IntrNoMem, IntrConvergent, ImmArg<2>]>;
+            [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>]>;
 
 def int_amdgcn_fcmp :
   Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
-            [IntrNoMem, IntrConvergent, ImmArg<2>]>;
+            [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>]>;
 
 def int_amdgcn_ballot :
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent]>;
@@ -1500,13 +1500,13 @@ def int_amdgcn_set_inactive :
 // Return if the given flat pointer points to a local memory address.
 def int_amdgcn_is_shared : GCCBuiltin<"__builtin_amdgcn_is_shared">,
   Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
-  [IntrNoMem, IntrSpeculatable, NoCapture<0>]
+  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
 >;
 
 // Return if the given flat pointer points to a prvate memory address.
 def int_amdgcn_is_private : GCCBuiltin<"__builtin_amdgcn_is_private">,
   Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
-  [IntrNoMem, IntrSpeculatable, NoCapture<0>]
+  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1529,8 +1529,8 @@ def int_amdgcn_buffer_wbinvl1_vol :
 def int_amdgcn_mov_dpp :
   Intrinsic<[llvm_anyint_ty],
             [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-             llvm_i1_ty], [IntrNoMem, IntrConvergent, ImmArg<1>,
-                           ImmArg<2>, ImmArg<3>, ImmArg<4>]>;
+             llvm_i1_ty], [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<1>>,
+                           ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
 // llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
 // Should be equivalent to:
@@ -1541,7 +1541,7 @@ def int_amdgcn_update_dpp :
             [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
              [IntrNoMem, IntrConvergent,
-              ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_s_dcache_wb :
   GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">,
@@ -1573,13 +1573,13 @@ def int_amdgcn_ds_bpermute :
 def int_amdgcn_permlane16 : GCCBuiltin<"__builtin_amdgcn_permlane16">,
   Intrinsic<[llvm_i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
-            [IntrNoMem, IntrConvergent, ImmArg<4>, ImmArg<5>]>;
+            [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 // llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
 def int_amdgcn_permlanex16 : GCCBuiltin<"__builtin_amdgcn_permlanex16">,
   Intrinsic<[llvm_i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
-            [IntrNoMem, IntrConvergent, ImmArg<4>, ImmArg<5>]>;
+            [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 // llvm.amdgcn.mov.dpp8.i32 <src> <sel>
 // <sel> is a 32-bit constant whose high 8 bits must be zero which selects
@@ -1587,7 +1587,7 @@ def int_amdgcn_permlanex16 : GCCBuiltin<"__builtin_amdgcn_permlanex16">,
 def int_amdgcn_mov_dpp8 :
   Intrinsic<[llvm_anyint_ty],
             [LLVMMatchType<0>, llvm_i32_ty],
-            [IntrNoMem, IntrConvergent, ImmArg<1>]>;
+            [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<1>>]>;
 
 def int_amdgcn_s_get_waveid_in_workgroup :
   GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
@@ -1609,7 +1609,7 @@ def int_amdgcn_fdot2 :
       llvm_float_ty, // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
@@ -1624,7 +1624,7 @@ def int_amdgcn_sdot2 :
       llvm_i32_ty,   // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
@@ -1639,7 +1639,7 @@ def int_amdgcn_udot2 :
       llvm_i32_ty,   // %c
       llvm_i1_ty     // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
@@ -1654,7 +1654,7 @@ def int_amdgcn_sdot4 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
@@ -1669,7 +1669,7 @@ def int_amdgcn_udot4 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
@@ -1685,7 +1685,7 @@ def int_amdgcn_sdot8 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 // u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
@@ -1701,7 +1701,7 @@ def int_amdgcn_udot8 :
       llvm_i32_ty, // %c
       llvm_i1_ty   // %clamp
     ],
-    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
+    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
   >;
 
 //===----------------------------------------------------------------------===//
@@ -1722,7 +1722,7 @@ class AMDGPUGlobalAtomicNoRtn : Intrinsic <
   [],
   [llvm_anyptr_ty,    // vaddr
    llvm_anyfloat_ty],               // vdata(VGPR)
-  [IntrArgMemOnly, NoCapture<0>], "", [SDNPMemOperand]>;
+  [IntrArgMemOnly, NoCapture<ArgIndex<0>>], "", [SDNPMemOperand]>;
 
 def int_amdgcn_buffer_atomic_fadd    : AMDGPUBufferAtomicNoRtn;
 def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
@@ -1732,121 +1732,121 @@ def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32
   Intrinsic<[llvm_v32f32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_16x16x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x1f32">,
   Intrinsic<[llvm_v16f32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_4x4x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x1f32">,
   Intrinsic<[llvm_v4f32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_32x32x2f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x2f32">,
   Intrinsic<[llvm_v16f32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_16x16x4f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4f32">,
   Intrinsic<[llvm_v4f32_ty],
             [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_32x32x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4f16">,
   Intrinsic<[llvm_v32f32_ty],
             [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_16x16x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4f16">,
   Intrinsic<[llvm_v16f32_ty],
             [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_4x4x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x4f16">,
   Intrinsic<[llvm_v4f32_ty],
             [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_32x32x8f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x8f16">,
   Intrinsic<[llvm_v16f32_ty],
             [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_16x16x16f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x16f16">,
   Intrinsic<[llvm_v4f32_ty],
             [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_i32_32x32x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_32x32x4i8">,
   Intrinsic<[llvm_v32i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_v32i32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_i32_16x16x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_16x16x4i8">,
   Intrinsic<[llvm_v16i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_i32_4x4x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_4x4x4i8">,
   Intrinsic<[llvm_v4i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_i32_32x32x8i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_32x32x8i8">,
   Intrinsic<[llvm_v16i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_i32_16x16x16i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_16x16x16i8">,
   Intrinsic<[llvm_v4i32_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_32x32x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x2bf16">,
   Intrinsic<[llvm_v32f32_ty],
             [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_16x16x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x2bf16">,
   Intrinsic<[llvm_v16f32_ty],
             [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_4x4x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x2bf16">,
   Intrinsic<[llvm_v4f32_ty],
             [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_32x32x4bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4bf16">,
   Intrinsic<[llvm_v16f32_ty],
             [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 def int_amdgcn_mfma_f32_16x16x8bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x8bf16">,
   Intrinsic<[llvm_v4f32_ty],
             [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+            [IntrConvergent, IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 1606d666fa6a9..adeafbb267b2b 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -19,7 +19,7 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
 // A space-consuming intrinsic primarily for testing ARMConstantIslands. The
 // first argument is the number of bytes this "instruction" takes up, the second
 // and return value are essentially chains, used to force ordering during ISel.
-def int_arm_space : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>]>;
+def int_arm_space : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 // 16-bit multiplications
 def int_arm_smulbb : GCCBuiltin<"__builtin_arm_smulbb">,
@@ -262,59 +262,59 @@ def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
 // Coprocessor
 
 def int_arm_ldc : GCCBuiltin<"__builtin_arm_ldc">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_arm_ldcl : GCCBuiltin<"__builtin_arm_ldcl">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_arm_ldc2 : GCCBuiltin<"__builtin_arm_ldc2">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_arm_ldc2l : GCCBuiltin<"__builtin_arm_ldc2l">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 def int_arm_stc : GCCBuiltin<"__builtin_arm_stc">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_arm_stcl : GCCBuiltin<"__builtin_arm_stcl">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_arm_stc2 : GCCBuiltin<"__builtin_arm_stc2">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_arm_stc2l : GCCBuiltin<"__builtin_arm_stc2l">,
-   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<0>, ImmArg<1>]>;
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 // Move to coprocessor
 def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 // Move from coprocessor
 def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
                   MSBuiltin<"_MoveFromCoprocessor">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                             llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>]>;
+                             llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
                    MSBuiltin<"_MoveFromCoprocessor2">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                             llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>]>;
+                             llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
 // Coprocessor data processing
 def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
 // Move from two registers to coprocessor
 def int_arm_mcrr : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                                  llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<4>]>;
+                                  llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 def int_arm_mcrr2 : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                                   llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<4>]>;
+                                   llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 
 def int_arm_mrrc : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty,
-                              llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>]>;
+                              llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 def int_arm_mrrc2 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty,
-                               llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, ImmArg<1>, ImmArg<2>]>;
+                               llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 //===----------------------------------------------------------------------===//
 // CRC32
@@ -695,16 +695,16 @@ def int_arm_neon_vst4 : Intrinsic<[],
 def int_arm_neon_vst1x2 : Intrinsic<[],
                                     [llvm_anyptr_ty, llvm_anyvector_ty,
                                      LLVMMatchType<1>],
-                                    [IntrArgMemOnly, NoCapture<0>]>;
+                                    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 def int_arm_neon_vst1x3 : Intrinsic<[],
                                     [llvm_anyptr_ty, llvm_anyvector_ty,
                                      LLVMMatchType<1>, LLVMMatchType<1>],
-                                    [IntrArgMemOnly, NoCapture<0>]>;
+                                    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 def int_arm_neon_vst1x4 : Intrinsic<[],
                                     [llvm_anyptr_ty, llvm_anyvector_ty,
                                      LLVMMatchType<1>, LLVMMatchType<1>,
                                      LLVMMatchType<1>],
-                                    [IntrArgMemOnly, NoCapture<0>]>;
+                                    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
 // Vector store N-element structure from one lane.
 // Source operands are: the address, the N vectors, the lane number, and
@@ -1297,22 +1297,22 @@ multiclass CDEGPRIntrinsics<list<LLVMType> args> {
   def "" : Intrinsic<
     [llvm_i32_ty],
     !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 1)>>]>;
   def a : Intrinsic<
     [llvm_i32_ty],
     !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc */], args,
                 [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 2)>>]>;
 
   def d: Intrinsic<
     [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */],
     !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 1)>>]>;
   def da: Intrinsic<
     [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */],
     !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc_lo */,
                  llvm_i32_ty /* acc_hi */], args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 3)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 3)>>]>;
 }
 
 defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>;
@@ -1323,12 +1323,12 @@ multiclass CDEVCXIntrinsics<list<LLVMType> args> {
   def "" : Intrinsic<
     [llvm_anyfloat_ty],
     !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 1)>>]>;
   def a : Intrinsic<
     [llvm_anyfloat_ty],
     !listconcat([llvm_i32_ty /* coproc */,  LLVMMatchType<0> /* acc */],
                 args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 2)>>]>;
 }
 
 defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>;
@@ -1339,23 +1339,23 @@ multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
   def "" : Intrinsic<
     [llvm_v16i8_ty],
     !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 1)>>]>;
   def a : Intrinsic<
     [llvm_v16i8_ty],
     !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
                 args, [llvm_i32_ty /* imm */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 2)>>]>;
 
   def _predicated : Intrinsic<
     [llvm_anyvector_ty],
     !listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* inactive */],
                 args, [llvm_i32_ty /* imm */, llvm_anyvector_ty /* mask */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 2)>>]>;
   def a_predicated : Intrinsic<
     [llvm_anyvector_ty],
     !listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* acc */],
                 args, [llvm_i32_ty /* imm */, llvm_anyvector_ty /* mask */]),
-    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+    [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<!add(!size(args), 2)>>]>;
 }
 
 defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsBPF.td b/llvm/include/llvm/IR/IntrinsicsBPF.td
index a43ca0e2569bc..c4d35b2a0a88c 100644
--- a/llvm/include/llvm/IR/IntrinsicsBPF.td
+++ b/llvm/include/llvm/IR/IntrinsicsBPF.td
@@ -22,7 +22,7 @@ let TargetPrefix = "bpf" in {  // All intrinsics start with "llvm.bpf."
               Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]>;
   def int_bpf_preserve_field_info : GCCBuiltin<"__builtin_bpf_preserve_field_info">,
               Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i64_ty],
-              [IntrNoMem, ImmArg<1>]>;
+              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_bpf_btf_type_id : GCCBuiltin<"__builtin_bpf_btf_type_id">,
               Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_any_ty, llvm_i64_ty],
               [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 163396365e750..fe16a361ba3d6 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -51,19 +51,19 @@ class Hexagon_mem_memmemsisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
                            llvm_i32_ty, llvm_i32_ty],
-                          [IntrArgMemOnly, ImmArg<3>]>;
+                          [IntrArgMemOnly, ImmArg<ArgIndex<3>>]>;
 
 class Hexagon_mem_memsisisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty,
                            llvm_i32_ty, llvm_i32_ty],
-                          [IntrWriteMem, ImmArg<3>]>;
+                          [IntrWriteMem, ImmArg<ArgIndex<3>>]>;
 
 class Hexagon_mem_memdisisi_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_ptr_ty], [llvm_ptr_ty, llvm_i64_ty,
                            llvm_i32_ty, llvm_i32_ty],
-                          [IntrWriteMem, ImmArg<3>]>;
+                          [IntrWriteMem, ImmArg<ArgIndex<3>>]>;
 
 //
 // BUILTIN_INFO_NONCONST(circ_ldd,PTR_ftype_PTRPTRSISI,4)
@@ -131,34 +131,34 @@ def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
 // Mark locked loads as read/write to prevent any accidental reordering.
 def int_hexagon_L2_loadw_locked :
 Hexagon_Intrinsic<"HEXAGON_L2_loadw_locked", [llvm_i32_ty], [llvm_ptr32_ty],
-      [IntrArgMemOnly, NoCapture<0>]>;
+      [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 def int_hexagon_L4_loadd_locked :
 Hexagon_Intrinsic<"HEXAGON_L4_loadd_locked", [llvm_i64_ty], [llvm_ptr64_ty],
-      [IntrArgMemOnly, NoCapture<0>]>;
+      [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
 def int_hexagon_S2_storew_locked :
 Hexagon_Intrinsic<"HEXAGON_S2_storew_locked", [llvm_i32_ty],
-      [llvm_ptr32_ty, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>;
+      [llvm_ptr32_ty, llvm_i32_ty], [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 def int_hexagon_S4_stored_locked :
 Hexagon_Intrinsic<"HEXAGON_S4_stored_locked", [llvm_i32_ty],
-      [llvm_ptr64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture<0>]>;
+      [llvm_ptr64_ty, llvm_i64_ty], [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
 def int_hexagon_vmemcpy : Hexagon_Intrinsic<"hexagon_vmemcpy",
     [], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-    [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, ReadOnly<1>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
 
 def int_hexagon_vmemset : Hexagon_Intrinsic<"hexagon_vmemset",
     [], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
-    [IntrArgMemOnly, NoCapture<0>, WriteOnly<0>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 
 multiclass Hexagon_custom_circ_ld_Intrinsic<LLVMType ElTy> {
   def NAME#_pci : Hexagon_NonGCC_Intrinsic<
     [ElTy, llvm_ptr_ty],
     [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<3>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
   def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
     [ElTy, llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<2>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<2>>]>;
 }
 
 defm int_hexagon_L2_loadrub : Hexagon_custom_circ_ld_Intrinsic<llvm_i32_ty>;
@@ -172,10 +172,10 @@ multiclass Hexagon_custom_circ_st_Intrinsic<LLVMType ElTy> {
   def NAME#_pci : Hexagon_NonGCC_Intrinsic<
     [llvm_ptr_ty],
     [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<4>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<4>>]>;
   def NAME#_pcr : Hexagon_NonGCC_Intrinsic<
     [llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty, ElTy, llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<3>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<3>>]>;
 }
 
 defm int_hexagon_S2_storerb : Hexagon_custom_circ_st_Intrinsic<llvm_i32_ty>;
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
index 67a06f5c06f4e..198b6a7ab0d1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
@@ -1100,10 +1100,10 @@ def int_hexagon_C2_cmpgtup :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_C2_cmpgtup">;
 
 def int_hexagon_A4_rcmpeqi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_rcmpneqi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpneqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpneqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_rcmpeq :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_rcmpeq">;
@@ -1124,19 +1124,19 @@ def int_hexagon_C4_nbitsclr :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclr">;
 
 def int_hexagon_C2_cmpeqi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpeqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpeqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C2_cmpgti :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C2_cmpgtui :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgtui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgtui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C2_cmpgei :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgei", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgei", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C2_cmpgeui :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgeui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpgeui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C2_cmplt :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmplt">;
@@ -1145,19 +1145,19 @@ def int_hexagon_C2_cmpltu :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_cmpltu">;
 
 def int_hexagon_C2_bitsclri :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsclri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C2_bitsclri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C4_nbitsclri :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_nbitsclri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C4_cmpneqi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C4_cmpltei :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpltei", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpltei", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C4_cmplteui :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplteui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmplteui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C4_cmpneq :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_C4_cmpneq">;
@@ -1226,13 +1226,13 @@ def int_hexagon_C2_mux :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_mux">;
 
 def int_hexagon_C2_muxii :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxii", [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxii", [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_C2_muxir :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxir", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxir", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_C2_muxri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_C2_muxri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_C2_vmux :
 Hexagon_i64_i32i64i64_Intrinsic<"HEXAGON_C2_vmux">;
@@ -1244,7 +1244,7 @@ def int_hexagon_A2_vcmpbeq :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpbeq">;
 
 def int_hexagon_A4_vcmpbeqi :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbeqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbeqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_vcmpbeq_any :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A4_vcmpbeq_any">;
@@ -1253,31 +1253,31 @@ def int_hexagon_A2_vcmpbgtu :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpbgtu">;
 
 def int_hexagon_A4_vcmpbgtui :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgtui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgtui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_vcmpbgt :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A4_vcmpbgt">;
 
 def int_hexagon_A4_vcmpbgti :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpbgti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cmpbeq :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeq">;
 
 def int_hexagon_A4_cmpbeqi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbeqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cmpbgtu :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtu">;
 
 def int_hexagon_A4_cmpbgtui :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgtui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cmpbgt :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgt">;
 
 def int_hexagon_A4_cmpbgti :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpbgti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_vcmpheq :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpheq">;
@@ -1289,13 +1289,13 @@ def int_hexagon_A2_vcmphgtu :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmphgtu">;
 
 def int_hexagon_A4_vcmpheqi :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpheqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpheqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_vcmphgti :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_vcmphgtui :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgtui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmphgtui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cmpheq :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheq">;
@@ -1307,13 +1307,13 @@ def int_hexagon_A4_cmphgtu :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtu">;
 
 def int_hexagon_A4_cmpheqi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmpheqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cmphgti :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cmphgtui :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cmphgtui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_vcmpweq :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpweq">;
@@ -1325,13 +1325,13 @@ def int_hexagon_A2_vcmpwgtu :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_A2_vcmpwgtu">;
 
 def int_hexagon_A4_vcmpweqi :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpweqi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpweqi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_vcmpwgti :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_vcmpwgtui :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgtui", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_A4_vcmpwgtui", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_boundscheck :
 Hexagon_i32_i32i64_Intrinsic<"HEXAGON_A4_boundscheck">;
@@ -1784,13 +1784,13 @@ def int_hexagon_M2_mpyud_ll_s1 :
 Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_mpyud_ll_s1">;
 
 def int_hexagon_M2_mpysmi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysmi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_M2_mpysmi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_M2_macsip :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsip", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsip", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M2_macsin :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsin", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_macsin", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M2_dpmpyss_s0 :
 Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_dpmpyss_s0">;
@@ -1847,13 +1847,13 @@ def int_hexagon_M2_acci :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_acci">;
 
 def int_hexagon_M2_accii :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_accii", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_accii", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M2_nacci :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_nacci">;
 
 def int_hexagon_M2_naccii :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_naccii", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_naccii", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M2_subacc :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M2_subacc">;
@@ -1862,16 +1862,16 @@ def int_hexagon_M4_mpyrr_addr :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addr">;
 
 def int_hexagon_M4_mpyri_addr_u2 :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr_u2", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr_u2", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_M4_mpyri_addr :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addr", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M4_mpyri_addi :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addi", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyri_addi", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M4_mpyrr_addi :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addi", [IntrNoMem, ImmArg<0>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_mpyrr_addi", [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_M2_vmpy2s_s0 :
 Hexagon_i64_i32i32_Intrinsic<"HEXAGON_M2_vmpy2s_s0">;
@@ -2234,10 +2234,10 @@ def int_hexagon_S2_vcrotate :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_vcrotate">;
 
 def int_hexagon_S4_vrcrotate_acc :
-Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate_acc", [IntrNoMem, ImmArg<3>]>;
+Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate_acc", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_S4_vrcrotate :
-Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_vrcrotate", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_vcnegh :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_vcnegh">;
@@ -2270,7 +2270,7 @@ def int_hexagon_A2_subsat :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subsat">;
 
 def int_hexagon_A2_addi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_addh_l16_ll :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_addh_l16_ll">;
@@ -2411,13 +2411,13 @@ def int_hexagon_A2_tfr :
 Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfr">;
 
 def int_hexagon_A2_tfrsi :
-Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrsi", [IntrNoMem, ImmArg<0>]>;
+Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_tfrsi", [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_A2_tfrp :
 Hexagon_i64_i64_Intrinsic<"HEXAGON_A2_tfrp">;
 
 def int_hexagon_A2_tfrpi :
-Hexagon_i64_i32_Intrinsic<"HEXAGON_A2_tfrpi", [IntrNoMem, ImmArg<0>]>;
+Hexagon_i64_i32_Intrinsic<"HEXAGON_A2_tfrpi", [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_A2_zxtb :
 Hexagon_i32_i32_Intrinsic<"HEXAGON_A2_zxtb">;
@@ -2435,13 +2435,13 @@ def int_hexagon_A2_combinew :
 Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combinew">;
 
 def int_hexagon_A4_combineri :
-Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_combineir :
-Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineir", [IntrNoMem, ImmArg<0>]>;
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_combineir", [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_A2_combineii :
-Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combineii", [IntrNoMem, ImmArg<0>, ImmArg<1>]>;
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A2_combineii", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_combine_hh :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_hh">;
@@ -2456,10 +2456,10 @@ def int_hexagon_A2_combine_ll :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_combine_ll">;
 
 def int_hexagon_A2_tfril :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfril", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfril", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_tfrih :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfrih", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_tfrih", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_and :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_and">;
@@ -2492,10 +2492,10 @@ def int_hexagon_A4_ornp :
 Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A4_ornp">;
 
 def int_hexagon_S4_addaddi :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addaddi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addaddi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_subaddi :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subaddi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subaddi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_M4_and_and :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_and_and">;
@@ -2522,13 +2522,13 @@ def int_hexagon_M4_or_xor :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_or_xor">;
 
 def int_hexagon_S4_or_andix :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andix", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andix", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_or_andi :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_andi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_or_ori :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_ori", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_or_ori", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_M4_xor_and :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_and">;
@@ -2540,13 +2540,13 @@ def int_hexagon_M4_xor_andn :
 Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_M4_xor_andn">;
 
 def int_hexagon_A2_subri :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subri", [IntrNoMem, ImmArg<0>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_subri", [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_A2_andir :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_andir", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_andir", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_orir :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_orir", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A2_orir", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A2_andp :
 Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_andp">;
@@ -2768,19 +2768,19 @@ def int_hexagon_A2_vnavghr :
 Hexagon_i64_i64i64_Intrinsic<"HEXAGON_A2_vnavghr">;
 
 def int_hexagon_A4_round_ri :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_round_rr :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_rr">;
 
 def int_hexagon_A4_round_ri_sat :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri_sat", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_ri_sat", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_round_rr_sat :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_round_rr_sat">;
 
 def int_hexagon_A4_cround_ri :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_ri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_ri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_cround_rr :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A4_cround_rr">;
@@ -2891,13 +2891,13 @@ def int_hexagon_F2_sfmin :
 Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sfmin", [IntrNoMem, Throws]>;
 
 def int_hexagon_F2_sfclass :
-Hexagon_i32_floati32_Intrinsic<"HEXAGON_F2_sfclass", [IntrNoMem, Throws, ImmArg<1>]>;
+Hexagon_i32_floati32_Intrinsic<"HEXAGON_F2_sfclass", [IntrNoMem, Throws, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_F2_sfimm_p :
-Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_p", [IntrNoMem, Throws, ImmArg<0>]>;
+Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_p", [IntrNoMem, Throws, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_F2_sfimm_n :
-Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_n", [IntrNoMem, Throws, ImmArg<0>]>;
+Hexagon_float_i32_Intrinsic<"HEXAGON_F2_sfimm_n", [IntrNoMem, Throws, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_F2_sffixupn :
 Hexagon_float_floatfloat_Intrinsic<"HEXAGON_F2_sffixupn", [IntrNoMem, Throws]>;
@@ -2921,13 +2921,13 @@ def int_hexagon_F2_dfcmpuo :
 Hexagon_i32_doubledouble_Intrinsic<"HEXAGON_F2_dfcmpuo", [IntrNoMem, Throws]>;
 
 def int_hexagon_F2_dfclass :
-Hexagon_i32_doublei32_Intrinsic<"HEXAGON_F2_dfclass", [IntrNoMem, Throws, ImmArg<1>]>;
+Hexagon_i32_doublei32_Intrinsic<"HEXAGON_F2_dfclass", [IntrNoMem, Throws, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_F2_dfimm_p :
-Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_p", [IntrNoMem, Throws, ImmArg<0>]>;
+Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_p", [IntrNoMem, Throws, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_F2_dfimm_n :
-Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_n", [IntrNoMem, Throws, ImmArg<0>]>;
+Hexagon_double_i32_Intrinsic<"HEXAGON_F2_dfimm_n", [IntrNoMem, Throws, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_F2_conv_sf2df :
 Hexagon_double_float_Intrinsic<"HEXAGON_F2_conv_sf2df">;
@@ -3146,160 +3146,160 @@ def int_hexagon_S2_asl_r_r_sat :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_r_r_sat">;
 
 def int_hexagon_S2_asr_i_r :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_lsr_i_r :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asl_i_r :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_p :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_lsr_i_p :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asl_i_p :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_p", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_p", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_r_acc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_r_acc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_r_acc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_p_acc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_p_acc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_p_acc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_r_nac :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_r_nac :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_r_nac :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_p_nac :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_p_nac :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_p_nac :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_r_xacc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_xacc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_r_xacc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_xacc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_xacc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_p_xacc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_xacc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_p_xacc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_xacc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_xacc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_r_and :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_r_and :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_r_and :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_r_or :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_r_or :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_lsr_i_r_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_r_or :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_p_and :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_p_and :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_p_and :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asr_i_p_or :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_lsr_i_p_or :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_lsr_i_p_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_p_or :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_asl_i_p_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_asl_i_r_sat :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_sat", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asl_i_r_sat", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_r_rnd :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_r_rnd_goodsyntax :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_asr_i_r_rnd_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_p_rnd :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_p_rnd_goodsyntax :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_p_rnd_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S4_lsli :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_lsli", [IntrNoMem, ImmArg<0>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_lsli", [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 def int_hexagon_S2_addasl_rrri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_addasl_rrri", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_addasl_rrri", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_andi_asl_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_asl_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_asl_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_ori_asl_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_asl_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_asl_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_addi_asl_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_asl_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_asl_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_subi_asl_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_asl_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_asl_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_andi_lsr_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_lsr_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_andi_lsr_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_ori_lsr_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_lsr_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_ori_lsr_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_addi_lsr_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_lsr_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_addi_lsr_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S4_subi_lsr_ri :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_lsr_ri", [IntrNoMem, ImmArg<0>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_subi_lsr_ri", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_valignib :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignib", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignib", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_valignrb :
 Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_valignrb">;
 
 def int_hexagon_S2_vspliceib :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vspliceib", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vspliceib", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_vsplicerb :
 Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S2_vsplicerb">;
@@ -3311,40 +3311,40 @@ def int_hexagon_S2_vsplatrb :
 Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_vsplatrb">;
 
 def int_hexagon_S2_insert :
-Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_insert", [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_insert", [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_S2_tableidxb_goodsyntax :
-Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax", [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxb_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_S2_tableidxh_goodsyntax :
-Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax", [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxh_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_S2_tableidxw_goodsyntax :
-Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax", [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxw_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_S2_tableidxd_goodsyntax :
-Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax", [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+Hexagon_i32_i32i32i32i32_Intrinsic<"HEXAGON_S2_tableidxd_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_A4_bitspliti :
-Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitspliti", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitspliti", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A4_bitsplit :
 Hexagon_i64_i32i32_Intrinsic<"HEXAGON_A4_bitsplit">;
 
 def int_hexagon_S4_extract :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_extract", [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S4_extract", [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_extractu :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_extractu", [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S2_extractu", [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_insertp :
-Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S2_insertp", [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+Hexagon_i64_i64i64i32i32_Intrinsic<"HEXAGON_S2_insertp", [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_S4_extractp :
-Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_extractp", [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S4_extractp", [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_extractup :
-Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S2_extractup", [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+Hexagon_i64_i64i32i32_Intrinsic<"HEXAGON_S2_extractup", [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S2_insert_rp :
 Hexagon_i32_i32i32i64_Intrinsic<"HEXAGON_S2_insert_rp">;
@@ -3365,19 +3365,19 @@ def int_hexagon_S2_extractup_rp :
 Hexagon_i64_i64i64_Intrinsic<"HEXAGON_S2_extractup_rp">;
 
 def int_hexagon_S2_tstbit_i :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_i", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_i", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S4_ntstbit_i :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_ntstbit_i", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_ntstbit_i", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_setbit_i :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_setbit_i", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_setbit_i", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_togglebit_i :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_togglebit_i", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_togglebit_i", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_clrbit_i :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_i", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_i", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_tstbit_r :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_tstbit_r">;
@@ -3395,25 +3395,25 @@ def int_hexagon_S2_clrbit_r :
 Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_clrbit_r">;
 
 def int_hexagon_S2_asr_i_vh :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vh", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vh", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_lsr_i_vh :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vh", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vh", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asl_i_vh :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vh", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vh", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_r_vh :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_vh">;
 
 def int_hexagon_S5_asrhub_rnd_sat_goodsyntax :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_rnd_sat_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S5_asrhub_sat :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_sat", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S5_asrhub_sat", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S5_vasrhrnd_goodsyntax :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S5_vasrhrnd_goodsyntax", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asl_r_vh :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_r_vh">;
@@ -3425,19 +3425,19 @@ def int_hexagon_S2_lsl_r_vh :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsl_r_vh">;
 
 def int_hexagon_S2_asr_i_vw :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vw", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_i_vw", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_i_svw_trun :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_i_svw_trun", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_i_svw_trun", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_r_svw_trun :
 Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S2_asr_r_svw_trun">;
 
 def int_hexagon_S2_lsr_i_vw :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vw", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_lsr_i_vw", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asl_i_vw :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vw", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asl_i_vw", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_asr_r_vw :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S2_asr_r_vw">;
@@ -3545,13 +3545,13 @@ def int_hexagon_S2_clbnorm :
 Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_clbnorm">;
 
 def int_hexagon_S4_clbaddi :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_clbaddi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S4_clbaddi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S4_clbpnorm :
 Hexagon_i32_i64_Intrinsic<"HEXAGON_S4_clbpnorm">;
 
 def int_hexagon_S4_clbpaddi :
-Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S4_clbpaddi", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i64i32_Intrinsic<"HEXAGON_S4_clbpaddi", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S2_clb :
 Hexagon_i32_i32_Intrinsic<"HEXAGON_S2_clb">;
@@ -3619,40 +3619,40 @@ Hexagon__ptri64_Intrinsic<"HEXAGON_Y5_l2fetch", []>;
 // V60 Scalar Instructions.
 
 def int_hexagon_S6_rol_i_r :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S6_rol_i_r", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S6_rol_i_r", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S6_rol_i_p :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S6_rol_i_p", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_S6_rol_i_p", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_S6_rol_i_r_acc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_p_acc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_acc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_acc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_r_nac :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_p_nac :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_nac", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_nac", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_r_xacc :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_xacc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_xacc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_p_xacc :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_xacc", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_xacc", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_r_and :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_r_or :
-Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i32_i32i32i32_Intrinsic<"HEXAGON_S6_rol_i_r_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_p_and :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_and", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_and", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_S6_rol_i_p_or :
-Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_or", [IntrNoMem, ImmArg<2>]>;
+Hexagon_i64_i64i64i32_Intrinsic<"HEXAGON_S6_rol_i_p_or", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 // V62 Scalar Instructions.
 
@@ -3688,7 +3688,7 @@ def int_hexagon_F2_dfsub :
 Hexagon_double_doubledouble_Intrinsic<"HEXAGON_F2_dfsub", [IntrNoMem, Throws]>;
 
 def int_hexagon_S2_mask :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_mask", [IntrNoMem, ImmArg<0>, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_S2_mask", [IntrNoMem, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
 // V67 Scalar Instructions.
 
@@ -3747,16 +3747,16 @@ def int_hexagon_M7_wcmpyiwc_rnd :
 Hexagon_i32_i64i64_Intrinsic<"HEXAGON_M7_wcmpyiwc_rnd">;
 
 def int_hexagon_A7_croundd_ri :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_A7_croundd_ri", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_A7_croundd_ri", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A7_croundd_rr :
 Hexagon_i64_i64i32_Intrinsic<"HEXAGON_A7_croundd_rr">;
 
 def int_hexagon_A7_clip :
-Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A7_clip", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i32_i32i32_Intrinsic<"HEXAGON_A7_clip", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_A7_vclip :
-Hexagon_i64_i64i32_Intrinsic<"HEXAGON_A7_vclip", [IntrNoMem, ImmArg<1>]>;
+Hexagon_i64_i64i32_Intrinsic<"HEXAGON_A7_vclip", [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_hexagon_F2_dfmax :
 Hexagon_double_doubledouble_Intrinsic<"HEXAGON_F2_dfmax", [IntrNoMem, Throws]>;
@@ -3815,16 +3815,16 @@ def int_hexagon_V6_vlalignb_128B :
 Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignb_128B">;
 
 def int_hexagon_V6_valignbi :
-Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valignbi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_valignbi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_valignbi_128B :
-Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valignbi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_valignbi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vlalignbi :
-Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlalignbi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlalignbi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vlalignbi_128B :
-Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignbi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlalignbi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vror :
 Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vror">;
@@ -4121,16 +4121,16 @@ def int_hexagon_V6_vrmpybv_acc_128B :
 Hexagon_v32i32_v32i32v32i32v32i32_Intrinsic<"HEXAGON_V6_vrmpybv_acc_128B">;
 
 def int_hexagon_V6_vrmpyubi :
-Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vrmpyubi_128B :
-Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vrmpyubi_acc :
-Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vrmpyubi_acc_128B :
-Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc_128B", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpyubi_acc_128B", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vrmpybus :
 Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vrmpybus">;
@@ -4145,16 +4145,16 @@ def int_hexagon_V6_vrmpybus_acc_128B :
 Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vrmpybus_acc_128B">;
 
 def int_hexagon_V6_vrmpybusi :
-Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vrmpybusi_128B :
-Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vrmpybusi_acc :
-Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vrmpybusi_acc_128B :
-Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc_128B", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrmpybusi_acc_128B", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vrmpybusv :
 Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vrmpybusv">;
@@ -4181,16 +4181,16 @@ def int_hexagon_V6_vdsaduh_acc_128B :
 Hexagon_v64i32_v64i32v64i32i32_Intrinsic<"HEXAGON_V6_vdsaduh_acc_128B">;
 
 def int_hexagon_V6_vrsadubi :
-Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vrsadubi_128B :
-Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v64i32_v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vrsadubi_acc :
-Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v32i32_v32i32v32i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vrsadubi_acc_128B :
-Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc_128B", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v64i32_v64i32v64i32i32i32_Intrinsic<"HEXAGON_V6_vrsadubi_acc_128B", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vasrw :
 Hexagon_v16i32_v16i32i32_Intrinsic<"HEXAGON_V6_vasrw">;
@@ -5839,28 +5839,28 @@ def int_hexagon_V6_vaddclbh_128B :
 Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vaddclbh_128B">;
 
 def int_hexagon_V6_vlutvvbi :
-Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vlutvvbi_128B :
-Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvbi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vlutvvb_oracci :
-Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v16i32_v16i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vlutvvb_oracci_128B :
-Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v32i32_v32i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vlutvwhi :
-Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v32i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vlutvwhi_128B :
-Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi_128B", [IntrNoMem, ImmArg<2>]>;
+Hexagon_v64i32_v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwhi_128B", [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_hexagon_V6_vlutvwh_oracci :
-Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v32i32_v32i32v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vlutvwh_oracci_128B :
-Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B", [IntrNoMem, ImmArg<3>]>;
+Hexagon_v64i32_v64i32v32i32v32i32i32_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B", [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 def int_hexagon_V6_vlutvvb_nm :
 Hexagon_v16i32_v16i32v16i32i32_Intrinsic<"HEXAGON_V6_vlutvvb_nm">;
diff --git a/llvm/include/llvm/IR/IntrinsicsMips.td b/llvm/include/llvm/IR/IntrinsicsMips.td
index 9f9d6d78abea8..271142ca7788f 100644
--- a/llvm/include/llvm/IR/IntrinsicsMips.td
+++ b/llvm/include/llvm/IR/IntrinsicsMips.td
@@ -234,9 +234,9 @@ def int_mips_extpdp: GCCBuiltin<"__builtin_mips_extpdp">,
 // Misc
 
 def int_mips_wrdsp: GCCBuiltin<"__builtin_mips_wrdsp">,
-  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<1>]>;
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>]>;
 def int_mips_rddsp: GCCBuiltin<"__builtin_mips_rddsp">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem, ImmArg<0>]>;
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem, ImmArg<ArgIndex<0>>]>;
 
 def int_mips_insv: GCCBuiltin<"__builtin_mips_insv">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
@@ -302,10 +302,10 @@ def int_mips_adduh_r_qb: GCCBuiltin<"__builtin_mips_adduh_r_qb">,
 
 def int_mips_append: GCCBuiltin<"__builtin_mips_append">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, ImmArg<2>]>;
+  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_balign: GCCBuiltin<"__builtin_mips_balign">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, ImmArg<2>]>;
+  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_cmpgdu_eq_qb: GCCBuiltin<"__builtin_mips_cmpgdu_eq_qb">,
   Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
@@ -355,14 +355,14 @@ def int_mips_precr_qb_ph: GCCBuiltin<"__builtin_mips_precr_qb_ph">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
 def int_mips_precr_sra_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_ph_w">,
   Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_precr_sra_r_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_r_ph_w">,
   Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_prepend: GCCBuiltin<"__builtin_mips_prepend">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-  [IntrNoMem, ImmArg<2>]>;
+  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_shra_qb: GCCBuiltin<"__builtin_mips_shra_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -463,22 +463,22 @@ def int_mips_addv_d : GCCBuiltin<"__builtin_msa_addv_d">,
 
 def int_mips_addvi_b : GCCBuiltin<"__builtin_msa_addvi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty],
-  [Commutative, IntrNoMem, ImmArg<1>]>;
+  [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_addvi_h : GCCBuiltin<"__builtin_msa_addvi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty],
-  [Commutative, IntrNoMem, ImmArg<1>]>;
+  [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_addvi_w : GCCBuiltin<"__builtin_msa_addvi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty],
-  [Commutative, IntrNoMem, ImmArg<1>]>;
+  [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_addvi_d : GCCBuiltin<"__builtin_msa_addvi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty],
-  [Commutative, IntrNoMem, ImmArg<1>]>;
+  [Commutative, IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_and_v : GCCBuiltin<"__builtin_msa_and_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
 def int_mips_andi_b : GCCBuiltin<"__builtin_msa_andi_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_asub_s_b : GCCBuiltin<"__builtin_msa_asub_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -560,13 +560,13 @@ def int_mips_bclr_d : GCCBuiltin<"__builtin_msa_bclr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_bclri_b : GCCBuiltin<"__builtin_msa_bclri_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bclri_h : GCCBuiltin<"__builtin_msa_bclri_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bclri_w : GCCBuiltin<"__builtin_msa_bclri_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bclri_d : GCCBuiltin<"__builtin_msa_bclri_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_binsl_b : GCCBuiltin<"__builtin_msa_binsl_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
@@ -583,16 +583,16 @@ def int_mips_binsl_d : GCCBuiltin<"__builtin_msa_binsl_d">,
 
 def int_mips_binsli_b : GCCBuiltin<"__builtin_msa_binsli_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_binsli_h : GCCBuiltin<"__builtin_msa_binsli_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_binsli_w : GCCBuiltin<"__builtin_msa_binsli_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_binsli_d : GCCBuiltin<"__builtin_msa_binsli_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_binsr_b : GCCBuiltin<"__builtin_msa_binsr_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
@@ -609,16 +609,16 @@ def int_mips_binsr_d : GCCBuiltin<"__builtin_msa_binsr_d">,
 
 def int_mips_binsri_b : GCCBuiltin<"__builtin_msa_binsri_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_binsri_h : GCCBuiltin<"__builtin_msa_binsri_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_binsri_w : GCCBuiltin<"__builtin_msa_binsri_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_binsri_d : GCCBuiltin<"__builtin_msa_binsri_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_bmnz_v : GCCBuiltin<"__builtin_msa_bmnz_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
@@ -626,7 +626,7 @@ def int_mips_bmnz_v : GCCBuiltin<"__builtin_msa_bmnz_v">,
 
 def int_mips_bmnzi_b : GCCBuiltin<"__builtin_msa_bmnzi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_bmz_v : GCCBuiltin<"__builtin_msa_bmz_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
@@ -634,7 +634,7 @@ def int_mips_bmz_v : GCCBuiltin<"__builtin_msa_bmz_v">,
 
 def int_mips_bmzi_b : GCCBuiltin<"__builtin_msa_bmzi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_bneg_b : GCCBuiltin<"__builtin_msa_bneg_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -646,13 +646,13 @@ def int_mips_bneg_d : GCCBuiltin<"__builtin_msa_bneg_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_bnegi_b : GCCBuiltin<"__builtin_msa_bnegi_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bnegi_h : GCCBuiltin<"__builtin_msa_bnegi_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bnegi_w : GCCBuiltin<"__builtin_msa_bnegi_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bnegi_d : GCCBuiltin<"__builtin_msa_bnegi_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_bnz_b : GCCBuiltin<"__builtin_msa_bnz_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
@@ -672,7 +672,7 @@ def int_mips_bsel_v : GCCBuiltin<"__builtin_msa_bsel_v">,
 
 def int_mips_bseli_b : GCCBuiltin<"__builtin_msa_bseli_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_bset_b : GCCBuiltin<"__builtin_msa_bset_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -684,13 +684,13 @@ def int_mips_bset_d : GCCBuiltin<"__builtin_msa_bset_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_bseti_b : GCCBuiltin<"__builtin_msa_bseti_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bseti_h : GCCBuiltin<"__builtin_msa_bseti_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bseti_w : GCCBuiltin<"__builtin_msa_bseti_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_bseti_d : GCCBuiltin<"__builtin_msa_bseti_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_bz_b : GCCBuiltin<"__builtin_msa_bz_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
@@ -714,16 +714,16 @@ def int_mips_ceq_d : GCCBuiltin<"__builtin_msa_ceq_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_ceqi_b : GCCBuiltin<"__builtin_msa_ceqi_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_ceqi_h : GCCBuiltin<"__builtin_msa_ceqi_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_ceqi_w : GCCBuiltin<"__builtin_msa_ceqi_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_ceqi_d : GCCBuiltin<"__builtin_msa_ceqi_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_cfcmsa : GCCBuiltin<"__builtin_msa_cfcmsa">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<0>]>;
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 def int_mips_cle_s_b : GCCBuiltin<"__builtin_msa_cle_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -744,22 +744,22 @@ def int_mips_cle_u_d : GCCBuiltin<"__builtin_msa_cle_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_clei_s_b : GCCBuiltin<"__builtin_msa_clei_s_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clei_s_h : GCCBuiltin<"__builtin_msa_clei_s_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clei_s_w : GCCBuiltin<"__builtin_msa_clei_s_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clei_s_d : GCCBuiltin<"__builtin_msa_clei_s_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_clei_u_b : GCCBuiltin<"__builtin_msa_clei_u_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clei_u_h : GCCBuiltin<"__builtin_msa_clei_u_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clei_u_w : GCCBuiltin<"__builtin_msa_clei_u_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clei_u_d : GCCBuiltin<"__builtin_msa_clei_u_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_clt_s_b : GCCBuiltin<"__builtin_msa_clt_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -780,22 +780,22 @@ def int_mips_clt_u_d : GCCBuiltin<"__builtin_msa_clt_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_clti_s_b : GCCBuiltin<"__builtin_msa_clti_s_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clti_s_h : GCCBuiltin<"__builtin_msa_clti_s_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clti_s_w : GCCBuiltin<"__builtin_msa_clti_s_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clti_s_d : GCCBuiltin<"__builtin_msa_clti_s_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_clti_u_b : GCCBuiltin<"__builtin_msa_clti_u_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clti_u_h : GCCBuiltin<"__builtin_msa_clti_u_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clti_u_w : GCCBuiltin<"__builtin_msa_clti_u_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_clti_u_d : GCCBuiltin<"__builtin_msa_clti_u_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_copy_s_b : GCCBuiltin<"__builtin_msa_copy_s_b">,
   Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -816,7 +816,7 @@ def int_mips_copy_u_d : GCCBuiltin<"__builtin_msa_copy_u_d">,
   Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
 def int_mips_ctcmsa : GCCBuiltin<"__builtin_msa_ctcmsa">,
-  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>]>;
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 def int_mips_div_s_b : GCCBuiltin<"__builtin_msa_div_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1244,19 +1244,19 @@ def int_mips_insert_d : GCCBuiltin<"__builtin_msa_insert_d">,
 def int_mips_insve_b : GCCBuiltin<"__builtin_msa_insve_b">,
   Intrinsic<[llvm_v16i8_ty],
             [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
-            [IntrNoMem, ImmArg<1>]>;
+            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_insve_h : GCCBuiltin<"__builtin_msa_insve_h">,
   Intrinsic<[llvm_v8i16_ty],
             [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
-            [IntrNoMem, ImmArg<1>]>;
+            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_insve_w : GCCBuiltin<"__builtin_msa_insve_w">,
   Intrinsic<[llvm_v4i32_ty],
             [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
-            [IntrNoMem, ImmArg<1>]>;
+            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_insve_d : GCCBuiltin<"__builtin_msa_insve_d">,
   Intrinsic<[llvm_v2i64_ty],
             [llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty],
-            [IntrNoMem, ImmArg<1>]>;
+            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_ld_b : GCCBuiltin<"__builtin_msa_ld_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
@@ -1279,13 +1279,13 @@ def int_mips_ldr_w : GCCBuiltin<"__builtin_msa_ldr_w">,
   [IntrReadMem, IntrArgMemOnly]>;
 
 def int_mips_ldi_b : GCCBuiltin<"__builtin_msa_ldi_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 def int_mips_ldi_h : GCCBuiltin<"__builtin_msa_ldi_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 def int_mips_ldi_w : GCCBuiltin<"__builtin_msa_ldi_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 def int_mips_ldi_d : GCCBuiltin<"__builtin_msa_ldi_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<0>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
 // This instruction is part of the MSA spec but it does not share the
 // __builtin_msa prefix because it operates on the GPR registers.
@@ -1348,22 +1348,22 @@ def int_mips_max_u_d : GCCBuiltin<"__builtin_msa_max_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_maxi_s_b : GCCBuiltin<"__builtin_msa_maxi_s_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_maxi_s_h : GCCBuiltin<"__builtin_msa_maxi_s_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_maxi_s_w : GCCBuiltin<"__builtin_msa_maxi_s_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_maxi_s_d : GCCBuiltin<"__builtin_msa_maxi_s_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_maxi_u_b : GCCBuiltin<"__builtin_msa_maxi_u_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_maxi_u_h : GCCBuiltin<"__builtin_msa_maxi_u_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_maxi_u_w : GCCBuiltin<"__builtin_msa_maxi_u_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_maxi_u_d : GCCBuiltin<"__builtin_msa_maxi_u_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_min_a_b : GCCBuiltin<"__builtin_msa_min_a_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1393,22 +1393,22 @@ def int_mips_min_u_d : GCCBuiltin<"__builtin_msa_min_u_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_mini_s_b : GCCBuiltin<"__builtin_msa_mini_s_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_mini_s_h : GCCBuiltin<"__builtin_msa_mini_s_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_mini_s_w : GCCBuiltin<"__builtin_msa_mini_s_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_mini_s_d : GCCBuiltin<"__builtin_msa_mini_s_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_mini_u_b : GCCBuiltin<"__builtin_msa_mini_u_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_mini_u_h : GCCBuiltin<"__builtin_msa_mini_u_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_mini_u_w : GCCBuiltin<"__builtin_msa_mini_u_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_mini_u_d : GCCBuiltin<"__builtin_msa_mini_u_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_mod_s_b : GCCBuiltin<"__builtin_msa_mod_s_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1499,13 +1499,13 @@ def int_mips_nor_v : GCCBuiltin<"__builtin_msa_nor_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
 def int_mips_nori_b : GCCBuiltin<"__builtin_msa_nori_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_or_v : GCCBuiltin<"__builtin_msa_or_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
 def int_mips_ori_b : GCCBuiltin<"__builtin_msa_ori_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_pckev_b : GCCBuiltin<"__builtin_msa_pckev_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1535,29 +1535,29 @@ def int_mips_pcnt_d : GCCBuiltin<"__builtin_msa_pcnt_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_sat_s_b : GCCBuiltin<"__builtin_msa_sat_s_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_sat_s_h : GCCBuiltin<"__builtin_msa_sat_s_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_sat_s_w : GCCBuiltin<"__builtin_msa_sat_s_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_sat_s_d : GCCBuiltin<"__builtin_msa_sat_s_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_sat_u_b : GCCBuiltin<"__builtin_msa_sat_u_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_sat_u_h : GCCBuiltin<"__builtin_msa_sat_u_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_sat_u_w : GCCBuiltin<"__builtin_msa_sat_u_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_sat_u_d : GCCBuiltin<"__builtin_msa_sat_u_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_shf_b : GCCBuiltin<"__builtin_msa_shf_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_shf_h : GCCBuiltin<"__builtin_msa_shf_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_shf_w : GCCBuiltin<"__builtin_msa_shf_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_sld_b : GCCBuiltin<"__builtin_msa_sld_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -1570,16 +1570,16 @@ def int_mips_sld_d : GCCBuiltin<"__builtin_msa_sld_d">,
 
 def int_mips_sldi_b : GCCBuiltin<"__builtin_msa_sldi_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_sldi_h : GCCBuiltin<"__builtin_msa_sldi_h">,
   Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_sldi_w : GCCBuiltin<"__builtin_msa_sldi_w">,
   Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_mips_sldi_d : GCCBuiltin<"__builtin_msa_sldi_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-            [IntrNoMem, ImmArg<2>]>;
+            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 def int_mips_sll_b : GCCBuiltin<"__builtin_msa_sll_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1591,13 +1591,13 @@ def int_mips_sll_d : GCCBuiltin<"__builtin_msa_sll_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_slli_b : GCCBuiltin<"__builtin_msa_slli_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_slli_h : GCCBuiltin<"__builtin_msa_slli_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_slli_w : GCCBuiltin<"__builtin_msa_slli_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_slli_d : GCCBuiltin<"__builtin_msa_slli_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_splat_b : GCCBuiltin<"__builtin_msa_splat_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -1609,13 +1609,13 @@ def int_mips_splat_d : GCCBuiltin<"__builtin_msa_splat_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
 def int_mips_splati_b : GCCBuiltin<"__builtin_msa_splati_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_splati_h : GCCBuiltin<"__builtin_msa_splati_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_splati_w : GCCBuiltin<"__builtin_msa_splati_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_splati_d : GCCBuiltin<"__builtin_msa_splati_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_sra_b : GCCBuiltin<"__builtin_msa_sra_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1627,13 +1627,13 @@ def int_mips_sra_d : GCCBuiltin<"__builtin_msa_sra_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_srai_b : GCCBuiltin<"__builtin_msa_srai_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srai_h : GCCBuiltin<"__builtin_msa_srai_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srai_w : GCCBuiltin<"__builtin_msa_srai_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srai_d : GCCBuiltin<"__builtin_msa_srai_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_srar_b : GCCBuiltin<"__builtin_msa_srar_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1645,13 +1645,13 @@ def int_mips_srar_d : GCCBuiltin<"__builtin_msa_srar_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_srari_b : GCCBuiltin<"__builtin_msa_srari_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srari_h : GCCBuiltin<"__builtin_msa_srari_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srari_w : GCCBuiltin<"__builtin_msa_srari_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srari_d : GCCBuiltin<"__builtin_msa_srari_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_srl_b : GCCBuiltin<"__builtin_msa_srl_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1663,13 +1663,13 @@ def int_mips_srl_d : GCCBuiltin<"__builtin_msa_srl_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_srli_b : GCCBuiltin<"__builtin_msa_srli_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srli_h : GCCBuiltin<"__builtin_msa_srli_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srli_w : GCCBuiltin<"__builtin_msa_srli_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srli_d : GCCBuiltin<"__builtin_msa_srli_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_srlr_b : GCCBuiltin<"__builtin_msa_srlr_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
@@ -1681,13 +1681,13 @@ def int_mips_srlr_d : GCCBuiltin<"__builtin_msa_srlr_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_srlri_b : GCCBuiltin<"__builtin_msa_srlri_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srlri_h : GCCBuiltin<"__builtin_msa_srlri_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srlri_w : GCCBuiltin<"__builtin_msa_srlri_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_srlri_d : GCCBuiltin<"__builtin_msa_srlri_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_st_b : GCCBuiltin<"__builtin_msa_st_b">,
   Intrinsic<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i32_ty],
@@ -1755,13 +1755,13 @@ def int_mips_subv_d : GCCBuiltin<"__builtin_msa_subv_d">,
   Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 
 def int_mips_subvi_b : GCCBuiltin<"__builtin_msa_subvi_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_subvi_h : GCCBuiltin<"__builtin_msa_subvi_h">,
-  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_subvi_w : GCCBuiltin<"__builtin_msa_subvi_w">,
-  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_mips_subvi_d : GCCBuiltin<"__builtin_msa_subvi_d">,
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 def int_mips_vshf_b : GCCBuiltin<"__builtin_msa_vshf_b">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
@@ -1780,5 +1780,5 @@ def int_mips_xor_v : GCCBuiltin<"__builtin_msa_xor_v">,
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
 def int_mips_xori_b : GCCBuiltin<"__builtin_msa_xori_b">,
-  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index ec328d69a8dd9..61293418ec41d 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -978,20 +978,20 @@ let TargetPrefix = "nvvm" in {
 // Atomics not available as llvm intrinsics.
   def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
           [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
-                                      [IntrArgMemOnly, NoCapture<0>]>;
+                                      [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
   def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
           [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
-                                      [IntrArgMemOnly, NoCapture<0>]>;
+                                      [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
   class SCOPED_ATOMIC2_impl<LLVMType elty>
         : Intrinsic<[elty],
           [LLVMAnyPointerType<LLVMMatchType<0>>, LLVMMatchType<0>],
-          [IntrArgMemOnly, NoCapture<0>]>;
+          [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
   class SCOPED_ATOMIC3_impl<LLVMType elty>
         : Intrinsic<[elty],
           [LLVMAnyPointerType<LLVMMatchType<0>>, LLVMMatchType<0>,
            LLVMMatchType<0>],
-          [IntrArgMemOnly, NoCapture<0>]>;
+          [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
   multiclass PTXAtomicWithScope2<LLVMType elty> {
     def _cta : SCOPED_ATOMIC2_impl<elty>;
@@ -1063,30 +1063,30 @@ let TargetPrefix = "nvvm" in {
 // pointer's alignment.
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
+  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
+  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
+  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
+  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
+  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
+  [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
@@ -1143,7 +1143,7 @@ def int_nvvm_move_float : Intrinsic<[llvm_float_ty], [llvm_float_ty],
 def int_nvvm_move_double : Intrinsic<[llvm_double_ty], [llvm_double_ty],
   [IntrNoMem], "llvm.nvvm.move.double">;
 def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
-  [IntrNoMem, NoCapture<0>], "llvm.nvvm.move.ptr">;
+  [IntrNoMem, NoCapture<ArgIndex<0>>], "llvm.nvvm.move.ptr">;
 
 
 // For getting the handle from a texture or surface variable
@@ -4110,7 +4110,7 @@ def int_nvvm_match_all_sync_i64p :
 class NVVM_WMMA_LD<WMMA_REGS Frag, string Layout, int WithStride>
   : Intrinsic<Frag.regs,
               !if(WithStride, [llvm_anyptr_ty, llvm_i32_ty], [llvm_anyptr_ty]),
-              [IntrReadMem, IntrArgMemOnly, ReadOnly<0>, NoCapture<0>],
+              [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
               WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.intr>;
 
 // WMMA.STORE.D
@@ -4120,7 +4120,7 @@ class NVVM_WMMA_ST<WMMA_REGS Frag, string Layout, int WithStride>
                 [llvm_anyptr_ty],
                 Frag.regs,
                 !if(WithStride, [llvm_i32_ty], [])),
-              [IntrWriteMem, IntrArgMemOnly, WriteOnly<0>, NoCapture<0>],
+              [IntrWriteMem, IntrArgMemOnly, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
               WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.intr>;
 
 // Create all load/store variants
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 2b3fcd7f23416..c23f04f710595 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -25,9 +25,9 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_dcbi  : Intrinsic<[], [llvm_ptr_ty], []>;
   def int_ppc_dcbst : Intrinsic<[], [llvm_ptr_ty], []>;
   def int_ppc_dcbt  : Intrinsic<[], [llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<0>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
   def int_ppc_dcbtst: Intrinsic<[], [llvm_ptr_ty],
-    [IntrArgMemOnly, NoCapture<0>]>;
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
   def int_ppc_dcbz  : Intrinsic<[], [llvm_ptr_ty], []>;
   def int_ppc_dcbzl : Intrinsic<[], [llvm_ptr_ty], []>;
 
@@ -620,16 +620,16 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
   // FP <-> integer conversion.
   def int_ppc_altivec_vcfsx : GCCBuiltin<"__builtin_altivec_vcfsx">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_ppc_altivec_vcfux : GCCBuiltin<"__builtin_altivec_vcfux">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_ppc_altivec_vctsxs : GCCBuiltin<"__builtin_altivec_vctsxs">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_ppc_altivec_vctuxs : GCCBuiltin<"__builtin_altivec_vctuxs">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_ppc_altivec_vrfim : GCCBuiltin<"__builtin_altivec_vrfim">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
@@ -726,11 +726,11 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
 def int_ppc_altivec_crypto_vshasigmad :
             GCCBuiltin<"__builtin_altivec_crypto_vshasigmad">,
             Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
-                       llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                       llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 def int_ppc_altivec_crypto_vshasigmaw :
             GCCBuiltin<"__builtin_altivec_crypto_vshasigmaw">,
             Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
-                       llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                       llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 }
 def int_ppc_altivec_crypto_vcipher :
             PowerPC_Vec_DDD_Intrinsic<"crypto_vcipher">;
@@ -925,10 +925,10 @@ def int_ppc_vsx_xvxsigsp :
                             [llvm_v4f32_ty], [IntrNoMem]>;
 def int_ppc_vsx_xvtstdcdp :
       PowerPC_VSX_Intrinsic<"xvtstdcdp", [llvm_v2i64_ty],
-                            [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+                            [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_ppc_vsx_xvtstdcsp :
       PowerPC_VSX_Intrinsic<"xvtstdcsp", [llvm_v4i32_ty],
-                            [llvm_v4f32_ty,llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+                            [llvm_v4f32_ty,llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 def int_ppc_vsx_xvcvhpsp :
       PowerPC_VSX_Intrinsic<"xvcvhpsp", [llvm_v4f32_ty],
                             [llvm_v8i16_ty],[IntrNoMem]>;
@@ -1123,9 +1123,9 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
 
 def int_ppc_tbegin : GCCBuiltin<"__builtin_tbegin">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<0>]>;
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 def int_ppc_tend : GCCBuiltin<"__builtin_tend">,
-      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<0>]>;
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 def int_ppc_tabort : GCCBuiltin<"__builtin_tabort">,
       Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 2039ad1a26b88..7590b568c367b 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -28,11 +28,11 @@ let TargetPrefix = "riscv" in {
   // T @llvm.<name>.T.<p>(any*, T, T, T imm);
   class MaskedAtomicRMWFourArg<LLVMType itype>
       : Intrinsic<[itype], [llvm_anyptr_ty, itype, itype, itype],
-                  [IntrArgMemOnly, NoCapture<0>, ImmArg<3>]>;
+                  [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>]>;
   // T @llvm.<name>.T.<p>(any*, T, T, T, T imm);
   class MaskedAtomicRMWFiveArg<LLVMType itype>
       : Intrinsic<[itype], [llvm_anyptr_ty, itype, itype, itype, itype],
-                  [IntrArgMemOnly, NoCapture<0>, ImmArg<4>]>;
+                  [IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<4>>]>;
 
   // We define 32-bit and 64-bit variants of the above, where T stands for i32
   // or i64 respectively:
diff --git a/llvm/include/llvm/IR/IntrinsicsSystemZ.td b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
index dd156a3dc3b60..b0c5cf0148fe5 100644
--- a/llvm/include/llvm/IR/IntrinsicsSystemZ.td
+++ b/llvm/include/llvm/IR/IntrinsicsSystemZ.td
@@ -39,7 +39,7 @@ class SystemZBinaryConvCC<LLVMType result, LLVMType arg>
 
 class SystemZBinaryConvIntCC<LLVMType result, LLVMType arg>
   : Intrinsic<[result, llvm_i32_ty], [arg, llvm_i32_ty],
-              [IntrNoMem, ImmArg<1>]>;
+              [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 class SystemZBinaryCC<LLVMType type>
   : SystemZBinaryConvCC<type, type>;
@@ -56,20 +56,20 @@ class SystemZTernary<string name, LLVMType type>
 
 class SystemZTernaryInt<string name, LLVMType type>
   : GCCBuiltin<"__builtin_s390_" # name>,
-    Intrinsic<[type], [type, type, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+    Intrinsic<[type], [type, type, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 class SystemZTernaryIntCC<LLVMType type>
   : Intrinsic<[type, llvm_i32_ty], [type, type, llvm_i32_ty],
-              [IntrNoMem, ImmArg<2>]>;
+              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 class SystemZQuaternaryInt<string name, LLVMType type>
   : GCCBuiltin<"__builtin_s390_" # name>,
     Intrinsic<[type], [type, type, type, llvm_i32_ty],
-    [IntrNoMem, ImmArg<3>]>;
+    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 class SystemZQuaternaryIntCC<LLVMType type>
   : Intrinsic<[type, llvm_i32_ty], [type, type, type, llvm_i32_ty],
-              [IntrNoMem, ImmArg<3>]>;
+              [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 multiclass SystemZUnaryExtBHF<string name> {
   def b : SystemZUnaryConv<name#"b", llvm_v8i16_ty, llvm_v16i8_ty>;
@@ -238,11 +238,11 @@ let TargetPrefix = "s390" in {
 let TargetPrefix = "s390" in {
   def int_s390_lcbb : GCCBuiltin<"__builtin_s390_lcbb">,
                       Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                [IntrNoMem, ImmArg<1>]>;
+                                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_s390_vlbb : GCCBuiltin<"__builtin_s390_vlbb">,
                       Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
+                                [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
 
   def int_s390_vll : GCCBuiltin<"__builtin_s390_vll">,
                      Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty],
@@ -251,7 +251,7 @@ let TargetPrefix = "s390" in {
   def int_s390_vpdi : GCCBuiltin<"__builtin_s390_vpdi">,
                       Intrinsic<[llvm_v2i64_ty],
                                 [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-                                [IntrNoMem, ImmArg<2>]>;
+                                [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_s390_vperm : GCCBuiltin<"__builtin_s390_vperm">,
                        Intrinsic<[llvm_v16i8_ty],
@@ -317,7 +317,7 @@ let TargetPrefix = "s390" in {
   def int_s390_vsldb : GCCBuiltin<"__builtin_s390_vsldb">,
                        Intrinsic<[llvm_v16i8_ty],
                                  [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-                                 [IntrNoMem, ImmArg<2>]>;
+                                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   defm int_s390_vscbi : SystemZBinaryBHFG<"vscbi">;
 
@@ -376,7 +376,7 @@ let TargetPrefix = "s390" in {
 
   def int_s390_vfidb : Intrinsic<[llvm_v2f64_ty],
                                  [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
-                                 [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                                 [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   // Instructions from the Vector Enhancements Facility 1
   def int_s390_vbperm : SystemZBinaryConv<"vbperm", llvm_v2i64_ty,
@@ -385,20 +385,20 @@ let TargetPrefix = "s390" in {
   def int_s390_vmslg  : GCCBuiltin<"__builtin_s390_vmslg">,
                         Intrinsic<[llvm_v16i8_ty],
                                   [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v16i8_ty,
-                                   llvm_i32_ty], [IntrNoMem, ImmArg<3>]>;
+                                   llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_s390_vfmaxdb : Intrinsic<[llvm_v2f64_ty],
                                    [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
-                                   [IntrNoMem, ImmArg<2>]>;
+                                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_s390_vfmindb : Intrinsic<[llvm_v2f64_ty],
                                    [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
-                                   [IntrNoMem, ImmArg<2>]>;
+                                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_s390_vfmaxsb : Intrinsic<[llvm_v4f32_ty],
                                    [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
-                                   [IntrNoMem, ImmArg<2>]>;
+                                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_s390_vfminsb : Intrinsic<[llvm_v4f32_ty],
                                    [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
-                                   [IntrNoMem, ImmArg<2>]>;
+                                   [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_s390_vfcesbs  : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
   def int_s390_vfchsbs  : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
@@ -408,7 +408,7 @@ let TargetPrefix = "s390" in {
 
   def int_s390_vfisb : Intrinsic<[llvm_v4f32_ty],
                                  [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty],
-                                 [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+                                 [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
   // Instructions from the Vector Packed Decimal Facility
   def int_s390_vlrl : GCCBuiltin<"__builtin_s390_vlrl">,
@@ -423,12 +423,12 @@ let TargetPrefix = "s390" in {
   def int_s390_vsld : GCCBuiltin<"__builtin_s390_vsld">,
                       Intrinsic<[llvm_v16i8_ty],
                                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-                                [IntrNoMem, ImmArg<2>]>;
+                                [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_s390_vsrd : GCCBuiltin<"__builtin_s390_vsrd">,
                       Intrinsic<[llvm_v16i8_ty],
                                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-                                [IntrNoMem, ImmArg<2>]>;
+                                [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_s390_vstrsb : SystemZTernaryConvCC<llvm_v16i8_ty, llvm_v16i8_ty>;
   def int_s390_vstrsh : SystemZTernaryConvCC<llvm_v16i8_ty, llvm_v8i16_ty>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index babe42f863191..97bd76e49f6d8 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -51,7 +51,7 @@ def int_wasm_trunc_saturate_unsigned : Intrinsic<[llvm_anyint_ty],
 
 // throw / rethrow
 def int_wasm_throw : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty],
-                               [Throws, IntrNoReturn, ImmArg<0>]>;
+                               [Throws, IntrNoReturn, ImmArg<ArgIndex<0>>]>;
 def int_wasm_rethrow_in_catch : Intrinsic<[], [], [Throws, IntrNoReturn]>;
 
 // Since wasm does not use landingpad instructions, these instructions return
@@ -69,7 +69,7 @@ def int_wasm_extract_exception : Intrinsic<[llvm_ptr_ty], [],
 // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
 // used in order to give them the indices in WasmEHPrepare.
 def int_wasm_landingpad_index: Intrinsic<[], [llvm_token_ty, llvm_i32_ty],
-                                         [IntrNoMem, ImmArg<1>]>;
+                                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
 // Returns LSDA address of the current function.
 def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
@@ -82,18 +82,18 @@ def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_wasm_atomic_wait_i32 :
   Intrinsic<[llvm_i32_ty],
             [LLVMPointerType<llvm_i32_ty>, llvm_i32_ty, llvm_i64_ty],
-            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<0>, NoCapture<0>,
+            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
              IntrHasSideEffects],
              "", [SDNPMemOperand]>;
 def int_wasm_atomic_wait_i64 :
   Intrinsic<[llvm_i32_ty],
             [LLVMPointerType<llvm_i64_ty>, llvm_i64_ty, llvm_i64_ty],
-            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<0>, NoCapture<0>,
+            [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
              IntrHasSideEffects],
              "", [SDNPMemOperand]>;
 def int_wasm_atomic_notify:
   Intrinsic<[llvm_i32_ty], [LLVMPointerType<llvm_i32_ty>, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, NoCapture<0>, IntrHasSideEffects], "",
+            [IntrInaccessibleMemOnly, NoCapture<ArgIndex<0>>, IntrHasSideEffects], "",
             [SDNPMemOperand]>;
 
 //===----------------------------------------------------------------------===//
@@ -194,12 +194,12 @@ def int_wasm_pmax :
 def int_wasm_memory_init :
   Intrinsic<[],
             [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
-            [IntrWriteMem, IntrInaccessibleMemOrArgMemOnly, WriteOnly<2>,
-             IntrHasSideEffects, ImmArg<0>, ImmArg<1>]>;
+            [IntrWriteMem, IntrInaccessibleMemOrArgMemOnly, WriteOnly<ArgIndex<2>>,
+             IntrHasSideEffects, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 def int_wasm_data_drop :
   Intrinsic<[],
             [llvm_i32_ty],
-            [IntrNoDuplicate, IntrHasSideEffects, ImmArg<0>]>;
+            [IntrNoDuplicate, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
 
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1bd2b88ae8c5b..b3bf187205958 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 // Interrupt traps
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_int : Intrinsic<[], [llvm_i8_ty], [ImmArg<0>]>;
+  def int_x86_int : Intrinsic<[], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -203,12 +203,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   // NOTE: This comparison intrinsic is not used by clang as long as the
   //       distinction in signaling behaviour is not implemented.
   def int_x86_sse_cmp_ps :
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse_comieq_ss : GCCBuiltin<"__builtin_ia32_comieq">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty], [IntrNoMem]>;
@@ -319,12 +319,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_cmp_sd : GCCBuiltin<"__builtin_ia32_cmpsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   // NOTE: This comparison intrinsic is not used by clang as long as the
   //       distinction in signaling behaviour is not implemented.
   def int_x86_sse2_cmp_pd :
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse2_comieq_sd : GCCBuiltin<"__builtin_ia32_comisdeq">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
@@ -618,7 +618,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_v16i8_ty], [IntrNoMem]>;
   def int_x86_sse_pshuf_w           : GCCBuiltin<"__builtin_ia32_pshufw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
-                         [IntrNoMem, ImmArg<1>]>;
+                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Sign ops
@@ -664,16 +664,16 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_round_ss        : GCCBuiltin<"__builtin_ia32_roundss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse41_round_ps        : GCCBuiltin<"__builtin_ia32_roundps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_sse41_round_sd        : GCCBuiltin<"__builtin_ia32_roundsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse41_round_pd        : GCCBuiltin<"__builtin_ia32_roundpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Vector min element
@@ -736,20 +736,20 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_aesni_aeskeygenassist :
               GCCBuiltin<"__builtin_ia32_aeskeygenassist128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // PCLMUL instructions
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
   def int_x86_pclmulqdq : GCCBuiltin<"__builtin_ia32_pclmulqdq128">,
           Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_pclmulqdq_256 : GCCBuiltin<"__builtin_ia32_pclmulqdq256">,
           Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_pclmulqdq_512 : GCCBuiltin<"__builtin_ia32_pclmulqdq512">,
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector pack
@@ -763,7 +763,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_insertps       : GCCBuiltin<"__builtin_ia32_insertps128">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector blend
@@ -783,17 +783,17 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_dppd            : GCCBuiltin<"__builtin_ia32_dppd">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
-                    [IntrNoMem, Commutative, ImmArg<2>]>;
+                    [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse41_dpps            : GCCBuiltin<"__builtin_ia32_dpps">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
-                    [IntrNoMem, Commutative, ImmArg<2>]>;
+                    [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector sum of absolute differences
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_mpsadbw         : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
-                    [IntrNoMem, Commutative, ImmArg<2>]>;
+                    [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
 }
 
 // Test instruction with bitwise comparison.
@@ -834,66 +834,66 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
   def int_x86_sse42_pcmpistrm128  : GCCBuiltin<"__builtin_ia32_pcmpistrm128">,
     Intrinsic<[llvm_v16i8_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpistri128  : GCCBuiltin<"__builtin_ia32_pcmpistri128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpistria128 : GCCBuiltin<"__builtin_ia32_pcmpistria128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpistric128 : GCCBuiltin<"__builtin_ia32_pcmpistric128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpistrio128 : GCCBuiltin<"__builtin_ia32_pcmpistrio128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpistris128 : GCCBuiltin<"__builtin_ia32_pcmpistris128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpistriz128 : GCCBuiltin<"__builtin_ia32_pcmpistriz128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-        [IntrNoMem, ImmArg<2>]>;
+        [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse42_pcmpestrm128  : GCCBuiltin<"__builtin_ia32_pcmpestrm128">,
     Intrinsic<[llvm_v16i8_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_sse42_pcmpestri128  : GCCBuiltin<"__builtin_ia32_pcmpestri128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_sse42_pcmpestria128 : GCCBuiltin<"__builtin_ia32_pcmpestria128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_sse42_pcmpestric128 : GCCBuiltin<"__builtin_ia32_pcmpestric128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_sse42_pcmpestrio128 : GCCBuiltin<"__builtin_ia32_pcmpestrio128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_sse42_pcmpestris128 : GCCBuiltin<"__builtin_ia32_pcmpestris128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_sse42_pcmpestriz128 : GCCBuiltin<"__builtin_ia32_pcmpestriz128">,
     Intrinsic<[llvm_i32_ty],
         [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
          llvm_i8_ty],
-        [IntrNoMem, ImmArg<4>]>;
+        [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -902,14 +902,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse4a_extrqi : GCCBuiltin<"__builtin_ia32_extrqi">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty, llvm_i8_ty],
-              [IntrNoMem, ImmArg<1>, ImmArg<2>]>;
+              [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
   def int_x86_sse4a_extrq  : GCCBuiltin<"__builtin_ia32_extrq">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
   def int_x86_sse4a_insertqi : GCCBuiltin<"__builtin_ia32_insertqi">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
                                 llvm_i8_ty, llvm_i8_ty],
-              [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+              [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
   def int_x86_sse4a_insertq  : GCCBuiltin<"__builtin_ia32_insertq">,
     Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 }
@@ -946,10 +946,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx_round_pd_256 : GCCBuiltin<"__builtin_ia32_roundpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+                  llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx_round_ps_256 : GCCBuiltin<"__builtin_ia32_roundps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_i32_ty], [IntrNoMem, ImmArg<1>]>;
+                  llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Horizontal ops
@@ -1101,33 +1101,33 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
          GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v16qi">,
           Intrinsic<[llvm_v16i8_ty],
           [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<2>]>;
+          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineinvqb_256 :
          GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v32qi">,
           Intrinsic<[llvm_v32i8_ty],
           [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<2>]>;
+          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineinvqb_512 :
          GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v64qi">,
           Intrinsic<[llvm_v64i8_ty],
           [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<2>]>;
+          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_vgf2p8affineqb_128 :
          GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v16qi">,
           Intrinsic<[llvm_v16i8_ty],
           [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<2>]>;
+          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineqb_256 :
          GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v32qi">,
           Intrinsic<[llvm_v32i8_ty],
           [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<2>]>;
+          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_vgf2p8affineqb_512 :
          GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v64qi">,
           Intrinsic<[llvm_v64i8_ty],
           [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<2>]>;
+          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_vgf2p8mulb_128     :
          GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v16qi">,
@@ -1161,17 +1161,17 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_dp_ps_256 : GCCBuiltin<"__builtin_ia32_dpps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty, llvm_i8_ty],
-                  [IntrNoMem, Commutative, ImmArg<2>]>;
+                  [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector compare
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_cmp_pd_256 :
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                  llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx_cmp_ps_256 :
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                  llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Vector convert
@@ -1238,30 +1238,30 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx512_fpclass_pd_128 :
           Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_fpclass_pd_256 :
           Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_fpclass_pd_512 :
           Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_fpclass_ps_128 :
           Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_fpclass_ps_256 :
           Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_fpclass_ps_512 :
           Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_fpclass_sd :
          GCCBuiltin<"__builtin_ia32_fpclasssd_mask">,
           Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_fpclass_ss :
          GCCBuiltin<"__builtin_ia32_fpclassss_mask">,
           Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 // Vector extract sign mask
@@ -1707,68 +1707,68 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_gather_d_pd : GCCBuiltin<"__builtin_ia32_gatherd_pd">,
       Intrinsic<[llvm_v2f64_ty],
         [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_d_pd_256 : GCCBuiltin<"__builtin_ia32_gatherd_pd256">,
       Intrinsic<[llvm_v4f64_ty],
         [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_pd : GCCBuiltin<"__builtin_ia32_gatherq_pd">,
       Intrinsic<[llvm_v2f64_ty],
         [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_pd_256 : GCCBuiltin<"__builtin_ia32_gatherq_pd256">,
       Intrinsic<[llvm_v4f64_ty],
         [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_d_ps : GCCBuiltin<"__builtin_ia32_gatherd_ps">,
       Intrinsic<[llvm_v4f32_ty],
         [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_d_ps_256 : GCCBuiltin<"__builtin_ia32_gatherd_ps256">,
       Intrinsic<[llvm_v8f32_ty],
         [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_ps : GCCBuiltin<"__builtin_ia32_gatherq_ps">,
       Intrinsic<[llvm_v4f32_ty],
         [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_ps_256 : GCCBuiltin<"__builtin_ia32_gatherq_ps256">,
       Intrinsic<[llvm_v4f32_ty],
         [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx2_gather_d_q : GCCBuiltin<"__builtin_ia32_gatherd_q">,
       Intrinsic<[llvm_v2i64_ty],
         [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_d_q_256 : GCCBuiltin<"__builtin_ia32_gatherd_q256">,
       Intrinsic<[llvm_v4i64_ty],
         [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_q : GCCBuiltin<"__builtin_ia32_gatherq_q">,
       Intrinsic<[llvm_v2i64_ty],
         [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_q_256 : GCCBuiltin<"__builtin_ia32_gatherq_q256">,
       Intrinsic<[llvm_v4i64_ty],
         [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_d_d : GCCBuiltin<"__builtin_ia32_gatherd_d">,
       Intrinsic<[llvm_v4i32_ty],
         [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
       Intrinsic<[llvm_v8i32_ty],
         [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_d : GCCBuiltin<"__builtin_ia32_gatherq_d">,
       Intrinsic<[llvm_v4i32_ty],
         [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx2_gather_q_d_256 : GCCBuiltin<"__builtin_ia32_gatherq_d256">,
       Intrinsic<[llvm_v4i32_ty],
         [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
-        [IntrReadMem, ImmArg<4>]>;
+        [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 }
 
 // Misc.
@@ -1780,7 +1780,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
-                         llvm_i8_ty], [IntrNoMem, Commutative, ImmArg<2>]>;
+                         llvm_i8_ty], [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1809,31 +1809,31 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_vfmadd_pd_512 :
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_vfmadd_ps_512 :
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_vfmaddsub_pd_512 :
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_vfmaddsub_ps_512 :
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_vfmadd_f64 :
           Intrinsic<[llvm_double_ty],
                     [llvm_double_ty, llvm_double_ty, llvm_double_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_vfmadd_f32 :
           Intrinsic<[llvm_float_ty],
                     [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_vpmadd52h_uq_128 :
               GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
@@ -1923,23 +1923,23 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                           llvm_v2i64_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_xop_vpermil2pd_256 :
               GCCBuiltin<"__builtin_ia32_vpermil2pd256">,
               Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
                                           llvm_v4i64_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                           llvm_v4i32_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_xop_vpermil2ps_256 :
               GCCBuiltin<"__builtin_ia32_vpermil2ps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
                                           llvm_v8i32_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
@@ -2110,19 +2110,19 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_lwpins32 :
               GCCBuiltin<"__builtin_ia32_lwpins32">,
               Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-                        [ImmArg<2>]>;
+                        [ImmArg<ArgIndex<2>>]>;
   def int_x86_lwpins64 :
               GCCBuiltin<"__builtin_ia32_lwpins64">,
               Intrinsic<[llvm_i8_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
-                        [ImmArg<2>]>;
+                        [ImmArg<ArgIndex<2>>]>;
   def int_x86_lwpval32 :
               GCCBuiltin<"__builtin_ia32_lwpval32">,
               Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-                        [ImmArg<2>]>;
+                        [ImmArg<ArgIndex<2>>]>;
   def int_x86_lwpval64 :
               GCCBuiltin<"__builtin_ia32_lwpval64">,
               Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty],
-                        [ImmArg<2>]>;
+                        [ImmArg<ArgIndex<2>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2423,15 +2423,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_mmx_palignr_b : GCCBuiltin<"__builtin_ia32_palignr">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
-                        llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg<2>]>;
+                        llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">,
               Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
-                        llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                        llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2548,26 +2548,26 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_vcvtph2ps_512 :
               Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty,
                                            llvm_i16_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty,
                                            llvm_v16i16_ty, llvm_i16_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256_mask">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty,
                                            llvm_v8i16_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph_mask">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty,
                                            llvm_v8i16_ty, llvm_i8_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2576,10 +2576,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_tbm_bextri_u32 : GCCBuiltin<"__builtin_ia32_bextri_u32">,
         Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_tbm_bextri_u64 : GCCBuiltin<"__builtin_ia32_bextri_u64">,
         Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2625,7 +2625,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_xend : GCCBuiltin<"__builtin_ia32_xend">,
               Intrinsic<[], [], []>;
   def int_x86_xabort : GCCBuiltin<"__builtin_ia32_xabort">,
-              Intrinsic<[], [llvm_i8_ty], [ImmArg<0>]>;
+              Intrinsic<[], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
   def int_x86_xtest : GCCBuiltin<"__builtin_ia32_xtest">,
               Intrinsic<[llvm_i32_ty], [], []>;
 }
@@ -2667,70 +2667,70 @@ let TargetPrefix = "x86" in {
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_cvttss2si : GCCBuiltin<"__builtin_ia32_vcvttss2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvttss2si64 : GCCBuiltin<"__builtin_ia32_vcvttss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvttss2usi : GCCBuiltin<"__builtin_ia32_vcvttss2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvttss2usi64 : GCCBuiltin<"__builtin_ia32_vcvttss2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvtusi2ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss32">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cvtusi642ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss64">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cvttsd2si : GCCBuiltin<"__builtin_ia32_vcvttsd2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_vcvttsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvttsd2usi : GCCBuiltin<"__builtin_ia32_vcvttsd2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvttsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvttsd2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd64">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_vcvtss2usi32 : GCCBuiltin<"__builtin_ia32_vcvtss2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtss2usi64 : GCCBuiltin<"__builtin_ia32_vcvtss2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtss2si32 : GCCBuiltin<"__builtin_ia32_vcvtss2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtss2si64 : GCCBuiltin<"__builtin_ia32_vcvtss2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtsd2usi32 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtsd2si32 : GCCBuiltin<"__builtin_ia32_vcvtsd2si32">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_vcvtsd2si64 : GCCBuiltin<"__builtin_ia32_vcvtsd2si64">,
               Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<1>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_cvtsi2ss32 : GCCBuiltin<"__builtin_ia32_cvtsi2ss32">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cvtsi2ss64 : GCCBuiltin<"__builtin_ia32_cvtsi2ss64">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
-                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
-                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Pack ops.
@@ -2753,11 +2753,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_sitofp_round :
           Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<1>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_uitofp_round :
           Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<1>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_cvtpd2dq_128 :
         GCCBuiltin<"__builtin_ia32_cvtpd2dq128_mask">,
@@ -2769,25 +2769,25 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtpd2dq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtpd2ps_512 :
         GCCBuiltin<"__builtin_ia32_cvtpd2ps512_mask">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f64_ty, llvm_v8f32_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtsd2ss_round :
         GCCBuiltin<"__builtin_ia32_cvtsd2ss_round_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v2f64_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<4>]>;
+          [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_cvtss2sd_round :
         GCCBuiltin<"__builtin_ia32_cvtss2sd_round_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v4f32_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrNoMem, ImmArg<4>]>;
+          [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_cvtpd2ps :
         GCCBuiltin<"__builtin_ia32_cvtpd2ps_mask">,
@@ -2811,7 +2811,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtpd2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtpd2udq_128 :
         GCCBuiltin<"__builtin_ia32_cvtpd2udq128_mask">,
@@ -2829,7 +2829,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtpd2udq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtpd2uqq_128 :
         GCCBuiltin<"__builtin_ia32_cvtpd2uqq128_mask">,
@@ -2847,7 +2847,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtpd2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2dq_128 :
         GCCBuiltin<"__builtin_ia32_cvtps2dq128_mask">,
@@ -2865,13 +2865,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtps2dq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2pd_512 :
         GCCBuiltin<"__builtin_ia32_cvtps2pd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f32_ty, llvm_v8f64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2qq_128 :
         GCCBuiltin<"__builtin_ia32_cvtps2qq128_mask">,
@@ -2889,7 +2889,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtps2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2udq_128 :
         GCCBuiltin<"__builtin_ia32_cvtps2udq128_mask">,
@@ -2907,7 +2907,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtps2udq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtps2uqq_128 :
         GCCBuiltin<"__builtin_ia32_cvtps2uqq128_mask">,
@@ -2925,7 +2925,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvtps2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtqq2ps_128 :
         GCCBuiltin<"__builtin_ia32_cvtqq2ps128_mask">,
@@ -2943,7 +2943,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttpd2dq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttpd2qq_128 :
         GCCBuiltin<"__builtin_ia32_cvttpd2qq128_mask">,
@@ -2961,7 +2961,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttpd2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttpd2udq_128 :
         GCCBuiltin<"__builtin_ia32_cvttpd2udq128_mask">,
@@ -2979,7 +2979,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttpd2udq512_mask">,
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8f64_ty, llvm_v8i32_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttpd2uqq_128 :
         GCCBuiltin<"__builtin_ia32_cvttpd2uqq128_mask">,
@@ -2997,13 +2997,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttpd2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f64_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2dq_512 :
         GCCBuiltin<"__builtin_ia32_cvttps2dq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2qq_128 :
         GCCBuiltin<"__builtin_ia32_cvttps2qq128_mask">,
@@ -3021,7 +3021,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttps2qq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2udq_128 :
         GCCBuiltin<"__builtin_ia32_cvttps2udq128_mask">,
@@ -3039,7 +3039,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttps2udq512_mask">,
           Intrinsic<[llvm_v16i32_ty],
           [llvm_v16f32_ty, llvm_v16i32_ty,  llvm_i16_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvttps2uqq_128 :
         GCCBuiltin<"__builtin_ia32_cvttps2uqq128_mask">,
@@ -3057,7 +3057,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_cvttps2uqq512_mask">,
           Intrinsic<[llvm_v8i64_ty],
           [llvm_v8f32_ty, llvm_v8i64_ty,  llvm_i8_ty,  llvm_i32_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_cvtuqq2ps_128 :
         GCCBuiltin<"__builtin_ia32_cvtuqq2ps128_mask">,
@@ -3068,75 +3068,75 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_rndscale_pd_128 : GCCBuiltin<"__builtin_ia32_rndscalepd_128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty,
                                      llvm_v2f64_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_rndscale_pd_256 : GCCBuiltin<"__builtin_ia32_rndscalepd_256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
                                      llvm_v4f64_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_rndscale_pd_512 : GCCBuiltin<"__builtin_ia32_rndscalepd_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty,
                                      llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_rndscale_ps_128 : GCCBuiltin<"__builtin_ia32_rndscaleps_128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty,
                                      llvm_v4f32_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_rndscale_ps_256 : GCCBuiltin<"__builtin_ia32_rndscaleps_256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
                                      llvm_v8f32_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_rndscale_ps_512 : GCCBuiltin<"__builtin_ia32_rndscaleps_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_reduce_pd_128 : GCCBuiltin<"__builtin_ia32_reducepd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty,
                                      llvm_v2f64_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_reduce_pd_256 : GCCBuiltin<"__builtin_ia32_reducepd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
                                      llvm_v4f64_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_reduce_pd_512 : GCCBuiltin<"__builtin_ia32_reducepd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty,
                                      llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_reduce_ps_128 : GCCBuiltin<"__builtin_ia32_reduceps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty,
                                      llvm_v4f32_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_reduce_ps_256 : GCCBuiltin<"__builtin_ia32_reduceps256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
                                      llvm_v8f32_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_reduce_ps_512 : GCCBuiltin<"__builtin_ia32_reduceps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 def int_x86_avx512_mask_range_pd_128 : GCCBuiltin<"__builtin_ia32_rangepd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty,
                                     llvm_v2f64_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<2>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_x86_avx512_mask_range_pd_256 : GCCBuiltin<"__builtin_ia32_rangepd256_mask">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty,
                                     llvm_v4f64_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<2>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_x86_avx512_mask_range_pd_512 : GCCBuiltin<"__builtin_ia32_rangepd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty,
                                     llvm_v8f64_ty,  llvm_i8_ty,  llvm_i32_ty],
-                  [IntrNoMem, ImmArg<2>, ImmArg<5>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
 def int_x86_avx512_mask_range_ps_128 : GCCBuiltin<"__builtin_ia32_rangeps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty,
                                     llvm_v4f32_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<2>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_x86_avx512_mask_range_ps_256 : GCCBuiltin<"__builtin_ia32_rangeps256_mask">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i32_ty,
                                     llvm_v8f32_ty,  llvm_i8_ty],
-                  [IntrNoMem, ImmArg<2>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 def int_x86_avx512_mask_range_ps_512 : GCCBuiltin<"__builtin_ia32_rangeps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty,
                                      llvm_v16f32_ty,  llvm_i16_ty,  llvm_i32_ty],
-                  [IntrNoMem, ImmArg<2>, ImmArg<5>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
 }
 
 // Vector load with broadcast
@@ -3166,111 +3166,111 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx512_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<4>]>;
+                     llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>, ImmArg<5>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>, ImmArg<5>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess128_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>, ImmArg<5>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd128_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>, ImmArg<5>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>, ImmArg<5>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>, ImmArg<5>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_scalef_sd : GCCBuiltin<"__builtin_ia32_scalefsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scalef_ss : GCCBuiltin<"__builtin_ia32_scalefss_round_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                       llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
-                                     [IntrNoMem, ImmArg<4>]>;
+                                     [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scalef_pd_128 : GCCBuiltin<"__builtin_ia32_scalefpd128_mask">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
@@ -3280,7 +3280,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_scalef_pd_512 : GCCBuiltin<"__builtin_ia32_scalefpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                       llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<4>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scalef_ps_128 : GCCBuiltin<"__builtin_ia32_scalefps128_mask">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
@@ -3290,103 +3290,103 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_scalef_ps_512 : GCCBuiltin<"__builtin_ia32_scalefps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                        llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<4>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_sqrt_ss :
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_sqrt_sd :
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_sqrt_pd_512 :
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_sqrt_ps_512 :
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<1>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   def int_x86_avx512_mask_fixupimm_pd_128 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd128_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_pd_128 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd128_maskz">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_pd_256 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd256_mask">,
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_pd_256 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd256_maskz">,
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_pd_512 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_pd_512 :
          GCCBuiltin<"__builtin_ia32_fixupimmpd512_maskz">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_fixupimm_ps_128 :
          GCCBuiltin<"__builtin_ia32_fixupimmps128_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_ps_128 :
          GCCBuiltin<"__builtin_ia32_fixupimmps128_maskz">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_ps_256 :
          GCCBuiltin<"__builtin_ia32_fixupimmps256_mask">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_maskz_fixupimm_ps_256 :
          GCCBuiltin<"__builtin_ia32_fixupimmps256_maskz">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty],
-          [IntrNoMem, ImmArg<3>]>;
+          [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_fixupimm_ps_512 :
          GCCBuiltin<"__builtin_ia32_fixupimmps512_mask">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty,
-          llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_ps_512 :
          GCCBuiltin<"__builtin_ia32_fixupimmps512_maskz">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty,
-          llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_fixupimm_sd :
          GCCBuiltin<"__builtin_ia32_fixupimmsd_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_sd :
          GCCBuiltin<"__builtin_ia32_fixupimmsd_maskz">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_fixupimm_ss :
          GCCBuiltin<"__builtin_ia32_fixupimmss_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_maskz_fixupimm_ss :
          GCCBuiltin<"__builtin_ia32_fixupimmss_maskz">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty,
-          llvm_i32_ty], [IntrNoMem, ImmArg<3>, ImmArg<5>]>;
+          llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<5>>]>;
   def int_x86_avx512_mask_getexp_pd_128 : GCCBuiltin<"__builtin_ia32_getexppd128_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
@@ -3396,7 +3396,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_getexp_pd_512 : GCCBuiltin<"__builtin_ia32_getexppd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                     llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<3>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_mask_getexp_ps_128 : GCCBuiltin<"__builtin_ia32_getexpps128_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty], [IntrNoMem]>;
@@ -3406,64 +3406,64 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_getexp_ps_512 : GCCBuiltin<"__builtin_ia32_getexpps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<3>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_mask_getexp_ss : GCCBuiltin<"__builtin_ia32_getexpss128_round_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_getexp_sd : GCCBuiltin<"__builtin_ia32_getexpsd128_round_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty],
-                  [IntrNoMem, ImmArg<4>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_getmant_pd_128 :
          GCCBuiltin<"__builtin_ia32_getmantpd128_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty,llvm_i32_ty, llvm_v2f64_ty,  llvm_i8_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_pd_256 :
          GCCBuiltin<"__builtin_ia32_getmantpd256_mask">,
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty,llvm_i32_ty, llvm_v4f64_ty,  llvm_i8_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_pd_512 :
          GCCBuiltin<"__builtin_ia32_getmantpd512_mask">,
           Intrinsic<[llvm_v8f64_ty],
           [llvm_v8f64_ty,llvm_i32_ty, llvm_v8f64_ty,  llvm_i8_ty,llvm_i32_ty ],
-          [IntrNoMem, ImmArg<1>, ImmArg<4>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_getmant_ps_128 :
          GCCBuiltin<"__builtin_ia32_getmantps128_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,  llvm_i8_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_ps_256 :
          GCCBuiltin<"__builtin_ia32_getmantps256_mask">,
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty,  llvm_i8_ty],
-          [IntrNoMem, ImmArg<1>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   def int_x86_avx512_mask_getmant_ps_512 :
          GCCBuiltin<"__builtin_ia32_getmantps512_mask">,
           Intrinsic<[llvm_v16f32_ty],
           [llvm_v16f32_ty,llvm_i32_ty, llvm_v16f32_ty,llvm_i16_ty,llvm_i32_ty],
-          [IntrNoMem, ImmArg<1>, ImmArg<4>]>;
+          [IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_getmant_ss :
          GCCBuiltin<"__builtin_ia32_getmantss_round_mask">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,
-           llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>, ImmArg<5>]>;
+           llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
 
   def int_x86_avx512_mask_getmant_sd :
          GCCBuiltin<"__builtin_ia32_getmantsd_round_mask">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty,
-           llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<2>, ImmArg<5>]>;
+           llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]>;
 
   def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
@@ -3518,41 +3518,41 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx512_rcp28_ps : GCCBuiltin<"__builtin_ia32_rcp28ps_mask">,
             Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                                         llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>]>;
+                                         llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                                        llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>]>;
+                                        llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">,
             Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
-                                         llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>]>;
+                                         llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
-                                        llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<3>]>;
+                                        llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_round_mask">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
-                      [IntrNoMem, ImmArg<4>]>;
+                      [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_round_mask">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
-                      [IntrNoMem, ImmArg<4>]>;
+                      [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_rsqrt28_ps : GCCBuiltin<"__builtin_ia32_rsqrt28ps_mask">,
             Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                          llvm_i16_ty, llvm_i32_ty],
-                      [IntrNoMem, ImmArg<3>]>;
+                      [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_rsqrt28_pd : GCCBuiltin<"__builtin_ia32_rsqrt28pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty],
-                      [IntrNoMem, ImmArg<3>]>;
+                      [IntrNoMem, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
-                      [IntrNoMem, ImmArg<4>]>;
+                      [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
-                      [IntrNoMem, ImmArg<4>]>;
+                      [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
                         [IntrNoMem, Commutative]>;
@@ -3582,19 +3582,19 @@ let TargetPrefix = "x86" in {
          GCCBuiltin<"__builtin_ia32_dbpsadbw128">,
           Intrinsic<[llvm_v8i16_ty],
                     [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_dbpsadbw_256 :
          GCCBuiltin<"__builtin_ia32_dbpsadbw256">,
           Intrinsic<[llvm_v16i16_ty],
                     [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_dbpsadbw_512 :
          GCCBuiltin<"__builtin_ia32_dbpsadbw512">,
           Intrinsic<[llvm_v32i16_ty],
                     [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<2>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Gather and Scatter ops
@@ -3605,117 +3605,117 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_gather_dpd_512  :
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
                      llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gather_dps_512  :
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
                      llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gather_qpd_512  :
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gather_qps_512  :
           Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
 
   def int_x86_avx512_gather_dpq_512  :
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gather_dpi_512  :
           Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
                      llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gather_qpq_512  :
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gather_qpi_512  :
           Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div2_df :
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div2_di :
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div4_df :
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div4_di :
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div4_sf :
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div4_si :
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div8_sf :
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3div8_si :
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv2_df :
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv2_di :
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv4_df :
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv4_di :
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv4_sf :
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv4_si :
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv8_sf :
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_gather3siv8_si :
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
 // scatter
   // NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
@@ -3724,149 +3724,149 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_scatter_dpd_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                         llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatter_dps_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
                        llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatter_qpd_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                      llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatter_qps_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                      llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
 
 
   def int_x86_avx512_scatter_dpq_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                          llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatter_dpi_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
                      llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatter_qpq_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,llvm_v8i64_ty, llvm_v8i64_ty,
                          llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatter_qpi_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8i32_ty,
                          llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv2_df :
         Intrinsic<[],
         [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty],
-        [ImmArg<4>]>;
+        [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv2_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv4_df :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv4_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv4_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv4_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv8_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scatterdiv8_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv2_df :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv2_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv4_df :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv4_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv4_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv4_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv8_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_scattersiv8_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   // gather prefetch
   // NOTE: These can't be ArgMemOnly because you can put the address completely
   // in the index register.
   def int_x86_avx512_gatherpf_dpd_512  : GCCBuiltin<"__builtin_ia32_gatherpfdpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gatherpf_dps_512  : GCCBuiltin<"__builtin_ia32_gatherpfdps">,
           Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gatherpf_qpd_512  : GCCBuiltin<"__builtin_ia32_gatherpfqpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_gatherpf_qps_512  : GCCBuiltin<"__builtin_ia32_gatherpfqps">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
   // scatter prefetch
   // NOTE: These can't be ArgMemOnly because you can put the address completely
   // in the index register.
   def int_x86_avx512_scatterpf_dpd_512  : GCCBuiltin<"__builtin_ia32_scatterpfdpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatterpf_dps_512  : GCCBuiltin<"__builtin_ia32_scatterpfdps">,
           Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatterpf_qpd_512  : GCCBuiltin<"__builtin_ia32_scatterpfqpd">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_scatterpf_qps_512  : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
           Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<3>, ImmArg<4>]>;
+                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 }
 
 // AVX512 gather/scatter intrinsics that use vXi1 masks.
@@ -3876,134 +3876,134 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_gather_dpd_512  :
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
                      llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_gather_dps_512  :
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
                      llvm_v16i32_ty, llvm_v16i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_gather_qpd_512  :
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_gather_qps_512  :
           Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
 
   def int_x86_avx512_mask_gather_dpq_512  :
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_gather_dpi_512  :
           Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
                      llvm_v16i32_ty, llvm_v16i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_gather_qpq_512  :
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_gather_qpi_512  :
           Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
                      llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty],
-                    [IntrReadMem, ImmArg<4>]>;
+                    [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div2_df :
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div2_di :
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div4_df :
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div4_di :
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div4_sf :
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div4_si :
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div8_sf :
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3div8_si :
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv2_df :
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv2_di :
           Intrinsic<[llvm_v2i64_ty],
           [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv4_df :
           Intrinsic<[llvm_v4f64_ty],
           [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv4_di :
           Intrinsic<[llvm_v4i64_ty],
           [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv4_sf :
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv4_si :
           Intrinsic<[llvm_v4i32_ty],
           [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv8_sf :
           Intrinsic<[llvm_v8f32_ty],
           [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_gather3siv8_si :
           Intrinsic<[llvm_v8i32_ty],
           [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
-          [IntrReadMem, ImmArg<4>]>;
+          [IntrReadMem, ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatter_dpd_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
                         llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scatter_dps_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
                        llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scatter_qpd_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
                      llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scatter_qps_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
                      llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
 
 
   // NOTE: These can't be ArgMemOnly because you can put the address completely
@@ -4011,99 +4011,99 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_scatter_dpq_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
                          llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scatter_dpi_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
                      llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scatter_qpq_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,llvm_v8i64_ty, llvm_v8i64_ty,
                          llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_scatter_qpi_512  :
           Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i32_ty,
                          llvm_i32_ty],
-                    [ImmArg<4>]>;
+                    [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv2_df :
         Intrinsic<[],
         [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty],
-        [ImmArg<4>]>;
+        [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv2_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv4_df :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv4_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv4_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv4_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv8_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scatterdiv8_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv2_df :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv2_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv4_df :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv4_di :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv4_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv4_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv8_sf :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 
   def int_x86_avx512_mask_scattersiv8_si :
           Intrinsic<[],
           [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
-          [ImmArg<4>]>;
+          [ImmArg<ArgIndex<4>>]>;
 }
 
 // AVX-512 conflict detection instruction
@@ -4136,11 +4136,11 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">,
               Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">,
               Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
                          llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
 }
 
 // Compress, Expand
@@ -4684,37 +4684,37 @@ let TargetPrefix = "x86" in {
           GCCBuiltin<"__builtin_ia32_pternlogd128">,
           Intrinsic<[llvm_v4i32_ty],
                     [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_d_256 :
           GCCBuiltin<"__builtin_ia32_pternlogd256">,
           Intrinsic<[llvm_v8i32_ty],
                     [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_d_512 :
           GCCBuiltin<"__builtin_ia32_pternlogd512">,
           Intrinsic<[llvm_v16i32_ty],
                     [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty,
-                     llvm_i32_ty], [IntrNoMem, ImmArg<3>]>;
+                     llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_q_128 :
           GCCBuiltin<"__builtin_ia32_pternlogq128">,
           Intrinsic<[llvm_v2i64_ty],
                     [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_q_256 :
           GCCBuiltin<"__builtin_ia32_pternlogq256">,
           Intrinsic<[llvm_v4i64_ty],
                     [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
   def int_x86_avx512_pternlog_q_512 :
           GCCBuiltin<"__builtin_ia32_pternlogq512">,
           Intrinsic<[llvm_v8i64_ty],
                     [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
-                    [IntrNoMem, ImmArg<3>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 }
 
 // vp2intersect
@@ -4752,34 +4752,34 @@ let TargetPrefix = "x86" in {
   def int_x86_avx512_cmp_ps_512 :
               Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                          llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_cmp_pd_512 :
               Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                          llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<2>, ImmArg<3>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
   def int_x86_avx512_cmp_ps_256 :
               Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cmp_pd_256 :
               Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                         llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                         llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cmp_ps_128 :
             Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                       llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                       llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_avx512_cmp_pd_128 :
             Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                       llvm_i32_ty], [IntrNoMem, ImmArg<2>]>;
+                       llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   def int_x86_avx512_mask_cmp_ss :
         GCCBuiltin<"__builtin_ia32_cmpss_mask">,
               Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                          llvm_i32_ty, llvm_i8_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<2>, ImmArg<4>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_mask_cmp_sd :
         GCCBuiltin<"__builtin_ia32_cmpsd_mask">,
               Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                          llvm_i32_ty, llvm_i8_ty, llvm_i32_ty],
-                        [IntrNoMem, ImmArg<2>, ImmArg<4>]>;
+                        [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4787,7 +4787,7 @@ let TargetPrefix = "x86" in {
 let TargetPrefix = "x86" in {
   def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
-                  [IntrNoMem, ImmArg<2>]>;
+                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   def int_x86_sha1nexte : GCCBuiltin<"__builtin_ia32_sha1nexte">,
       Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
   def int_x86_sha1msg1 : GCCBuiltin<"__builtin_ia32_sha1msg1">,
diff --git a/llvm/include/llvm/IR/IntrinsicsXCore.td b/llvm/include/llvm/IR/IntrinsicsXCore.td
index 7fe8bdfd3bd01..89dbc65fea445 100644
--- a/llvm/include/llvm/IR/IntrinsicsXCore.td
+++ b/llvm/include/llvm/IR/IntrinsicsXCore.td
@@ -38,58 +38,58 @@ let TargetPrefix = "xcore" in {  // All intrinsics start with "llvm.xcore.".
   // Resource instructions.
   def int_xcore_getr : Intrinsic<[llvm_anyptr_ty],[llvm_i32_ty]>;
   def int_xcore_freer : Intrinsic<[],[llvm_anyptr_ty],
-                                   [NoCapture<0>]>;
-  def int_xcore_in : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],[NoCapture<0>]>;
+                                   [NoCapture<ArgIndex<0>>]>;
+  def int_xcore_in : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],[NoCapture<ArgIndex<0>>]>;
   def int_xcore_int : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                [NoCapture<0>]>;
+                                [NoCapture<ArgIndex<0>>]>;
   def int_xcore_inct : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                 [NoCapture<0>]>;
+                                 [NoCapture<ArgIndex<0>>]>;
   def int_xcore_out : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                [NoCapture<0>]>;
+                                [NoCapture<ArgIndex<0>>]>;
   def int_xcore_outt : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                 [NoCapture<0>]>;
+                                 [NoCapture<ArgIndex<0>>]>;
   def int_xcore_outct : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_chkct : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_testct : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                   [NoCapture<0>]>;
+                                   [NoCapture<ArgIndex<0>>]>;
   def int_xcore_testwct : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                    [NoCapture<0>]>;
+                                    [NoCapture<ArgIndex<0>>]>;
   def int_xcore_setd : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_setc : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_inshr : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_outshr : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_setpt : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_clrpt : Intrinsic<[],[llvm_anyptr_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_getts : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_syncr : Intrinsic<[],[llvm_anyptr_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_settw : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                  [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
   def int_xcore_setv : Intrinsic<[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                 [NoCapture<0>]>;
+                                 [NoCapture<ArgIndex<0>>]>;
   def int_xcore_setev : Intrinsic<[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                  [NoCapture<0>]>;
-  def int_xcore_eeu : Intrinsic<[],[llvm_anyptr_ty], [NoCapture<0>]>;
-  def int_xcore_edu : Intrinsic<[],[llvm_anyptr_ty], [NoCapture<0>]>;
+                                  [NoCapture<ArgIndex<0>>]>;
+  def int_xcore_eeu : Intrinsic<[],[llvm_anyptr_ty], [NoCapture<ArgIndex<0>>]>;
+  def int_xcore_edu : Intrinsic<[],[llvm_anyptr_ty], [NoCapture<ArgIndex<0>>]>;
   def int_xcore_setclk : Intrinsic<[],[llvm_anyptr_ty, llvm_anyptr_ty],
-                                   [NoCapture<0>, NoCapture<1>]>;
+                                   [NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
   def int_xcore_setrdy : Intrinsic<[],[llvm_anyptr_ty, llvm_anyptr_ty],
-                                   [NoCapture<0>, NoCapture<1>]>;
+                                   [NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
   def int_xcore_setpsc : Intrinsic<[],[llvm_anyptr_ty, llvm_i32_ty],
-                                   [NoCapture<0>]>;
+                                   [NoCapture<ArgIndex<0>>]>;
   def int_xcore_peek : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                 [NoCapture<0>]>;
+                                 [NoCapture<ArgIndex<0>>]>;
   def int_xcore_endin : Intrinsic<[llvm_i32_ty],[llvm_anyptr_ty],
-                                 [NoCapture<0>]>;
+                                 [NoCapture<ArgIndex<0>>]>;
 
   // Intrinsics for events.
   def int_xcore_waitevent : Intrinsic<[llvm_ptr_ty],[], [IntrReadMem]>;
@@ -103,18 +103,18 @@ let TargetPrefix = "xcore" in {  // All intrinsics start with "llvm.xcore.".
 
   // Intrinsics for threads.
   def int_xcore_getst : Intrinsic <[llvm_anyptr_ty],[llvm_anyptr_ty],
-                                   [NoCapture<0>]>;
-  def int_xcore_msync : Intrinsic <[],[llvm_anyptr_ty], [NoCapture<0>]>;
+                                   [NoCapture<ArgIndex<0>>]>;
+  def int_xcore_msync : Intrinsic <[],[llvm_anyptr_ty], [NoCapture<ArgIndex<0>>]>;
   def int_xcore_ssync : Intrinsic <[],[]>;
-  def int_xcore_mjoin : Intrinsic <[],[llvm_anyptr_ty], [NoCapture<0>]>;
+  def int_xcore_mjoin : Intrinsic <[],[llvm_anyptr_ty], [NoCapture<ArgIndex<0>>]>;
   def int_xcore_initsp : Intrinsic <[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                    [NoCapture<0>]>;
+                                    [NoCapture<ArgIndex<0>>]>;
   def int_xcore_initpc : Intrinsic <[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                    [NoCapture<0>]>;
+                                    [NoCapture<ArgIndex<0>>]>;
   def int_xcore_initlr : Intrinsic <[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                    [NoCapture<0>]>;
+                                    [NoCapture<ArgIndex<0>>]>;
   def int_xcore_initcp : Intrinsic <[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                    [NoCapture<0>]>;
+                                    [NoCapture<ArgIndex<0>>]>;
   def int_xcore_initdp : Intrinsic <[],[llvm_anyptr_ty, llvm_ptr_ty],
-                                    [NoCapture<0>]>;
+                                    [NoCapture<ArgIndex<0>>]>;
 }
diff --git a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td b/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td
index afc967f3f78dc..0a12cc0bf8560 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td
+++ b/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td
@@ -11,8 +11,8 @@ def gi_shiftl_1 : GICustomOperandRenderer<"renderShiftImml1">,
   GISDNodeXFormEquiv<shiftl_1>;
 
 
-def int_mytarget_sleep : Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]>;
-def int_mytarget_foo : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<1>, IntrNoMem]>;
+def int_mytarget_sleep : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+def int_mytarget_foo : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>, IntrNoMem]>;
 
 
 def SLEEP : I<(outs), (ins i32imm:$src0), []>;
diff --git a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td b/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td
index a87e46a837347..2f39bf49af4d5 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td
+++ b/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td
@@ -3,7 +3,7 @@
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
 
-def int_mytarget_sleep : Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]>;
+def int_mytarget_sleep : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 
 def G_TGT_CAT : MyTargetGenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/test/TableGen/immarg.td b/llvm/test/TableGen/immarg.td
index 407f06c3a40ec..c6f03cad137fa 100644
--- a/llvm/test/TableGen/immarg.td
+++ b/llvm/test/TableGen/immarg.td
@@ -4,8 +4,8 @@ include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
 
 let TargetPrefix = "mytarget" in {
-def int_mytarget_sleep0 : Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]>;
-def int_mytarget_sleep1 : Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]>;
+def int_mytarget_sleep0 : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+def int_mytarget_sleep1 : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
 }
 
 // GISEL: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS,
diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h
index 824bb944753bf..5ebdbf995ebf6 100644
--- a/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -142,7 +142,7 @@ struct CodeGenIntrinsic {
   // True if the intrinsic is marked as speculatable.
   bool isSpeculatable;
 
-  enum ArgAttribute {
+  enum ArgAttrKind {
     NoCapture,
     NoAlias,
     Returned,
@@ -152,7 +152,19 @@ struct CodeGenIntrinsic {
     ImmArg
   };
 
-  std::vector<std::pair<unsigned, ArgAttribute>> ArgumentAttributes;
+  struct ArgAttribute {
+    unsigned Index;
+    ArgAttrKind Kind;
+
+    ArgAttribute(unsigned Idx, ArgAttrKind K)
+        : Index(Idx), Kind(K) {}
+
+    bool operator<(const ArgAttribute &Other) const {
+      return std::tie(Index, Kind) < std::tie(Other.Index, Other.Kind);
+    }
+  };
+
+  std::vector<ArgAttribute> ArgumentAttributes;
 
   bool hasProperty(enum SDNP Prop) const {
     return Properties & (1 << Prop);
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 282e62cf838e0..35d5deecf32cb 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -795,25 +795,25 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       hasSideEffects = true;
     else if (Property->isSubClassOf("NoCapture")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture));
+      ArgumentAttributes.emplace_back(ArgNo, NoCapture);
     } else if (Property->isSubClassOf("NoAlias")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, NoAlias));
+      ArgumentAttributes.emplace_back(ArgNo, NoAlias);
     } else if (Property->isSubClassOf("Returned")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, Returned));
+      ArgumentAttributes.emplace_back(ArgNo, Returned);
     } else if (Property->isSubClassOf("ReadOnly")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, ReadOnly));
+      ArgumentAttributes.emplace_back(ArgNo, ReadOnly);
     } else if (Property->isSubClassOf("WriteOnly")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, WriteOnly));
+      ArgumentAttributes.emplace_back(ArgNo, WriteOnly);
     } else if (Property->isSubClassOf("ReadNone")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, ReadNone));
+      ArgumentAttributes.emplace_back(ArgNo, ReadNone);
     } else if (Property->isSubClassOf("ImmArg")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.push_back(std::make_pair(ArgNo, ImmArg));
+      ArgumentAttributes.emplace_back(ArgNo, ImmArg);
     } else
       llvm_unreachable("Unknown property!");
   }
@@ -833,7 +833,8 @@ bool CodeGenIntrinsic::isParamAPointer(unsigned ParamIdx) const {
 }
 
 bool CodeGenIntrinsic::isParamImmArg(unsigned ParamIdx) const {
-  std::pair<unsigned, ArgAttribute> Val = {ParamIdx, ImmArg};
+  // Convert argument index to attribute index starting from `FirstArgIndex`.
+  ArgAttribute Val{ParamIdx + 1, ImmArg};
   return std::binary_search(ArgumentAttributes.begin(),
                             ArgumentAttributes.end(), Val);
 }
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index f05fd9fd39fe2..0480a838ea6cd 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -663,14 +663,13 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
     unsigned ai = 0, ae = intrinsic.ArgumentAttributes.size();
     if (ae) {
       while (ai != ae) {
-        unsigned argNo = intrinsic.ArgumentAttributes[ai].first;
-        unsigned attrIdx = argNo + 1; // Must match AttributeList::FirstArgIndex
+        unsigned attrIdx = intrinsic.ArgumentAttributes[ai].Index;
 
         OS << "      const Attribute::AttrKind AttrParam" << attrIdx << "[]= {";
         bool addComma = false;
 
         do {
-          switch (intrinsic.ArgumentAttributes[ai].second) {
+          switch (intrinsic.ArgumentAttributes[ai].Kind) {
           case CodeGenIntrinsic::NoCapture:
             if (addComma)
               OS << ",";
@@ -716,7 +715,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
           }
 
           ++ai;
-        } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
+        } while (ai != ae && intrinsic.ArgumentAttributes[ai].Index == attrIdx);
         OS << "};\n";
         OS << "      AS[" << numAttrs++ << "] = AttributeList::get(C, "
            << attrIdx << ", AttrParam" << attrIdx << ");\n";

From fa342b5c8054dad4cfd1032ac580d71f0f4943d3 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Wed, 27 May 2020 16:02:15 -0400
Subject: [PATCH 278/770] Enable `align <n>` to be used in the intrinsic
 definition.

- This allow us to specify the (minimal) alignment on an intrinsic's
  arguments and, more importantly, the return value.

Differential Revision: https://reviews.llvm.org/D80422
---
 llvm/include/llvm/IR/Attributes.h             |  3 ++
 llvm/include/llvm/IR/Intrinsics.td            |  5 ++++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 10 +++----
 llvm/lib/IR/Attributes.cpp                    | 11 +++++++
 .../CodeGen/AMDGPU/reqd-work-group-size.ll    | 15 +++++++++-
 llvm/utils/TableGen/CodeGenIntrinsics.h       | 11 ++++---
 llvm/utils/TableGen/CodeGenTarget.cpp         | 20 ++++++++-----
 llvm/utils/TableGen/IntrinsicEmitter.cpp      | 30 ++++++++++++++++++-
 8 files changed, 86 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index a76e29132accc..58365aa2b7645 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -396,6 +396,9 @@ class AttributeList {
   static AttributeList get(LLVMContext &C, ArrayRef<AttributeList> Attrs);
   static AttributeList get(LLVMContext &C, unsigned Index,
                            ArrayRef<Attribute::AttrKind> Kinds);
+  static AttributeList get(LLVMContext &C, unsigned Index,
+                           ArrayRef<Attribute::AttrKind> Kinds,
+                           ArrayRef<uint64_t> Values);
   static AttributeList get(LLVMContext &C, unsigned Index,
                            ArrayRef<StringRef> Kind);
   static AttributeList get(LLVMContext &C, unsigned Index,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f6df3faba83f5..78409df8f816a 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -81,6 +81,11 @@ class NoAlias<AttrIndex idx> : IntrinsicProperty {
   int ArgNo = idx.Value;
 }
 
+class Align<AttrIndex idx, int align> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+  int Align = align;
+}
+
 // Returned - The specified argument is always the return value of the
 // intrinsic.
 class Returned<AttrIndex idx> : IntrinsicProperty {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 132d6b7360f74..e2d8f3cb1bd60 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -142,22 +142,22 @@ defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
 
 def int_amdgcn_dispatch_ptr :
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
-  [IntrNoMem, IntrSpeculatable]>;
+  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_queue_ptr :
   GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
-  [IntrNoMem, IntrSpeculatable]>;
+  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_kernarg_segment_ptr :
   GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
-  [IntrNoMem, IntrSpeculatable]>;
+  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_implicitarg_ptr :
   GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
-  [IntrNoMem, IntrSpeculatable]>;
+  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
 
 def int_amdgcn_groupstaticsize :
   GCCBuiltin<"__builtin_amdgcn_groupstaticsize">,
@@ -170,7 +170,7 @@ def int_amdgcn_dispatch_id :
 def int_amdgcn_implicit_buffer_ptr :
   GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
   Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
-  [IntrNoMem, IntrSpeculatable]>;
+  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
 
 // Set EXEC to the 64-bit value given.
 // This is always moved to the beginning of the basic block.
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 122cfe5d5fcab..191668dacc188 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1174,6 +1174,17 @@ AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
   return get(C, Attrs);
 }
 
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<Attribute::AttrKind> Kinds,
+                                 ArrayRef<uint64_t> Values) {
+  assert(Kinds.size() == Values.size() && "Mismatched attribute values.");
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  auto VI = Values.begin();
+  for (const auto K : Kinds)
+    Attrs.emplace_back(Index, Attribute::get(C, K, *VI++));
+  return get(C, Attrs);
+}
+
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  ArrayRef<StringRef> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index 1903f89789b16..1e69d4551359c 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -390,7 +390,7 @@ define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %
 ; CHECK-LABEL: @partial_load_group_size_x(
 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
+; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 4
 ; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
 define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
@@ -400,6 +400,19 @@ define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !
   ret void
 }
 
+; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align(
+; CHECK-NEXT: %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
+; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 2
+; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
+define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
+  %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
+  store i8 %group.size.x.lo, i8 addrspace(1)* %out
+  ret void
+}
+
 ; TODO: Should be able to handle this
 ; CHECK-LABEL: @load_group_size_xy_i32(
 ; CHECK: %group.size.xy = load i32,
diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h
index 5ebdbf995ebf6..6503f39cfd8ea 100644
--- a/llvm/utils/TableGen/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/CodeGenIntrinsics.h
@@ -149,18 +149,21 @@ struct CodeGenIntrinsic {
     ReadOnly,
     WriteOnly,
     ReadNone,
-    ImmArg
+    ImmArg,
+    Alignment
   };
 
   struct ArgAttribute {
     unsigned Index;
     ArgAttrKind Kind;
+    uint64_t Value;
 
-    ArgAttribute(unsigned Idx, ArgAttrKind K)
-        : Index(Idx), Kind(K) {}
+    ArgAttribute(unsigned Idx, ArgAttrKind K, uint64_t V)
+        : Index(Idx), Kind(K), Value(V) {}
 
     bool operator<(const ArgAttribute &Other) const {
-      return std::tie(Index, Kind) < std::tie(Other.Index, Other.Kind);
+      return std::tie(Index, Kind, Value) <
+             std::tie(Other.Index, Other.Kind, Other.Value);
     }
   };
 
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 35d5deecf32cb..78fb732877013 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -795,25 +795,29 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       hasSideEffects = true;
     else if (Property->isSubClassOf("NoCapture")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, NoCapture);
+      ArgumentAttributes.emplace_back(ArgNo, NoCapture, 0);
     } else if (Property->isSubClassOf("NoAlias")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, NoAlias);
+      ArgumentAttributes.emplace_back(ArgNo, NoAlias, 0);
     } else if (Property->isSubClassOf("Returned")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, Returned);
+      ArgumentAttributes.emplace_back(ArgNo, Returned, 0);
     } else if (Property->isSubClassOf("ReadOnly")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, ReadOnly);
+      ArgumentAttributes.emplace_back(ArgNo, ReadOnly, 0);
     } else if (Property->isSubClassOf("WriteOnly")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, WriteOnly);
+      ArgumentAttributes.emplace_back(ArgNo, WriteOnly, 0);
     } else if (Property->isSubClassOf("ReadNone")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, ReadNone);
+      ArgumentAttributes.emplace_back(ArgNo, ReadNone, 0);
     } else if (Property->isSubClassOf("ImmArg")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
-      ArgumentAttributes.emplace_back(ArgNo, ImmArg);
+      ArgumentAttributes.emplace_back(ArgNo, ImmArg, 0);
+    } else if (Property->isSubClassOf("Align")) {
+      unsigned ArgNo = Property->getValueAsInt("ArgNo");
+      uint64_t Align = Property->getValueAsInt("Align");
+      ArgumentAttributes.emplace_back(ArgNo, Alignment, Align);
     } else
       llvm_unreachable("Unknown property!");
   }
@@ -834,7 +838,7 @@ bool CodeGenIntrinsic::isParamAPointer(unsigned ParamIdx) const {
 
 bool CodeGenIntrinsic::isParamImmArg(unsigned ParamIdx) const {
   // Convert argument index to attribute index starting from `FirstArgIndex`.
-  ArgAttribute Val{ParamIdx + 1, ImmArg};
+  ArgAttribute Val{ParamIdx + 1, ImmArg, 0};
   return std::binary_search(ArgumentAttributes.begin(),
                             ArgumentAttributes.end(), Val);
 }
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 0480a838ea6cd..ab42f33cf23fd 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -668,6 +668,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
         OS << "      const Attribute::AttrKind AttrParam" << attrIdx << "[]= {";
         bool addComma = false;
 
+        bool AllValuesAreZero = true;
+        SmallVector<uint64_t, 8> Values;
         do {
           switch (intrinsic.ArgumentAttributes[ai].Kind) {
           case CodeGenIntrinsic::NoCapture:
@@ -712,13 +714,39 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
             OS << "Attribute::ImmArg";
             addComma = true;
             break;
+          case CodeGenIntrinsic::Alignment:
+            if (addComma)
+              OS << ',';
+            OS << "Attribute::Alignment";
+            addComma = true;
+            break;
           }
+          uint64_t V = intrinsic.ArgumentAttributes[ai].Value;
+          Values.push_back(V);
+          AllValuesAreZero &= (V == 0);
 
           ++ai;
         } while (ai != ae && intrinsic.ArgumentAttributes[ai].Index == attrIdx);
         OS << "};\n";
+
+        // Generate attribute value array if not all attribute values are zero.
+        if (!AllValuesAreZero) {
+          OS << "      const uint64_t AttrValParam" << attrIdx << "[]= {";
+          addComma = false;
+          for (const auto V : Values) {
+            if (addComma)
+              OS << ',';
+            OS << V;
+            addComma = true;
+          }
+          OS << "};\n";
+        }
+
         OS << "      AS[" << numAttrs++ << "] = AttributeList::get(C, "
-           << attrIdx << ", AttrParam" << attrIdx << ");\n";
+           << attrIdx << ", AttrParam" << attrIdx;
+        if (!AllValuesAreZero)
+          OS << ", AttrValParam" << attrIdx;
+        OS << ");\n";
       }
     }
 

From 98ef93eabd768e51aa58c7623a9fe220ab471715 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 26 May 2020 12:07:08 -0700
Subject: [PATCH 279/770] [llvm] Add function feature extraction analysis

Summary:
This patch introduces an analysis pass to extract function features,
which will be needed by the ML InlineAdvisor.

RFC: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140763.html

Reviewers: davidxl, dblaikie, jdoerfert

Subscribers: mgorny, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80579
---
 .../llvm/Analysis/ML/InlineFeaturesAnalysis.h | 37 +++++++++
 llvm/lib/Analysis/CMakeLists.txt              |  7 ++
 llvm/lib/Analysis/ML/CMakeLists.txt           |  9 +++
 .../Analysis/ML/InlineFeaturesAnalysis.cpp    | 28 +++++++
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 llvm/lib/Passes/PassRegistry.def              |  1 +
 llvm/unittests/Analysis/CMakeLists.txt        |  2 +
 llvm/unittests/Analysis/ML/CMakeLists.txt     | 12 +++
 .../ML/InlineFeaturesAnalysisTest.cpp         | 77 +++++++++++++++++++
 9 files changed, 174 insertions(+)
 create mode 100644 llvm/include/llvm/Analysis/ML/InlineFeaturesAnalysis.h
 create mode 100644 llvm/lib/Analysis/ML/CMakeLists.txt
 create mode 100644 llvm/lib/Analysis/ML/InlineFeaturesAnalysis.cpp
 create mode 100644 llvm/unittests/Analysis/ML/CMakeLists.txt
 create mode 100644 llvm/unittests/Analysis/ML/InlineFeaturesAnalysisTest.cpp

diff --git a/llvm/include/llvm/Analysis/ML/InlineFeaturesAnalysis.h b/llvm/include/llvm/Analysis/ML/InlineFeaturesAnalysis.h
new file mode 100644
index 0000000000000..694cae34bc75e
--- /dev/null
+++ b/llvm/include/llvm/Analysis/ML/InlineFeaturesAnalysis.h
@@ -0,0 +1,37 @@
+#ifndef LLVM_INLINEFEATURESANALYSIS_H_
+#define LLVM_INLINEFEATURESANALYSIS_H_
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Function;
+
+class InlineFeaturesAnalysis
+    : public AnalysisInfoMixin<InlineFeaturesAnalysis> {
+public:
+  static AnalysisKey Key;
+  struct Result {
+    /// Number of basic blocks
+    int64_t BasicBlockCount = 0;
+
+    /// Number of blocks reached from a conditional instruction, or that are
+    /// 'cases' of a SwitchInstr.
+    // FIXME: We may want to replace this with a more meaningful metric, like
+    // number of conditionally executed blocks:
+    // 'if (a) s();' would be counted here as 2 blocks, just like
+    // 'if (a) s(); else s2(); s3();' would.
+    int64_t BlocksReachedFromConditionalInstruction = 0;
+
+    /// Number of uses of this function, plus 1 if the function is callable
+    /// outside the module.
+    int64_t Uses = 0;
+
+    /// Number of direct calls made from this function to other functions
+    /// defined in this module.
+    int64_t DirectCallsToDefinedFunctions = 0;
+  };
+  Result run(const Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+#endif // LLVM_INLINEFEATURESANALYSIS_H_
\ No newline at end of file
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 7ae053f59d1a0..8fa832faec4ce 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(ML)
+
 add_llvm_component_library(LLVMAnalysis
   AliasAnalysis.cpp
   AliasAnalysisEvaluator.cpp
@@ -106,4 +108,9 @@ add_llvm_component_library(LLVMAnalysis
 
   DEPENDS
   intrinsics_gen
+  LLVMMLPolicies
+
+
+  LINK_LIBS
+  LLVMMLPolicies
   )
diff --git a/llvm/lib/Analysis/ML/CMakeLists.txt b/llvm/lib/Analysis/ML/CMakeLists.txt
new file mode 100644
index 0000000000000..05039a95ba73b
--- /dev/null
+++ b/llvm/lib/Analysis/ML/CMakeLists.txt
@@ -0,0 +1,9 @@
+set (SOURCES InlineFeaturesAnalysis.cpp)
+
+add_llvm_library(LLVMMLPolicies STATIC
+  ${SOURCES}
+
+  DEPENDS
+  intrinsics_gen
+
+  )
diff --git a/llvm/lib/Analysis/ML/InlineFeaturesAnalysis.cpp b/llvm/lib/Analysis/ML/InlineFeaturesAnalysis.cpp
new file mode 100644
index 0000000000000..d81e9b3aaf62d
--- /dev/null
+++ b/llvm/lib/Analysis/ML/InlineFeaturesAnalysis.cpp
@@ -0,0 +1,28 @@
+#include "llvm/Analysis/ML/InlineFeaturesAnalysis.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+AnalysisKey InlineFeaturesAnalysis::Key;
+
+InlineFeaturesAnalysis::Result
+InlineFeaturesAnalysis::run(const Function &F, FunctionAnalysisManager &FAM) {
+  Result Ret;
+  Ret.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
+  for (const auto &BB : F) {
+    ++Ret.BasicBlockCount;
+    if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (BI->isConditional())
+        Ret.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
+    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator()))
+      Ret.BlocksReachedFromConditionalInstruction +=
+          (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
+    for (const auto &I : BB)
+      if (auto *CS = dyn_cast<CallBase>(&I)) {
+        const auto *Callee = CS->getCalledFunction();
+        if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
+          ++Ret.DirectCallsToDefinedFunctions;
+      }
+  }
+  return Ret;
+}
\ No newline at end of file
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 014ef836e2c30..0999f7872d12c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopNestAnalysis.h"
+#include "llvm/Analysis/ML/InlineFeaturesAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 03c4379f2468b..dd75a418925bc 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -134,6 +134,7 @@ FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("da", DependenceAnalysis())
+FUNCTION_ANALYSIS("inliner-features", InlineFeaturesAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
 FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis())
 FUNCTION_ANALYSIS("phi-values", PhiValuesAnalysis())
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index f344d6c7bc25d..6cc14d124b153 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -41,3 +41,5 @@ add_llvm_unittest(AnalysisTests
   ValueTrackingTest.cpp
   VectorUtilsTest.cpp
   )
+
+add_subdirectory(ML)
\ No newline at end of file
diff --git a/llvm/unittests/Analysis/ML/CMakeLists.txt b/llvm/unittests/Analysis/ML/CMakeLists.txt
new file mode 100644
index 0000000000000..8d1c90312ad0b
--- /dev/null
+++ b/llvm/unittests/Analysis/ML/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis  
+  AsmParser
+  Core
+  MLPolicies
+  Support
+  TransformUtils
+  )
+
+add_llvm_unittest(MLAnalysisTests
+  InlineFeaturesAnalysisTest.cpp
+  )
diff --git a/llvm/unittests/Analysis/ML/InlineFeaturesAnalysisTest.cpp b/llvm/unittests/Analysis/ML/InlineFeaturesAnalysisTest.cpp
new file mode 100644
index 0000000000000..4dfc0bd153f71
--- /dev/null
+++ b/llvm/unittests/Analysis/ML/InlineFeaturesAnalysisTest.cpp
@@ -0,0 +1,77 @@
+//===- InlineFeaturesAnalysisTest.cpp - inline features unit tests --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ML/InlineFeaturesAnalysis.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
+  SMDiagnostic Err;
+  std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
+  if (!Mod)
+    Err.print("MLAnalysisTests", errs());
+  return Mod;
+}
+
+TEST(InlineFeaturesTest, BasicTest) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C,
+                                      R"IR(
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare i32 @f1(i32)
+declare i32 @f2(i32)
+
+define i32 @branches(i32) {
+  %cond = icmp slt i32 %0, 3
+  br i1 %cond, label %then, label %else
+
+then:
+  %ret.1 = call i32 @f1(i32 %0)
+  br label %last.block
+
+else:
+  %ret.2 = call i32 @f2(i32 %0)
+  br label %last.block
+
+last.block:
+  %ret = phi i32 [%ret.1, %then], [%ret.2, %else]
+  ret i32 %ret
+}
+
+define internal i32 @top() {
+  %1 = call i32 @branches(i32 2)
+  %2 = call i32 @f1(i32 %1)
+  ret i32 %2
+}
+)IR");
+
+  FunctionAnalysisManager FAM;
+  InlineFeaturesAnalysis FA;
+
+  auto BranchesFeatures = FA.run(*M->getFunction("branches"), FAM);
+  EXPECT_EQ(BranchesFeatures.BasicBlockCount, 4);
+  EXPECT_EQ(BranchesFeatures.BlocksReachedFromConditionalInstruction, 2);
+  EXPECT_EQ(BranchesFeatures.DirectCallsToDefinedFunctions, 0);
+  // 2 Users: top is one. The other is added because @branches is not internal,
+  // so it may have external callers.
+  EXPECT_EQ(BranchesFeatures.Uses, 2);
+
+  auto TopFeatures = FA.run(*M->getFunction("top"), FAM);
+  EXPECT_EQ(TopFeatures.BasicBlockCount, 1);
+  EXPECT_EQ(TopFeatures.BlocksReachedFromConditionalInstruction, 0);
+  EXPECT_EQ(TopFeatures.DirectCallsToDefinedFunctions, 1);
+  EXPECT_EQ(TopFeatures.Uses, 0);
+}

From 9546d8b108dce03e03e0448cebbca5fa0fe4be21 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Tue, 26 May 2020 16:44:20 -0700
Subject: [PATCH 280/770] [mlir][core] Add IndexElementsAttr helpers.

Summary:
In a follow-up, I'll update the Shape dialect to use this instead of
I64ElementsAttr.

Differential Revision: https://reviews.llvm.org/D80601
---
 mlir/include/mlir/IR/Builders.h       |  1 +
 mlir/include/mlir/IR/OpBase.td        |  7 +++++++
 mlir/lib/IR/Attributes.cpp            |  2 ++
 mlir/lib/IR/Builders.cpp              |  7 +++++++
 mlir/test/lib/Dialect/Test/TestOps.td |  4 ++++
 mlir/test/mlir-tblgen/types.mlir      | 15 +++++++++++++++
 6 files changed, 36 insertions(+)

diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 4ade6bb1e4390..424eb980cd33a 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -128,6 +128,7 @@ class Builder {
   /// as attributes.
   DenseIntElementsAttr getI32TensorAttr(ArrayRef<int32_t> values);
   DenseIntElementsAttr getI64TensorAttr(ArrayRef<int64_t> values);
+  DenseIntElementsAttr getIndexTensorAttr(ArrayRef<int64_t> values);
 
   ArrayAttr getAffineMapArrayAttr(ArrayRef<AffineMap> values);
   ArrayAttr getBoolArrayAttr(ArrayRef<bool> values);
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 6a7542c7127c0..5ffb1727ee353 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -1218,6 +1218,13 @@ class IntElementsAttrBase<Pred condition, string description> :
   let convertFromStorage = "$_self";
 }
 
+def IndexElementsAttr
+    : IntElementsAttrBase<CPred<[{$_self.cast<DenseIntElementsAttr>()
+                                      .getType()
+                                      .getElementType()
+                                      .isIndex()}]>,
+                          "index elements attribute">;
+
 class AnyIntElementsAttr<int width> : IntElementsAttrBase<
   CPred<"$_self.cast<DenseIntElementsAttr>().getType()."
         "getElementType().isInteger(" # width # ")">,
diff --git a/mlir/lib/IR/Attributes.cpp b/mlir/lib/IR/Attributes.cpp
index 540c3c6258e29..12fd08787fa75 100644
--- a/mlir/lib/IR/Attributes.cpp
+++ b/mlir/lib/IR/Attributes.cpp
@@ -624,6 +624,8 @@ Attribute DenseElementsAttr::AttributeElementIterator::operator*() const {
                            owner.getContext());
     return IntegerAttr::get(eltTy, *IntElementIterator(owner, index));
   }
+  if (eltTy.isa<IndexType>())
+    return IntegerAttr::get(eltTy, *IntElementIterator(owner, index));
   if (auto floatEltTy = eltTy.dyn_cast<FloatType>()) {
     IntElementIterator intIt(owner, index);
     FloatElementIterator floatIt(floatEltTy.getFloatSemantics(), intIt);
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index a72e03c739e3b..064889724f092 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -130,6 +130,13 @@ DenseIntElementsAttr Builder::getI64TensorAttr(ArrayRef<int64_t> values) {
       values);
 }
 
+DenseIntElementsAttr Builder::getIndexTensorAttr(ArrayRef<int64_t> values) {
+  return DenseIntElementsAttr::get(
+      RankedTensorType::get(static_cast<int64_t>(values.size()),
+                            getIndexType()),
+      values);
+}
+
 IntegerAttr Builder::getI32IntegerAttr(int32_t value) {
   return IntegerAttr::get(getIntegerType(32), APInt(32, value));
 }
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 997d8eb44ae59..8e5b380dff452 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -454,6 +454,10 @@ def I32ElementsAttrOp : TEST_Op<"i32ElementsAttr"> {
   let arguments = (ins I32ElementsAttr:$attr);
 }
 
+def IndexElementsAttrOp : TEST_Op<"indexElementsAttr"> {
+  let arguments = (ins IndexElementsAttr:$attr);
+}
+
 def OpWithInferTypeInterfaceOp : TEST_Op<"op_with_infer_type_if", [
     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let arguments = (ins AnyTensor, AnyTensor);
diff --git a/mlir/test/mlir-tblgen/types.mlir b/mlir/test/mlir-tblgen/types.mlir
index 6a0a80ca5e5fc..5e4dac33012b9 100644
--- a/mlir/test/mlir-tblgen/types.mlir
+++ b/mlir/test/mlir-tblgen/types.mlir
@@ -489,3 +489,18 @@ func @elements_attr_i32(%arg0: tensor<1x2xi32>) {
   "test.i32ElementsAttr"() {attr = dense<[1, 2]>:tensor<2xi32>} : () -> ()
   return
 }
+
+// -----
+
+func @elements_attr_index() {
+  "test.indexElementsAttr"() {attr = dense<[1, 2]>:tensor<2xindex>} : () -> ()
+  return
+}
+
+// -----
+
+func @elements_attr_not_index() {
+  // expected-error@+1 {{index elements attribute}}
+  "test.indexElementsAttr"() {attr = dense<[1, 2]>:tensor<2xi32>} : () -> ()
+  return
+}

From 25132b36a8b39e7c2b0b28aa73772e57191b6df4 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Tue, 26 May 2020 16:45:32 -0700
Subject: [PATCH 281/770] [mlir][shape] Use IndexElementsAttr in Shape dialect.

Summary:
Index is the proper type for storing shapes when constant folding, so
this fixes the previous code (which was using i64).

Differential Revision: https://reviews.llvm.org/D80600
---
 .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 11 +++------
 mlir/lib/Dialect/Shape/IR/Shape.cpp           | 23 +++++++------------
 mlir/test/Dialect/Shape/canonicalize.mlir     |  6 ++---
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index a9759fc6a7343..406aac2db99a2 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -102,7 +102,7 @@ def Shape_ConstShapeOp : Shape_Op<"const_shape", [ConstantLike, NoSideEffect]> {
     %1 = shape.const_shape [1, 2, 3]
     ```
   }];
-  let arguments = (ins I64ElementsAttr:$shape);
+  let arguments = (ins IndexElementsAttr:$shape);
   let results = (outs Shape_ShapeType:$result);
 
   // TODO: Move this to main so that all shape ops implement these.
@@ -206,13 +206,8 @@ def Shape_GetExtentOp : Shape_Op<"get_extent",
   let builders = [
     // Builder that allows passing a simple integer instead of an IntegerAttr.
     OpBuilder<
-      [{
-        OpBuilder &builder, OperationState &result,
-        Value shape, int64_t dim
-      }],
-      [{
-        build(builder, result, shape, builder.getI64IntegerAttr(dim));
-      }]
+      [{OpBuilder &builder, OperationState &result, Value shape, int64_t dim}],
+      [{build(builder, result, shape, builder.getI64IntegerAttr(dim));}]
     >
   ];
 
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index fa9552fc86945..c4a8b15298171 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -177,7 +177,7 @@ OpFoldResult BroadcastOp::fold(ArrayRef<Attribute> operands) {
   if (!OpTrait::util::getBroadcastedShape(lhsShape, rhsShape, resultShape))
     return nullptr;
   Builder builder(getContext());
-  return builder.getI64TensorAttr(resultShape);
+  return builder.getIndexTensorAttr(resultShape);
 }
 
 //===----------------------------------------------------------------------===//
@@ -215,7 +215,7 @@ static ParseResult parseConstShapeOp(OpAsmParser &parser,
     ints.push_back(attr.getInt());
   }
   Builder &builder = parser.getBuilder();
-  result.addAttribute("shape", builder.getI64TensorAttr(ints));
+  result.addAttribute("shape", builder.getIndexTensorAttr(ints));
 
   result.types.push_back(ShapeType::get(builder.getContext()));
   return success();
@@ -257,7 +257,7 @@ OpFoldResult FromExtentsOp::fold(ArrayRef<Attribute> operands) {
   for (auto attr : operands)
     extents.push_back(attr.cast<IntegerAttr>().getInt());
   Builder builder(getContext());
-  return builder.getI64TensorAttr(extents);
+  return builder.getIndexTensorAttr(extents);
 }
 
 //===----------------------------------------------------------------------===//
@@ -281,14 +281,7 @@ OpFoldResult GetExtentOp::fold(ArrayRef<Attribute> operands) {
   // TODO: Constant fold this to some kind of constant error.
   if (dimToGet >= (uint64_t)elements.getNumElements())
     return nullptr;
-  // This is a little inconvenient because getValue returns an IntegerAttr
-  // that is not of IndexType, but the result here needs to be of
-  // IndexType.
-  // TODO: Make ConstShapeOp hold an tensor of index instead of i64.
-  Builder builder(getContext());
-  return builder.getIntegerAttr(
-      builder.getIndexType(),
-      elements.getValue<IntegerAttr>({dimToGet}).getInt());
+  return elements.getValue({dimToGet});
 }
 
 //===----------------------------------------------------------------------===//
@@ -309,7 +302,7 @@ OpFoldResult ShapeOfOp::fold(ArrayRef<Attribute>) {
   if (!type || !type.hasStaticShape())
     return nullptr;
   Builder builder(getContext());
-  return builder.getI64TensorAttr(type.getShape());
+  return builder.getIndexTensorAttr(type.getShape());
 }
 
 //===----------------------------------------------------------------------===//
@@ -343,8 +336,8 @@ LogicalResult SplitAtOp::fold(ArrayRef<Attribute> operands,
   if (splitPoint < 0)
     splitPoint += shape.size();
   Builder builder(operands[0].getContext());
-  results.push_back(builder.getI64TensorAttr(shape.take_front(splitPoint)));
-  results.push_back(builder.getI64TensorAttr(shape.drop_front(splitPoint)));
+  results.push_back(builder.getIndexTensorAttr(shape.take_front(splitPoint)));
+  results.push_back(builder.getIndexTensorAttr(shape.drop_front(splitPoint)));
   return success();
 }
 
@@ -373,7 +366,7 @@ OpFoldResult ConcatOp::fold(ArrayRef<Attribute> operands) {
   resultShape.append(lhsShape.begin(), lhsShape.end());
   resultShape.append(rhsShape.begin(), rhsShape.end());
   Builder builder(getContext());
-  return builder.getI64TensorAttr(resultShape);
+  return builder.getIndexTensorAttr(resultShape);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 018f5b212b4e4..23147e557a151 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -15,7 +15,7 @@ func @f() -> (!shape.shape, !shape.shape) {
   // CHECK: shape.const_shape [2, 3]
   // CHECK: shape.const_shape [4, 5]
   %c2 = constant 2 : i32
-  %0 = "shape.const_shape"() {shape = dense<[2, 3, 4, 5]> : tensor<4xi64>} : () -> !shape.shape
+  %0 = shape.const_shape [2, 3, 4, 5]
   %head, %tail = "shape.split_at"(%0, %c2) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
   return %head, %tail : !shape.shape, !shape.shape
 
@@ -28,7 +28,7 @@ func @f() -> (!shape.shape, !shape.shape) {
   // CHECK: shape.const_shape [2, 3, 4]
   // CHECK: shape.const_shape [5]
   %c-1 = constant -1 : i32
-  %0 = "shape.const_shape"() {shape = dense<[2, 3, 4, 5]> : tensor<4xi64>} : () -> !shape.shape
+  %0 = shape.const_shape [2, 3, 4, 5]
   %head, %tail = "shape.split_at"(%0, %c-1) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
   return %head, %tail : !shape.shape, !shape.shape
 }
@@ -39,7 +39,7 @@ func @f() -> (!shape.shape, !shape.shape) {
 func @f() -> (!shape.shape, !shape.shape) {
   // CHECK: shape.split_at
   %c5 = constant 5 : i32
-  %0 = "shape.const_shape"() {shape = dense<[2, 3, 4, 5]> : tensor<4xi64>} : () -> !shape.shape
+  %0 = shape.const_shape [2, 3, 4, 5]
   %head, %tail = "shape.split_at"(%0, %c5) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
   return %head, %tail : !shape.shape, !shape.shape
 }

From 5f97a540ad8dd4baac47873fa4bdfba2f37e0f82 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 13:10:19 -0700
Subject: [PATCH 282/770] [lldb/Reproducers] Differentiate active and passive
 replay unexpected packet.

---
 .../test/API/commands/command/script/TestCommandScript.py | 2 +-
 .../test/API/commands/expression/issue_11588/Test11588.py | 2 +-
 .../commands/process/attach-resume/TestAttachResume.py    | 2 +-
 .../test/API/commands/process/attach/TestProcessAttach.py | 2 +-
 .../breakpoint/scripted_bkpt/TestScriptedResolver.py      | 2 +-
 .../conditional_break/TestConditionalBreak.py             | 2 +-
 .../gdb_remote_client/TestGDBRemoteClient.py              | 2 +-
 lldb/test/API/functionalities/signal/TestSendSignal.py    | 2 +-
 .../API/functionalities/step_scripted/TestStepScripted.py | 8 ++++----
 lldb/test/API/lang/objc/foundation/TestRuntimeTypes.py    | 2 +-
 lldb/test/API/lang/objc/modules/TestObjCModules.py        | 2 +-
 lldb/test/API/lang/objc/print-obj/TestPrintObj.py         | 2 +-
 lldb/test/API/python_api/hello_world/TestHelloWorld.py    | 2 +-
 13 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/lldb/test/API/commands/command/script/TestCommandScript.py b/lldb/test/API/commands/command/script/TestCommandScript.py
index 6663c36414526..caf97ea8db979 100644
--- a/lldb/test/API/commands/command/script/TestCommandScript.py
+++ b/lldb/test/API/commands/command/script/TestCommandScript.py
@@ -14,7 +14,7 @@ class CmdPythonTestCase(TestBase):
     mydir = TestBase.compute_mydir(__file__)
     NO_DEBUG_INFO_TESTCASE = True
 
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test(self):
         self.build()
         self.pycmd_tests()
diff --git a/lldb/test/API/commands/expression/issue_11588/Test11588.py b/lldb/test/API/commands/expression/issue_11588/Test11588.py
index 8ed7797d5fffe..eb5b86e96363d 100644
--- a/lldb/test/API/commands/expression/issue_11588/Test11588.py
+++ b/lldb/test/API/commands/expression/issue_11588/Test11588.py
@@ -17,7 +17,7 @@ class Issue11581TestCase(TestBase):
     mydir = TestBase.compute_mydir(__file__)
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24778")
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_11581_commands(self):
         # This is the function to remove the custom commands in order to have a
         # clean slate for the next test case.
diff --git a/lldb/test/API/commands/process/attach-resume/TestAttachResume.py b/lldb/test/API/commands/process/attach-resume/TestAttachResume.py
index ebb4345aca911..48a281e096a93 100644
--- a/lldb/test/API/commands/process/attach-resume/TestAttachResume.py
+++ b/lldb/test/API/commands/process/attach-resume/TestAttachResume.py
@@ -21,7 +21,7 @@ class AttachResumeTestCase(TestBase):
     @expectedFailureAll(oslist=['freebsd'], bugnumber='llvm.org/pr19310')
     @expectedFailureNetBSD
     @skipIfWindows # llvm.org/pr24778, llvm.org/pr21753
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_attach_continue_interrupt_detach(self):
         """Test attach/continue/interrupt/detach"""
         self.build()
diff --git a/lldb/test/API/commands/process/attach/TestProcessAttach.py b/lldb/test/API/commands/process/attach/TestProcessAttach.py
index 792a8cee61f99..f9b273309956c 100644
--- a/lldb/test/API/commands/process/attach/TestProcessAttach.py
+++ b/lldb/test/API/commands/process/attach/TestProcessAttach.py
@@ -39,7 +39,7 @@ def test_attach_to_process_by_id(self):
         self.assertTrue(process, PROCESS_IS_VALID)
 
     @expectedFailureNetBSD
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_attach_to_process_from_different_dir_by_id(self):
         """Test attach by process id"""
         newdir = self.getBuildArtifact("newdir")
diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py
index b08dfc78cea3a..f4bbde755e690 100644
--- a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py
+++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py
@@ -16,7 +16,7 @@ class TestScriptedResolver(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528")
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_scripted_resolver(self):
         """Use a scripted resolver to set a by symbol name breakpoint"""
         self.build()
diff --git a/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py b/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py
index c1184d22cf153..619d8c9f23940 100644
--- a/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py
+++ b/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py
@@ -26,7 +26,7 @@ def test_with_python(self):
         self.build()
         self.do_conditional_break()
 
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_with_command(self):
         """Simulate a user using lldb commands to break on c() if called from a()."""
         self.build()
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
index 053183b5b5b74..54f1e8a220abf 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
@@ -18,7 +18,7 @@ def test_connect(self):
         process = self.connect(target)
         self.assertPacketLogContains(["qProcessInfo", "qfThreadInfo"])
 
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_attach_fail(self):
         error_msg = "mock-error-msg"
 
diff --git a/lldb/test/API/functionalities/signal/TestSendSignal.py b/lldb/test/API/functionalities/signal/TestSendSignal.py
index 84c41d7def649..d06322794a636 100644
--- a/lldb/test/API/functionalities/signal/TestSendSignal.py
+++ b/lldb/test/API/functionalities/signal/TestSendSignal.py
@@ -23,7 +23,7 @@ def setUp(self):
         bugnumber="llvm.org/pr23318: does not report running state")
     @expectedFailureNetBSD(bugnumber='llvm.org/pr43959')
     @skipIfWindows  # Windows does not support signals
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_with_run_command(self):
         """Test that lldb command 'process signal SIGUSR1' sends a signal to the inferior process."""
         self.build()
diff --git a/lldb/test/API/functionalities/step_scripted/TestStepScripted.py b/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
index 9cc63f46e8069..1e87541960c80 100644
--- a/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
+++ b/lldb/test/API/functionalities/step_scripted/TestStepScripted.py
@@ -18,14 +18,14 @@ def setUp(self):
         self.main_source_file = lldb.SBFileSpec("main.c")
         self.runCmd("command script import Steps.py")
 
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_standard_step_out(self):
         """Tests stepping with the scripted thread plan laying over a standard 
         thread plan for stepping out."""
         self.build()
         self.step_out_with_scripted_plan("Steps.StepOut")
 
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_scripted_step_out(self):
         """Tests stepping with the scripted thread plan laying over an another 
         scripted thread plan for stepping out."""
@@ -65,12 +65,12 @@ def test_misspelled_plan_name(self):
         # Make sure we didn't let the process run:
         self.assertEqual(stop_id, process.GetStopID(), "Process didn't run")
         
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_checking_variable(self):
         """Test that we can call SBValue API's from a scripted thread plan - using SBAPI's to step"""
         self.do_test_checking_variable(False)
         
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_checking_variable_cli(self):
         """Test that we can call SBValue API's from a scripted thread plan - using cli to step"""
         self.do_test_checking_variable(True)
diff --git a/lldb/test/API/lang/objc/foundation/TestRuntimeTypes.py b/lldb/test/API/lang/objc/foundation/TestRuntimeTypes.py
index 7ddaf63f34505..7254f8ec3c6b8 100644
--- a/lldb/test/API/lang/objc/foundation/TestRuntimeTypes.py
+++ b/lldb/test/API/lang/objc/foundation/TestRuntimeTypes.py
@@ -19,7 +19,7 @@ class RuntimeTypesTestCase(TestBase):
         oslist=["macosx"],
         debug_info="gmodules",
         bugnumber="llvm.org/pr27862")
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_break(self):
         """Test setting objc breakpoints using '_regexp-break' and 'breakpoint set'."""
         if self.getArchitecture() != 'x86_64':
diff --git a/lldb/test/API/lang/objc/modules/TestObjCModules.py b/lldb/test/API/lang/objc/modules/TestObjCModules.py
index 30535409a30f0..f6f9111f7641b 100644
--- a/lldb/test/API/lang/objc/modules/TestObjCModules.py
+++ b/lldb/test/API/lang/objc/modules/TestObjCModules.py
@@ -22,7 +22,7 @@ def setUp(self):
 
     @skipUnlessDarwin
     @skipIf(macos_version=["<", "10.12"])
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_expr(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
index b908079eefcd8..dc66e788990df 100644
--- a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
+++ b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
@@ -24,7 +24,7 @@ def setUp(self):
         # Find the line numbers to break at.
         self.line = line_number(self.source, '// Set a breakpoint here.')
 
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_print_obj(self):
         """
         Test "print object" where another thread blocks the print object from making progress.
diff --git a/lldb/test/API/python_api/hello_world/TestHelloWorld.py b/lldb/test/API/python_api/hello_world/TestHelloWorld.py
index 2d38043bb4504..75a55ab1f44dc 100644
--- a/lldb/test/API/python_api/hello_world/TestHelloWorld.py
+++ b/lldb/test/API/python_api/hello_world/TestHelloWorld.py
@@ -110,7 +110,7 @@ def test_with_attach_to_process_with_id_api(self):
     @skipIfiOSSimulator
     @skipIfAsan # FIXME: Hangs indefinitely.
     @expectedFailureNetBSD
-    @skipIfReproducer # Unexpected packet during replay
+    @skipIfReproducer # FIXME: Unexpected packet during (active) replay
     def test_with_attach_to_process_with_name_api(self):
         """Create target, spawn a process, and attach to it with process name."""
         exe = '%s_%d'%(self.testMethodName, os.getpid())

From f9bea9bc4acf4c412eab4767c31674d0caa60322 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 13:49:15 -0700
Subject: [PATCH 283/770] [lldb/Reproducers] Skip & add FIXME to tests failing
 with unexpected packet.

Add skip decorator to tests failing with an unexpected packet during
passive replay.
---
 .../expression/unwind_expression/TestUnwindExpression.py        | 1 +
 .../API/functionalities/gdb_remote_client/TestRestartBug.py     | 1 +
 lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py         | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py b/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py
index de883f47f935d..3839f7d89235a 100644
--- a/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py
+++ b/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py
@@ -53,6 +53,7 @@ def test_conditional_bktp(self):
 
     @add_test_categories(['pyapi'])
     @expectedFlakeyNetBSD
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test_unwind_expression(self):
         """Test unwinding from an expression."""
         self.build_and_run_to_bkpt()
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py b/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py
index 142861a37dff2..f66f58379890d 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py
@@ -8,6 +8,7 @@
 class TestRestartBug(GDBRemoteTestBase):
 
     @expectedFailureAll(bugnumber="llvm.org/pr24530")
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test(self):
         """
         Test auto-continue behavior when a process is interrupted to deliver
diff --git a/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py b/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py
index 5930ffdc958aa..cc3922ccf9f49 100644
--- a/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py
+++ b/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py
@@ -30,6 +30,7 @@ def setUp(self):
     @skipIf(
         debug_info=no_match("dsym"),
         bugnumber="This test requires a stripped binary and a dSYM")
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test_expr_stripped(self):
         if self.getArchitecture() == 'i386':
             self.skipTest("requires modern objc runtime")
@@ -38,6 +39,7 @@ def test_expr_stripped(self):
             self.expr(True)
 
     @skipUnlessDarwin
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test_expr(self):
         if self.getArchitecture() == 'i386':
             self.skipTest("requires modern objc runtime")

From f46bb9dd5ca0b5b553590da5ff177767be0b75b5 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne@apple.com>
Date: Wed, 27 May 2020 16:52:22 -0400
Subject: [PATCH 284/770] [NFC] Reformat TEST_FOO macros in test_macros.h

To make them easier to read and to make it easier to add new ones.
---
 libcxx/test/support/test_macros.h | 66 +++++++++++++++----------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 6cbb7f0ccc166..79ee5ddace8c8 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -118,41 +118,41 @@
 #endif
 
 #if TEST_STD_VER >= 11
-#define TEST_ALIGNOF(...) alignof(__VA_ARGS__)
-#define TEST_ALIGNAS(...) alignas(__VA_ARGS__)
-#define TEST_CONSTEXPR constexpr
-#define TEST_NOEXCEPT noexcept
-#define TEST_NOEXCEPT_FALSE noexcept(false)
-#define TEST_NOEXCEPT_COND(...) noexcept(__VA_ARGS__)
-# if TEST_STD_VER >= 14
-#   define TEST_CONSTEXPR_CXX14 constexpr
-# else
-#   define TEST_CONSTEXPR_CXX14
-# endif
-# if TEST_STD_VER > 14
-#   define TEST_THROW_SPEC(...)
-# else
-#   define TEST_THROW_SPEC(...) throw(__VA_ARGS__)
-# endif
-# if TEST_STD_VER > 17
-#   define TEST_CONSTEXPR_CXX20 constexpr
-# else
-#   define TEST_CONSTEXPR_CXX20
-# endif
+# define TEST_ALIGNOF(...) alignof(__VA_ARGS__)
+# define TEST_ALIGNAS(...) alignas(__VA_ARGS__)
+# define TEST_CONSTEXPR constexpr
+# define TEST_NOEXCEPT noexcept
+# define TEST_NOEXCEPT_FALSE noexcept(false)
+# define TEST_NOEXCEPT_COND(...) noexcept(__VA_ARGS__)
+#else
+#   if defined(TEST_COMPILER_CLANG)
+#    define TEST_ALIGNOF(...) _Alignof(__VA_ARGS__)
+#   else
+#    define TEST_ALIGNOF(...) __alignof(__VA_ARGS__)
+#   endif
+# define TEST_ALIGNAS(...) __attribute__((__aligned__(__VA_ARGS__)))
+# define TEST_CONSTEXPR
+# define TEST_NOEXCEPT throw()
+# define TEST_NOEXCEPT_FALSE
+# define TEST_NOEXCEPT_COND(...)
+#endif
+
+#if TEST_STD_VER >= 17
+# define TEST_THROW_SPEC(...)
 #else
-#if defined(TEST_COMPILER_CLANG)
-# define TEST_ALIGNOF(...) _Alignof(__VA_ARGS__)
+# define TEST_THROW_SPEC(...) throw(__VA_ARGS__)
+#endif
+
+#if TEST_STD_VER >= 14
+# define TEST_CONSTEXPR_CXX14 constexpr
+#else
+# define TEST_CONSTEXPR_CXX14
+#endif
+
+#if TEST_STD_VER >= 20
+# define TEST_CONSTEXPR_CXX20 constexpr
 #else
-# define TEST_ALIGNOF(...) __alignof(__VA_ARGS__)
-#endif
-#define TEST_ALIGNAS(...) __attribute__((__aligned__(__VA_ARGS__)))
-#define TEST_CONSTEXPR
-#define TEST_CONSTEXPR_CXX14
-#define TEST_CONSTEXPR_CXX20
-#define TEST_NOEXCEPT throw()
-#define TEST_NOEXCEPT_FALSE
-#define TEST_NOEXCEPT_COND(...)
-#define TEST_THROW_SPEC(...) throw(__VA_ARGS__)
+# define TEST_CONSTEXPR_CXX20
 #endif
 
 // Sniff out to see if the underlying C library has C11 features

From a57a67c59b3f7529f4aa30009b214248772b544b Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Wed, 27 May 2020 14:26:15 -0700
Subject: [PATCH 285/770] Fix a use-after-free in GetXcodeSDKPath

Introduced in https://reviews.llvm.org/D80595. Thanks Jonas for noticing!

Differential Revision: https://reviews.llvm.org/D80666
---
 lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index cb6f03465ef70..615f77b2dbcc3 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -370,7 +370,6 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
   auto it = g_sdk_path.find(sdk.GetString());
   if (it != g_sdk_path.end())
     return it->second;
-  std::string path = GetXcodeSDK(sdk);
-  g_sdk_path.insert({sdk.GetString(), path});
-  return path;
+  auto it_new = g_sdk_path.insert({sdk.GetString(), GetXcodeSDK(sdk)});
+  return it_new.first->second;
 }

From 0a072b8a0da7399eeeb670330b7baeddf1bb407a Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Wed, 27 May 2020 14:12:35 -0700
Subject: [PATCH 286/770] [mlir][Linalg] Add missing library linkage for shared
 library builds.

Differential Revision: https://reviews.llvm.org/D80664
---
 mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
index 61b6b61597f9d..8b3e89768c55d 100644
--- a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
@@ -8,6 +8,7 @@ add_mlir_dialect_library(MLIRLinalgUtils
   MLIRAffineOps
   MLIREDSC
   MLIRIR
+  MLIRLinalgEDSC
   MLIRLinalgOps
   MLIRSCF
   MLIRPass

From 2d068e534f1671459e1b135852c1b3c10502e929 Mon Sep 17 00:00:00 2001
From: Adrian McCarthy <amccarth@google.com>
Date: Wed, 27 May 2020 13:53:47 -0700
Subject: [PATCH 287/770] Fix Windows command line bug when last token in
 response file is ""

Patch by Neil Dhar <dhar@alumni.duke.edu>

Current state machine for parsing tokens from response files in Windows
does not correctly handle the case where the last token is "". The current
implementation handles the last token by only adding it if it is not empty,
however this does not cover the case where the last token is meant to be
the empty string. We can cover this case by checking whether the state
machine was last in the UNQUOTED state, which indicates that the last
character of the input was a non-whitespace character.

Differential Revision: https://reviews.llvm.org/D78346
---
 llvm/lib/Support/CommandLine.cpp           |  2 +-
 llvm/unittests/Support/CommandLineTest.cpp | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index aa7e79652af95..25612b7e8f232 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1009,7 +1009,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
     }
   }
 
-  if (!Token.empty())
+  if (State == UNQUOTED)
     AddToken(Saver.save(Token.str()));
 }
 
diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp
index a6b26b310b977..3e7ec8d5baf18 100644
--- a/llvm/unittests/Support/CommandLineTest.cpp
+++ b/llvm/unittests/Support/CommandLineTest.cpp
@@ -253,8 +253,8 @@ TEST(CommandLineTest, TokenizeGNUCommandLine) {
 }
 
 TEST(CommandLineTest, TokenizeWindowsCommandLine1) {
-  const char Input[] = "a\\b c\\\\d e\\\\\"f g\" h\\\"i j\\\\\\\"k \"lmn\" o pqr "
-                      "\"st \\\"u\" \\v";
+  const char Input[] =
+      R"(a\b c\\d e\\"f g" h\"i j\\\"k "lmn" o pqr "st \"u" \v)";
   const char *const Output[] = { "a\\b", "c\\\\d", "e\\f g", "h\"i", "j\\\"k",
                                  "lmn", "o", "pqr", "st \"u", "\\v" };
   testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input, Output,
@@ -268,6 +268,17 @@ TEST(CommandLineTest, TokenizeWindowsCommandLine2) {
                            array_lengthof(Output));
 }
 
+TEST(CommandLineTest, TokenizeWindowsCommandLineQuotedLastArgument) {
+  const char Input1[] = R"(a b c d "")";
+  const char *const Output1[] = {"a", "b", "c", "d", ""};
+  testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input1, Output1,
+                           array_lengthof(Output1));
+  const char Input2[] = R"(a b c d ")";
+  const char *const Output2[] = {"a", "b", "c", "d"};
+  testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input2, Output2,
+                           array_lengthof(Output2));
+}
+
 TEST(CommandLineTest, TokenizeConfigFile1) {
   const char *Input = "\\";
   const char *const Output[] = { "\\" };

From cf86a234ba86acf0bb875e21d27833be36e08be4 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 27 May 2020 15:11:43 -0700
Subject: [PATCH 288/770] Fix shared libs build break introduced in
 rG98ef93eabd76

---
 llvm/lib/Analysis/ML/CMakeLists.txt | 4 +++-
 llvm/lib/Passes/CMakeLists.txt      | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ML/CMakeLists.txt b/llvm/lib/Analysis/ML/CMakeLists.txt
index 05039a95ba73b..ab4332fcf00f8 100644
--- a/llvm/lib/Analysis/ML/CMakeLists.txt
+++ b/llvm/lib/Analysis/ML/CMakeLists.txt
@@ -1,9 +1,11 @@
 set (SOURCES InlineFeaturesAnalysis.cpp)
 
-add_llvm_library(LLVMMLPolicies STATIC
+add_llvm_library(LLVMMLPolicies
   ${SOURCES}
 
   DEPENDS
   intrinsics_gen
 
+  LINK_LIBS
+  LLVMCore
   )
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 371a21c113dcb..5df9ceac728b5 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -13,4 +13,7 @@ add_llvm_component_library(LLVMPasses
 
   DEPENDS
   intrinsics_gen
+
+  LINK_LIBS
+  LLVMMLPolicies
   )

From 993bbaf6a35baed4ad3d8422a76c4311140641a8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 15:21:08 -0700
Subject: [PATCH 289/770] [MLPolicies] Fix dependency and
 -DBUILD_SHARED_LIBS=on builds after D80579

---
 llvm/lib/Analysis/CMakeLists.txt    |  5 -----
 llvm/lib/Analysis/LLVMBuild.txt     |  3 +++
 llvm/lib/Analysis/ML/CMakeLists.txt |  6 ++----
 llvm/lib/Analysis/ML/LLVMBuild.txt  | 21 +++++++++++++++++++++
 llvm/lib/Passes/LLVMBuild.txt       |  2 +-
 5 files changed, 27 insertions(+), 10 deletions(-)
 create mode 100644 llvm/lib/Analysis/ML/LLVMBuild.txt

diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 8fa832faec4ce..faf0a3186fd61 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -108,9 +108,4 @@ add_llvm_component_library(LLVMAnalysis
 
   DEPENDS
   intrinsics_gen
-  LLVMMLPolicies
-
-
-  LINK_LIBS
-  LLVMMLPolicies
   )
diff --git a/llvm/lib/Analysis/LLVMBuild.txt b/llvm/lib/Analysis/LLVMBuild.txt
index d73b55f037fa3..ef52c41da8a60 100644
--- a/llvm/lib/Analysis/LLVMBuild.txt
+++ b/llvm/lib/Analysis/LLVMBuild.txt
@@ -14,6 +14,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = ML
+
 [component_0]
 type = Library
 name = Analysis
diff --git a/llvm/lib/Analysis/ML/CMakeLists.txt b/llvm/lib/Analysis/ML/CMakeLists.txt
index ab4332fcf00f8..80430baa9595b 100644
--- a/llvm/lib/Analysis/ML/CMakeLists.txt
+++ b/llvm/lib/Analysis/ML/CMakeLists.txt
@@ -1,7 +1,5 @@
-set (SOURCES InlineFeaturesAnalysis.cpp)
-
-add_llvm_library(LLVMMLPolicies
-  ${SOURCES}
+add_llvm_component_library(LLVMMLPolicies
+  InlineFeaturesAnalysis.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/lib/Analysis/ML/LLVMBuild.txt b/llvm/lib/Analysis/ML/LLVMBuild.txt
new file mode 100644
index 0000000000000..a0bb919bb4117
--- /dev/null
+++ b/llvm/lib/Analysis/ML/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Analysis/ML/LLVMBuild.txt --------------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MLPolicies
+parent = Analysis
+required_libraries = Core Support
diff --git a/llvm/lib/Passes/LLVMBuild.txt b/llvm/lib/Passes/LLVMBuild.txt
index 438fc5c7c2d46..14586b640849e 100644
--- a/llvm/lib/Passes/LLVMBuild.txt
+++ b/llvm/lib/Passes/LLVMBuild.txt
@@ -18,4 +18,4 @@
 type = Library
 name = Passes
 parent = Libraries
-required_libraries = AggressiveInstCombine Analysis CodeGen Core Coroutines IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation
+required_libraries = AggressiveInstCombine Analysis MLPolicies CodeGen Core Coroutines IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation

From be6bffe7293c63ec874aaf21b4f768dd3f77380a Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 15:29:10 -0700
Subject: [PATCH 290/770] [CMake] Revert
 cf86a234ba86acf0bb875e21d27833be36e08be4

It is unnecessary after 993bbaf6a35baed4ad3d8422a76c4311140641a8
---
 llvm/lib/Analysis/ML/CMakeLists.txt | 3 ---
 llvm/lib/Passes/CMakeLists.txt      | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/llvm/lib/Analysis/ML/CMakeLists.txt b/llvm/lib/Analysis/ML/CMakeLists.txt
index 80430baa9595b..28a5f98b793b8 100644
--- a/llvm/lib/Analysis/ML/CMakeLists.txt
+++ b/llvm/lib/Analysis/ML/CMakeLists.txt
@@ -3,7 +3,4 @@ add_llvm_component_library(LLVMMLPolicies
 
   DEPENDS
   intrinsics_gen
-
-  LINK_LIBS
-  LLVMCore
   )
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 5df9ceac728b5..371a21c113dcb 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -13,7 +13,4 @@ add_llvm_component_library(LLVMPasses
 
   DEPENDS
   intrinsics_gen
-
-  LINK_LIBS
-  LLVMMLPolicies
   )

From 8aa81aaebe533d0721f1c00deeb0fc452b0147a5 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 21 May 2020 12:41:29 -0700
Subject: [PATCH 291/770] AMDGPU/GlobalISel: Fixed handling of non-standard
 vectors

We do not have register classes for all possible vector
sizes, so round it up for extract vector element.

Also fixes selection of G_MERGE_VALUES when vectors are
not a power of two.

This has required to refactor getRegSplitParts() in way
that it can handle not just power of two vectors.

Ideally we would like RegSplitParts to be generated by
tablegen.

Differential Revision: https://reviews.llvm.org/D80457
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     | 170 ++---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   7 +
 .../AMDGPU/GlobalISel/extractelement.ll       | 636 ++++++++++++++++++
 .../GlobalISel/inst-select-concat-vectors.mir |  14 +-
 4 files changed, 718 insertions(+), 109 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7657a2d0ea2ce..f2c4fa2d60e79 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
+#include <vector>
 
 using namespace llvm;
 
@@ -38,6 +39,8 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
+std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
@@ -53,6 +56,30 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
   RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
   for (auto Reg : AMDGPU::VGPR_HI16RegClass)
     RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
+
+  // HACK: Until this is fully tablegen'd.
+  static llvm::once_flag InitializeRegSplitPartsFlag;
+
+  static auto InitializeRegSplitPartsOnce = [this]() {
+    for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
+      unsigned Size = getSubRegIdxSize(Idx);
+      if (Size & 31)
+        continue;
+      std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
+      unsigned Pos = getSubRegIdxOffset(Idx);
+      if (Pos % Size)
+        continue;
+      Pos /= Size;
+      if (Vec.empty()) {
+        unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
+        Vec.resize(MaxNumParts);
+      }
+      Vec[Pos] = Idx;
+    }
+  };
+
+
+  llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
 }
 
 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
@@ -1313,88 +1340,82 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
 
 const TargetRegisterClass *
 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
-  switch (BitWidth) {
-  case 1:
+  if (BitWidth == 1)
     return &AMDGPU::VReg_1RegClass;
-  case 16:
+  if (BitWidth <= 16)
     return &AMDGPU::VGPR_LO16RegClass;
-  case 32:
+  if (BitWidth <= 32)
     return &AMDGPU::VGPR_32RegClass;
-  case 64:
+  if (BitWidth <= 64)
     return &AMDGPU::VReg_64RegClass;
-  case 96:
+  if (BitWidth <= 96)
     return &AMDGPU::VReg_96RegClass;
-  case 128:
+  if (BitWidth <= 128)
     return &AMDGPU::VReg_128RegClass;
-  case 160:
+  if (BitWidth <= 160)
     return &AMDGPU::VReg_160RegClass;
-  case 192:
+  if (BitWidth <= 192)
     return &AMDGPU::VReg_192RegClass;
-  case 256:
+  if (BitWidth <= 256)
     return &AMDGPU::VReg_256RegClass;
-  case 512:
+  if (BitWidth <= 512)
     return &AMDGPU::VReg_512RegClass;
-  case 1024:
+  if (BitWidth <= 1024)
     return &AMDGPU::VReg_1024RegClass;
-  default:
-    return nullptr;
-  }
+
+  return nullptr;
 }
 
 const TargetRegisterClass *
 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
-  switch (BitWidth) {
-  case 16:
+  if (BitWidth <= 16)
     return &AMDGPU::AGPR_LO16RegClass;
-  case 32:
+  if (BitWidth <= 32)
     return &AMDGPU::AGPR_32RegClass;
-  case 64:
+  if (BitWidth <= 64)
     return &AMDGPU::AReg_64RegClass;
-  case 96:
+  if (BitWidth <= 96)
     return &AMDGPU::AReg_96RegClass;
-  case 128:
+  if (BitWidth <= 128)
     return &AMDGPU::AReg_128RegClass;
-  case 160:
+  if (BitWidth <= 160)
     return &AMDGPU::AReg_160RegClass;
-  case 192:
+  if (BitWidth <= 192)
     return &AMDGPU::AReg_192RegClass;
-  case 256:
+  if (BitWidth <= 256)
     return &AMDGPU::AReg_256RegClass;
-  case 512:
+  if (BitWidth <= 512)
     return &AMDGPU::AReg_512RegClass;
-  case 1024:
+  if (BitWidth <= 1024)
     return &AMDGPU::AReg_1024RegClass;
-  default:
-    return nullptr;
-  }
+
+  return nullptr;
 }
 
 const TargetRegisterClass *
 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
-  switch (BitWidth) {
-  case 16:
+  if (BitWidth <= 16)
     return &AMDGPU::SGPR_LO16RegClass;
-  case 32:
+  if (BitWidth <= 32)
     return &AMDGPU::SReg_32RegClass;
-  case 64:
+  if (BitWidth <= 64)
     return &AMDGPU::SReg_64RegClass;
-  case 96:
+  if (BitWidth <= 96)
     return &AMDGPU::SGPR_96RegClass;
-  case 128:
+  if (BitWidth <= 128)
     return &AMDGPU::SGPR_128RegClass;
-  case 160:
+  if (BitWidth <= 160)
     return &AMDGPU::SGPR_160RegClass;
-  case 192:
+  if (BitWidth <= 192)
     return &AMDGPU::SGPR_192RegClass;
-  case 256:
+  if (BitWidth <= 256)
     return &AMDGPU::SGPR_256RegClass;
-  case 512:
+  if (BitWidth <= 512)
     return &AMDGPU::SGPR_512RegClass;
-  case 1024:
+  if (BitWidth <= 1024)
     return &AMDGPU::SGPR_1024RegClass;
-  default:
-    return nullptr;
-  }
+
+  return nullptr;
 }
 
 // FIXME: This is very slow. It might be worth creating a map from physreg to
@@ -1579,65 +1600,14 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
   assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
 
-  const unsigned EltBitWidth = EltSize * 8;
-  assert(EltBitWidth >= 32 && EltBitWidth < 1024 && isPowerOf2_32(EltBitWidth));
-  const unsigned LogEltBitWidth = Log2_32(EltBitWidth);
-
-  assert(RegBitWidth % EltBitWidth == 0);
+  const unsigned RegDWORDs = RegBitWidth / 32;
+  const unsigned EltDWORDs = EltSize / 4;
+  assert(RegSplitParts.size() + 1 >= EltDWORDs);
 
-  if (RegBitWidth == EltBitWidth)
-    return {};
-
-  static const int16_t Sub_32[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
-    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
-    AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
-    AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
-    AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
-    AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31
-  };
-
-  static const int16_t Sub_64[] = {
-    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
-    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
-    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
-    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
-    AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
-    AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
-    AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
-    AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
-  };
-
-  static const int16_t Sub_128[] = {
-    AMDGPU::sub0_sub1_sub2_sub3,
-    AMDGPU::sub4_sub5_sub6_sub7,
-    AMDGPU::sub8_sub9_sub10_sub11,
-    AMDGPU::sub12_sub13_sub14_sub15,
-    AMDGPU::sub16_sub17_sub18_sub19,
-    AMDGPU::sub20_sub21_sub22_sub23,
-    AMDGPU::sub24_sub25_sub26_sub27,
-    AMDGPU::sub28_sub29_sub30_sub31
-  };
-
-  static const int16_t Sub_256[] = {
-    AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
-    AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
-    AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
-    AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
-  };
-
-  static const int16_t Sub_512[] = {
-    AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
-    AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
-  };
-
-  static const int16_t *const Subs[] = {
-    Sub_32, Sub_64, Sub_128, Sub_256, Sub_512
-  };
+  const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
+  const unsigned NumParts = RegDWORDs / EltDWORDs;
 
-  return makeArrayRef(Subs[LogEltBitWidth - 5], RegBitWidth >> LogEltBitWidth);
+  return makeArrayRef(Parts.data(), NumParts);
 }
 
 const TargetRegisterClass*
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 0b2920b3777e8..8a8ac8169453c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -33,6 +33,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   bool isWave32;
   BitVector RegPressureIgnoredUnits;
 
+  /// Sub reg indexes for getRegSplitParts.
+  /// First index represents subreg size from 1 to 16 DWORDs.
+  /// The inner vector is sorted by bit offset.
+  /// Provided a register can be fully split with given subregs,
+  /// all elements of the inner vector combined give a full lane mask.
+  static std::array<std::vector<int16_t>, 16> RegSplitParts;
+
   void reserveRegisterTuples(BitVector &, MCRegister Reg) const;
 
 public:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 5a7b4b390b5dd..443944408f339 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -1517,3 +1517,639 @@ entry:
   %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
   ret double %ext
 }
+
+define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f32_s_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b64 s[6:7], exec
+; GPRIDX-NEXT:  BB33_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v0
+; GPRIDX-NEXT:    s_mov_b32 m0, s8
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
+; GPRIDX-NEXT:    s_movrels_b32 s8, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s8
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB33_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[6:7]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v6f32_s_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b64 s[6:7], exec
+; MOVREL-NEXT:  BB33_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v0
+; MOVREL-NEXT:    s_mov_b32 m0, s8
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
+; MOVREL-NEXT:    s_movrels_b32 s8, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s8
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB33_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[6:7]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <6 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f32_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
+; GPRIDX-NEXT:  BB34_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v6
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
+; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB34_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v7
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: dyn_extract_v6f32_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
+; MOVREL-NEXT:  BB34_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v6
+; MOVREL-NEXT:    s_mov_b32 m0, s6
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
+; MOVREL-NEXT:    v_movrels_b32_e32 v7, v0
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB34_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v7
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ext = extractelement <6 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f32_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v6f32_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 m0, s2
+; MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <6 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v6f32_s_s(<6 x float> inreg %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f32_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 m0, s8
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_movrels_b32 s0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v6f32_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 m0, s8
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_movrels_b32 s0, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <6 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f32_s_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b64 s[8:9], exec
+; GPRIDX-NEXT:  BB37_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v0
+; GPRIDX-NEXT:    s_mov_b32 m0, s7
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v0
+; GPRIDX-NEXT:    s_movrels_b32 s7, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s7
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB37_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[8:9]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v7f32_s_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b64 s[8:9], exec
+; MOVREL-NEXT:  BB37_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v0
+; MOVREL-NEXT:    s_mov_b32 m0, s7
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v0
+; MOVREL-NEXT:    s_movrels_b32 s7, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s7
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB37_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[8:9]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f32_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
+; GPRIDX-NEXT:  BB38_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v7
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB38_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v8
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: dyn_extract_v7f32_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
+; MOVREL-NEXT:  BB38_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v7
+; MOVREL-NEXT:    s_mov_b32 m0, s6
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; MOVREL-NEXT:    v_movrels_b32_e32 v8, v0
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB38_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v8
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ext = extractelement <7 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f32_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v7f32_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 m0, s2
+; MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v7f32_s_s(<7 x float> inreg %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f32_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 m0, s9
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_movrels_b32 s0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v7f32_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 m0, s9
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_movrels_b32 s0, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f64_s_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s16, s2
+; GPRIDX-NEXT:    s_mov_b32 s17, s3
+; GPRIDX-NEXT:    s_mov_b32 s18, s4
+; GPRIDX-NEXT:    s_mov_b32 s19, s5
+; GPRIDX-NEXT:    s_mov_b32 s20, s6
+; GPRIDX-NEXT:    s_mov_b32 s21, s7
+; GPRIDX-NEXT:    s_mov_b32 s22, s8
+; GPRIDX-NEXT:    s_mov_b32 s23, s9
+; GPRIDX-NEXT:    s_mov_b32 s24, s10
+; GPRIDX-NEXT:    s_mov_b32 s25, s11
+; GPRIDX-NEXT:    s_mov_b32 s26, s12
+; GPRIDX-NEXT:    s_mov_b32 s27, s13
+; GPRIDX-NEXT:    s_mov_b64 s[2:3], exec
+; GPRIDX-NEXT:  BB41_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
+; GPRIDX-NEXT:    s_lshl_b32 m0, s0, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; GPRIDX-NEXT:    s_movrels_b32 s0, s16
+; GPRIDX-NEXT:    s_movrels_b32 s1, s17
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB41_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[2:3]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v6f64_s_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s16, s2
+; MOVREL-NEXT:    s_mov_b32 s17, s3
+; MOVREL-NEXT:    s_mov_b32 s18, s4
+; MOVREL-NEXT:    s_mov_b32 s19, s5
+; MOVREL-NEXT:    s_mov_b32 s20, s6
+; MOVREL-NEXT:    s_mov_b32 s21, s7
+; MOVREL-NEXT:    s_mov_b32 s22, s8
+; MOVREL-NEXT:    s_mov_b32 s23, s9
+; MOVREL-NEXT:    s_mov_b32 s24, s10
+; MOVREL-NEXT:    s_mov_b32 s25, s11
+; MOVREL-NEXT:    s_mov_b32 s26, s12
+; MOVREL-NEXT:    s_mov_b32 s27, s13
+; MOVREL-NEXT:    s_mov_b64 s[2:3], exec
+; MOVREL-NEXT:  BB41_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
+; MOVREL-NEXT:    s_lshl_b32 m0, s0, 1
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; MOVREL-NEXT:    s_movrels_b32 s0, s16
+; MOVREL-NEXT:    s_movrels_b32 s1, s17
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB41_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[2:3]
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <6 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f64_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
+; GPRIDX-NEXT:  BB42_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v12
+; GPRIDX-NEXT:    s_lshl_b32 s7, s6, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v12
+; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB42_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v13
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v14
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: dyn_extract_v6f64_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
+; MOVREL-NEXT:  BB42_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v12
+; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
+; MOVREL-NEXT:    v_movrels_b32_e32 v13, v0
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v12
+; MOVREL-NEXT:    v_movrels_b32_e32 v14, v1
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB42_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v13
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v14
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ext = extractelement <6 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v6f64_v_s(<6 x double> %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f64_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_lshl_b32 s0, s2, 1
+; GPRIDX-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v12
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v0
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v6f64_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_lshl_b32 m0, s2, 1
+; MOVREL-NEXT:    v_movrels_b32_e32 v12, v0
+; MOVREL-NEXT:    v_movrels_b32_e32 v0, v1
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v12
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v0
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <6 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v6f64_s_s(<6 x double> inreg %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v6f64_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 m0, s14
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v6f64_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 m0, s14
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    s_mov_b32 s10, s12
+; MOVREL-NEXT:    s_mov_b32 s11, s13
+; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <6 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f64_s_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s16, s2
+; GPRIDX-NEXT:    s_mov_b32 s17, s3
+; GPRIDX-NEXT:    s_mov_b32 s18, s4
+; GPRIDX-NEXT:    s_mov_b32 s19, s5
+; GPRIDX-NEXT:    s_mov_b32 s20, s6
+; GPRIDX-NEXT:    s_mov_b32 s21, s7
+; GPRIDX-NEXT:    s_mov_b32 s22, s8
+; GPRIDX-NEXT:    s_mov_b32 s23, s9
+; GPRIDX-NEXT:    s_mov_b32 s24, s10
+; GPRIDX-NEXT:    s_mov_b32 s25, s11
+; GPRIDX-NEXT:    s_mov_b32 s26, s12
+; GPRIDX-NEXT:    s_mov_b32 s27, s13
+; GPRIDX-NEXT:    s_mov_b32 s28, s14
+; GPRIDX-NEXT:    s_mov_b32 s29, s15
+; GPRIDX-NEXT:    s_mov_b64 s[2:3], exec
+; GPRIDX-NEXT:  BB45_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
+; GPRIDX-NEXT:    s_lshl_b32 m0, s0, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; GPRIDX-NEXT:    s_movrels_b32 s0, s16
+; GPRIDX-NEXT:    s_movrels_b32 s1, s17
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB45_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[2:3]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v7f64_s_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s16, s2
+; MOVREL-NEXT:    s_mov_b32 s17, s3
+; MOVREL-NEXT:    s_mov_b32 s18, s4
+; MOVREL-NEXT:    s_mov_b32 s19, s5
+; MOVREL-NEXT:    s_mov_b32 s20, s6
+; MOVREL-NEXT:    s_mov_b32 s21, s7
+; MOVREL-NEXT:    s_mov_b32 s22, s8
+; MOVREL-NEXT:    s_mov_b32 s23, s9
+; MOVREL-NEXT:    s_mov_b32 s24, s10
+; MOVREL-NEXT:    s_mov_b32 s25, s11
+; MOVREL-NEXT:    s_mov_b32 s26, s12
+; MOVREL-NEXT:    s_mov_b32 s27, s13
+; MOVREL-NEXT:    s_mov_b32 s28, s14
+; MOVREL-NEXT:    s_mov_b32 s29, s15
+; MOVREL-NEXT:    s_mov_b64 s[2:3], exec
+; MOVREL-NEXT:  BB45_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
+; MOVREL-NEXT:    s_lshl_b32 m0, s0, 1
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; MOVREL-NEXT:    s_movrels_b32 s0, s16
+; MOVREL-NEXT:    s_movrels_b32 s1, s17
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB45_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[2:3]
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f64_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
+; GPRIDX-NEXT:  BB46_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v14
+; GPRIDX-NEXT:    s_lshl_b32 s7, s6, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v14
+; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB46_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v15
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v16
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: dyn_extract_v7f64_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
+; MOVREL-NEXT:  BB46_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v14
+; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
+; MOVREL-NEXT:    v_movrels_b32_e32 v15, v0
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v14
+; MOVREL-NEXT:    v_movrels_b32_e32 v16, v1
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB46_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v15
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v16
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ext = extractelement <7 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v7f64_v_s(<7 x double> %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f64_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_lshl_b32 s0, s2, 1
+; GPRIDX-NEXT:    s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v14
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v0
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v7f64_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_lshl_b32 m0, s2, 1
+; MOVREL-NEXT:    v_movrels_b32_e32 v14, v0
+; MOVREL-NEXT:    v_movrels_b32_e32 v0, v1
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v14
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v0
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v7f64_s_s(<7 x double> inreg %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v7f64_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 m0, s16
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    s_mov_b32 s12, s14
+; GPRIDX-NEXT:    s_mov_b32 s13, s15
+; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_extract_v7f64_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 m0, s16
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    s_mov_b32 s10, s12
+; MOVREL-NEXT:    s_mov_b32 s11, s13
+; MOVREL-NEXT:    s_mov_b32 s12, s14
+; MOVREL-NEXT:    s_mov_b32 s13, s15
+; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x double> %vec, i32 %sel
+  ret double %ext
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
index d8f2fad8f9381..54cb2a0ab0e05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir
@@ -1,10 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*'  -o - %s 2> %t | FileCheck -check-prefix=GCN  %s
-# RUN: FileCheck -check-prefix=ERR %s < %t
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*'  -o - %s | FileCheck -check-prefix=GCN  %s
 
-# ERR-NOT: remark:
-# ERR: remark: <unknown>:0:0: cannot select: %2:sgpr(<6 x s64>) = G_CONCAT_VECTORS %0:sgpr(<3 x s64>), %1:sgpr(<3 x s64>) (in function: test_concat_vectors_s_v6s64_s_v3s64_s_v3s64)
-# ERR-NOT: remark:
 
 ---
 name: test_concat_vectors_v_v4s16_v_v2s16_v_v2s16
@@ -634,10 +630,10 @@ body: |
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
 
     ; GCN-LABEL: name: test_concat_vectors_s_v6s64_s_v3s64_s_v3s64
-    ; GCN: [[DEF:%[0-9]+]]:sgpr(<3 x s64>) = G_IMPLICIT_DEF
-    ; GCN: [[DEF1:%[0-9]+]]:sgpr(<3 x s64>) = G_IMPLICIT_DEF
-    ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s64>) = G_CONCAT_VECTORS [[DEF]](<3 x s64>), [[DEF1]](<3 x s64>)
-    ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s64>)
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
+    ; GCN: [[DEF1:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF
+    ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11
+    ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
     %0:sgpr(<3 x s64>) = G_IMPLICIT_DEF
     %1:sgpr(<3 x s64>) = G_IMPLICIT_DEF
     %2:sgpr(<6 x s64>) = G_CONCAT_VECTORS %0, %1

From dda82986f97747350dce4e8ebd65c27a64a37c9d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 12:33:55 -0400
Subject: [PATCH 292/770] DAG: Fix expansion of DYNAMIC_STACKALLOC for
 StackGrowsUp targets

Can't test this since I can't directly use the default expansion for
AMDGPU. It needs to scale the amount by the wave size, rather than use
the raw byte size value.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2ffcc859f8051..9969786d8d43a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1600,9 +1600,13 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-  unsigned StackAlign =
-      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
-  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  const TargetFrameLowering *TFL = DAG.getSubtarget().getFrameLowering();
+  unsigned Opc =
+    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+    ISD::ADD : ISD::SUB;
+
+  unsigned StackAlign = TFL->getStackAlignment();
+  Tmp1 = DAG.getNode(Opc, dl, VT, SP, Size);       // Value
   if (Align > StackAlign)
     Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
                        DAG.getConstant(-(uint64_t)Align, dl, VT));

From 5e007fe9980cc44e9c4a14c9baf3bdfb012d2c18 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 10:59:14 -0400
Subject: [PATCH 293/770] AMDGPU: Support non-entry block static sized allocas

OpenMP emits these for some reason, so handle them. Assume these use
4096 bytes by default, with a flag to override this. Also change the
related stack assumption for calls to have a flag.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp  |  27 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  63 +++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h      |   3 +
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 271 +++++++++++++++++++
 4 files changed, 362 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 81676d63643df..fe0462a31064d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -49,6 +49,22 @@ using namespace llvm;
 using namespace llvm::AMDGPU;
 using namespace llvm::AMDGPU::HSAMD;
 
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+  "amdgpu-assume-external-call-stack-size",
+  cl::desc("Assumed stack use of any external call (in bytes)"),
+  cl::Hidden,
+  cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+  "amdgpu-assume-dynamic-stack-object-size",
+  cl::desc("Assumed extra stack use if there are any "
+           "variable sized objects (in bytes)"),
+  cl::Hidden,
+  cl::init(4096));
+
 // This should get the default rounding mode from the kernel. We just set the
 // default here, but this could change if the OpenCL rounding mode pragmas are
 // used.
@@ -637,8 +653,13 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
     Info.UsesFlatScratch = false;
   }
 
-  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+  // Assume a big number if there are any unknown sized objects.
+  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+  if (Info.HasDynamicallySizedStack)
+    Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
@@ -907,7 +928,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           MaxVGPR = std::max(MaxVGPR, 23);
           MaxAGPR = std::max(MaxAGPR, 23);
 
-          CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
+          CalleeFrameSize = std::max(CalleeFrameSize,
+            static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
           Info.UsesVCC = true;
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 689aece39dee5..042087ec5a4de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3089,6 +3089,67 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          IsThisReturn ? OutVals[0] : SDValue());
 }
 
+// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for applying the wave size scale to the increment amount.
+SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
+    SDValue Op, SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Tmp1 = Op;
+  SDValue Tmp2 = Op.getValue(1);
+  SDValue Tmp3 = Op.getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  Register SPReg = Info->getStackPtrOffsetReg();
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const TargetFrameLowering *TFL = ST.getFrameLowering();
+  unsigned Opc =
+    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+    ISD::ADD : ISD::SUB;
+
+  SDValue ScaledSize = DAG.getNode(
+      ISD::SHL, dl, VT, Size,
+      DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
+
+  unsigned StackAlign = TFL->getStackAlignment();
+  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
+  if (Align > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);    // Output chain
+  Tmp2 = DAG.getCALLSEQ_END(
+      Chain, DAG.getIntPtrConstant(0, dl, true),
+      DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+}
+
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // We only handle constant sizes here to allow non-entry block, static sized
+  // allocas. A truly dynamic value is more difficult to support because we
+  // don't know if the size value is uniform or not. If the size isn't uniform,
+  // we would need to do a wave reduction to get the maximum size to know how
+  // much to increment the uniform stack pointer.
+  SDValue Size = Op.getOperand(1);
+  if (isa<ConstantSDNode>(Size))
+      return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
+
+  return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -4305,6 +4366,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 7ef11eba4f9ce..da0260f4ed2d1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -337,6 +337,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
new file mode 100644
index 0000000000000..060d66ae84282
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
+
+; FIXME: Generated test checks do not check metadata at the end of the
+; function, so this also includes manually added checks.
+
+; Test that we can select a statically sized alloca outside of the
+; entry block.
+
+; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
+; alignment less than the stack alignment.
+define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
+; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s33, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cbranch_scc1 BB0_3
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    s_cmp_lg_u32 s9, 0
+; GCN-NEXT:    s_cbranch_scc1 BB0_3
+; GCN-NEXT:  ; %bb.2: ; %bb.1
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    s_lshl_b32 s7, s10, 2
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_add_i32 s6, s6, s7
+; GCN-NEXT:    v_mov_b32_e32 v3, 1
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB0_3: ; %bb.2
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
+
+entry:
+  %cond0 = icmp eq i32 %arg.cond0, 0
+  br i1 %cond0, label %bb.0, label %bb.2
+
+bb.0:
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  %cond1 = icmp eq i32 %arg.cond1, 0
+  br i1 %cond1, label %bb.1, label %bb.2
+
+bb.1:
+  ; Use the alloca outside of the defining block.
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.2
+
+bb.2:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
+; DEFAULTSIZE: ; ScratchSize: 4112
+
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: ; ScratchSize: 1040
+
+define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
+; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s33, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s6, 0
+; GCN-NEXT:    s_cbranch_scc1 BB1_2
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    s_andn2_b32 s6, s6, 63
+; GCN-NEXT:    s_lshl_b32 s7, s7, 2
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_add_i32 s6, s6, s7
+; GCN-NEXT:    v_mov_b32_e32 v3, 1
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB1_2: ; %bb.1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
+entry:
+  %cond = icmp eq i32 %arg.cond, 0
+  br i1 %cond, label %bb.0, label %bb.1
+
+bb.0:
+  %alloca = alloca [16 x i32], align 64, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.1
+
+bb.1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
+; DEFAULTSIZE: ; ScratchSize: 4160
+
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: ; ScratchSize: 1088
+
+
+define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
+; GCN-LABEL: func_non_entry_block_static_alloca_align4:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, s33
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_cbranch_execz BB2_3
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-NEXT:    s_and_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execz BB2_3
+; GCN-NEXT:  ; %bb.2: ; %bb.1
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    v_mov_b32_e32 v6, 1
+; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
+; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB2_3: ; %bb.2
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_mov_b32 s33, s7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+
+entry:
+  %cond0 = icmp eq i32 %arg.cond0, 0
+  br i1 %cond0, label %bb.0, label %bb.2
+
+bb.0:
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  %cond1 = icmp eq i32 %arg.cond1, 0
+  br i1 %cond1, label %bb.1, label %bb.2
+
+bb.1:
+  ; Use the alloca outside of the defining block.
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.2
+
+bb.2:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
+; GCN-LABEL: func_non_entry_block_static_alloca_align64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
+; GCN-NEXT:    s_mov_b32 s7, s33
+; GCN-NEXT:    s_and_b32 s33, s4, 0xfffff000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GCN-NEXT:    s_add_u32 s32, s32, 0x2000
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_cbranch_execz BB3_2
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    s_andn2_b32 s6, s6, 63
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_mov_b32_e32 v6, 1
+; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
+; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB3_2: ; %bb.1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x2000
+; GCN-NEXT:    s_mov_b32 s33, s7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cond = icmp eq i32 %arg.cond, 0
+  br i1 %cond, label %bb.0, label %bb.1
+
+bb.0:
+  %alloca = alloca [16 x i32], align 64, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.1
+
+bb.1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }

From ef37444058550b0f49441b994c9e9368d8e42da8 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Wed, 27 May 2020 16:16:56 -0700
Subject: [PATCH 294/770] [Lexer] Fix invalid suffix diagnostic for fixed-point
 literals

Committing on behalf of nagart, who authored this patch.

Differential Revision: https://reviews.llvm.org/D80412
---
 clang/include/clang/Basic/DiagnosticLexKinds.td |  2 +-
 clang/include/clang/Lex/LiteralSupport.h        |  4 +++-
 clang/lib/Lex/LiteralSupport.cpp                |  4 +++-
 clang/test/Frontend/fixed_point_errors.c        | 14 +++++++-------
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index ef90bdf84c8ab..fa07e9ae76c85 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -175,7 +175,7 @@ def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">,
 def err_invalid_digit : Error<
   "invalid digit '%0' in %select{decimal|octal|binary}1 constant">;
 def err_invalid_suffix_constant : Error<
-  "invalid suffix '%0' on %select{integer|floating}1 constant">;
+  "invalid suffix '%0' on %select{integer|floating|fixed-point}1 constant">;
 def warn_cxx11_compat_digit_separator : Warning<
   "digit separators are incompatible with C++ standards before C++14">,
   InGroup<CXXPre14Compat>, DefaultIgnore;
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index b9d64c24a00bd..6829771b28308 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -71,7 +71,9 @@ class NumericLiteralParser {
   bool isFract : 1;         // 1.0hr/r/lr/uhr/ur/ulr
   bool isAccum : 1;         // 1.0hk/k/lk/uhk/uk/ulk
 
-  bool isFixedPointLiteral() const { return saw_fixed_point_suffix; }
+  bool isFixedPointLiteral() const {
+    return (saw_period || saw_exponent) && saw_fixed_point_suffix;
+  }
 
   bool isIntegerLiteral() const {
     return !saw_period && !saw_exponent && !isFixedPointLiteral();
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 2b1add4d9b987..f44614b4bec46 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -583,6 +583,7 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
 
   // Parse the suffix.  At this point we can classify whether we have an FP or
   // integer constant.
+  bool isFixedPointConstant = isFixedPointLiteral();
   bool isFPConstant = isFloatingLiteral();
 
   // Loop over all of the characters of the suffix.  If we see something bad,
@@ -737,7 +738,8 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
       // Report an error if there are any.
       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
               diag::err_invalid_suffix_constant)
-          << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) << isFPConstant;
+          << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
+          << (isFixedPointConstant ? 2 : isFPConstant);
       hadError = true;
     }
   }
diff --git a/clang/test/Frontend/fixed_point_errors.c b/clang/test/Frontend/fixed_point_errors.c
index db15bd874b316..9b600fbc2642b 100644
--- a/clang/test/Frontend/fixed_point_errors.c
+++ b/clang/test/Frontend/fixed_point_errors.c
@@ -137,15 +137,15 @@ _Sat longfract_t td_sat_long_fract;         // expected-error{{'_Sat' specifier
 _Sat longaccum_t td_sat_long_accum;         // expected-error{{'_Sat' specifier is only valid on '_Fract' or '_Accum', not 'type-name'}}
 
 /* Bad suffixes  */
-_Accum fk = 1.0fk;    // expected-error{{invalid suffix 'fk' on integer constant}}
-_Accum kk = 1.0kk;    // expected-error{{invalid suffix 'kk' on integer constant}}
-_Accum rk = 1.0rk;    // expected-error{{invalid suffix 'rk' on integer constant}}
-_Accum rk = 1.0rr;    // expected-error{{invalid suffix 'rr' on integer constant}}
-_Accum qk = 1.0qr;    // expected-error{{invalid suffix 'qr' on integer constant}}
+_Accum fk = 1.0fk; // expected-error{{invalid suffix 'fk' on fixed-point constant}}
+_Accum kk = 1.0kk; // expected-error{{invalid suffix 'kk' on fixed-point constant}}
+_Accum rk = 1.0rk; // expected-error{{invalid suffix 'rk' on fixed-point constant}}
+_Accum rk = 1.0rr; // expected-error{{invalid suffix 'rr' on fixed-point constant}}
+_Accum qk = 1.0qr; // expected-error{{invalid suffix 'qr' on fixed-point constant}}
 
 /* Using wrong exponent notation */
-_Accum dec_with_hex_exp1 = 0.1p10k;    // expected-error{{invalid suffix 'p10k' on integer constant}}
-_Accum dec_with_hex_exp2 = 0.1P10k;    // expected-error{{invalid suffix 'P10k' on integer constant}}
+_Accum dec_with_hex_exp1 = 0.1p10k;    // expected-error{{invalid suffix 'p10k' on fixed-point constant}}
+_Accum dec_with_hex_exp2 = 0.1P10k;    // expected-error{{invalid suffix 'P10k' on fixed-point constant}}
 _Accum hex_with_dex_exp1 = 0x0.1e10k;  // expected-error{{hexadecimal floating constant requires an exponent}}
 _Accum hex_with_dex_exp2 = 0x0.1E10k;  // expected-error{{hexadecimal floating constant requires an exponent}}
 

From 7392bbc3014cd1b54852aa71ac971c6c92cd1914 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 27 May 2020 12:21:26 -0700
Subject: [PATCH 295/770] AMDGPU/GlobalISel: Fixed insert element for
 non-standard vectors

Differential Revision: https://reviews.llvm.org/D80653
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   60 +-
 .../AMDGPU/GlobalISel/insertelement.ll        | 1185 +++++++++++++++++
 2 files changed, 1212 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5392abfa8f6e5..e68f8a95efed5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1069,66 +1069,60 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 }
 
 static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) {
-  switch (VecSize) {
-  case 32: // 4 bytes
+  if (VecSize <= 32) // 4 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1;
-  case 64: // 8 bytes
+  if (VecSize <= 64) // 8 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2;
-  case 96: // 12 bytes
+  if (VecSize <= 96) // 12 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3;
-  case 128: // 16 bytes
+  if (VecSize <= 128) // 16 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4;
-  case 160: // 20 bytes
+  if (VecSize <= 160) // 20 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5;
-  case 256: // 32 bytes
+  if (VecSize <= 256) // 32 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8;
-  case 512: // 64 bytes
+  if (VecSize <= 512) // 64 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16;
-  case 1024: // 128 bytes
+  if (VecSize <= 1024) // 128 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32;
-  default:
-    llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
-  }
+
+  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
 }
 
 static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) {
-  switch (VecSize) {
-  case 32: // 4 bytes
+  if (VecSize <= 32) // 4 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1;
-  case 64: // 8 bytes
+  if (VecSize <= 64) // 8 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2;
-  case 96: // 12 bytes
+  if (VecSize <= 96) // 12 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3;
-  case 128: // 16 bytes
+  if (VecSize <= 128) // 16 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4;
-  case 160: // 20 bytes
+  if (VecSize <= 160) // 20 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5;
-  case 256: // 32 bytes
+  if (VecSize <= 256) // 32 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8;
-  case 512: // 64 bytes
+  if (VecSize <= 512) // 64 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16;
-  case 1024: // 128 bytes
+  if (VecSize <= 1024) // 128 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32;
-  default:
-    llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
-  }
+
+  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
 }
 
 static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) {
-  switch (VecSize) {
-  case 64: // 8 bytes
+  if (VecSize <= 64) // 8 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1;
-  case 128: // 16 bytes
+  if (VecSize <= 128) // 16 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2;
-  case 256: // 32 bytes
+  if (VecSize <= 256) // 32 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4;
-  case 512: // 64 bytes
+  if (VecSize <= 512) // 64 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8;
-  case 1024: // 128 bytes
+  if (VecSize <= 1024) // 128 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16;
-  default:
-    llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
-  }
+
+  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
 }
 
 const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo(
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 5fb0ef97932f3..0d4e4a9a06895 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -3397,3 +3397,1188 @@ entry:
   %insert = insertelement <16 x double> %vec, double %val, i32 %idx
   ret <16 x double> %insert
 }
+
+define amdgpu_ps <7 x i32> @dyn_insertelement_v7i32_s_s_s(<7 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7i32_s_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 m0, s10
+; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_movreld_b32 s0, s9
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7i32_s_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 m0, s10
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_movreld_b32 s0, s9
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x i32> %vec, i32 %val, i32 %idx
+  ret <7 x i32> %insert
+}
+
+define amdgpu_ps <7 x i8 addrspace(3)*> @dyn_insertelement_v7p3i8_s_s_s(<7 x i8 addrspace(3)*> inreg %vec, i8 addrspace(3)* inreg %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7p3i8_s_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 m0, s10
+; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_movreld_b32 s0, s9
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7p3i8_s_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 m0, s10
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_movreld_b32 s0, s9
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 %idx
+  ret <7 x i8 addrspace(3)*> %insert
+}
+
+define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT:    s_set_gpr_idx_on s9, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v8
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v8, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; MOVREL-NEXT:    s_mov_b32 m0, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v4, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v5, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v6, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v7, s7
+; MOVREL-NEXT:    v_movreld_b32_e32 v0, v8
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x float> %vec, float %val, i32 %idx
+  ret <7 x float> %insert
+}
+
+define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s0
+; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
+; GPRIDX-NEXT:  BB46_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v9
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v9
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v11
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, v12
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v13
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, v14
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, v15
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, v16
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, v17
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v8
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB46_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[0:1]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v17, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v15, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v16, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s0
+; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:  BB46_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v10
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v11
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v12
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v13
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
+; MOVREL-NEXT:    s_mov_b32 m0, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v14
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v15
+; MOVREL-NEXT:    v_mov_b32_e32 v8, v16
+; MOVREL-NEXT:    v_mov_b32_e32 v9, v17
+; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
+; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; MOVREL-NEXT:    s_cbranch_execnz BB46_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v3
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v4
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v5
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v6
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v7
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v8
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x float> %vec, float %val, i32 %idx
+  ret <7 x float> %insert
+}
+
+define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_v_v_s(<7 x float> %vec, float %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f32_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v7
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f32_v_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 m0, s2
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    v_movreld_b32_e32 v0, v7
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x float> %vec, float %val, i32 %idx
+  ret <7 x float> %insert
+}
+
+define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_v_v_v(<7 x float> %vec, float %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f32_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
+; GPRIDX-NEXT:  BB48_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v8
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v8
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v7
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, v6
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v5
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, v4
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, v3
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, v2
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, v7
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB48_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[0:1]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v9
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, v11
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v12
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, v13
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, v14
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, v15
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f32_v_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:  BB48_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v8
+; MOVREL-NEXT:    v_mov_b32_e32 v16, v7
+; MOVREL-NEXT:    v_mov_b32_e32 v9, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v15, v6
+; MOVREL-NEXT:    v_mov_b32_e32 v14, v5
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v8
+; MOVREL-NEXT:    s_mov_b32 m0, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v13, v4
+; MOVREL-NEXT:    v_mov_b32_e32 v12, v3
+; MOVREL-NEXT:    v_mov_b32_e32 v11, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v10, v1
+; MOVREL-NEXT:    v_movreld_b32_e32 v9, v7
+; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; MOVREL-NEXT:    s_cbranch_execnz BB48_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v9
+; MOVREL-NEXT:    v_mov_b32_e32 v1, v10
+; MOVREL-NEXT:    v_mov_b32_e32 v2, v11
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v12
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v13
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v14
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v15
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x float> %vec, float %val, i32 %idx
+  ret <7 x float> %insert
+}
+
+define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_s_s(<7 x double> inreg %vec, double inreg %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f64_s_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    s_mov_b32 s12, s14
+; GPRIDX-NEXT:    s_mov_b32 s13, s15
+; GPRIDX-NEXT:    s_mov_b32 m0, s18
+; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_movreld_b64 s[0:1], s[16:17]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f64_s_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 m0, s18
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    s_mov_b32 s10, s12
+; MOVREL-NEXT:    s_mov_b32 s11, s13
+; MOVREL-NEXT:    s_mov_b32 s12, s14
+; MOVREL-NEXT:    s_mov_b32 s13, s15
+; MOVREL-NEXT:    s_movreld_b64 s[0:1], s[16:17]
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x double> %vec, double %val, i32 %idx
+  ret <7 x double> %insert
+}
+
+define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg %vec, double %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f64_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    s_mov_b32 s12, s14
+; GPRIDX-NEXT:    s_mov_b32 s13, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s0
+; GPRIDX-NEXT:    s_lshl_b32 s0, s16, 1
+; GPRIDX-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v2
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v3
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v4
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v6
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v7
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v8
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v10
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v11
+; GPRIDX-NEXT:    v_readfirstlane_b32 s10, v12
+; GPRIDX-NEXT:    v_readfirstlane_b32 s11, v13
+; GPRIDX-NEXT:    v_readfirstlane_b32 s12, v14
+; GPRIDX-NEXT:    v_readfirstlane_b32 s13, v15
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f64_s_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    s_mov_b32 s10, s12
+; MOVREL-NEXT:    s_mov_b32 s11, s13
+; MOVREL-NEXT:    s_mov_b32 s12, s14
+; MOVREL-NEXT:    s_mov_b32 s13, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v17, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
+; MOVREL-NEXT:    s_lshl_b32 m0, s16, 1
+; MOVREL-NEXT:    v_mov_b32_e32 v16, s14
+; MOVREL-NEXT:    v_mov_b32_e32 v15, s13
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s12
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v9, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v7, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v6, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v5, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v4, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
+; MOVREL-NEXT:    v_movreld_b32_e32 v3, v1
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v2
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v3
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v5
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v6
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v7
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v9
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v10
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v11
+; MOVREL-NEXT:    v_readfirstlane_b32 s10, v12
+; MOVREL-NEXT:    v_readfirstlane_b32 s11, v13
+; MOVREL-NEXT:    v_readfirstlane_b32 s12, v14
+; MOVREL-NEXT:    v_readfirstlane_b32 s13, v15
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x double> %vec, double %val, i32 %idx
+  ret <7 x double> %insert
+}
+
+define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg %vec, double %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f64_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 s10, s12
+; GPRIDX-NEXT:    s_mov_b32 s11, s13
+; GPRIDX-NEXT:    s_mov_b32 s12, s14
+; GPRIDX-NEXT:    s_mov_b32 s13, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v34, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v33, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v32, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v31, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v30, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v29, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v28, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v27, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v26, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v25, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v24, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v23, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v22, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v21, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v20, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s0
+; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
+; GPRIDX-NEXT:  BB51_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2
+; GPRIDX-NEXT:    s_lshl_b32 s3, s2, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v19
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, v20
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, v21
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, v22
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, v23
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, v24
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, v25
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, v26
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, v27
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, v28
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, v29
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v30
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, v31
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v32
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, v33
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, v34
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB51_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[0:1]
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v3
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v4
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v6
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v7
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v8
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v10
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v11
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v12
+; GPRIDX-NEXT:    v_readfirstlane_b32 s10, v13
+; GPRIDX-NEXT:    v_readfirstlane_b32 s11, v14
+; GPRIDX-NEXT:    v_readfirstlane_b32 s12, v15
+; GPRIDX-NEXT:    v_readfirstlane_b32 s13, v16
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f64_s_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    s_mov_b32 s10, s12
+; MOVREL-NEXT:    s_mov_b32 s11, s13
+; MOVREL-NEXT:    s_mov_b32 s12, s14
+; MOVREL-NEXT:    s_mov_b32 s13, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v34, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v33, s14
+; MOVREL-NEXT:    v_mov_b32_e32 v30, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v31, s12
+; MOVREL-NEXT:    v_mov_b32_e32 v32, s13
+; MOVREL-NEXT:    v_mov_b32_e32 v29, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v28, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v27, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v26, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v25, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v24, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v23, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v22, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v21, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v20, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v19, s0
+; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:  BB51_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v19
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v20
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v21
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v22
+; MOVREL-NEXT:    s_lshl_b32 m0, s1, 1
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v23
+; MOVREL-NEXT:    v_mov_b32_e32 v8, v24
+; MOVREL-NEXT:    v_mov_b32_e32 v9, v25
+; MOVREL-NEXT:    v_mov_b32_e32 v10, v26
+; MOVREL-NEXT:    v_mov_b32_e32 v11, v27
+; MOVREL-NEXT:    v_mov_b32_e32 v12, v28
+; MOVREL-NEXT:    v_mov_b32_e32 v13, v29
+; MOVREL-NEXT:    v_mov_b32_e32 v14, v30
+; MOVREL-NEXT:    v_mov_b32_e32 v15, v31
+; MOVREL-NEXT:    v_mov_b32_e32 v16, v32
+; MOVREL-NEXT:    v_mov_b32_e32 v17, v33
+; MOVREL-NEXT:    v_mov_b32_e32 v18, v34
+; MOVREL-NEXT:    v_movreld_b32_e32 v3, v0
+; MOVREL-NEXT:    v_movreld_b32_e32 v4, v1
+; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; MOVREL-NEXT:    s_cbranch_execnz BB51_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v3
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v5
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v6
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v7
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v9
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v10
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v11
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v12
+; MOVREL-NEXT:    v_readfirstlane_b32 s10, v13
+; MOVREL-NEXT:    v_readfirstlane_b32 s11, v14
+; MOVREL-NEXT:    v_readfirstlane_b32 s12, v15
+; MOVREL-NEXT:    v_readfirstlane_b32 s13, v16
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x double> %vec, double %val, i32 %idx
+  ret <7 x double> %insert
+}
+
+define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_s(<7 x double> %vec, double %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f64_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_lshl_b32 s0, s2, 1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v15
+; GPRIDX-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v14
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v16
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v3
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v4
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v6
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v7
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v8
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s10, v10
+; GPRIDX-NEXT:    v_readfirstlane_b32 s11, v11
+; GPRIDX-NEXT:    v_readfirstlane_b32 s12, v12
+; GPRIDX-NEXT:    v_readfirstlane_b32 s13, v13
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_lshl_b32 m0, s2, 1
+; MOVREL-NEXT:    v_mov_b32_e32 v16, v15
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    v_movreld_b32_e32 v0, v14
+; MOVREL-NEXT:    v_movreld_b32_e32 v1, v16
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v2
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v3
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v5
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v6
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v7
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v9
+; MOVREL-NEXT:    v_readfirstlane_b32 s10, v10
+; MOVREL-NEXT:    v_readfirstlane_b32 s11, v11
+; MOVREL-NEXT:    v_readfirstlane_b32 s12, v12
+; MOVREL-NEXT:    v_readfirstlane_b32 s13, v13
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x double> %vec, double %val, i32 %idx
+  ret <7 x double> %insert
+}
+
+define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, double %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v7f64_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
+; GPRIDX-NEXT:  BB53_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v16
+; GPRIDX-NEXT:    s_lshl_b32 s3, s2, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v16
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v32, v15
+; GPRIDX-NEXT:    v_mov_b32_e32 v31, v14
+; GPRIDX-NEXT:    v_mov_b32_e32 v30, v13
+; GPRIDX-NEXT:    v_mov_b32_e32 v29, v12
+; GPRIDX-NEXT:    v_mov_b32_e32 v28, v11
+; GPRIDX-NEXT:    v_mov_b32_e32 v27, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v26, v9
+; GPRIDX-NEXT:    v_mov_b32_e32 v25, v8
+; GPRIDX-NEXT:    v_mov_b32_e32 v24, v7
+; GPRIDX-NEXT:    v_mov_b32_e32 v23, v6
+; GPRIDX-NEXT:    v_mov_b32_e32 v22, v5
+; GPRIDX-NEXT:    v_mov_b32_e32 v21, v4
+; GPRIDX-NEXT:    v_mov_b32_e32 v20, v3
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, v2
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, v14
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, v15
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB53_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[0:1]
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v17
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v18
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v19
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v20
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v21
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v22
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v23
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v24
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v25
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v26
+; GPRIDX-NEXT:    v_readfirstlane_b32 s10, v27
+; GPRIDX-NEXT:    v_readfirstlane_b32 s11, v28
+; GPRIDX-NEXT:    v_readfirstlane_b32 s12, v29
+; GPRIDX-NEXT:    v_readfirstlane_b32 s13, v30
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:  BB53_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v16
+; MOVREL-NEXT:    v_mov_b32_e32 v32, v15
+; MOVREL-NEXT:    v_mov_b32_e32 v17, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v31, v14
+; MOVREL-NEXT:    v_mov_b32_e32 v30, v13
+; MOVREL-NEXT:    s_lshl_b32 m0, s1, 1
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v16
+; MOVREL-NEXT:    v_mov_b32_e32 v29, v12
+; MOVREL-NEXT:    v_mov_b32_e32 v28, v11
+; MOVREL-NEXT:    v_mov_b32_e32 v27, v10
+; MOVREL-NEXT:    v_mov_b32_e32 v26, v9
+; MOVREL-NEXT:    v_mov_b32_e32 v25, v8
+; MOVREL-NEXT:    v_mov_b32_e32 v24, v7
+; MOVREL-NEXT:    v_mov_b32_e32 v23, v6
+; MOVREL-NEXT:    v_mov_b32_e32 v22, v5
+; MOVREL-NEXT:    v_mov_b32_e32 v21, v4
+; MOVREL-NEXT:    v_mov_b32_e32 v20, v3
+; MOVREL-NEXT:    v_mov_b32_e32 v19, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v18, v1
+; MOVREL-NEXT:    v_movreld_b32_e32 v17, v14
+; MOVREL-NEXT:    v_movreld_b32_e32 v18, v15
+; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; MOVREL-NEXT:    s_cbranch_execnz BB53_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v17
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v18
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v19
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v20
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v21
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v22
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v23
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v24
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v25
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v26
+; MOVREL-NEXT:    v_readfirstlane_b32 s10, v27
+; MOVREL-NEXT:    v_readfirstlane_b32 s11, v28
+; MOVREL-NEXT:    v_readfirstlane_b32 s12, v29
+; MOVREL-NEXT:    v_readfirstlane_b32 s13, v30
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <7 x double> %vec, double %val, i32 %idx
+  ret <7 x double> %insert
+}
+
+define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_s_s(<5 x double> inreg %vec, double inreg %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v5f64_s_s_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    s_mov_b32 m0, s14
+; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_movreld_b64 s[0:1], s[12:13]
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v5f64_s_s_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 m0, s14
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    s_movreld_b64 s[0:1], s[12:13]
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <5 x double> %vec, double %val, i32 %idx
+  ret <5 x double> %insert
+}
+
+define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s0
+; GPRIDX-NEXT:    s_lshl_b32 s0, s12, 1
+; GPRIDX-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v2
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v3
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v4
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v6
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v7
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v8
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v10
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v11
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v5f64_s_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v17, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
+; MOVREL-NEXT:    s_lshl_b32 m0, s12, 1
+; MOVREL-NEXT:    v_mov_b32_e32 v16, s14
+; MOVREL-NEXT:    v_mov_b32_e32 v15, s13
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s12
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v9, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v7, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v6, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v5, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v4, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
+; MOVREL-NEXT:    v_movreld_b32_e32 v3, v1
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v2
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v3
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v5
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v6
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v7
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v9
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v10
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v11
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <5 x double> %vec, double %val, i32 %idx
+  ret <5 x double> %insert
+}
+
+define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b32 s0, s2
+; GPRIDX-NEXT:    s_mov_b32 s1, s3
+; GPRIDX-NEXT:    s_mov_b32 s2, s4
+; GPRIDX-NEXT:    s_mov_b32 s3, s5
+; GPRIDX-NEXT:    s_mov_b32 s4, s6
+; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    s_mov_b32 s6, s8
+; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    s_mov_b32 s8, s10
+; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v34, s15
+; GPRIDX-NEXT:    v_mov_b32_e32 v33, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v32, s13
+; GPRIDX-NEXT:    v_mov_b32_e32 v31, s12
+; GPRIDX-NEXT:    v_mov_b32_e32 v30, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v29, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v28, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v27, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v26, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v25, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v24, s5
+; GPRIDX-NEXT:    v_mov_b32_e32 v23, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v22, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v21, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v20, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, s0
+; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
+; GPRIDX-NEXT:  BB56_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2
+; GPRIDX-NEXT:    s_lshl_b32 s3, s2, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v19
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, v20
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, v21
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, v22
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, v23
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, v24
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, v25
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, v26
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, v27
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, v28
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, v29
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v30
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, v31
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v32
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, v33
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, v34
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, v1
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB56_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[0:1]
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v3
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v4
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v6
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v7
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v8
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v9
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v10
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v11
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v12
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v5f64_s_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, s2
+; MOVREL-NEXT:    s_mov_b32 s1, s3
+; MOVREL-NEXT:    s_mov_b32 s2, s4
+; MOVREL-NEXT:    s_mov_b32 s3, s5
+; MOVREL-NEXT:    s_mov_b32 s4, s6
+; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    s_mov_b32 s6, s8
+; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    s_mov_b32 s8, s10
+; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v34, s15
+; MOVREL-NEXT:    v_mov_b32_e32 v33, s14
+; MOVREL-NEXT:    v_mov_b32_e32 v32, s13
+; MOVREL-NEXT:    v_mov_b32_e32 v31, s12
+; MOVREL-NEXT:    v_mov_b32_e32 v30, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v29, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v28, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v27, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v26, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v25, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v24, s5
+; MOVREL-NEXT:    v_mov_b32_e32 v23, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v22, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v21, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v20, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v19, s0
+; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:  BB56_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, v19
+; MOVREL-NEXT:    v_mov_b32_e32 v4, v20
+; MOVREL-NEXT:    v_mov_b32_e32 v5, v21
+; MOVREL-NEXT:    v_mov_b32_e32 v6, v22
+; MOVREL-NEXT:    s_lshl_b32 m0, s1, 1
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v7, v23
+; MOVREL-NEXT:    v_mov_b32_e32 v8, v24
+; MOVREL-NEXT:    v_mov_b32_e32 v9, v25
+; MOVREL-NEXT:    v_mov_b32_e32 v10, v26
+; MOVREL-NEXT:    v_mov_b32_e32 v11, v27
+; MOVREL-NEXT:    v_mov_b32_e32 v12, v28
+; MOVREL-NEXT:    v_mov_b32_e32 v13, v29
+; MOVREL-NEXT:    v_mov_b32_e32 v14, v30
+; MOVREL-NEXT:    v_mov_b32_e32 v15, v31
+; MOVREL-NEXT:    v_mov_b32_e32 v16, v32
+; MOVREL-NEXT:    v_mov_b32_e32 v17, v33
+; MOVREL-NEXT:    v_mov_b32_e32 v18, v34
+; MOVREL-NEXT:    v_movreld_b32_e32 v3, v0
+; MOVREL-NEXT:    v_movreld_b32_e32 v4, v1
+; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; MOVREL-NEXT:    s_cbranch_execnz BB56_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v3
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v5
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v6
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v7
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v9
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v10
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v11
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v12
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <5 x double> %vec, double %val, i32 %idx
+  ret <5 x double> %insert
+}
+
+define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, double %val, i32 inreg %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v5f64_v_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_lshl_b32 s0, s2, 1
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v11
+; GPRIDX-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, v16
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v2
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v3
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v4
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v5
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v6
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v7
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v8
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v9
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_lshl_b32 m0, s2, 1
+; MOVREL-NEXT:    v_mov_b32_e32 v16, v11
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:    v_movreld_b32_e32 v0, v10
+; MOVREL-NEXT:    v_movreld_b32_e32 v1, v16
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v1
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v2
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v3
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v4
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v5
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v6
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v7
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v8
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v9
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <5 x double> %vec, double %val, i32 %idx
+  ret <5 x double> %insert
+}
+
+define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, double %val, i32 %idx) {
+; GPRIDX-LABEL: dyn_insertelement_v5f64_v_v_v:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_mov_b64 s[0:1], exec
+; GPRIDX-NEXT:  BB58_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v12
+; GPRIDX-NEXT:    s_lshl_b32 s3, s2, 1
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v12
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v28, v15
+; GPRIDX-NEXT:    v_mov_b32_e32 v27, v14
+; GPRIDX-NEXT:    v_mov_b32_e32 v26, v13
+; GPRIDX-NEXT:    v_mov_b32_e32 v25, v12
+; GPRIDX-NEXT:    v_mov_b32_e32 v24, v11
+; GPRIDX-NEXT:    v_mov_b32_e32 v23, v10
+; GPRIDX-NEXT:    v_mov_b32_e32 v22, v9
+; GPRIDX-NEXT:    v_mov_b32_e32 v21, v8
+; GPRIDX-NEXT:    v_mov_b32_e32 v20, v7
+; GPRIDX-NEXT:    v_mov_b32_e32 v19, v6
+; GPRIDX-NEXT:    v_mov_b32_e32 v18, v5
+; GPRIDX-NEXT:    v_mov_b32_e32 v17, v4
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v3
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, v2
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v1
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, v10
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_set_gpr_idx_on s3, gpr_idx(DST)
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, v11
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB58_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[0:1]
+; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v13
+; GPRIDX-NEXT:    v_readfirstlane_b32 s1, v14
+; GPRIDX-NEXT:    v_readfirstlane_b32 s2, v15
+; GPRIDX-NEXT:    v_readfirstlane_b32 s3, v16
+; GPRIDX-NEXT:    v_readfirstlane_b32 s4, v17
+; GPRIDX-NEXT:    v_readfirstlane_b32 s5, v18
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v19
+; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v20
+; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v21
+; GPRIDX-NEXT:    v_readfirstlane_b32 s9, v22
+; GPRIDX-NEXT:    ; return to shader part epilog
+;
+; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_mov_b32 s0, exec_lo
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
+; MOVREL-NEXT:  BB58_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v12
+; MOVREL-NEXT:    v_mov_b32_e32 v28, v15
+; MOVREL-NEXT:    v_mov_b32_e32 v27, v14
+; MOVREL-NEXT:    v_mov_b32_e32 v26, v13
+; MOVREL-NEXT:    v_mov_b32_e32 v25, v12
+; MOVREL-NEXT:    v_mov_b32_e32 v24, v11
+; MOVREL-NEXT:    v_mov_b32_e32 v23, v10
+; MOVREL-NEXT:    v_mov_b32_e32 v22, v9
+; MOVREL-NEXT:    v_mov_b32_e32 v21, v8
+; MOVREL-NEXT:    v_mov_b32_e32 v20, v7
+; MOVREL-NEXT:    v_mov_b32_e32 v19, v6
+; MOVREL-NEXT:    v_mov_b32_e32 v18, v5
+; MOVREL-NEXT:    v_mov_b32_e32 v17, v4
+; MOVREL-NEXT:    v_mov_b32_e32 v16, v3
+; MOVREL-NEXT:    v_mov_b32_e32 v15, v2
+; MOVREL-NEXT:    v_mov_b32_e32 v14, v1
+; MOVREL-NEXT:    v_mov_b32_e32 v13, v0
+; MOVREL-NEXT:    s_lshl_b32 m0, s1, 1
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v12
+; MOVREL-NEXT:    v_movreld_b32_e32 v13, v10
+; MOVREL-NEXT:    v_movreld_b32_e32 v14, v11
+; MOVREL-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; MOVREL-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; MOVREL-NEXT:    s_cbranch_execnz BB58_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b32 exec_lo, s0
+; MOVREL-NEXT:    v_readfirstlane_b32 s0, v13
+; MOVREL-NEXT:    v_readfirstlane_b32 s1, v14
+; MOVREL-NEXT:    v_readfirstlane_b32 s2, v15
+; MOVREL-NEXT:    v_readfirstlane_b32 s3, v16
+; MOVREL-NEXT:    v_readfirstlane_b32 s4, v17
+; MOVREL-NEXT:    v_readfirstlane_b32 s5, v18
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v19
+; MOVREL-NEXT:    v_readfirstlane_b32 s7, v20
+; MOVREL-NEXT:    v_readfirstlane_b32 s8, v21
+; MOVREL-NEXT:    v_readfirstlane_b32 s9, v22
+; MOVREL-NEXT:    ; return to shader part epilog
+entry:
+  %insert = insertelement <5 x double> %vec, double %val, i32 %idx
+  ret <5 x double> %insert
+}

From 2bf3fe9b6dedf727990e68244a3d637518ea8bc3 Mon Sep 17 00:00:00 2001
From: Layton Kifer <laytonkifer@gmail.com>
Date: Wed, 27 May 2020 16:54:20 -0700
Subject: [PATCH 296/770] [TRE] Allow elimination when the returned value is
 non-constant

Currently we can only eliminate call return pairs that either return the
result of the call or a dynamic constant. This patch removes that
limitation.

Differential Revision: https://reviews.llvm.org/D79660
---
 .../Scalar/TailRecursionElimination.cpp       | 117 ++++++++++++------
 .../2010-06-26-MultipleReturnValues.ll        | 104 +++++++++++++++-
 llvm/test/Transforms/TailCallElim/basic.ll    |  10 +-
 3 files changed, 185 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 4fd63fa1838bf..a752e356b7273 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -460,6 +460,16 @@ class TailRecursionEliminator {
   SmallVector<PHINode *, 8> ArgumentPHIs;
   bool RemovableCallsMustBeMarkedTail = false;
 
+  // PHI node to store our return value.
+  PHINode *RetPN = nullptr;
+
+  // i1 PHI node to track if we have a valid return value stored in RetPN.
+  PHINode *RetKnownPN = nullptr;
+
+  // Vector of select instructions we insereted. These selects use RetKnownPN
+  // to either propagate RetPN or select a new return value.
+  SmallVector<SelectInst *, 8> RetSelects;
+
   TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
                           AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
                           DomTreeUpdater &DTU)
@@ -577,6 +587,21 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
     PN->addIncoming(&*I, NewEntry);
     ArgumentPHIs.push_back(PN);
   }
+
+  // If the function doen't return void, create the RetPN and RetKnownPN PHI
+  // nodes to track our return value. We initialize RetPN with undef and
+  // RetKnownPN with false since we can't know our return value at function
+  // entry.
+  Type *RetType = F.getReturnType();
+  if (!RetType->isVoidTy()) {
+    Type *BoolType = Type::getInt1Ty(F.getContext());
+    RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
+    RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
+
+    RetPN->addIncoming(UndefValue::get(RetType), NewEntry);
+    RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
+  }
+
   // The entry block was changed from HeaderBB to NewEntry.
   // The forward DominatorTree needs to be recalculated when the EntryBB is
   // changed. In this corner-case we recalculate the entire tree.
@@ -616,11 +641,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   // value for the accumulator is placed in this variable.  If this value is set
   // then we actually perform accumulator recursion elimination instead of
   // simple tail recursion elimination.  If the operation is an LLVM instruction
-  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
-  // we are handling the case when the return instruction returns a constant C
-  // which is different to the constant returned by other return instructions
-  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
-  // special case of accumulator recursion, the operation being "return C".
+  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.
   Value *AccumulatorRecursionEliminationInitVal = nullptr;
   Instruction *AccumulatorRecursionInstr = nullptr;
 
@@ -647,26 +668,6 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
     }
   }
 
-  // We can only transform call/return pairs that either ignore the return value
-  // of the call and return void, ignore the value of the call and return a
-  // constant, return the value returned by the tail call, or that are being
-  // accumulator recursion variable eliminated.
-  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
-      !isa<UndefValue>(Ret->getReturnValue()) &&
-      AccumulatorRecursionEliminationInitVal == nullptr &&
-      !getCommonReturnValue(nullptr, CI)) {
-    // One case remains that we are able to handle: the current return
-    // instruction returns a constant, and all other return instructions
-    // return a different constant.
-    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
-      return false; // Current return instruction does not return a constant.
-    // Check that all other return instructions return a common constant.  If
-    // so, record it in AccumulatorRecursionEliminationInitVal.
-    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
-    if (!AccumulatorRecursionEliminationInitVal)
-      return false;
-  }
-
   BasicBlock *BB = Ret->getParent();
 
   using namespace ore;
@@ -698,20 +699,15 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
     PHINode *AccPN = insertAccumulator(AccumulatorRecursionEliminationInitVal);
 
     Instruction *AccRecInstr = AccumulatorRecursionInstr;
-    if (AccRecInstr) {
-      // Add an incoming argument for the current block, which is computed by
-      // our associative and commutative accumulator instruction.
-      AccPN->addIncoming(AccRecInstr, BB);
-
-      // Next, rewrite the accumulator recursion instruction so that it does not
-      // use the result of the call anymore, instead, use the PHI node we just
-      // inserted.
-      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
-    } else {
-      // Add an incoming argument for the current block, which is just the
-      // constant returned by the current return instruction.
-      AccPN->addIncoming(Ret->getReturnValue(), BB);
-    }
+
+    // Add an incoming argument for the current block, which is computed by
+    // our associative and commutative accumulator instruction.
+    AccPN->addIncoming(AccRecInstr, BB);
+
+    // Next, rewrite the accumulator recursion instruction so that it does not
+    // use the result of the call anymore, instead, use the PHI node we just
+    // inserted.
+    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
 
     // Finally, rewrite any return instructions in the program to return the PHI
     // node instead of the "initval" that they do currently.  This loop will
@@ -722,6 +718,25 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
     ++NumAccumAdded;
   }
 
+  // Update our return value tracking
+  if (RetPN) {
+    if (Ret->getReturnValue() == CI || AccumulatorRecursionEliminationInitVal) {
+      // Defer selecting a return value
+      RetPN->addIncoming(RetPN, BB);
+      RetKnownPN->addIncoming(RetKnownPN, BB);
+    } else {
+      // We found a return value we want to use, insert a select instruction to
+      // select it if we don't already know what our return value will be and
+      // store the result in our return value PHI node.
+      SelectInst *SI = SelectInst::Create(
+          RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret);
+      RetSelects.push_back(SI);
+
+      RetPN->addIncoming(SI, BB);
+      RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB);
+    }
+  }
+
   // Now that all of the PHI nodes are in place, remove the call and
   // ret instructions, replacing them with an unconditional branch.
   BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret);
@@ -804,6 +819,30 @@ void TailRecursionEliminator::cleanupAndFinalize() {
       PN->eraseFromParent();
     }
   }
+
+  if (RetPN) {
+    if (RetSelects.empty()) {
+      // If we didn't insert any select instructions, then we know we didn't
+      // store a return value and we can remove the PHI nodes we inserted.
+      RetPN->dropAllReferences();
+      RetPN->eraseFromParent();
+
+      RetKnownPN->dropAllReferences();
+      RetKnownPN->eraseFromParent();
+    } else {
+      // We need to insert a select instruction before any return left in the
+      // function to select our stored return value if we have one.
+      for (BasicBlock &BB : F) {
+        ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+        if (!RI)
+          continue;
+
+        SelectInst *SI = SelectInst::Create(
+            RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI);
+        RI->setOperand(0, SI);
+      }
+    }
+  }
 }
 
 bool TailRecursionEliminator::eliminate(Function &F,
diff --git a/llvm/test/Transforms/TailCallElim/2010-06-26-MultipleReturnValues.ll b/llvm/test/Transforms/TailCallElim/2010-06-26-MultipleReturnValues.ll
index 48110e3283cfb..4e0346c14c345 100644
--- a/llvm/test/Transforms/TailCallElim/2010-06-26-MultipleReturnValues.ll
+++ b/llvm/test/Transforms/TailCallElim/2010-06-26-MultipleReturnValues.ll
@@ -1,20 +1,112 @@
 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
 ; PR7328
 ; PR7506
-define i32 @foo(i32 %x) {
-; CHECK-LABEL: define i32 @foo(
-; CHECK: %accumulator.tr = phi i32 [ 1, %entry ], [ 0, %body ]
+define i32 @test1_constants(i32 %x) {
 entry:
   %cond = icmp ugt i32 %x, 0                      ; <i1> [#uses=1]
   br i1 %cond, label %return, label %body
 
 body:                                             ; preds = %entry
   %y = add i32 %x, 1                              ; <i32> [#uses=1]
-  %tmp = call i32 @foo(i32 %y)                    ; <i32> [#uses=0]
-; CHECK-NOT: call
+  %recurse = call i32 @test1_constants(i32 %y)        ; <i32> [#uses=0]
   ret i32 0
-; CHECK: ret i32 %accumulator.tr
 
 return:                                           ; preds = %entry
   ret i32 1
 }
+
+; CHECK-LABEL: define i32 @test1_constants(
+; CHECK: tailrecurse:
+; CHECK: %ret.tr = phi i32 [ undef, %entry ], [ %current.ret.tr, %body ]
+; CHECK: %ret.known.tr = phi i1 [ false, %entry ], [ true, %body ]
+; CHECK: body:
+; CHECK-NOT: %recurse
+; CHECK: %current.ret.tr = select i1 %ret.known.tr, i32 %ret.tr, i32 0
+; CHECK-NOT: ret
+; CHECK: return:
+; CHECK: %current.ret.tr1 = select i1 %ret.known.tr, i32 %ret.tr, i32 1
+; CHECK: ret i32 %current.ret.tr1
+
+define i32 @test2_non_constants(i32 %x) {
+entry:
+  %cond = icmp ugt i32 %x, 0
+  br i1 %cond, label %return, label %body
+
+body:
+  %y = add i32 %x, 1
+  %helper1 = call i32 @test2_helper()
+  %recurse = call i32 @test2_non_constants(i32 %y)
+  ret i32 %helper1
+
+return:
+  %helper2 = call i32 @test2_helper()
+  ret i32 %helper2
+}
+
+declare i32 @test2_helper()
+
+; CHECK-LABEL: define i32 @test2_non_constants(
+; CHECK: tailrecurse:
+; CHECK: %ret.tr = phi i32 [ undef, %entry ], [ %current.ret.tr, %body ]
+; CHECK: %ret.known.tr = phi i1 [ false, %entry ], [ true, %body ]
+; CHECK: body:
+; CHECK-NOT: %recurse
+; CHECK: %current.ret.tr = select i1 %ret.known.tr, i32 %ret.tr, i32 %helper1
+; CHECK-NOT: ret
+; CHECK: return:
+; CHECK: %current.ret.tr1 = select i1 %ret.known.tr, i32 %ret.tr, i32 %helper2
+; CHECK: ret i32 %current.ret.tr1
+
+define i32 @test3_mixed(i32 %x) {
+entry:
+  switch i32 %x, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+    i32 2, label %case2
+  ]
+
+case0:
+  %helper1 = call i32 @test3_helper()
+  br label %return
+
+case1:
+  %y1 = add i32 %x, -1
+  %recurse1 = call i32 @test3_mixed(i32 %y1)
+  br label %return
+
+case2:
+  %y2 = add i32 %x, -1
+  %helper2 = call i32 @test3_helper()
+  %recurse2 = call i32 @test3_mixed(i32 %y2)
+  br label %return
+
+default:
+  %y3 = urem i32 %x, 3
+  %recurse3 = call i32 @test3_mixed(i32 %y3)
+  br label %return
+
+return:
+  %retval = phi i32 [ %recurse3, %default ], [ %helper2, %case2 ], [ 9, %case1 ], [ %helper1, %case0 ]
+  ret i32 %retval
+}
+
+declare i32 @test3_helper()
+
+; CHECK-LABEL: define i32 @test3_mixed(
+; CHECK: tailrecurse:
+; CHECK: %ret.tr = phi i32 [ undef, %entry ], [ %current.ret.tr, %case1 ], [ %current.ret.tr1, %case2 ], [ %ret.tr, %default ]
+; CHECK: %ret.known.tr = phi i1 [ false, %entry ], [ true, %case1 ], [ true, %case2 ], [ %ret.known.tr, %default ]
+; CHECK: case1:
+; CHECK-NOT: %recurse
+; CHECK: %current.ret.tr = select i1 %ret.known.tr, i32 %ret.tr, i32 9
+; CHECK: br label %tailrecurse
+; CHECK: case2:
+; CHECK-NOT: %recurse
+; CHECK: %current.ret.tr1 = select i1 %ret.known.tr, i32 %ret.tr, i32 %helper2
+; CHECK: br label %tailrecurse
+; CHECK: default:
+; CHECK-NOT: %recurse
+; CHECK: br label %tailrecurse
+; CHECK: return:
+; CHECK: %current.ret.tr2 = select i1 %ret.known.tr, i32 %ret.tr, i32 %helper1
+; CHECK: ret i32 %current.ret.tr2
diff --git a/llvm/test/Transforms/TailCallElim/basic.ll b/llvm/test/Transforms/TailCallElim/basic.ll
index 576f2fec1244f..6116014a024b1 100644
--- a/llvm/test/Transforms/TailCallElim/basic.ll
+++ b/llvm/test/Transforms/TailCallElim/basic.ll
@@ -46,8 +46,16 @@ endif.0:		; preds = %entry
 ; plunked it into the demo script, so maybe they care about it.
 define i32 @test3(i32 %c) {
 ; CHECK: i32 @test3
+; CHECK: tailrecurse:
+; CHECK: %ret.tr = phi i32 [ undef, %entry ], [ %current.ret.tr, %else ]
+; CHECK: %ret.known.tr = phi i1 [ false, %entry ], [ true, %else ]
+; CHECK: else:
 ; CHECK-NOT: call
-; CHECK: ret i32 0
+; CHECK: %current.ret.tr = select i1 %ret.known.tr, i32 %ret.tr, i32 0
+; CHECK-NOT: ret
+; CHECK: return:
+; CHECK: %current.ret.tr1 = select i1 %ret.known.tr, i32 %ret.tr, i32 0
+; CHECK: ret i32 %current.ret.tr1
 entry:
 	%tmp.1 = icmp eq i32 %c, 0		; <i1> [#uses=1]
 	br i1 %tmp.1, label %return, label %else

From 54d289685260da85fc43c59db2550b18df7c33a5 Mon Sep 17 00:00:00 2001
From: Rui Ueyama <ruiu@google.com>
Date: Wed, 27 May 2020 16:18:55 -0700
Subject: [PATCH 297/770] [ELF] --wrap: Drop __real_ symbol from the symbol
 table

In D34993, we discussed and concluded that we should drop `__real_
symbol from the symbol table, but I did the opposite in D50569.
This patch is to drop `__real_` symbol.

MaskRay's note: omitting `__real_` is important if it is undefined:
otherwise a subsequent link may error due to the undefined `__real_` in .dynsym

Differential Revision: https://reviews.llvm.org/D51283
---
 lld/ELF/SymbolTable.cpp     | 14 ++++++++------
 lld/test/ELF/lto/wrap-2.ll  |  4 ----
 lld/test/ELF/wrap-no-real.s | 35 +++++++++++++++++++----------------
 lld/test/ELF/wrap.s         |  7 +------
 4 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index ccec90727730c..2097234f1adb2 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -40,12 +40,14 @@ void SymbolTable::wrap(Symbol *sym, Symbol *real, Symbol *wrap) {
   idx2 = idx1;
   idx1 = idx3;
 
-  // Now renaming is complete. No one refers Real symbol. We could leave
-  // Real as-is, but if Real is written to the symbol table, that may
-  // contain irrelevant values. So, we copy all values from Sym to Real.
-  StringRef s = real->getName();
-  memcpy(real, sym, sizeof(SymbolUnion));
-  real->setName(s);
+  // Now renaming is complete, and no one refers to real. We drop real from
+  // .symtab and .dynsym. If real is undefined, it is important that we don't
+  // leave it in .dynsym, because otherwise it might lead to an undefined symbol
+  // error in a subsequent link. If real is defined, we could emit real as an
+  // alias for sym, but that could degrade the user experience of some tools
+  // that can print out only one symbol for each location: sym is a preferred
+  // name than real, but they might print out real instead.
+  real->isUsedInRegularObj = false;
 }
 
 // Find an existing symbol or create a new one.
diff --git a/lld/test/ELF/lto/wrap-2.ll b/lld/test/ELF/lto/wrap-2.ll
index 4c8993aa0424c..a441c9837bdaa 100644
--- a/lld/test/ELF/lto/wrap-2.ll
+++ b/lld/test/ELF/lto/wrap-2.ll
@@ -27,10 +27,6 @@
 ; BIND-NEXT: Value:
 ; BIND-NEXT: Size:
 ; BIND-NEXT: Binding: Local
-; BIND:      Name: __real_bar
-; BIND-NEXT: Value:
-; BIND-NEXT: Size:
-; BIND-NEXT: Binding: Local
 ; BIND:      Name: __wrap_bar
 ; BIND-NEXT: Value:
 ; BIND-NEXT: Size:
diff --git a/lld/test/ELF/wrap-no-real.s b/lld/test/ELF/wrap-no-real.s
index 41a23185093be..0ee95edd8e305 100644
--- a/lld/test/ELF/wrap-no-real.s
+++ b/lld/test/ELF/wrap-no-real.s
@@ -5,27 +5,30 @@
 // RUN: ld.lld -o %t3.so -shared %t3.o
 
 // RUN: ld.lld -o %t %t1.o %t2.o -wrap foo
-// RUN: llvm-objdump -d --print-imm-hex %t | FileCheck %s
-
-// RUN: ld.lld -o %t %t1.o %t2.o %t3.so -wrap foo
-// RUN: llvm-objdump -d --print-imm-hex %t | FileCheck %s
+// RUN: llvm-objdump -d %t | FileCheck %s
+// RUN: llvm-readelf -s -x .got %t | FileCheck --check-prefix=READELF --implicit-check-not=__real_ %s
 
 // CHECK: <_start>:
-// CHECK-NEXT: movl $0x11010, %edx
-// CHECK-NEXT: movl $0x11010, %edx
-// CHECK-NEXT: movl $0x11000, %edx
+// CHECK-NEXT: movq {{.*}}(%rip), %rax  # 2021a8
+// CHECK-NEXT: movq {{.*}}(%rip), %rbx  # 2021a8
+// CHECK-NEXT: movq {{.*}}(%rip), %rcx  # 2021b0
 
-// RUN: llvm-objdump -t %t | FileCheck --check-prefix=SYM %s
+// READELF:      0000000000011010  0 NOTYPE GLOBAL DEFAULT ABS __wrap_foo
+// READELF:      0000000000011000  0 NOTYPE GLOBAL DEFAULT ABS foo
+// READELF:      Hex dump of section '.got':
+// READELF-NEXT: 0x[[#%x,ADDR:]] 10100100 00000000 00100100 00000000
 
+// RUN: ld.lld -o %t2 %t1.o %t2.o %t3.so --wrap foo
+// RUN: llvm-objdump -d %t2 | FileCheck --check-prefix=CHECK2 %s
+// RUN: llvm-readelf -s -x .got %t2 | FileCheck --check-prefix=READELF --implicit-check-not=__real_ %s
 
-// SYM:      {{.*}}           l .dynamic 0000000000000000 .hidden _DYNAMIC
-// SYM-NEXT: 0000000000011000 g *ABS*    0000000000000000 __real_foo
-// SYM-NEXT: 0000000000011010 g *ABS*    0000000000000000 __wrap_foo
-// SYM-NEXT: {{.*}}           g .text    0000000000000000 _start
-// SYM-NEXT: 0000000000011000 g *ABS*    0000000000000000 foo
+// CHECK2: <_start>:
+// CHECK2-NEXT: movq {{.*}}(%rip), %rax  # 2022f8
+// CHECK2-NEXT: movq {{.*}}(%rip), %rbx  # 2022f8
+// CHECK2-NEXT: movq {{.*}}(%rip), %rcx  # 202300
 
 .global _start
 _start:
-  movl $foo, %edx
-  movl $__wrap_foo, %edx
-  movl $__real_foo, %edx
+  mov foo@gotpcrel(%rip), %rax
+  mov __wrap_foo@gotpcrel(%rip), %rbx
+  mov __real_foo@gotpcrel(%rip), %rcx
diff --git a/lld/test/ELF/wrap.s b/lld/test/ELF/wrap.s
index 5718ea45f669f..2a3e56cb0af57 100644
--- a/lld/test/ELF/wrap.s
+++ b/lld/test/ELF/wrap.s
@@ -33,12 +33,7 @@
 // SYM2-NEXT: Other [
 // SYM2-NEXT:   STV_PROTECTED
 // SYM2-NEXT: ]
-// SYM3:      Name: __real_foo
-// SYM3-NEXT: Value: 0x11000
-// SYM3-NEXT: Size:
-// SYM3-NEXT: Binding: Global
-// SYM3-NEXT: Type:    None
-// SYM3-NEXT: Other:   0
+// SYM3-NOT:  Name: __real_foo
 
 .global _start
 _start:

From dee2bb58107fc3ce438d2a12c778bb0ab485b592 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 17:12:02 -0700
Subject: [PATCH 298/770] [gn build] Port D80579

---
 llvm/utils/gn/secondary/llvm/lib/Analysis/ML/BUILD.gn | 10 ++++++++++
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn      |  1 +
 2 files changed, 11 insertions(+)
 create mode 100644 llvm/utils/gn/secondary/llvm/lib/Analysis/ML/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/ML/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/ML/BUILD.gn
new file mode 100644
index 0000000000000..86b86252eed05
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/ML/BUILD.gn
@@ -0,0 +1,10 @@
+static_library("MLPolicies") {
+  output_name = "LLVMMLPolicies"
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "InlineFeaturesAnalysis.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 5044fc16a7aac..e1779112abe42 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -2,6 +2,7 @@ static_library("Passes") {
   output_name = "LLVMPasses"
   deps = [
     "//llvm/lib/Analysis",
+    "//llvm/lib/Analysis/ML:MLPolicies",
     "//llvm/lib/CodeGen",
     "//llvm/lib/IR",
     "//llvm/lib/Support",

From d14ee1553e46634ef6b7eb0d7c0b45fd3c30567f Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 27 May 2020 17:04:28 -0700
Subject: [PATCH 299/770] [llvm][NFC] ProfileSummaryInfo - const-ify APIs

Follow-up from https://reviews.llvm.org/D79920
---
 .../llvm/Analysis/ProfileSummaryInfo.h        | 82 ++++++++++---------
 llvm/lib/Analysis/ProfileSummaryInfo.cpp      | 80 +++++++++---------
 2 files changed, 82 insertions(+), 80 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
index e650e1c9d6890..9fcceb93dbc40 100644
--- a/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/llvm/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -52,7 +52,7 @@ class ProfileSummaryInfo {
   // percentile is above a large threshold.
   Optional<bool> HasLargeWorkingSetSize;
   // Compute the threshold for a given cutoff.
-  Optional<uint64_t> computeThreshold(int PercentileCutoff);
+  Optional<uint64_t> computeThreshold(int PercentileCutoff) const;
   // The map that caches the threshold values. The keys are the percentile
   // cutoff values and the values are the corresponding threshold values.
   mutable DenseMap<int, uint64_t> ThresholdCache;
@@ -68,19 +68,19 @@ class ProfileSummaryInfo {
   bool hasProfileSummary() const { return Summary != nullptr; }
 
   /// Returns true if module \c M has sample profile.
-  bool hasSampleProfile() {
+  bool hasSampleProfile() const {
     return hasProfileSummary() &&
            Summary->getKind() == ProfileSummary::PSK_Sample;
   }
 
   /// Returns true if module \c M has instrumentation profile.
-  bool hasInstrumentationProfile() {
+  bool hasInstrumentationProfile() const {
     return hasProfileSummary() &&
            Summary->getKind() == ProfileSummary::PSK_Instr;
   }
 
   /// Returns true if module \c M has context sensitive instrumentation profile.
-  bool hasCSInstrumentationProfile() {
+  bool hasCSInstrumentationProfile() const {
     return hasProfileSummary() &&
            Summary->getKind() == ProfileSummary::PSK_CSInstr;
   }
@@ -99,84 +99,86 @@ class ProfileSummaryInfo {
   /// Returns the profile count for \p CallInst.
   Optional<uint64_t> getProfileCount(const CallBase &CallInst,
                                      BlockFrequencyInfo *BFI,
-                                     bool AllowSynthetic = false);
+                                     bool AllowSynthetic = false) const;
   /// Returns true if module \c M has partial-profile sample profile.
-  bool hasPartialSampleProfile();
+  bool hasPartialSampleProfile() const;
   /// Returns true if the working set size of the code is considered huge.
-  bool hasHugeWorkingSetSize();
+  bool hasHugeWorkingSetSize() const;
   /// Returns true if the working set size of the code is considered large.
-  bool hasLargeWorkingSetSize();
+  bool hasLargeWorkingSetSize() const;
   /// Returns true if \p F has hot function entry.
-  bool isFunctionEntryHot(const Function *F);
+  bool isFunctionEntryHot(const Function *F) const;
   /// Returns true if \p F contains hot code.
-  bool isFunctionHotInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
+  bool isFunctionHotInCallGraph(const Function *F,
+                                BlockFrequencyInfo &BFI) const;
   /// Returns true if \p F has cold function entry.
-  bool isFunctionEntryCold(const Function *F);
+  bool isFunctionEntryCold(const Function *F) const;
   /// Returns true if \p F contains only cold code.
-  bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
+  bool isFunctionColdInCallGraph(const Function *F,
+                                 BlockFrequencyInfo &BFI) const;
   /// Returns true if the hotness of \p F is unknown.
-  bool isFunctionHotnessUnknown(const Function &F);
+  bool isFunctionHotnessUnknown(const Function &F) const;
   /// Returns true if \p F contains hot code with regard to a given hot
   /// percentile cutoff value.
   bool isFunctionHotInCallGraphNthPercentile(int PercentileCutoff,
                                              const Function *F,
-                                             BlockFrequencyInfo &BFI);
+                                             BlockFrequencyInfo &BFI) const;
   /// Returns true if \p F contains cold code with regard to a given cold
   /// percentile cutoff value.
   bool isFunctionColdInCallGraphNthPercentile(int PercentileCutoff,
                                               const Function *F,
-                                              BlockFrequencyInfo &BFI);
+                                              BlockFrequencyInfo &BFI) const;
   /// Returns true if count \p C is considered hot.
-  bool isHotCount(uint64_t C);
+  bool isHotCount(uint64_t C) const;
   /// Returns true if count \p C is considered cold.
-  bool isColdCount(uint64_t C);
+  bool isColdCount(uint64_t C) const;
   /// Returns true if count \p C is considered hot with regard to a given
   /// hot percentile cutoff value.
-  bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C);
+  bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C) const;
   /// Returns true if count \p C is considered cold with regard to a given
   /// cold percentile cutoff value.
-  bool isColdCountNthPercentile(int PercentileCutoff, uint64_t C);
+  bool isColdCountNthPercentile(int PercentileCutoff, uint64_t C) const;
   /// Returns true if BasicBlock \p BB is considered hot.
-  bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  bool isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const;
   /// Returns true if BasicBlock \p BB is considered cold.
-  bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const;
   /// Returns true if BasicBlock \p BB is considered hot with regard to a given
   /// hot percentile cutoff value.
-  bool isHotBlockNthPercentile(int PercentileCutoff,
-                               const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  bool isHotBlockNthPercentile(int PercentileCutoff, const BasicBlock *BB,
+                               BlockFrequencyInfo *BFI) const;
   /// Returns true if BasicBlock \p BB is considered cold with regard to a given
   /// cold percentile cutoff value.
-  bool isColdBlockNthPercentile(int PercentileCutoff,
-                                const BasicBlock *BB, BlockFrequencyInfo *BFI);
+  bool isColdBlockNthPercentile(int PercentileCutoff, const BasicBlock *BB,
+                                BlockFrequencyInfo *BFI) const;
   /// Returns true if the call site \p CB is considered hot.
-  bool isHotCallSite(const CallBase &CB, BlockFrequencyInfo *BFI);
+  bool isHotCallSite(const CallBase &CB, BlockFrequencyInfo *BFI) const;
   /// Returns true if call site \p CB is considered cold.
-  bool isColdCallSite(const CallBase &CB, BlockFrequencyInfo *BFI);
+  bool isColdCallSite(const CallBase &CB, BlockFrequencyInfo *BFI) const;
   /// Returns HotCountThreshold if set. Recompute HotCountThreshold
   /// if not set.
-  uint64_t getOrCompHotCountThreshold();
+  uint64_t getOrCompHotCountThreshold() const;
   /// Returns ColdCountThreshold if set. Recompute HotCountThreshold
   /// if not set.
-  uint64_t getOrCompColdCountThreshold();
+  uint64_t getOrCompColdCountThreshold() const;
   /// Returns HotCountThreshold if set.
-  uint64_t getHotCountThreshold() {
+  uint64_t getHotCountThreshold() const {
     return HotCountThreshold ? HotCountThreshold.getValue() : 0;
   }
   /// Returns ColdCountThreshold if set.
-  uint64_t getColdCountThreshold() {
+  uint64_t getColdCountThreshold() const {
     return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
   }
 
  private:
-  template<bool isHot>
-  bool isFunctionHotOrColdInCallGraphNthPercentile(int PercentileCutoff,
-                                                   const Function *F,
-                                                   BlockFrequencyInfo &BFI);
-  template<bool isHot>
-  bool isHotOrColdCountNthPercentile(int PercentileCutoff, uint64_t C);
-  template<bool isHot>
-  bool isHotOrColdBlockNthPercentile(int PercentileCutoff, const BasicBlock *BB,
-                                     BlockFrequencyInfo *BFI);
+   template <bool isHot>
+   bool isFunctionHotOrColdInCallGraphNthPercentile(
+       int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const;
+   template <bool isHot>
+   bool isHotOrColdCountNthPercentile(int PercentileCutoff, uint64_t C) const;
+   template <bool isHot>
+   bool isHotOrColdBlockNthPercentile(int PercentileCutoff,
+                                      const BasicBlock *BB,
+                                      BlockFrequencyInfo *BFI) const;
 };
 
 /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index ec7649c516e04..3360fd4c37c02 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -106,9 +106,8 @@ void ProfileSummaryInfo::refresh() {
   computeThresholds();
 }
 
-Optional<uint64_t> ProfileSummaryInfo::getProfileCount(const CallBase &Call,
-                                                       BlockFrequencyInfo *BFI,
-                                                       bool AllowSynthetic) {
+Optional<uint64_t> ProfileSummaryInfo::getProfileCount(
+    const CallBase &Call, BlockFrequencyInfo *BFI, bool AllowSynthetic) const {
   assert((isa<CallInst>(Call) || isa<InvokeInst>(Call)) &&
          "We can only get profile count for call/invoke instruction.");
   if (hasSampleProfile()) {
@@ -129,7 +128,7 @@ Optional<uint64_t> ProfileSummaryInfo::getProfileCount(const CallBase &Call,
 /// Returns true if the function's entry is hot. If it returns false, it
 /// either means it is not hot or it is unknown whether it is hot or not (for
 /// example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
+bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) const {
   if (!F || !hasProfileSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
@@ -144,8 +143,8 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
 /// hot total call edge count.
 /// If it returns false, it either means it is not hot or it is unknown
 /// (for example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
-                                                  BlockFrequencyInfo &BFI) {
+bool ProfileSummaryInfo::isFunctionHotInCallGraph(
+    const Function *F, BlockFrequencyInfo &BFI) const {
   if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
@@ -173,8 +172,8 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
 /// the total call edge count is cold.
 /// If it returns false, it either means it is not cold or it is unknown
 /// (for example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
-                                                   BlockFrequencyInfo &BFI) {
+bool ProfileSummaryInfo::isFunctionColdInCallGraph(
+    const Function *F, BlockFrequencyInfo &BFI) const {
   if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
@@ -197,14 +196,14 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
   return true;
 }
 
-bool ProfileSummaryInfo::isFunctionHotnessUnknown(const Function &F) {
+bool ProfileSummaryInfo::isFunctionHotnessUnknown(const Function &F) const {
   assert(hasPartialSampleProfile() && "Expect partial sample profile");
   return !F.getEntryCount().hasValue();
 }
 
-template<bool isHot>
+template <bool isHot>
 bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
-    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const {
   if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount()) {
@@ -238,13 +237,13 @@ bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
 
 // Like isFunctionHotInCallGraph but for a given cutoff.
 bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
-    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const {
   return isFunctionHotOrColdInCallGraphNthPercentile<true>(
       PercentileCutoff, F, BFI);
 }
 
 bool ProfileSummaryInfo::isFunctionColdInCallGraphNthPercentile(
-    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const {
   return isFunctionHotOrColdInCallGraphNthPercentile<false>(
       PercentileCutoff, F, BFI);
 }
@@ -252,7 +251,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraphNthPercentile(
 /// Returns true if the function's entry is a cold. If it returns false, it
 /// either means it is not cold or it is unknown whether it is cold or not (for
 /// example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
+bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) const {
   if (!F)
     return false;
   if (F->hasFnAttribute(Attribute::Cold))
@@ -287,7 +286,8 @@ void ProfileSummaryInfo::computeThresholds() {
       HotEntry.NumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
 }
 
-Optional<uint64_t> ProfileSummaryInfo::computeThreshold(int PercentileCutoff) {
+Optional<uint64_t>
+ProfileSummaryInfo::computeThreshold(int PercentileCutoff) const {
   if (!hasProfileSummary())
     return None;
   auto iter = ThresholdCache.find(PercentileCutoff);
@@ -302,25 +302,25 @@ Optional<uint64_t> ProfileSummaryInfo::computeThreshold(int PercentileCutoff) {
   return CountThreshold;
 }
 
-bool ProfileSummaryInfo::hasHugeWorkingSetSize() {
+bool ProfileSummaryInfo::hasHugeWorkingSetSize() const {
   return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue();
 }
 
-bool ProfileSummaryInfo::hasLargeWorkingSetSize() {
+bool ProfileSummaryInfo::hasLargeWorkingSetSize() const {
   return HasLargeWorkingSetSize && HasLargeWorkingSetSize.getValue();
 }
 
-bool ProfileSummaryInfo::isHotCount(uint64_t C) {
+bool ProfileSummaryInfo::isHotCount(uint64_t C) const {
   return HotCountThreshold && C >= HotCountThreshold.getValue();
 }
 
-bool ProfileSummaryInfo::isColdCount(uint64_t C) {
+bool ProfileSummaryInfo::isColdCount(uint64_t C) const {
   return ColdCountThreshold && C <= ColdCountThreshold.getValue();
 }
 
-template<bool isHot>
+template <bool isHot>
 bool ProfileSummaryInfo::isHotOrColdCountNthPercentile(int PercentileCutoff,
-                                                       uint64_t C) {
+                                                       uint64_t C) const {
   auto CountThreshold = computeThreshold(PercentileCutoff);
   if (isHot)
     return CountThreshold && C >= CountThreshold.getValue();
@@ -328,37 +328,39 @@ bool ProfileSummaryInfo::isHotOrColdCountNthPercentile(int PercentileCutoff,
     return CountThreshold && C <= CountThreshold.getValue();
 }
 
-bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff, uint64_t C) {
+bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff,
+                                                 uint64_t C) const {
   return isHotOrColdCountNthPercentile<true>(PercentileCutoff, C);
 }
 
-bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff, uint64_t C) {
+bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff,
+                                                  uint64_t C) const {
   return isHotOrColdCountNthPercentile<false>(PercentileCutoff, C);
 }
 
-uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() {
+uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() const {
   return HotCountThreshold ? HotCountThreshold.getValue() : UINT64_MAX;
 }
 
-uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() {
+uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() const {
   return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
 }
 
-bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) {
+bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB,
+                                    BlockFrequencyInfo *BFI) const {
   auto Count = BFI->getBlockProfileCount(BB);
   return Count && isHotCount(*Count);
 }
 
 bool ProfileSummaryInfo::isColdBlock(const BasicBlock *BB,
-                                  BlockFrequencyInfo *BFI) {
+                                     BlockFrequencyInfo *BFI) const {
   auto Count = BFI->getBlockProfileCount(BB);
   return Count && isColdCount(*Count);
 }
 
-template<bool isHot>
-bool ProfileSummaryInfo::isHotOrColdBlockNthPercentile(int PercentileCutoff,
-                                                       const BasicBlock *BB,
-                                                       BlockFrequencyInfo *BFI) {
+template <bool isHot>
+bool ProfileSummaryInfo::isHotOrColdBlockNthPercentile(
+    int PercentileCutoff, const BasicBlock *BB, BlockFrequencyInfo *BFI) const {
   auto Count = BFI->getBlockProfileCount(BB);
   if (isHot)
     return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
@@ -366,26 +368,24 @@ bool ProfileSummaryInfo::isHotOrColdBlockNthPercentile(int PercentileCutoff,
     return Count && isColdCountNthPercentile(PercentileCutoff, *Count);
 }
 
-bool ProfileSummaryInfo::isHotBlockNthPercentile(int PercentileCutoff,
-                                                 const BasicBlock *BB,
-                                                 BlockFrequencyInfo *BFI) {
+bool ProfileSummaryInfo::isHotBlockNthPercentile(
+    int PercentileCutoff, const BasicBlock *BB, BlockFrequencyInfo *BFI) const {
   return isHotOrColdBlockNthPercentile<true>(PercentileCutoff, BB, BFI);
 }
 
-bool ProfileSummaryInfo::isColdBlockNthPercentile(int PercentileCutoff,
-                                                  const BasicBlock *BB,
-                                                  BlockFrequencyInfo *BFI) {
+bool ProfileSummaryInfo::isColdBlockNthPercentile(
+    int PercentileCutoff, const BasicBlock *BB, BlockFrequencyInfo *BFI) const {
   return isHotOrColdBlockNthPercentile<false>(PercentileCutoff, BB, BFI);
 }
 
 bool ProfileSummaryInfo::isHotCallSite(const CallBase &CB,
-                                       BlockFrequencyInfo *BFI) {
+                                       BlockFrequencyInfo *BFI) const {
   auto C = getProfileCount(CB, BFI);
   return C && isHotCount(*C);
 }
 
 bool ProfileSummaryInfo::isColdCallSite(const CallBase &CB,
-                                        BlockFrequencyInfo *BFI) {
+                                        BlockFrequencyInfo *BFI) const {
   auto C = getProfileCount(CB, BFI);
   if (C)
     return isColdCount(*C);
@@ -395,7 +395,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallBase &CB,
   return hasSampleProfile() && CB.getCaller()->hasProfileData();
 }
 
-bool ProfileSummaryInfo::hasPartialSampleProfile() {
+bool ProfileSummaryInfo::hasPartialSampleProfile() const {
   return hasProfileSummary() &&
          Summary->getKind() == ProfileSummary::PSK_Sample &&
          (PartialProfile || Summary->isPartialProfile());

From eca963f244c711ab51e1e645241562987c0f8fbf Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 17:20:41 -0700
Subject: [PATCH 300/770] [gn build] Add MLAnalysisTests after D80579

---
 .../secondary/llvm/unittests/Analysis/ML/BUILD.gn | 15 +++++++++++++++
 llvm/utils/gn/secondary/llvm/unittests/BUILD.gn   |  1 +
 2 files changed, 16 insertions(+)
 create mode 100644 llvm/utils/gn/secondary/llvm/unittests/Analysis/ML/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/ML/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/ML/BUILD.gn
new file mode 100644
index 0000000000000..8c924603358ba
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/ML/BUILD.gn
@@ -0,0 +1,15 @@
+import("//llvm/utils/unittest/unittest.gni")
+
+unittest("MLAnalysisTests") {
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/AsmParser",
+    "//llvm/lib/IR",
+    "//llvm/lib/Analysis/ML:MLPolicies",
+    "//llvm/lib/Support",
+    "//llvm/lib/Transforms/Utils",
+  ]
+  sources = [
+    "InlineFeaturesAnalysisTest.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 3d960d501e425..23557304b5715 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -4,6 +4,7 @@ group("unittests") {
   deps = [
     "ADT:ADTTests",
     "Analysis:AnalysisTests",
+    "Analysis/ML:MLAnalysisTests",
     "AsmParser:AsmParserTests",
     "BinaryFormat:BinaryFormatTests",
     "Bitcode:BitcodeTests",

From c94c5bf9cce8a4c7ad5e8abbc8f21bad5cf6b889 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 27 May 2020 17:16:41 -0700
Subject: [PATCH 301/770] Introduce a GCStatepointInst type analogous to
 IntrinsicInst subclasses

Back when we had CallSite, we implemented the current Statepoint/ImmutableStatepoint structure in analogous manner.  Now that CallSite has been removed, the structure used for statepoints looks decidely out of place.  gc.statepoint is one of the small handful of intrinsics which are invokable.  Because of this, it can't subclass IntrinsicInst as is idiomatic.

This change simply introduces the GCStatepointInst class, restructures the existing Statepoint/ImmutableStatepoint types to wrap it.  I will be landing a series of changes to sink functionality into GCStatepointInst and updating callers to be more idiomatic.
---
 llvm/include/llvm/IR/Statepoint.h | 44 ++++++++++++++++++++++---------
 llvm/lib/IR/Statepoint.cpp        |  8 ++----
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index 89f130bc33517..f9eeddba778a2 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -65,26 +65,46 @@ bool isGCRelocate(const Value *V);
 bool isGCResult(const CallBase *Call);
 bool isGCResult(const Value *V);
 
+/// Represents a gc.statepoint intrinsic call.  This extends directly from
+/// CallBase as the IntrinsicInst only supports calls and gc.statepoint is
+/// invokable.
+class GCStatepointInst : public CallBase {
+public:
+  GCStatepointInst() = delete;
+  GCStatepointInst(const GCStatepointInst &) = delete;
+  GCStatepointInst &operator=(const GCStatepointInst &) = delete;
+
+  static bool classof(const CallBase *I) {
+    if (const Function *CF = I->getCalledFunction())
+      return CF->getIntrinsicID() == Intrinsic::experimental_gc_statepoint;
+    return false;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<CallBase>(V) && classof(cast<CallBase>(V));
+  }
+};
+
 /// A wrapper around a GC intrinsic call, this provides most of the actual
 /// functionality for Statepoint and ImmutableStatepoint.  It is
 /// templatized to allow easily specializing of const and non-const
 /// concrete subtypes.
 template <typename FunTy, typename InstructionTy, typename ValueTy,
-          typename CallBaseTy>
+          typename CallTy>
 class StatepointBase {
-  CallBaseTy *StatepointCall;
+  CallTy *StatepointCall;
 
 protected:
   explicit StatepointBase(InstructionTy *I) {
-    StatepointCall = isStatepoint(I) ? cast<CallBaseTy>(I) : nullptr;
+    StatepointCall = isStatepoint(I) ? cast<CallTy>(I) : nullptr;
   }
 
-  explicit StatepointBase(CallBaseTy *Call) {
+  explicit StatepointBase(CallTy *Call) {
     StatepointCall = isStatepoint(Call) ? Call : nullptr;
   }
 
 public:
-  using arg_iterator = typename CallBaseTy::const_op_iterator;
+  using arg_iterator = typename CallTy::const_op_iterator;
 
   enum {
     IDPos = 0,
@@ -104,7 +124,7 @@ class StatepointBase {
   }
 
   /// Return the underlying call instruction.
-  CallBaseTy *getCall() const {
+  CallTy *getCall() const {
     assert(*this && "check validity first!");
     return StatepointCall;
   }
@@ -291,9 +311,9 @@ class StatepointBase {
 /// to a gc.statepoint.
 class ImmutableStatepoint
     : public StatepointBase<const Function, const Instruction, const Value,
-                            const CallBase> {
+                            const GCStatepointInst> {
   using Base = StatepointBase<const Function, const Instruction, const Value,
-                              const CallBase>;
+                              const GCStatepointInst>;
 
 public:
   explicit ImmutableStatepoint(const Instruction *I) : Base(I) {}
@@ -303,8 +323,8 @@ class ImmutableStatepoint
 /// A specialization of it's base class for read-write access
 /// to a gc.statepoint.
 class Statepoint
-    : public StatepointBase<Function, Instruction, Value, CallBase> {
-  using Base = StatepointBase<Function, Instruction, Value, CallBase>;
+    : public StatepointBase<Function, Instruction, Value, GCStatepointInst> {
+  using Base = StatepointBase<Function, Instruction, Value, GCStatepointInst>;
 
 public:
   explicit Statepoint(Instruction *I) : Base(I) {}
@@ -402,9 +422,9 @@ class GCResultInst : public GCProjectionInst {
 };
 
 template <typename FunTy, typename InstructionTy, typename ValueTy,
-          typename CallBaseTy>
+          typename CallTy>
 std::vector<const GCRelocateInst *>
-StatepointBase<FunTy, InstructionTy, ValueTy, CallBaseTy>::getRelocates()
+StatepointBase<FunTy, InstructionTy, ValueTy, CallTy>::getRelocates()
     const {
   std::vector<const GCRelocateInst *> Result;
 
diff --git a/llvm/lib/IR/Statepoint.cpp b/llvm/lib/IR/Statepoint.cpp
index fce89b42e9bf6..53b0d1e0aa359 100644
--- a/llvm/lib/IR/Statepoint.cpp
+++ b/llvm/lib/IR/Statepoint.cpp
@@ -18,15 +18,11 @@
 using namespace llvm;
 
 bool llvm::isStatepoint(const CallBase *Call) {
-  if (auto *F = Call->getCalledFunction())
-    return F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint;
-  return false;
+  return isa<GCStatepointInst>(Call);
 }
 
 bool llvm::isStatepoint(const Value *V) {
-  if (auto *Call = dyn_cast<CallBase>(V))
-    return isStatepoint(Call);
-  return false;
+  return isa<GCStatepointInst>(V);
 }
 
 bool llvm::isStatepoint(const Value &V) {

From 00e5d38d40162d049f67b436ad42c9d05092e65c Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 27 May 2020 17:20:15 -0700
Subject: [PATCH 302/770] Do not warn that an expression of the form (void)arr;
 is unused when arr is a volatile non-local array.

This fixes a recent regression exposed by removing lvalue-to-rvalue
conversion of discarded volatile arrays. In passing, regularize the
rules we use to determine whether '(void)expr;' warns when expr is a
volatile glvalue.
---
 clang/include/clang/AST/Expr.h           |  5 ++
 clang/lib/AST/Expr.cpp                   | 93 +++++++++++++++++++++---
 clang/lib/Sema/SemaExprCXX.cpp           | 78 +++-----------------
 clang/test/SemaCXX/warn-unused-value.cpp | 30 ++++++++
 4 files changed, 126 insertions(+), 80 deletions(-)

diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 0ca4941789e75..deca0b82c4e33 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -232,6 +232,11 @@ class Expr : public ValueStmt {
   /// a problem with a generic expression.
   SourceLocation getExprLoc() const LLVM_READONLY;
 
+  /// Determine whether an lvalue-to-rvalue conversion should implicitly be
+  /// applied to this expression if it appears as a discarded-value expression
+  /// in C++11 onwards. This applies to certain forms of volatile glvalues.
+  bool isReadIfDiscardedInCPlusPlus11() const;
+
   /// isUnusedResultAWarning - Return true if this immediate expression should
   /// be warned about if the result is unused.  If so, fill in expr, location,
   /// and ranges with expr to warn on and source locations/ranges appropriate
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 4c175fff64217..feb0517204c4b 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -2267,6 +2267,64 @@ Stmt *BlockExpr::getBody() {
 // Generic Expression Routines
 //===----------------------------------------------------------------------===//
 
+bool Expr::isReadIfDiscardedInCPlusPlus11() const {
+  // In C++11, discarded-value expressions of a certain form are special,
+  // according to [expr]p10:
+  //   The lvalue-to-rvalue conversion (4.1) is applied only if the
+  //   expression is an lvalue of volatile-qualified type and it has
+  //   one of the following forms:
+  if (!isGLValue() || !getType().isVolatileQualified())
+    return false;
+
+  const Expr *E = IgnoreParens();
+
+  //   - id-expression (5.1.1),
+  if (isa<DeclRefExpr>(E))
+    return true;
+
+  //   - subscripting (5.2.1),
+  if (isa<ArraySubscriptExpr>(E))
+    return true;
+
+  //   - class member access (5.2.5),
+  if (isa<MemberExpr>(E))
+    return true;
+
+  //   - indirection (5.3.1),
+  if (auto *UO = dyn_cast<UnaryOperator>(E))
+    if (UO->getOpcode() == UO_Deref)
+      return true;
+
+  if (auto *BO = dyn_cast<BinaryOperator>(E)) {
+    //   - pointer-to-member operation (5.5),
+    if (BO->isPtrMemOp())
+      return true;
+
+    //   - comma expression (5.18) where the right operand is one of the above.
+    if (BO->getOpcode() == BO_Comma)
+      return BO->getRHS()->isReadIfDiscardedInCPlusPlus11();
+  }
+
+  //   - conditional expression (5.16) where both the second and the third
+  //     operands are one of the above, or
+  if (auto *CO = dyn_cast<ConditionalOperator>(E))
+    return CO->getTrueExpr()->isReadIfDiscardedInCPlusPlus11() &&
+           CO->getFalseExpr()->isReadIfDiscardedInCPlusPlus11();
+  // The related edge case of "*x ?: *x".
+  if (auto *BCO =
+          dyn_cast<BinaryConditionalOperator>(E)) {
+    if (auto *OVE = dyn_cast<OpaqueValueExpr>(BCO->getTrueExpr()))
+      return OVE->getSourceExpr()->isReadIfDiscardedInCPlusPlus11() &&
+             BCO->getFalseExpr()->isReadIfDiscardedInCPlusPlus11();
+  }
+
+  // Objective-C++ extensions to the rule.
+  if (isa<PseudoObjectExpr>(E) || isa<ObjCIvarRefExpr>(E))
+    return true;
+
+  return false;
+}
+
 /// isUnusedResultAWarning - Return true if this immediate expression should
 /// be warned about if the result is unused.  If so, fill in Loc and Ranges
 /// with location to warn on and the source range[s] to report with the
@@ -2555,20 +2613,31 @@ bool Expr::isUnusedResultAWarning(const Expr *&WarnE, SourceLocation &Loc,
   }
   case CXXFunctionalCastExprClass:
   case CStyleCastExprClass: {
-    // Ignore an explicit cast to void unless the operand is a non-trivial
-    // volatile lvalue.
+    // Ignore an explicit cast to void, except in C++98 if the operand is a
+    // volatile glvalue for which we would trigger an implicit read in any
+    // other language mode. (Such an implicit read always happens as part of
+    // the lvalue conversion in C, and happens in C++ for expressions of all
+    // forms where it seems likely the user intended to trigger a volatile
+    // load.)
     const CastExpr *CE = cast<CastExpr>(this);
+    const Expr *SubE = CE->getSubExpr()->IgnoreParens();
     if (CE->getCastKind() == CK_ToVoid) {
-      if (CE->getSubExpr()->isGLValue() &&
-          CE->getSubExpr()->getType().isVolatileQualified()) {
-        const DeclRefExpr *DRE =
-            dyn_cast<DeclRefExpr>(CE->getSubExpr()->IgnoreParens());
-        if (!(DRE && isa<VarDecl>(DRE->getDecl()) &&
-              cast<VarDecl>(DRE->getDecl())->hasLocalStorage()) &&
-            !isa<CallExpr>(CE->getSubExpr()->IgnoreParens())) {
-          return CE->getSubExpr()->isUnusedResultAWarning(WarnE, Loc,
-                                                          R1, R2, Ctx);
-        }
+      if (Ctx.getLangOpts().CPlusPlus && !Ctx.getLangOpts().CPlusPlus11 &&
+          SubE->isReadIfDiscardedInCPlusPlus11()) {
+        // Suppress the "unused value" warning for idiomatic usage of
+        // '(void)var;' used to suppress "unused variable" warnings.
+        if (auto *DRE = dyn_cast<DeclRefExpr>(SubE))
+          if (auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+            if (!VD->isExternallyVisible())
+              return false;
+
+        // The lvalue-to-rvalue conversion would have no effect for an array.
+        // It's implausible that the programmer expected this to result in a
+        // volatile array load, so don't warn.
+        if (SubE->getType()->isArrayType())
+          return false;
+
+        return SubE->isUnusedResultAWarning(WarnE, Loc, R1, R2, Ctx);
       }
       return false;
     }
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 7cda60ba75984..b655c82816896 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7660,61 +7660,6 @@ ExprResult Sema::ActOnNoexceptExpr(SourceLocation KeyLoc, SourceLocation,
   return BuildCXXNoexceptExpr(KeyLoc, Operand, RParen);
 }
 
-static bool IsSpecialDiscardedValue(Expr *E) {
-  // In C++11, discarded-value expressions of a certain form are special,
-  // according to [expr]p10:
-  //   The lvalue-to-rvalue conversion (4.1) is applied only if the
-  //   expression is an lvalue of volatile-qualified type and it has
-  //   one of the following forms:
-  E = E->IgnoreParens();
-
-  //   - id-expression (5.1.1),
-  if (isa<DeclRefExpr>(E))
-    return true;
-
-  //   - subscripting (5.2.1),
-  if (isa<ArraySubscriptExpr>(E))
-    return true;
-
-  //   - class member access (5.2.5),
-  if (isa<MemberExpr>(E))
-    return true;
-
-  //   - indirection (5.3.1),
-  if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E))
-    if (UO->getOpcode() == UO_Deref)
-      return true;
-
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(E)) {
-    //   - pointer-to-member operation (5.5),
-    if (BO->isPtrMemOp())
-      return true;
-
-    //   - comma expression (5.18) where the right operand is one of the above.
-    if (BO->getOpcode() == BO_Comma)
-      return IsSpecialDiscardedValue(BO->getRHS());
-  }
-
-  //   - conditional expression (5.16) where both the second and the third
-  //     operands are one of the above, or
-  if (ConditionalOperator *CO = dyn_cast<ConditionalOperator>(E))
-    return IsSpecialDiscardedValue(CO->getTrueExpr()) &&
-           IsSpecialDiscardedValue(CO->getFalseExpr());
-  // The related edge case of "*x ?: *x".
-  if (BinaryConditionalOperator *BCO =
-          dyn_cast<BinaryConditionalOperator>(E)) {
-    if (OpaqueValueExpr *OVE = dyn_cast<OpaqueValueExpr>(BCO->getTrueExpr()))
-      return IsSpecialDiscardedValue(OVE->getSourceExpr()) &&
-             IsSpecialDiscardedValue(BCO->getFalseExpr());
-  }
-
-  // Objective-C++ extensions to the rule.
-  if (isa<PseudoObjectExpr>(E) || isa<ObjCIvarRefExpr>(E))
-    return true;
-
-  return false;
-}
-
 /// Perform the conversions required for an expression used in a
 /// context that ignores the result.
 ExprResult Sema::IgnoredValueConversions(Expr *E) {
@@ -7739,23 +7684,20 @@ ExprResult Sema::IgnoredValueConversions(Expr *E) {
     return E;
   }
 
-  if (getLangOpts().CPlusPlus)  {
+  if (getLangOpts().CPlusPlus) {
     // The C++11 standard defines the notion of a discarded-value expression;
     // normally, we don't need to do anything to handle it, but if it is a
     // volatile lvalue with a special form, we perform an lvalue-to-rvalue
     // conversion.
-    if (getLangOpts().CPlusPlus11 && E->isGLValue() &&
-        E->getType().isVolatileQualified()) {
-       if (IsSpecialDiscardedValue(E)) {
-        ExprResult Res = DefaultLvalueConversion(E);
-        if (Res.isInvalid())
-          return E;
-        E = Res.get();
-      } else {
-        // Per C++2a [expr.ass]p5, a volatile assignment is not deprecated if
-        // it occurs as a discarded-value expression.
-        CheckUnusedVolatileAssignment(E);
-      }
+    if (getLangOpts().CPlusPlus11 && E->isReadIfDiscardedInCPlusPlus11()) {
+      ExprResult Res = DefaultLvalueConversion(E);
+      if (Res.isInvalid())
+        return E;
+      E = Res.get();
+    } else {
+      // Per C++2a [expr.ass]p5, a volatile assignment is not deprecated if
+      // it occurs as a discarded-value expression.
+      CheckUnusedVolatileAssignment(E);
     }
 
     // C++1z:
diff --git a/clang/test/SemaCXX/warn-unused-value.cpp b/clang/test/SemaCXX/warn-unused-value.cpp
index 98e2a4e86304d..02bceeca13374 100644
--- a/clang/test/SemaCXX/warn-unused-value.cpp
+++ b/clang/test/SemaCXX/warn-unused-value.cpp
@@ -108,3 +108,33 @@ void f() {
   (void)sizeof(*x); // Ok
 }
 }
+
+static volatile char var1 = 'a';
+volatile char var2 = 'a';
+static volatile char arr1[] = "hello";
+volatile char arr2[] = "hello";
+void volatile_array() {
+  static volatile char var3 = 'a';
+  volatile char var4 = 'a';
+  static volatile char arr3[] = "hello";
+  volatile char arr4[] = "hello";
+
+  // These all result in volatile loads in C and C++11. In C++98, they don't,
+  // but we suppress the warning in the case where '(void)var;' might be
+  // idiomatically suppressing an 'unused variable' warning.
+  (void)var1;
+  (void)var2;
+#if __cplusplus < 201103L
+  // expected-warning@-2 {{expression result unused; assign into a variable to force a volatile load}}
+#endif
+  (void)var3;
+  (void)var4;
+
+  // None of these result in volatile loads in any language mode, and it's not
+  // really reasonable to assume that they would, since volatile array loads
+  // don't really exist anywhere.
+  (void)arr1;
+  (void)arr2;
+  (void)arr3;
+  (void)arr4;
+}

From 1224e619d975c7ecf8017e0ef8210188f39deec4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 27 May 2020 17:42:35 -0700
Subject: [PATCH 303/770] [ELF][test] Fix wrap-no-real.s after D51283

Give %t3.so a DT_SONAME so that the DT_NEEDED entry in a dependent executable has a fixed length.
---
 lld/test/ELF/wrap-no-real.s | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lld/test/ELF/wrap-no-real.s b/lld/test/ELF/wrap-no-real.s
index 0ee95edd8e305..43d94cf91f8a1 100644
--- a/lld/test/ELF/wrap-no-real.s
+++ b/lld/test/ELF/wrap-no-real.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t1.o
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %p/Inputs/wrap-no-real.s -o %t2.o
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %p/Inputs/wrap-no-real2.s -o %t3.o
-// RUN: ld.lld -o %t3.so -shared %t3.o
+// RUN: ld.lld -o %t3.so -shared --soname=t3 %t3.o
 
 // RUN: ld.lld -o %t %t1.o %t2.o -wrap foo
 // RUN: llvm-objdump -d %t | FileCheck %s
@@ -23,9 +23,9 @@
 // RUN: llvm-readelf -s -x .got %t2 | FileCheck --check-prefix=READELF --implicit-check-not=__real_ %s
 
 // CHECK2: <_start>:
-// CHECK2-NEXT: movq {{.*}}(%rip), %rax  # 2022f8
-// CHECK2-NEXT: movq {{.*}}(%rip), %rbx  # 2022f8
-// CHECK2-NEXT: movq {{.*}}(%rip), %rcx  # 202300
+// CHECK2-NEXT: movq {{.*}}(%rip), %rax  # 2022b8
+// CHECK2-NEXT: movq {{.*}}(%rip), %rbx  # 2022b8
+// CHECK2-NEXT: movq {{.*}}(%rip), %rcx  # 2022c0
 
 .global _start
 _start:

From a70edc2b1613b10b65f55a0670e96f9f4e6c2926 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 17:41:17 -0700
Subject: [PATCH 304/770] [NFC,StackSafety] Cleanup alloca size calculation

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 68 +++++++++++++----------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index aead4ec9fd160..d33efa5d1d042 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -99,21 +99,40 @@ raw_ostream &operator<<(raw_ostream &OS, const UseInfo &U) {
   return OS;
 }
 
-/// Calculate the allocation size of a given alloca. Returns 0 if the
-/// size can not be statically determined.
-uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) {
-  const DataLayout &DL = AI->getModule()->getDataLayout();
-  TypeSize TS = DL.getTypeAllocSize(AI->getAllocatedType());
+// Check if we should bailout for such ranges.
+bool isUnsafe(const ConstantRange &R) {
+  return R.isEmptySet() || R.isFullSet() || R.isUpperSignWrapped();
+}
+
+/// Calculate the allocation size of a given alloca. Returns empty range
+// in case of confution.
+ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  TypeSize TS = DL.getTypeAllocSize(AI.getAllocatedType());
+  unsigned PointerSize = DL.getMaxPointerSizeInBits();
+  // Fallback to empty range for alloca size.
+  ConstantRange R = ConstantRange::getEmpty(PointerSize);
   if (TS.isScalable())
-    return 0;
-  uint64_t Size = TS.getFixedSize();
-  if (AI->isArrayAllocation()) {
-    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    return R;
+  APInt APSize(PointerSize, TS.getFixedSize(), true);
+  if (APSize.isNonPositive())
+    return R;
+  if (AI.isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI.getArraySize());
     if (!C)
-      return 0;
-    Size *= C->getZExtValue();
+      return R;
+    bool Overflow = false;
+    APInt Mul = C->getValue();
+    if (Mul.isNonPositive())
+      return R;
+    Mul = Mul.sextOrTrunc(PointerSize);
+    APSize = APSize.smul_ov(Mul, Overflow);
+    if (Overflow)
+      return R;
   }
-  return Size;
+  R = ConstantRange(APInt::getNullValue(PointerSize), APSize);
+  assert(!isUnsafe(R));
+  return R;
 }
 
 /// Describes uses of allocas and parameters inside of a single function.
@@ -159,7 +178,7 @@ struct FunctionInfo {
         if (auto AI = dyn_cast<AllocaInst>(&I)) {
           auto &AS = Allocas[Pos];
           O << "      " << AI->getName() << "["
-            << getStaticAllocaAllocationSize(AI) << "]: " << AS << "\n";
+            << getStaticAllocaSizeRange(*AI).getUpper() << "]: " << AS << "\n";
           ++Pos;
         }
       }
@@ -193,11 +212,6 @@ StackSafetyInfo makeSSI(FunctionInfo Info) {
 
 namespace {
 
-// Check if we should bailout for such ranges.
-bool isUnsafe(const ConstantRange &R) {
-  return R.isEmptySet() || R.isFullSet() || R.isUpperSignWrapped();
-}
-
 class StackSafetyLocalAnalysis {
   Function &F;
   const DataLayout &DL;
@@ -215,10 +229,6 @@ class StackSafetyLocalAnalysis {
 
   bool analyzeAllUses(Value *Ptr, UseInfo &AS);
 
-  ConstantRange getRange(uint64_t Lower, uint64_t Upper) const {
-    return ConstantRange(APInt(PointerSize, Lower), APInt(PointerSize, Upper));
-  }
-
 public:
   StackSafetyLocalAnalysis(Function &F, ScalarEvolution &SE)
       : F(F), DL(F.getParent()->getDataLayout()), SE(SE),
@@ -266,7 +276,11 @@ ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
                                                        TypeSize Size) {
   if (Size.isScalable())
     return UnknownRange;
-  return getAccessRange(Addr, Base, getRange(0, Size.getFixedSize()));
+  APInt APSize(PointerSize, Size.getFixedSize(), true);
+  if (APSize.isNegative())
+    return UnknownRange;
+  return getAccessRange(
+      Addr, Base, ConstantRange(APInt::getNullValue(PointerSize), APSize));
 }
 
 ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
@@ -278,6 +292,7 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
     if (MI->getRawDest() != U)
       return ConstantRange::getEmpty(PointerSize);
   }
+
   auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize);
   if (!SE.isSCEVable(MI->getLength()->getType()))
     return UnknownRange;
@@ -285,8 +300,7 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
   const SCEV *Expr =
       SE.getTruncateOrZeroExtend(SE.getSCEV(MI->getLength()), CalculationTy);
   ConstantRange Sizes = SE.getSignedRange(Expr);
-  assert(!isUnsafe(Sizes));
-  if (Sizes.getUpper().isNegative())
+  if (Sizes.getUpper().isNegative() || isUnsafe(Sizes))
     return UnknownRange;
   Sizes = Sizes.sextOrTrunc(PointerSize);
   ConstantRange SizeRange(APInt::getNullValue(PointerSize),
@@ -581,9 +595,7 @@ bool setStackSafetyMetadata(Module &M, const StackSafetyGlobalInfo &SSGI) {
     for (auto &I : instructions(F)) {
       if (auto AI = dyn_cast<AllocaInst>(&I)) {
         auto &AS = Summary.Allocas[Pos];
-        ConstantRange AllocaRange{
-            APInt(Width, 0), APInt(Width, getStaticAllocaAllocationSize(AI))};
-        if (AllocaRange.contains(AS.Range)) {
+        if (getStaticAllocaSizeRange(*AI).contains(AS.Range)) {
           AI->setMetadata(M.getMDKindID("stack-safe"),
                           MDNode::get(M.getContext(), None));
           Changed = true;

From 74671d5c1491dc9e252a8a10c9065b2f8cc99fba Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 27 May 2020 18:02:49 -0700
Subject: [PATCH 305/770] Sink first bit of functionality from Statepoint to
 GCStatepointInst

Starting with the obvious stuff.  I initially tried to include the inline operand sequences too, but managed to get code which confused *me*.  Since several parts of those are being entirely removed in the near future, I may defer that portion until the cleanup is done.
---
 llvm/include/llvm/IR/Statepoint.h | 69 +++++++++++++++++--------------
 1 file changed, 39 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index f9eeddba778a2..34eb1126b373f 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -83,6 +83,38 @@ class GCStatepointInst : public CallBase {
   static bool classof(const Value *V) {
     return isa<CallBase>(V) && classof(cast<CallBase>(V));
   }
+
+  enum {
+    IDPos = 0,
+    NumPatchBytesPos = 1,
+    CalledFunctionPos = 2,
+    NumCallArgsPos = 3,
+    FlagsPos = 4,
+    CallArgsBeginPos = 5,
+  };
+
+  /// Return the ID associated with this statepoint.
+  uint64_t getID() const {
+    return cast<ConstantInt>(getArgOperand(IDPos))->getZExtValue();
+  }
+
+  /// Return the number of patchable bytes associated with this statepoint.
+  uint32_t getNumPatchBytes() const {
+    const Value *NumPatchBytesVal = getArgOperand(NumPatchBytesPos);
+    uint64_t NumPatchBytes =
+      cast<ConstantInt>(NumPatchBytesVal)->getZExtValue();
+    assert(isInt<32>(NumPatchBytes) && "should fit in 32 bits!");
+    return NumPatchBytes;
+  }
+
+  /// Number of arguments to be passed to the actual callee.
+  int getNumCallArgs() const {
+    return cast<ConstantInt>(getArgOperand(NumCallArgsPos))->getZExtValue();
+  }
+
+  uint64_t getFlags() const {
+    return cast<ConstantInt>(getArgOperand(FlagsPos))->getZExtValue();
+  }
 };
 
 /// A wrapper around a GC intrinsic call, this provides most of the actual
@@ -107,12 +139,8 @@ class StatepointBase {
   using arg_iterator = typename CallTy::const_op_iterator;
 
   enum {
-    IDPos = 0,
-    NumPatchBytesPos = 1,
-    CalledFunctionPos = 2,
-    NumCallArgsPos = 3,
-    FlagsPos = 4,
-    CallArgsBeginPos = 5,
+    CalledFunctionPos = GCStatepointInst::CalledFunctionPos,
+    CallArgsBeginPos = GCStatepointInst::CallArgsBeginPos,
   };
 
   void *operator new(size_t, unsigned) = delete;
@@ -129,25 +157,12 @@ class StatepointBase {
     return StatepointCall;
   }
 
-  uint64_t getFlags() const {
-    return cast<ConstantInt>(getCall()->getArgOperand(FlagsPos))
-        ->getZExtValue();
-  }
+  // Deprecated shims (update all callers to remove)
+  uint64_t getFlags() const { return getCall()->getFlags(); }
+  uint64_t getID() const { return getCall()->getID(); }
+  uint32_t getNumPatchBytes() const { return getCall()->getNumPatchBytes(); }
+  int getNumCallArgs() const { return getCall()->getNumCallArgs(); }
 
-  /// Return the ID associated with this statepoint.
-  uint64_t getID() const {
-    const Value *IDVal = getCall()->getArgOperand(IDPos);
-    return cast<ConstantInt>(IDVal)->getZExtValue();
-  }
-
-  /// Return the number of patchable bytes associated with this statepoint.
-  uint32_t getNumPatchBytes() const {
-    const Value *NumPatchBytesVal = getCall()->getArgOperand(NumPatchBytesPos);
-    uint64_t NumPatchBytes =
-      cast<ConstantInt>(NumPatchBytesVal)->getZExtValue();
-    assert(isInt<32>(NumPatchBytes) && "should fit in 32 bits!");
-    return NumPatchBytes;
-  }
 
   /// Return the value actually being called or invoked.
   ValueTy *getCalledValue() const {
@@ -180,12 +195,6 @@ class StatepointBase {
     return FTy->getReturnType();
   }
 
-  /// Number of arguments to be passed to the actual callee.
-  int getNumCallArgs() const {
-    const Value *NumCallArgsVal = getCall()->getArgOperand(NumCallArgsPos);
-    return cast<ConstantInt>(NumCallArgsVal)->getZExtValue();
-  }
-
   size_t arg_size() const { return getNumCallArgs(); }
   arg_iterator arg_begin() const {
     assert(CallArgsBeginPos <= (int)getCall()->arg_size());

From 87bea912c27caaa71ac9bc3d172995994b57e639 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 27 May 2020 18:31:00 -0700
Subject: [PATCH 306/770] [Statepoint] Replace uses of isX functions with
 idiomatic isa<X>

Now that all of the statepoint related routines have classes with isa support, let's cleanup.

I'm leaving the (dead) utitilities in tree for a few days so that I can do the same cleanup downstream without breakage.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp                    | 2 +-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp  | 4 ++--
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp     | 4 ++--
 llvm/lib/IR/SafepointIRVerifier.cpp                    | 2 +-
 llvm/lib/IR/Verifier.cpp                               | 4 ++--
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp   | 3 ++-
 llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp         | 3 ++-
 llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 2 +-
 llvm/lib/Transforms/Utils/StripGCRelocates.cpp         | 2 +-
 9 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d4c471afb3b61..ee4b43446ee1c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -576,7 +576,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     SmallVector<Instruction *, 2> Statepoints;
     for (BasicBlock &BB : F)
       for (Instruction &I : BB)
-        if (isStatepoint(I))
+        if (isa<GCStatepointInst>(I))
           Statepoints.push_back(&I);
     for (auto &I : Statepoints)
       EverMadeChange |= simplifyOffsetableRelocate(*I);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c1b4d7431ca8f..559d166e372df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1134,7 +1134,7 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
   }
 
   if (!I.isTerminator() && !HasTailCall &&
-      !isStatepoint(&I)) // statepoints handle their exports internally
+      !isa<GCStatepointInst>(I)) // statepoints handle their exports internally
     CopyToExportRegsIfNeeded(&I);
 
   CurInst = nullptr;
@@ -2827,7 +2827,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   // available as a virtual register.
   // We already took care of the exported value for the statepoint instruction
   // during call to the LowerStatepoint.
-  if (!isStatepoint(I)) {
+  if (!isa<GCStatepointInst>(I)) {
     CopyToExportRegsIfNeeded(&I);
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 279e53877dc8e..a0cfd3eb729f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1518,8 +1518,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         // to keep track of gc-relocates for a particular gc-statepoint. This is
         // done by SelectionDAGBuilder::LowerAsSTATEPOINT, called before
         // visitGCRelocate.
-        if (isa<CallInst>(Inst) && !isStatepoint(Inst) && !isGCRelocate(Inst) &&
-            !isGCResult(Inst)) {
+        if (isa<CallInst>(Inst) && !isa<GCStatepointInst>(Inst) &&
+            !isa<GCRelocateInst>(Inst) && !isa<GCResultInst>(Inst)) {
           OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
                                      Inst->getDebugLoc(), LLVMBB);
 
diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp
index c9aa85bb4c96e..01c0132261595 100644
--- a/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -783,7 +783,7 @@ void GCPtrTracker::transferBlock(const BasicBlock *BB, BasicBlockState &BBS,
 
 void GCPtrTracker::transferInstruction(const Instruction &I, bool &Cleared,
                                        AvailableValueSet &Available) {
-  if (isStatepoint(I)) {
+  if (isa<GCStatepointInst>(I)) {
     Cleared = true;
     Available.clear();
   } else if (containsGCPtrType(I.getType()))
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f4680fffa8582..c39fb0edc714b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4702,14 +4702,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
              LandingPad->getParent());
       Assert(InvokeBB->getTerminator(), "safepoint block should be well formed",
              InvokeBB);
-      Assert(isStatepoint(InvokeBB->getTerminator()),
+      Assert(isa<GCStatepointInst>(InvokeBB->getTerminator()),
              "gc relocate should be linked to a statepoint", InvokeBB);
     } else {
       // In all other cases relocate should be tied to the statepoint directly.
       // This covers relocates on a normal return path of invoke statepoint and
       // relocates of a call statepoint.
       auto Token = Call.getArgOperand(0);
-      Assert(isa<Instruction>(Token) && isStatepoint(cast<Instruction>(Token)),
+      Assert(isa<GCStatepointInst>(Token),
              "gc relocate is incorrectly tied to the statepoint", Call, Token);
     }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7e20d241bbab5..a3d5215fad4f8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4387,7 +4387,8 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
   // TODO: This is probably something which should be expanded to all
   // intrinsics since the entire point of intrinsics is that
   // they are understandable by the optimizer.
-  if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call))
+  if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+      isa<GCResultInst>(Call))
     return false;
 
   // The size of ByVal or InAlloca arguments is derived from the type, so we
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index b39edca6780fc..4553b23532f21 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -189,7 +189,8 @@ static bool needsStatepoint(CallBase *Call, const TargetLibraryInfo &TLI) {
       return false;
   }
 
-  return !(isStatepoint(Call) || isGCRelocate(Call) || isGCResult(Call));
+  return !(isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+           isa<GCResultInst>(Call));
 }
 
 /// Returns true if this loop is known to contain a call safepoint which
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 2aace0b7f8111..ab284b75ee2c2 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -2586,7 +2586,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
 
   auto NeedsRewrite = [&TLI](Instruction &I) {
     if (const auto *Call = dyn_cast<CallBase>(&I))
-      return !callsGCLeafFunction(Call, TLI) && !isStatepoint(Call);
+      return !callsGCLeafFunction(Call, TLI) && !isa<GCStatepointInst>(Call);
     return false;
   };
 
diff --git a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 7880ea1c6c479..b559811d120bc 100644
--- a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -48,7 +48,7 @@ bool StripGCRelocates::runOnFunction(Function &F) {
   // i.e. not bound to a single statepoint token.
   for (Instruction &I : instructions(F)) {
     if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
-      if (isStatepoint(GCR->getOperand(0)))
+      if (isa<GCStatepointInst>(GCR->getOperand(0)))
         GCRelocates.push_back(GCR);
   }
   // All gc.relocates are bound to a single statepoint token. The order of

From 3c3a6e26e7c39096b3df746faeaa743197657a8e Mon Sep 17 00:00:00 2001
From: Xing GUO <higuoxing@gmail.com>
Date: Wed, 27 May 2020 16:59:45 +0800
Subject: [PATCH 307/770] [ObjectYAML][MachO] Add error handling in
 MachOEmitter.

Currently, `yaml2macho` doesn't support error handling. This patch helps improve it.

Differential Revision: https://reviews.llvm.org/D80535
---
 llvm/lib/ObjectYAML/MachOEmitter.cpp          | 50 +++++++++------
 .../MachO/fat_macho_i386_x86_64.yaml          | 43 ++++++++++++-
 llvm/test/ObjectYAML/MachO/sections.yaml      | 61 ++++++++++++++++++-
 3 files changed, 135 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index 5a38fef508540..f8661e0c3c317 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -15,6 +15,8 @@
 #include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/ObjectYAML/ObjectYAML.h"
 #include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,12 +35,12 @@ class MachOWriter {
     memset(reinterpret_cast<void *>(&Header), 0, sizeof(MachO::mach_header_64));
   }
 
-  void writeMachO(raw_ostream &OS);
+  Error writeMachO(raw_ostream &OS);
 
 private:
   void writeHeader(raw_ostream &OS);
   void writeLoadCommands(raw_ostream &OS);
-  void writeSectionData(raw_ostream &OS);
+  Error writeSectionData(raw_ostream &OS);
   void writeRelocations(raw_ostream &OS);
   void writeLinkEditData(raw_ostream &OS);
 
@@ -66,14 +68,16 @@ class MachOWriter {
   bool FoundLinkEditSeg = false;
 };
 
-void MachOWriter::writeMachO(raw_ostream &OS) {
+Error MachOWriter::writeMachO(raw_ostream &OS) {
   fileStart = OS.tell();
   writeHeader(OS);
   writeLoadCommands(OS);
-  writeSectionData(OS);
+  if (Error Err = writeSectionData(OS))
+    return Err;
   writeRelocations(OS);
   if (!FoundLinkEditSeg)
     writeLinkEditData(OS);
+  return Error::success();
 }
 
 void MachOWriter::writeHeader(raw_ostream &OS) {
@@ -261,7 +265,7 @@ void MachOWriter::writeLoadCommands(raw_ostream &OS) {
   }
 }
 
-void MachOWriter::writeSectionData(raw_ostream &OS) {
+Error MachOWriter::writeSectionData(raw_ostream &OS) {
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SEGMENT:
@@ -277,9 +281,10 @@ void MachOWriter::writeSectionData(raw_ostream &OS) {
         ZeroToOffset(OS, Sec.offset);
         // Zero Fill any data between the end of the last thing we wrote and the
         // start of this section.
-        assert((OS.tell() - fileStart <= Sec.offset ||
-                Sec.offset == (uint32_t)0) &&
-               "Wrote too much data somewhere, section offsets don't line up.");
+        if (OS.tell() - fileStart > Sec.offset && Sec.offset != (uint32_t)0)
+          return createStringError(
+              errc::invalid_argument,
+              "wrote too much data somewhere, section offsets don't line up");
         if (0 == strncmp(&Sec.segname[0], "__DWARF", 16)) {
           if (0 == strncmp(&Sec.sectname[0], "__debug_str", 16)) {
             DWARFYAML::EmitDebugStr(OS, Obj.DWARF);
@@ -323,6 +328,8 @@ void MachOWriter::writeSectionData(raw_ostream &OS) {
       break;
     }
   }
+
+  return Error::success();
 }
 
 // The implementation of makeRelocationInfo and makeScatteredRelocationInfo is
@@ -528,7 +535,7 @@ class UniversalWriter {
   UniversalWriter(yaml::YamlObjectFile &ObjectFile)
       : ObjectFile(ObjectFile), fileStart(0) {}
 
-  void writeMachO(raw_ostream &OS);
+  Error writeMachO(raw_ostream &OS);
 
 private:
   void writeFatHeader(raw_ostream &OS);
@@ -540,28 +547,33 @@ class UniversalWriter {
   uint64_t fileStart;
 };
 
-void UniversalWriter::writeMachO(raw_ostream &OS) {
+Error UniversalWriter::writeMachO(raw_ostream &OS) {
   fileStart = OS.tell();
   if (ObjectFile.MachO) {
     MachOWriter Writer(*ObjectFile.MachO);
-    Writer.writeMachO(OS);
-    return;
+    return Writer.writeMachO(OS);
   }
 
   writeFatHeader(OS);
   writeFatArchs(OS);
 
   auto &FatFile = *ObjectFile.FatMachO;
-  assert(FatFile.FatArchs.size() >= FatFile.Slices.size() &&
-         "Cannot write Slices if not decribed in FatArches");
+  if (FatFile.FatArchs.size() < FatFile.Slices.size())
+    return createStringError(
+        errc::invalid_argument,
+        "cannot write 'Slices' if not described in 'FatArches'");
+
   for (size_t i = 0; i < FatFile.Slices.size(); i++) {
     ZeroToOffset(OS, FatFile.FatArchs[i].offset);
     MachOWriter Writer(FatFile.Slices[i]);
-    Writer.writeMachO(OS);
+    if (Error Err = Writer.writeMachO(OS))
+      return Err;
 
     auto SliceEnd = FatFile.FatArchs[i].offset + FatFile.FatArchs[i].size;
     ZeroToOffset(OS, SliceEnd);
   }
+
+  return Error::success();
 }
 
 void UniversalWriter::writeFatHeader(raw_ostream &OS) {
@@ -629,9 +641,13 @@ void UniversalWriter::ZeroToOffset(raw_ostream &OS, size_t Offset) {
 namespace llvm {
 namespace yaml {
 
-bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler /*EH*/) {
+bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH) {
   UniversalWriter Writer(Doc);
-  Writer.writeMachO(Out);
+  if (Error Err = Writer.writeMachO(Out)) {
+    handleAllErrors(std::move(Err),
+                    [&](const ErrorInfoBase &Err) { EH(Err.message()); });
+    return false;
+  }
   return true;
 }
 
diff --git a/llvm/test/ObjectYAML/MachO/fat_macho_i386_x86_64.yaml b/llvm/test/ObjectYAML/MachO/fat_macho_i386_x86_64.yaml
index 55a9df3636d69..b9b2f2d629e2d 100644
--- a/llvm/test/ObjectYAML/MachO/fat_macho_i386_x86_64.yaml
+++ b/llvm/test/ObjectYAML/MachO/fat_macho_i386_x86_64.yaml
@@ -1,4 +1,9 @@
-# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+## This file contains test cases for generating Fat Mach-O binaries.
+
+## a) Test that yaml2obj emits Fat Mach-O binary and obj2yaml converts it
+## back to YAML file.
+
+# RUN: yaml2obj --docnum=1 %s | obj2yaml | FileCheck %s
 
 --- !fat-mach-o
 FatHeader:       
@@ -72,3 +77,39 @@ Slices:
 #CHECK:       flags:           0x00218085
 #CHECK:       reserved:        0x00000000
 #CHECK: ...
+
+## b) Test that yaml2obj emits an error message if the number of 'FatArchs' is less than
+## the number of 'Slices'.
+
+# RUN: not yaml2obj --docnum=2 %s -o %t2.fat-macho 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR: yaml2obj: error: cannot write 'Slices' if not described in 'FatArches'
+
+--- !fat-mach-o
+FatHeader:
+  magic:     0xCAFEBABE
+  nfat_arch: 2
+FatArchs:
+  ## 2 FatArchs are expected.
+  - cputype:    0x00000007
+    cpusubtype: 0x00000003
+    offset:     0x0000000000001000
+    size:       0
+    align:      0
+Slices:
+  - FileHeader:
+      magic:      0xFEEDFACE
+      cputype:    0x00000007
+      cpusubtype: 0x00000003
+      filetype:   0x00000002
+      ncmds:      0
+      sizeofcmds: 0
+      flags:      0x00000000
+  - FileHeader:
+      magic:      0xFEEDFACE
+      cputype:    0x00000007
+      cpusubtype: 0x00000003
+      filetype:   0x00000002
+      ncmds:      0
+      sizeofcmds: 0
+      flags:      0x00000000
diff --git a/llvm/test/ObjectYAML/MachO/sections.yaml b/llvm/test/ObjectYAML/MachO/sections.yaml
index 5da789dbdef7b..f8c5370ecc37c 100644
--- a/llvm/test/ObjectYAML/MachO/sections.yaml
+++ b/llvm/test/ObjectYAML/MachO/sections.yaml
@@ -1,4 +1,8 @@
-# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+## This file contains test cases for generating sections in Mach-O object files.
+
+## a) Test that yaml2obj emits sections and obj2yaml converts them back.
+
+# RUN: yaml2obj --docnum=1 %s | obj2yaml | FileCheck %s
 
 --- !mach-o
 FileHeader:      
@@ -281,3 +285,58 @@ LoadCommands:
 #CHECK:         segname:         __DATA
 #CHECK:       - sectname:        __la_symbol_ptr
 #CHECK:         segname:         __DATA
+
+## b) Test that yaml2obj emits an error message if we specify an offset that
+## makes the current section and the previous one overlap.
+
+# RUN: not yaml2obj --docnum=2 %s -o %t2.macho 2>&1 | FileCheck %s --check-prefix=OVERLAP
+
+# OVERLAP: yaml2obj: error: wrote too much data somewhere, section offsets don't line up
+
+--- !mach-o
+FileHeader:
+  magic:      0xFEEDFACF
+  cputype:    0x01000007
+  cpusubtype: 0x80000003
+  filetype:   0x00000002
+  ncmds:      1
+  sizeofcmds: 1024
+  flags:      0x00000000
+  reserved:   0x00000000
+LoadCommands:
+  - cmd:      LC_SEGMENT_64
+    cmdsize:  0xff
+    segname:  __SEC
+    vmaddr:   0
+    vmsize:   0
+    fileoff:  0
+    filesize: 0
+    maxprot:  0
+    initprot: 0
+    nsects:   2
+    flags:    0
+    Sections:
+      - sectname:  __sec1
+        segname:   __SEC
+        addr:      0x0000000000000000
+        size:      2
+        offset:    0x00000000
+        align:     0
+        reloff:    0x00000000
+        nreloc:    0
+        flags:     0x00000000
+        reserved1: 0x00000000
+        reserved2: 0x00000000
+        reserved3: 0x00000000
+      - sectname:  __sec2
+        segname:   __SEC
+        addr:      0x0000000000000000
+        size:      2
+        offset:    0x00000001 ## Specify an offset that makes __sec1 and __sec2 overlap.
+        align:     1
+        reloff:    0x00000000
+        nreloc:    0
+        flags:     0x00000000
+        reserved1: 0x00000000
+        reserved2: 0x00000000
+        reserved3: 0x00000000

From 98a87c65a35335473cf7c233cdb312892fc771a3 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 27 May 2020 18:56:50 -0700
Subject: [PATCH 308/770] [Statepoint] Reduce scope of usage of
 ImmutableStatepoint

Can't quite fully remove it yet as some more items need sunk the GCStatepointInst class from the wrapper, but we can at least reduce scope.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  4 +--
 .../SelectionDAG/SelectionDAGBuilder.h        |  2 +-
 .../SelectionDAG/StatepointLowering.cpp       | 34 +++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 559d166e372df..d40f7f92c4cb0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2796,7 +2796,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
       visitPatchpoint(I, EHPadBB);
       break;
     case Intrinsic::experimental_gc_statepoint:
-      LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
+      LowerStatepoint(cast<GCStatepointInst>(I), EHPadBB);
       break;
     case Intrinsic::wasm_rethrow_in_catch: {
       // This is usually done in visitTargetIntrinsic, but this intrinsic is
@@ -6637,7 +6637,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitPatchpoint(I);
     return;
   case Intrinsic::experimental_gc_statepoint:
-    LowerStatepoint(ImmutableStatepoint(&I));
+    LowerStatepoint(cast<GCStatepointInst>(I));
     return;
   case Intrinsic::experimental_gc_result:
     visitGCResult(cast<GCResultInst>(I));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 0924939cc3106..f0b7fb0d52299 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -625,7 +625,7 @@ class SelectionDAGBuilder {
 
   // This function is responsible for the whole statepoint lowering process.
   // It uniformly handles invoke and call statepoints.
-  void LowerStatepoint(ImmutableStatepoint ISP,
+  void LowerStatepoint(const GCStatepointInst &I,
                        const BasicBlock *EHPadBB = nullptr);
 
   void LowerCallSiteWithDeoptBundle(const CallBase *Call, SDValue Callee,
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 9a35bd41d1167..acb68405470ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -804,9 +804,10 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
 }
 
 void
-SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
+SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
                                      const BasicBlock *EHPadBB /*= nullptr*/) {
-  assert(ISP.getCall()->getCallingConv() != CallingConv::AnyReg &&
+  ImmutableStatepoint ISP(&I);
+  assert(I.getCallingConv() != CallingConv::AnyReg &&
          "anyregcc is not supported on statepoints!");
 
 #ifndef NDEBUG
@@ -823,7 +824,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   SDValue ActualCallee;
   SDValue Callee = getValue(ISP.getCalledValue());
 
-  if (ISP.getNumPatchBytes() > 0) {
+  if (I.getNumPatchBytes() > 0) {
     // If we've been asked to emit a nop sequence instead of a call instruction
     // for this statepoint then don't lower the call target, but use a constant
     // `undef` instead.  Not lowering the call target lets statepoint clients
@@ -835,9 +836,8 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   }
 
   StatepointLoweringInfo SI(DAG);
-  populateCallLoweringInfo(SI.CLI, ISP.getCall(),
-                           ImmutableStatepoint::CallArgsBeginPos,
-                           ISP.getNumCallArgs(), ActualCallee,
+  populateCallLoweringInfo(SI.CLI, &I, GCStatepointInst::CallArgsBeginPos,
+                           I.getNumCallArgs(), ActualCallee,
                            ISP.getActualReturnType(), false /* IsPatchPoint */);
 
   // There may be duplication in the gc.relocate list; such as two copies of
@@ -865,10 +865,10 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   }
 
   SI.GCArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
-  SI.StatepointInstr = ISP.getInstruction();
-  SI.ID = ISP.getID();
+  SI.StatepointInstr = &I;
+  SI.ID = I.getID();
 
-  if (auto Opt = ISP.getCall()->getOperandBundle(LLVMContext::OB_deopt)) {
+  if (auto Opt = I.getOperandBundle(LLVMContext::OB_deopt)) {
     assert(ISP.deopt_begin() == ISP.deopt_end() &&
            "can't list both deopt operands and deopt bundle");
     auto &Inputs = Opt->Inputs;
@@ -876,7 +876,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   } else {
     SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
   }
-  if (auto Opt = ISP.getCall()->getOperandBundle(LLVMContext::OB_gc_transition)) {
+  if (auto Opt = I.getOperandBundle(LLVMContext::OB_gc_transition)) {
     assert(ISP.gc_transition_args_begin() == ISP.gc_transition_args_end() &&
            "can't list both gc_transition operands and bundle");
     auto &Inputs = Opt->Inputs;
@@ -886,8 +886,8 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
                                               ISP.gc_transition_args_end());
   }
 
-  SI.StatepointFlags = ISP.getFlags();
-  SI.NumPatchBytes = ISP.getNumPatchBytes();
+  SI.StatepointFlags = I.getFlags();
+  SI.NumPatchBytes = I.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
 
   SDValue ReturnValue = LowerAsSTATEPOINT(SI);
@@ -896,7 +896,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   const GCResultInst *GCResult = ISP.getGCResult();
   Type *RetTy = ISP.getActualReturnType();
   if (!RetTy->isVoidTy() && GCResult) {
-    if (GCResult->getParent() != ISP.getCall()->getParent()) {
+    if (GCResult->getParent() != I.getParent()) {
       // Result value will be used in a different basic block so we need to
       // export it now.  Default exporting mechanism will not work here because
       // statepoint call has a different type than the actual call. It means
@@ -908,22 +908,22 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
       unsigned Reg = FuncInfo.CreateRegs(RetTy);
       RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
                        DAG.getDataLayout(), Reg, RetTy,
-                       ISP.getCall()->getCallingConv());
+                       I.getCallingConv());
       SDValue Chain = DAG.getEntryNode();
 
       RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
       PendingExports.push_back(Chain);
-      FuncInfo.ValueMap[ISP.getInstruction()] = Reg;
+      FuncInfo.ValueMap[&I] = Reg;
     } else {
       // Result value will be used in a same basic block. Don't export it or
       // perform any explicit register copies.
       // We'll replace the actuall call node shortly. gc_result will grab
       // this value.
-      setValue(ISP.getInstruction(), ReturnValue);
+      setValue(&I, ReturnValue);
     }
   } else {
     // The token value is never used from here on, just generate a poison value
-    setValue(ISP.getInstruction(), DAG.getIntPtrConstant(-1, getCurSDLoc()));
+    setValue(&I, DAG.getIntPtrConstant(-1, getCurSDLoc()));
   }
 }
 

From f3a089506fdcc4a1d658697009572c93e00c4373 Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Wed, 27 May 2020 15:50:03 -0700
Subject: [PATCH 309/770] Temporarily disable the following failing tests on
 Darwin:

  AddressSanitizer-Unit :: ./Asan-i386-calls-Test/AddressSanitizer.LongJmpTest
  AddressSanitizer-Unit :: ./Asan-i386-calls-Test/AddressSanitizer.SigLongJmpTest
  AddressSanitizer-Unit :: ./Asan-i386-inline-Test/AddressSanitizer.LongJmpTest
  AddressSanitizer-Unit :: ./Asan-i386-inline-Test/AddressSanitizer.SigLongJmpTest

These failures will be examined properly when time permits.

rdar://problem/62141412
---
 compiler-rt/lib/asan/tests/asan_test.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp
index edc98ed185202..83b0b0e8d33e9 100644
--- a/compiler-rt/lib/asan/tests/asan_test.cpp
+++ b/compiler-rt/lib/asan/tests/asan_test.cpp
@@ -588,6 +588,9 @@ NOINLINE void TouchStackFunc() {
     A[i] = i*i;
 }
 
+// Disabled due to rdar://problem/62141412
+#if !(defined(__APPLE__) && defined(__i386__))
+
 // Test that we handle longjmp and do not report false positives on stack.
 TEST(AddressSanitizer, LongJmpTest) {
   static jmp_buf buf;
@@ -597,6 +600,7 @@ TEST(AddressSanitizer, LongJmpTest) {
     TouchStackFunc();
   }
 }
+#endif
 
 #if !defined(_WIN32)  // Only basic longjmp is available on Windows.
 NOINLINE void UnderscopeLongJmpFunc1(jmp_buf buf) {
@@ -658,6 +662,8 @@ TEST(AddressSanitizer, UnderscopeLongJmpTest) {
   }
 }
 
+// Disabled due to rdar://problem/62141412
+#if !(defined(__APPLE__) && defined(__i386__))
 TEST(AddressSanitizer, SigLongJmpTest) {
   static sigjmp_buf buf;
   if (!sigsetjmp(buf, 1)) {
@@ -668,6 +674,8 @@ TEST(AddressSanitizer, SigLongJmpTest) {
 }
 #endif
 
+#endif
+
 // FIXME: Why does clang-cl define __EXCEPTIONS?
 #if defined(__EXCEPTIONS) && !defined(_WIN32)
 NOINLINE void ThrowFunc() {

From 660cda572d6e05e55a9d959e61aba51790c0abbd Mon Sep 17 00:00:00 2001
From: Jan Korous <jkorous@apple.com>
Date: Tue, 31 Mar 2020 14:05:17 -0700
Subject: [PATCH 310/770] [Analyzer][WebKit] NoUncountedMembersChecker

Differential Revision: https://reviews.llvm.org/D77178
---
 clang/docs/analyzer/checkers.rst              |  18 +++
 .../clang/StaticAnalyzer/Checkers/Checkers.td |   4 +
 .../StaticAnalyzer/Checkers/CMakeLists.txt    |   1 +
 .../Checkers/WebKit/DiagOutputUtils.h         |   8 +
 .../WebKit/NoUncountedMembersChecker.cpp      | 150 ++++++++++++++++++
 .../Checkers/WebKit/uncounted-members.cpp     |  43 +++++
 6 files changed, 224 insertions(+)
 create mode 100644 clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
 create mode 100644 clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index dcf1f28994de4..c977dde8c52ff 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1403,6 +1403,24 @@ Ref-counted types hold their ref-countable data by a raw pointer and allow impli
 
  struct Derived : RefCntblBase { }; // warn
 
+.. _webkit-WebKitNoUncountedMemberChecker:
+
+webkit.WebKitNoUncountedMemberChecker
+""""""""""""""""""""""""""""""""""""
+Raw pointers and references to uncounted types can't be used as class members. Only ref-counted types are allowed.
+
+.. code-block:: cpp
+ struct RefCntbl {
+   void ref() {}
+   void deref() {}
+ };
+
+ struct Foo {
+   RefCntbl * ptr; // warn
+   RefCntbl & ptr; // warn
+   // ...
+ };
+
 .. _alpha-checkers:
 
 Experimental Checkers
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 2ba3881c61351..2d69d8f344209 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1623,4 +1623,8 @@ let ParentPackage = WebKit in {
 def RefCntblBaseVirtualDtorChecker : Checker<"RefCntblBaseVirtualDtor">,
   HelpText<"Check for any ref-countable base class having virtual destructor.">,
   Documentation<HasDocumentation>;
+
+def WebKitNoUncountedMemberChecker : Checker<"WebKitNoUncountedMemberChecker">,
+  HelpText<"Check for no uncounted member variables.">,
+  Documentation<HasDocumentation>;
 } // end webkit
diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 4f885fadf4158..b3dc7a9f63212 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -121,6 +121,7 @@ add_clang_library(clangStaticAnalyzerCheckers
   VLASizeChecker.cpp
   ValistChecker.cpp
   VirtualCallChecker.cpp
+  WebKit/NoUncountedMembersChecker.cpp
   WebKit/PtrTypesSemantics.cpp
   WebKit/RefCntblBaseVirtualDtorChecker.cpp
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/DiagOutputUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/DiagOutputUtils.h
index 4979b8ffc2b20..781a8d746001f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/DiagOutputUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/DiagOutputUtils.h
@@ -23,6 +23,14 @@ void printQuotedQualifiedName(llvm::raw_ostream &Os,
   Os << "'";
 }
 
+template <typename NamedDeclDerivedT>
+void printQuotedName(llvm::raw_ostream &Os, const NamedDeclDerivedT &D) {
+  Os << "'";
+  D->getNameForDiagnostic(Os, D->getASTContext().getPrintingPolicy(),
+                          /*Qualified=*/false);
+  Os << "'";
+}
+
 } // namespace clang
 
 #endif
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
new file mode 100644
index 0000000000000..89caf602a17e5
--- /dev/null
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
@@ -0,0 +1,150 @@
+//=======- NoUncountedMembersChecker.cpp -------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ASTUtils.h"
+#include "DiagOutputUtils.h"
+#include "PtrTypesSemantics.h"
+#include "clang/AST/CXXInheritance.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/Casting.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+class NoUncountedMemberChecker
+    : public Checker<check::ASTDecl<TranslationUnitDecl>> {
+private:
+  BugType Bug;
+  mutable BugReporter *BR;
+
+public:
+  NoUncountedMemberChecker()
+      : Bug(this,
+            "Member variable is a raw-poiner/reference to reference-countable "
+            "type",
+            "WebKit coding guidelines") {}
+
+  void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR,
+                    BugReporter &BRArg) const {
+    BR = &BRArg;
+
+    // The calls to checkAST* from AnalysisConsumer don't
+    // visit template instantiations or lambda classes. We
+    // want to visit those, so we make our own RecursiveASTVisitor.
+    struct LocalVisitor : public RecursiveASTVisitor<LocalVisitor> {
+      const NoUncountedMemberChecker *Checker;
+      explicit LocalVisitor(const NoUncountedMemberChecker *Checker)
+          : Checker(Checker) {
+        assert(Checker);
+      }
+
+      bool shouldVisitTemplateInstantiations() const { return true; }
+      bool shouldVisitImplicitCode() const { return false; }
+
+      bool VisitRecordDecl(const RecordDecl *RD) {
+        Checker->visitRecordDecl(RD);
+        return true;
+      }
+    };
+
+    LocalVisitor visitor(this);
+    visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
+  }
+
+  void visitRecordDecl(const RecordDecl *RD) const {
+    if (shouldSkipDecl(RD))
+      return;
+
+    for (auto Member : RD->fields()) {
+      const Type *MemberType = Member->getType().getTypePtrOrNull();
+      if (!MemberType)
+        continue;
+
+      if (auto *MemberCXXRD = MemberType->getPointeeCXXRecordDecl()) {
+        if (isRefCountable(MemberCXXRD))
+          reportBug(Member, MemberType, MemberCXXRD, RD);
+      }
+    }
+  }
+
+  bool shouldSkipDecl(const RecordDecl *RD) const {
+    if (!RD->isThisDeclarationADefinition())
+      return true;
+
+    if (RD->isImplicit())
+      return true;
+
+    if (RD->isLambda())
+      return true;
+
+    // If the construct doesn't have a source file, then it's not something
+    // we want to diagnose.
+    const auto RDLocation = RD->getLocation();
+    if (!RDLocation.isValid())
+      return true;
+
+    const auto Kind = RD->getTagKind();
+    // FIMXE: Should we check union members too?
+    if (Kind != TTK_Struct && Kind != TTK_Class)
+      return true;
+
+    // Ignore CXXRecords that come from system headers.
+    if (BR->getSourceManager().isInSystemHeader(RDLocation))
+      return true;
+
+    // Ref-counted smartpointers actually have raw-pointer to uncounted type as
+    // a member but we trust them to handle it correctly.
+    return isRefCounted(llvm::dyn_cast_or_null<CXXRecordDecl>(RD));
+  }
+
+  void reportBug(const FieldDecl *Member, const Type *MemberType,
+                 const CXXRecordDecl *MemberCXXRD,
+                 const RecordDecl *ClassCXXRD) const {
+    assert(Member);
+    assert(MemberType);
+    assert(MemberCXXRD);
+
+    SmallString<100> Buf;
+    llvm::raw_svector_ostream Os(Buf);
+
+    Os << "Member variable ";
+    printQuotedName(Os, Member);
+    Os << " in ";
+    printQuotedQualifiedName(Os, ClassCXXRD);
+    Os << " is a "
+       << (isa<PointerType>(MemberType) ? "raw pointer" : "reference")
+       << " to ref-countable type ";
+    printQuotedQualifiedName(Os, MemberCXXRD);
+    Os << "; member variables must be ref-counted.";
+
+    PathDiagnosticLocation BSLoc(Member->getSourceRange().getBegin(),
+                                 BR->getSourceManager());
+    auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
+    Report->addRange(Member->getSourceRange());
+    BR->emitReport(std::move(Report));
+  }
+};
+} // namespace
+
+void ento::registerWebKitNoUncountedMemberChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<NoUncountedMemberChecker>();
+}
+
+bool ento::shouldRegisterWebKitNoUncountedMemberChecker(
+    const CheckerManager &Mgr) {
+  return true;
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp
new file mode 100644
index 0000000000000..e88c0b3b0dd03
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-members.cpp
@@ -0,0 +1,43 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.WebKitNoUncountedMemberChecker -verify %s
+
+#include "mock-types.h"
+
+namespace members {
+  struct Foo {
+  private:
+    RefCountable* a = nullptr;
+// expected-warning@-1{{Member variable 'a' in 'members::Foo' is a raw pointer to ref-countable type 'RefCountable'}}
+
+  protected:
+    RefPtr<RefCountable> b;
+
+  public:
+    RefCountable silenceWarningAboutInit;
+    RefCountable& c = silenceWarningAboutInit;
+// expected-warning@-1{{Member variable 'c' in 'members::Foo' is a reference to ref-countable type 'RefCountable'}}
+    Ref<RefCountable> d;
+  };
+
+  template<class T>
+  struct FooTmpl {
+    T* a;
+// expected-warning@-1{{Member variable 'a' in 'members::FooTmpl<RefCountable>' is a raw pointer to ref-countable type 'RefCountable'}}
+  };
+
+  void forceTmplToInstantiate(FooTmpl<RefCountable>) {}
+}
+
+namespace ignore_unions {
+  union Foo {
+    RefCountable* a;
+    RefPtr<RefCountable> b;
+    Ref<RefCountable> c;
+  };
+
+  template<class T>
+  union RefPtr {
+    T* a;
+  };
+
+  void forceTmplToInstantiate(RefPtr<RefCountable>) {}
+}

From f830b406c655ae59888a188302edfbc5d6fa7a13 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 28 May 2020 02:47:12 +0000
Subject: [PATCH 311/770] [gn build] Port 660cda572d6

---
 .../gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
index 0629d9c19231f..1ada201611d5c 100644
--- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
@@ -129,6 +129,7 @@ static_library("Checkers") {
     "ValistChecker.cpp",
     "VforkChecker.cpp",
     "VirtualCallChecker.cpp",
+    "WebKit/NoUncountedMembersChecker.cpp",
     "WebKit/PtrTypesSemantics.cpp",
     "WebKit/RefCntblBaseVirtualDtorChecker.cpp",
     "cert/PutenvWithAutoChecker.cpp",

From 12cd4a51640f5e025043c45a004df66b678ffa9d Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 19:28:42 -0700
Subject: [PATCH 312/770] [NFC,StackSafety] Add StackSafetyGlobalInfo class

---
 .../llvm/Analysis/StackSafetyAnalysis.h       | 20 ++++++++-
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     | 42 +++++++++++--------
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index 33a4b2c149c36..df7ccac5b4b92 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -18,6 +18,8 @@
 
 namespace llvm {
 
+class AllocaInst;
+
 /// Interface to access stack safety analysis results for single function.
 class StackSafetyInfo {
 public:
@@ -38,6 +40,22 @@ class StackSafetyInfo {
   void print(raw_ostream &O, const GlobalValue &F) const;
 };
 
+class StackSafetyGlobalInfo {
+public:
+  using GVToSSI = std::map<const GlobalValue *, StackSafetyInfo>;
+
+private:
+  GVToSSI SSGI;
+
+public:
+  StackSafetyGlobalInfo() = default;
+  StackSafetyGlobalInfo(GVToSSI SSGI) : SSGI(std::move(SSGI)) {}
+
+  bool setMetadata(Module &M) const;
+  void print(raw_ostream &O) const;
+  void dump() const;
+};
+
 /// StackSafetyInfo wrapper for the new pass manager.
 class StackSafetyAnalysis : public AnalysisInfoMixin<StackSafetyAnalysis> {
   friend AnalysisInfoMixin<StackSafetyAnalysis>;
@@ -74,8 +92,6 @@ class StackSafetyInfoWrapperPass : public FunctionPass {
   bool runOnFunction(Function &F) override;
 };
 
-using StackSafetyGlobalInfo = std::map<const GlobalValue *, StackSafetyInfo>;
-
 /// This pass performs the global (interprocedural) stack safety analysis (new
 /// pass manager).
 class StackSafetyGlobalAnalysis
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index d33efa5d1d042..6eeffe6066dfc 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -33,6 +33,8 @@ static cl::opt<int> StackSafetyMaxIterations("stack-safety-max-iterations",
 
 namespace {
 
+using GVToSSI = StackSafetyGlobalInfo::GVToSSI;
+
 /// Rewrite an SCEV expression for a memory access address to an expression that
 /// represents offset from the given alloca.
 class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
@@ -457,7 +459,7 @@ class StackSafetyDataFlowAnalysis {
 public:
   StackSafetyDataFlowAnalysis(
       Module &M, std::function<const FunctionInfo &(Function &)> FI);
-  StackSafetyGlobalInfo run();
+  GVToSSI run();
 };
 
 StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis(
@@ -571,19 +573,18 @@ void StackSafetyDataFlowAnalysis::verifyFixedPoint() {
 }
 #endif
 
-StackSafetyGlobalInfo StackSafetyDataFlowAnalysis::run() {
+GVToSSI StackSafetyDataFlowAnalysis::run() {
   runDataFlow();
   LLVM_DEBUG(verifyFixedPoint());
 
-  StackSafetyGlobalInfo SSI;
+  GVToSSI SSI;
   for (auto &F : Functions)
     SSI.emplace(F.first, makeSSI(F.second));
   return SSI;
 }
 
-bool setStackSafetyMetadata(Module &M, const StackSafetyGlobalInfo &SSGI) {
+bool setStackSafetyMetadata(Module &M, const GVToSSI &SSGI) {
   bool Changed = false;
-  unsigned Width = M.getDataLayout().getPointerSizeInBits();
   for (auto &F : M.functions()) {
     if (F.isDeclaration() || F.hasOptNone())
       continue;
@@ -621,23 +622,28 @@ void StackSafetyInfo::print(raw_ostream &O, const GlobalValue &F) const {
   Info->Info.print(O, F.getName(), dyn_cast<Function>(&F));
 }
 
-static void print(const StackSafetyGlobalInfo &SSI, raw_ostream &O,
-                  const Module &M) {
-  size_t Count = 0;
-  for (auto &F : M.functions())
+bool StackSafetyGlobalInfo::setMetadata(Module &M) const {
+  return setStackSafetyMetadata(M, SSGI);
+}
+
+void StackSafetyGlobalInfo::print(raw_ostream &O) const {
+  if (SSGI.empty())
+    return;
+  const Module &M = *SSGI.begin()->first->getParent();
+  for (auto &F : M.functions()) {
     if (!F.isDeclaration()) {
-      SSI.find(&F)->second.print(O, F);
+      SSGI.find(&F)->second.print(O, F);
       O << "\n";
-      ++Count;
     }
+  }
   for (auto &A : M.aliases()) {
-    SSI.find(&A)->second.print(O, A);
+    SSGI.find(&A)->second.print(O, A);
     O << "\n";
-    ++Count;
   }
-  assert(Count == SSI.size() && "Unexpected functions in the result");
 }
 
+LLVM_DUMP_METHOD void StackSafetyGlobalInfo::dump() const { print(dbgs()); }
+
 AnalysisKey StackSafetyAnalysis::Key;
 
 StackSafetyInfo StackSafetyAnalysis::run(Function &F,
@@ -693,14 +699,14 @@ StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
 PreservedAnalyses StackSafetyGlobalPrinterPass::run(Module &M,
                                                     ModuleAnalysisManager &AM) {
   OS << "'Stack Safety Analysis' for module '" << M.getName() << "'\n";
-  print(AM.getResult<StackSafetyGlobalAnalysis>(M), OS, M);
+  AM.getResult<StackSafetyGlobalAnalysis>(M).print(OS);
   return PreservedAnalyses::all();
 }
 
 PreservedAnalyses
 StackSafetyGlobalAnnotatorPass::run(Module &M, ModuleAnalysisManager &AM) {
   auto &SSGI = AM.getResult<StackSafetyGlobalAnalysis>(M);
-  (void)setStackSafetyMetadata(M, SSGI);
+  SSGI.setMetadata(M);
   return PreservedAnalyses::all();
 }
 
@@ -715,7 +721,7 @@ StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass(
 
 void StackSafetyGlobalInfoWrapperPass::print(raw_ostream &O,
                                              const Module *M) const {
-  ::print(SSGI, O, *M);
+  SSGI.print(O);
 }
 
 void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage(
@@ -732,7 +738,7 @@ bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
             .Info;
       });
   SSGI = SSDFA.run();
-  return SetMetadata ? setStackSafetyMetadata(M, SSGI) : false;
+  return SetMetadata ? SSGI.setMetadata(M) : false;
 }
 
 ModulePass *llvm::createStackSafetyGlobalInfoWrapperPass(bool SetMetadata) {

From 5238b80058a6d096220eb9fbf606d9d983f37b0b Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 21:01:09 -0700
Subject: [PATCH 313/770] [lldb/Reproducers] Skip or fix the remaining tests.

After this patch all remaining tests should pass on macOS when replayed
from a reproducer.

To capture the reproducers:

  ./bin/llvm-lit ../llvm-project/lldb/test/ --param lldb-run-with-repro=capture

To replay the reproducers:

  ./bin/llvm-lit ../llvm-project/lldb/test/ --param lldb-run-with-repro=replay
---
 .../gdb_remote_client/TestWriteMemory.py        |  1 +
 .../load_unload/TestLoadUnload.py               |  1 +
 .../load_using_paths/TestLoadUsingPaths.py      |  1 +
 .../postmortem/minidump-new/TestMiniDumpNew.py  |  1 +
 .../process_group/TestChangeProcessGroup.py     |  1 +
 .../TestExitDuringExpression.py                 | 17 +++++++++--------
 .../lang/cpp/thread_local/TestThreadLocal.py    | 17 ++++++++++++++++-
 .../version_zero/TestGetVersionZeroVersion.py   |  1 +
 .../symbol-context/TestSymbolContext.py         |  1 +
 9 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestWriteMemory.py b/lldb/test/API/functionalities/gdb_remote_client/TestWriteMemory.py
index 73bd292463f0f..83fa197c2fe3e 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestWriteMemory.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestWriteMemory.py
@@ -6,6 +6,7 @@
 
 class TestWriteMemory(GDBRemoteTestBase):
 
+    @skipIfReproducer # SBProcess::WriteMemory is not instrumented.
     def test(self):
 
         class MyResponder(MockGDBServerResponder):
diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
index 853c0b2cea201..538f7b1734ba1 100644
--- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
+++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
@@ -223,6 +223,7 @@ def test_lldb_process_load_and_unload_commands_with_svr4(self):
         self.setSvr4Support(True)
         self.run_lldb_process_load_and_unload_commands()
 
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def run_lldb_process_load_and_unload_commands(self):
         """Test that lldb process load/unload command work correctly."""
         self.copy_shlibs_to_remote()
diff --git a/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py b/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py
index a7d5f07a09766..9e10bd3ce833d 100644
--- a/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py
+++ b/lldb/test/API/functionalities/load_using_paths/TestLoadUsingPaths.py
@@ -41,6 +41,7 @@ def setUp(self):
     @skipIfWindows  # Windows doesn't have dlopen and friends, dynamic libraries work differently
     @expectedFlakeyNetBSD
     @expectedFailureAll(oslist=["linux"], archs=['arm'], bugnumber="llvm.org/pr45894")
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test_load_using_paths(self):
         """Test that we can load a module by providing a set of search paths."""
         if self.platformIsDarwin():
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
index 3e1abc3353c32..012f9b67d9e33 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
@@ -344,6 +344,7 @@ def test_deeper_stack_in_minidump(self):
                                   "linux-x86_64_not_crashed.dmp",
                                   self._linux_x86_64_not_crashed_pid)
 
+    @skipIfReproducer # VFS is a snapshot.
     def do_change_pid_in_minidump(self, core, newcore, offset, oldpid, newpid):
         """ This assumes that the minidump is breakpad generated on Linux -
         meaning that the PID in the file will be an ascii string part of
diff --git a/lldb/test/API/functionalities/process_group/TestChangeProcessGroup.py b/lldb/test/API/functionalities/process_group/TestChangeProcessGroup.py
index 93597b4edae37..124d13ed97a41 100644
--- a/lldb/test/API/functionalities/process_group/TestChangeProcessGroup.py
+++ b/lldb/test/API/functionalities/process_group/TestChangeProcessGroup.py
@@ -24,6 +24,7 @@ def setUp(self):
     @skipIfWindows  # setpgid call does not exist on Windows
     @expectedFailureAndroid("http://llvm.org/pr23762", api_levels=[16])
     @expectedFailureNetBSD
+    @skipIfReproducer # File synchronization is not supported during replay.
     def test_setpgid(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py b/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py
index bfdfdf53cdb15..260fe596a39f2 100644
--- a/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py
+++ b/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py
@@ -33,12 +33,13 @@ def test_exit_after_one_thread_unwind(self):
     def test_exit_after_one_thread_no_unwind(self):
         """Test the case where we exit within the one thread timeout"""
         self.exiting_expression_test(False, False)
-    
+
     def setUp(self):
         TestBase.setUp(self)
         self.main_source_file = lldb.SBFileSpec("main.c")
         self.build()
-        
+
+    @skipIfReproducer # Timeouts are not currently modeled.
     def exiting_expression_test(self, before_one_thread_timeout , unwind):
         """function_to_call sleeps for g_timeout microseconds, then calls pthread_exit.
            This test calls function_to_call with an overall timeout of 500
@@ -46,7 +47,7 @@ def exiting_expression_test(self, before_one_thread_timeout , unwind):
            It also sets unwind_on_exit for the call to the unwind passed in.
            This allows you to have the thread exit either before the one thread
            timeout is passed. """
-        
+
         (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(self,
                                    "Break here and cause the thread to exit", self.main_source_file)
 
@@ -59,15 +60,15 @@ def exiting_expression_test(self, before_one_thread_timeout , unwind):
         var_options.SetIncludeArguments(False)
         var_options.SetIncludeLocals(False)
         var_options.SetIncludeStatics(True)
-        
+
         value_list = frame.GetVariables(var_options)
         g_timeout = value_list.GetFirstValueByName("g_timeout")
         self.assertTrue(g_timeout.IsValid(), "Found g_timeout")
-        
+
         error = lldb.SBError()
         timeout_value = g_timeout.GetValueAsUnsigned(error)
         self.assertTrue(error.Success(), "Couldn't get timeout value: %s"%(error.GetCString()))
-        
+
         one_thread_timeout = 0
         if (before_one_thread_timeout):
             one_thread_timeout = timeout_value * 2
@@ -78,7 +79,7 @@ def exiting_expression_test(self, before_one_thread_timeout , unwind):
         options.SetUnwindOnError(unwind)
         options.SetOneThreadTimeoutInMicroSeconds(one_thread_timeout)
         options.SetTimeoutInMicroSeconds(4 * timeout_value)
-            
+
         result = frame.EvaluateExpression("function_to_call()", options)
 
         # Make sure the thread actually exited:
@@ -103,4 +104,4 @@ def exiting_expression_test(self, before_one_thread_timeout , unwind):
         ret_val_value = ret_val.GetValueAsSigned(error)
         self.assertTrue(error.Success(), "Got ret_val's value")
         self.assertEqual(ret_val_value, 10, "We put the right value in ret_val")
-        
+
diff --git a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py
index e7cfa1ca14f27..b92ec90ff77d3 100644
--- a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py
+++ b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py
@@ -30,10 +30,25 @@ def test_thread_local(self):
         self.expect_expr("*tl_global_ptr",
                          result_type="int", result_value="45")
 
+        # Create the filespec by which to locate our a.out module.
+        #
+        #  - Use the absolute path to get the module for the current variant.
+        #  - Use the relative path for reproducers. The modules are never
+        #    orphaned because the SB objects are leaked intentionally. This
+        #    causes LLDB to reuse the same module for every variant, because the
+        #    UUID is the same for all the inferiors. FindModule below only
+        #    compares paths and is oblivious to the fact that the UUIDs are the
+        #    same.
+        if configuration.is_reproducer():
+            filespec = lldb.SBFileSpec('a.out', False)
+        else:
+            filespec = lldb.SBFileSpec(exe, False)
+
         # Now see if we emit the correct error when the TLS is not yet
         # initialized. Let's set a breakpoint on the first instruction
         # of main.
-        main_module = target.FindModule(lldb.SBFileSpec(exe))
+        main_module = target.FindModule(filespec)
+        self.assertTrue(main_module, VALID_MODULE)
         main_address = main_module.FindSymbol("main").GetStartAddress()
         main_bkpt = target.BreakpointCreateBySBAddress(main_address)
 
diff --git a/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py b/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py
index f7e4da73dda65..53a59c923d84d 100644
--- a/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py
+++ b/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py
@@ -19,6 +19,7 @@ class TestGetVersionForZero(TestBase):
     # each debug info format.
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test_get_version_zero(self):
         """Read in a library with a version of 0.0.0.  Test SBModule::GetVersion"""
         self.yaml2obj("libDylib.dylib.yaml", self.getBuildArtifact("libDylib.dylib"))
diff --git a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py
index 0baf91e4a3513..cbe4eff0a5e3b 100644
--- a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py
+++ b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py
@@ -23,6 +23,7 @@ def setUp(self):
 
     @add_test_categories(['pyapi'])
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24778")
+    @skipIfReproducer # FIXME: Unexpected packet during (passive) replay
     def test(self):
         """Exercise SBSymbolContext API extensively."""
         self.build()

From e5bb542362dfbb6c57a597810d740987afbc4202 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 27 May 2020 21:13:08 -0700
Subject: [PATCH 314/770] [lldb/Test] Import all decorators.

Fixes "NameError: name 'skipIfReproducer' is not defined".
---
 lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py b/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py
index 53a59c923d84d..5f9772b8fb20e 100644
--- a/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py
+++ b/lldb/test/API/macosx/version_zero/TestGetVersionZeroVersion.py
@@ -5,7 +5,7 @@
 
 
 import lldb
-from lldbsuite.test import decorators
+from lldbsuite.test.decorators import *
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test.lldbtest import *
 

From c1d5b831b1cb095370a01e1749a8e9746f8f3de6 Mon Sep 17 00:00:00 2001
From: Sourabh Singh Tomar <SourabhSingh.Tomar@amd.com>
Date: Wed, 27 May 2020 14:01:35 +0530
Subject: [PATCH 315/770] [docs] Release notes for DIModule metadata

Updated the release notes for the changes in the DIModule metadata.

Reviewed By: aprantl

Differential Revision: https://reviews.llvm.org/D80614
---
 llvm/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index b47222eefe8a3..a55d14c0cfa28 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -70,6 +70,10 @@ Changes to the LLVM IR
   behavior was undocumented. To preserve optimizations, frontends may need to
   be updated to generate appropriate `align` attributes and metadata.
 
+* The DIModule metadata is extended to contain file and line number
+  information. This information is used to represent Fortran modules debug
+  info at IR level.
+
 Changes to building LLVM
 ------------------------
 

From 49544499954912c5a0f02014de53e0bc0234c7af Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Mon, 25 May 2020 10:18:44 +0800
Subject: [PATCH 316/770] [Driver][X86] Support branch align options with LTO

Summary: Before this patch, we use two different ways to pass options to align branch
depending on whether LTO is enabled. For example, `-mbranches-within-32B-boundaries`
w/o LTO and `-Wl,-plugin-opt=-x86-branches-within-32B-boundaries` w/ LTO.  It's
inconvenient, so this patch unifies the way: we only need to pass options like
`-mbranches-within-32B-boundaries` to align branches, no matter LTO is enabled or not.

Differential Revision: https://reviews.llvm.org/D80289
---
 clang/lib/Driver/ToolChains/Clang.cpp      | 52 +----------------
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 67 +++++++++++++++++++---
 clang/lib/Driver/ToolChains/CommonArgs.h   |  2 +
 clang/test/Driver/x86-malign-branch.c      | 10 +++-
 4 files changed, 73 insertions(+), 58 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f33983db3e1eb..dd83cafb27487 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2045,57 +2045,10 @@ void Clang::AddSystemZTargetArgs(const ArgList &Args,
   }
 }
 
-static void addX86AlignBranchArgs(const Driver &D, const ArgList &Args,
-                                  ArgStringList &CmdArgs) {
-  if (Args.hasArg(options::OPT_mbranches_within_32B_boundaries)) {
-    CmdArgs.push_back("-mllvm");
-    CmdArgs.push_back("-x86-branches-within-32B-boundaries");
-  }
-  if (const Arg *A = Args.getLastArg(options::OPT_malign_branch_boundary_EQ)) {
-    StringRef Value = A->getValue();
-    unsigned Boundary;
-    if (Value.getAsInteger(10, Boundary) || Boundary < 16 ||
-        !llvm::isPowerOf2_64(Boundary)) {
-      D.Diag(diag::err_drv_invalid_argument_to_option)
-          << Value << A->getOption().getName();
-    } else {
-      CmdArgs.push_back("-mllvm");
-      CmdArgs.push_back(
-          Args.MakeArgString("-x86-align-branch-boundary=" + Twine(Boundary)));
-    }
-  }
-  if (const Arg *A = Args.getLastArg(options::OPT_malign_branch_EQ)) {
-    std::string AlignBranch;
-    for (StringRef T : A->getValues()) {
-      if (T != "fused" && T != "jcc" && T != "jmp" && T != "call" &&
-          T != "ret" && T != "indirect")
-        D.Diag(diag::err_drv_invalid_malign_branch_EQ)
-            << T << "fused, jcc, jmp, call, ret, indirect";
-      if (!AlignBranch.empty())
-        AlignBranch += '+';
-      AlignBranch += T;
-    }
-    CmdArgs.push_back("-mllvm");
-    CmdArgs.push_back(Args.MakeArgString("-x86-align-branch=" + AlignBranch));
-  }
-  if (const Arg *A = Args.getLastArg(options::OPT_mpad_max_prefix_size_EQ)) {
-    StringRef Value = A->getValue();
-    unsigned PrefixSize;
-    if (Value.getAsInteger(10, PrefixSize)) {
-      D.Diag(diag::err_drv_invalid_argument_to_option)
-          << Value << A->getOption().getName();
-    } else {
-      CmdArgs.push_back("-mllvm");
-      CmdArgs.push_back(
-          Args.MakeArgString("-x86-pad-max-prefix-size=" + Twine(PrefixSize)));
-    }
-  }
-}
-
 void Clang::AddX86TargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
-  addX86AlignBranchArgs(D, Args, CmdArgs);
+  addX86AlignBranchArgs(D, Args, CmdArgs, /*IsLTO=*/false);
 
   if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) ||
       Args.hasArg(options::OPT_mkernel) ||
@@ -6745,7 +6698,8 @@ void ClangAs::AddMIPSTargetArgs(const ArgList &Args,
 
 void ClangAs::AddX86TargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
-  addX86AlignBranchArgs(getToolChain().getDriver(), Args, CmdArgs);
+  addX86AlignBranchArgs(getToolChain().getDriver(), Args, CmdArgs,
+                        /*IsLTO=*/false);
 
   if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
     StringRef Value = A->getValue();
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 85a1a4e1ac07d..33c43222b5f9d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -358,6 +358,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                           ArgStringList &CmdArgs, const InputInfo &Output,
                           const InputInfo &Input, bool IsThinLTO) {
   const char *Linker = Args.MakeArgString(ToolChain.GetLinkerPath());
+  const Driver &D = ToolChain.getDriver();
   if (llvm::sys::path::filename(Linker) != "ld.lld" &&
       llvm::sys::path::stem(Linker) != "ld.lld") {
     // Tell the linker to load the plugin. This has to come before
@@ -374,10 +375,9 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
 #endif
 
     SmallString<1024> Plugin;
-    llvm::sys::path::native(Twine(ToolChain.getDriver().Dir) +
-                                "/../lib" CLANG_LIBDIR_SUFFIX "/LLVMgold" +
-                                Suffix,
-                            Plugin);
+    llvm::sys::path::native(
+        Twine(D.Dir) + "/../lib" CLANG_LIBDIR_SUFFIX "/LLVMgold" + Suffix,
+        Plugin);
     CmdArgs.push_back(Args.MakeArgString(Plugin));
   }
 
@@ -417,7 +417,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (IsThinLTO)
     CmdArgs.push_back("-plugin-opt=thinlto");
 
-  StringRef Parallelism = getLTOParallelism(Args, ToolChain.getDriver());
+  StringRef Parallelism = getLTOParallelism(Args, D);
   if (!Parallelism.empty())
     CmdArgs.push_back(
         Args.MakeArgString("-plugin-opt=jobs=" + Twine(Parallelism)));
@@ -449,7 +449,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (Arg *A = getLastProfileSampleUseArg(Args)) {
     StringRef FName = A->getValue();
     if (!llvm::sys::fs::exists(FName))
-      ToolChain.getDriver().Diag(diag::err_drv_no_such_file) << FName;
+      D.Diag(diag::err_drv_no_such_file) << FName;
     else
       CmdArgs.push_back(
           Args.MakeArgString(Twine("-plugin-opt=sample-profile=") + FName));
@@ -492,11 +492,12 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   }
 
   // Setup statistics file output.
-  SmallString<128> StatsFile =
-      getStatsFileName(Args, Output, Input, ToolChain.getDriver());
+  SmallString<128> StatsFile = getStatsFileName(Args, Output, Input, D);
   if (!StatsFile.empty())
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-plugin-opt=stats-file=") + StatsFile));
+
+  addX86AlignBranchArgs(D, Args, CmdArgs, /*IsLTO=*/true);
 }
 
 void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args,
@@ -1423,3 +1424,53 @@ void tools::addMultilibFlag(bool Enabled, const char *const Flag,
                             Multilib::flags_list &Flags) {
   Flags.push_back(std::string(Enabled ? "+" : "-") + Flag);
 }
+
+void tools::addX86AlignBranchArgs(const Driver &D, const ArgList &Args,
+                                  ArgStringList &CmdArgs, bool IsLTO) {
+  auto addArg = [&, IsLTO](const Twine &Arg) {
+    if (IsLTO) {
+      CmdArgs.push_back(Args.MakeArgString("-plugin-opt=" + Arg));
+    } else {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back(Args.MakeArgString(Arg));
+    }
+  };
+
+  if (Args.hasArg(options::OPT_mbranches_within_32B_boundaries)) {
+    addArg(Twine("-x86-branches-within-32B-boundaries"));
+  }
+  if (const Arg *A = Args.getLastArg(options::OPT_malign_branch_boundary_EQ)) {
+    StringRef Value = A->getValue();
+    unsigned Boundary;
+    if (Value.getAsInteger(10, Boundary) || Boundary < 16 ||
+        !llvm::isPowerOf2_64(Boundary)) {
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Value << A->getOption().getName();
+    } else {
+      addArg("-x86-align-branch-boundary=" + Twine(Boundary));
+    }
+  }
+  if (const Arg *A = Args.getLastArg(options::OPT_malign_branch_EQ)) {
+    std::string AlignBranch;
+    for (StringRef T : A->getValues()) {
+      if (T != "fused" && T != "jcc" && T != "jmp" && T != "call" &&
+          T != "ret" && T != "indirect")
+        D.Diag(diag::err_drv_invalid_malign_branch_EQ)
+            << T << "fused, jcc, jmp, call, ret, indirect";
+      if (!AlignBranch.empty())
+        AlignBranch += '+';
+      AlignBranch += T;
+    }
+    addArg("-x86-align-branch=" + Twine(AlignBranch));
+  }
+  if (const Arg *A = Args.getLastArg(options::OPT_mpad_max_prefix_size_EQ)) {
+    StringRef Value = A->getValue();
+    unsigned PrefixSize;
+    if (Value.getAsInteger(10, PrefixSize)) {
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Value << A->getOption().getName();
+    } else {
+      addArg("-x86-pad-max-prefix-size=" + Twine(PrefixSize));
+    }
+  }
+}
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index c94b2b828c9b8..58bc92c9b7569 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -127,6 +127,8 @@ SmallString<128> getStatsFileName(const llvm::opt::ArgList &Args,
 void addMultilibFlag(bool Enabled, const char *const Flag,
                      Multilib::flags_list &Flags);
 
+void addX86AlignBranchArgs(const Driver &D, const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs, bool IsLTO);
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang
diff --git a/clang/test/Driver/x86-malign-branch.c b/clang/test/Driver/x86-malign-branch.c
index 5180eb0a16196..a71b18105baa3 100644
--- a/clang/test/Driver/x86-malign-branch.c
+++ b/clang/test/Driver/x86-malign-branch.c
@@ -1,8 +1,10 @@
-/// Test that -malign-branch* and -mbranches-within-32B-boundaries are parsed and converted to -mllvm options.
+/// Test that -malign-branch* and -mbranches-within-32B-boundaries are parsed and converted to MC options.
 
 /// Test -malign-branch-boundary=
 // RUN: %clang -target x86_64 -malign-branch-boundary=16 %s -c -### 2>&1 | FileCheck %s --check-prefix=BOUNDARY
 // BOUNDARY: "-mllvm" "-x86-align-branch-boundary=16"
+// RUN: %clang -target x86_64-unknown-linux -malign-branch-boundary=16 -flto %s -### 2>&1 | FileCheck %s --check-prefix=BOUNDARY-LTO
+// BOUNDARY-LTO: "-plugin-opt=-x86-align-branch-boundary=16"
 
 // RUN: %clang -target x86_64 -malign-branch-boundary=8 %s -c -### 2>&1 | FileCheck %s --check-prefix=BOUNDARY-ERR
 // RUN: %clang -target x86_64 -malign-branch-boundary=15 %s -c -### 2>&1 | FileCheck %s --check-prefix=BOUNDARY-ERR
@@ -13,6 +15,8 @@
 // TYPE0: "-mllvm" "-x86-align-branch=fused+jcc+jmp"
 // RUN: %clang -target x86_64 -malign-branch=fused,jcc,jmp,ret,call,indirect %s -c -### %s 2>&1 | FileCheck %s --check-prefix=TYPE1
 // TYPE1: "-mllvm" "-x86-align-branch=fused+jcc+jmp+ret+call+indirect"
+// RUN: %clang -target x86_64-unknown-linux -malign-branch=fused,jcc,jmp -flto %s -### %s 2>&1 | FileCheck %s --check-prefix=TYPE0-LTO
+// TYPE0-LTO: "-plugin-opt=-x86-align-branch=fused+jcc+jmp"
 
 // RUN: %clang -target x86_64 -malign-branch=fused,foo,bar %s -c -### %s 2>&1 | FileCheck %s --check-prefix=TYPE-ERR
 // TYPE-ERR: invalid argument 'foo' to -malign-branch=; each element must be one of: fused, jcc, jmp, call, ret, indirect
@@ -23,10 +27,14 @@
 // PREFIX-0: "-mllvm" "-x86-pad-max-prefix-size=0"
 // RUN: %clang -target x86_64 -mpad-max-prefix-size=15 %s -c -### 2>&1 | FileCheck %s --check-prefix=PREFIX-15
 // PREFIX-15: "-mllvm" "-x86-pad-max-prefix-size=15"
+// RUN: %clang -target x86_64-unknown-linux -mpad-max-prefix-size=0 -flto %s -### 2>&1 | FileCheck %s --check-prefix=PREFIX-0-LTO
+// PREFIX-0-LTO: "-plugin-opt=-x86-pad-max-prefix-size=0"
 
 /// Test -mbranches-within-32B-boundaries
 // RUN: %clang -target x86_64 -mbranches-within-32B-boundaries %s -c -### 2>&1 | FileCheck %s --check-prefix=32B
 // 32B: "-mllvm" "-x86-branches-within-32B-boundaries"
+// RUN: %clang -target x86_64-unknown-linux -mbranches-within-32B-boundaries -flto %s -### 2>&1 | FileCheck %s --check-prefix=32B-LTO
+// 32B-LTO: "-plugin-opt=-x86-branches-within-32B-boundaries"
 
 /// Unsupported on other targets.
 // RUN: %clang -target aarch64 -malign-branch=jmp %s -c -### 2>&1 | FileCheck --check-prefix=UNUSED %s

From c4990a03c6c347df120c0dbf6039e900889c4a92 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 27 May 2020 22:30:10 -0700
Subject: [PATCH 317/770] [JumpThreading] Use emplace_back instead of push_back
 (NFC)

Summary: This patch replaces push_back with emplace_back where appropriate.

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80688
---
 llvm/lib/Transforms/Scalar/JumpThreading.cpp | 28 ++++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index f5b24182edbdd..6f16d6583340d 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -616,7 +616,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
   // If V is a constant, then it is known in all predecessors.
   if (Constant *KC = getKnownConstant(V, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
-      Result.push_back(std::make_pair(KC, Pred));
+      Result.emplace_back(KC, Pred);
 
     return !Result.empty();
   }
@@ -643,7 +643,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
       // predecessor, use that information to try to thread this block.
       Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
       if (Constant *KC = getKnownConstant(PredCst, Preference))
-        Result.push_back(std::make_pair(KC, P));
+        Result.emplace_back(KC, P);
     }
 
     return !Result.empty();
@@ -654,13 +654,13 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *InVal = PN->getIncomingValue(i);
       if (Constant *KC = getKnownConstant(InVal, Preference)) {
-        Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+        Result.emplace_back(KC, PN->getIncomingBlock(i));
       } else {
         Constant *CI = LVI->getConstantOnEdge(InVal,
                                               PN->getIncomingBlock(i),
                                               BB, CxtI);
         if (Constant *KC = getKnownConstant(CI, Preference))
-          Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+          Result.emplace_back(KC, PN->getIncomingBlock(i));
       }
     }
 
@@ -759,7 +759,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
 
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
-          Result.push_back(std::make_pair(KC, LHSVal.second));
+          Result.emplace_back(KC, LHSVal.second);
       }
     }
 
@@ -811,7 +811,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         }
 
         if (Constant *KC = getKnownConstant(Res, WantInteger))
-          Result.push_back(std::make_pair(KC, PredBB));
+          Result.emplace_back(KC, PredBB);
       }
 
       return !Result.empty();
@@ -834,7 +834,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
             continue;
 
           Constant *ResC = ConstantInt::get(CmpType, Res);
-          Result.push_back(std::make_pair(ResC, P));
+          Result.emplace_back(ResC, P);
         }
 
         return !Result.empty();
@@ -873,7 +873,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
               else
                 continue;
 
-              Result.push_back(std::make_pair(ResC, P));
+              Result.emplace_back(ResC, P);
             }
 
             return !Result.empty();
@@ -891,7 +891,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         Constant *V = LHSVal.first;
         Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
-          Result.push_back(std::make_pair(KC, LHSVal.second));
+          Result.emplace_back(KC, LHSVal.second);
       }
 
       return !Result.empty();
@@ -925,7 +925,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
 
         // See if the select has a known constant value for this predecessor.
         if (Constant *Val = KnownCond ? TrueVal : FalseVal)
-          Result.push_back(std::make_pair(Val, C.second));
+          Result.emplace_back(Val, C.second);
       }
 
       return !Result.empty();
@@ -936,7 +936,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
   Constant *CI = LVI->getConstant(V, BB, CxtI);
   if (Constant *KC = getKnownConstant(CI, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
-      Result.push_back(std::make_pair(KC, Pred));
+      Result.emplace_back(KC, Pred);
   }
 
   return !Result.empty();
@@ -1345,7 +1345,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
     // If so, this load is partially redundant.  Remember this info so that we
     // can create a PHI node.
-    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+    AvailablePreds.emplace_back(PredBB, PredAvailable);
   }
 
   // If the loaded value isn't available in any predecessor, it isn't partially
@@ -1419,7 +1419,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     if (AATags)
       NewVal->setAAMetadata(AATags);
 
-    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+    AvailablePreds.emplace_back(UnavailablePred, NewVal);
   }
 
   // Now we know that each predecessor of this block has a value in
@@ -1652,7 +1652,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
         isa<CallBrInst>(Pred->getTerminator()))
       continue;
 
-    PredToDestList.push_back(std::make_pair(Pred, DestBB));
+    PredToDestList.emplace_back(Pred, DestBB);
   }
 
   // If all edges were unthreadable, we fail.

From 9081fa20991d101728434b354a96283b26495b71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <1.int32@gmail.com>
Date: Wed, 27 May 2020 16:23:42 +0200
Subject: [PATCH 318/770] [Analyzer][StreamChecker] Added check for
 "indeterminate file position".

Summary:
According to the standard, after a `wread` or `fwrite` call the file position
becomes "indeterminate". It is assumable that a next read or write causes
undefined behavior, so a (fatal error) warning is added for this case.
The indeterminate position can be cleared by some operations, for example
`fseek` or `freopen`, not with `clearerr`.

Reviewers: Szelethus, baloghadamsoftware, martong, NoQ, xazax.hun, dcoughlin

Reviewed By: Szelethus

Subscribers: rnkovacs, NoQ, xazax.hun, baloghadamsoftware, szepet, a.sidorin, mikhail.ramalho, Szelethus, donat.nagy, dkrupp, gamesh411, Charusso, martong, ASDenysPetrov, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80018
---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp | 124 ++++++++++++++++--
 clang/test/Analysis/stream-error.c            |  71 +++++++++-
 2 files changed, 181 insertions(+), 14 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 2e90be4350a03..63ebfaf90dc82 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -92,7 +92,27 @@ struct StreamState {
 
   /// State of the error flags.
   /// Ignored in non-opened stream state but must be NoError.
-  StreamErrorState ErrorState;
+  StreamErrorState const ErrorState;
+
+  /// Indicate if the file has an "indeterminate file position indicator".
+  /// This can be set at a failing read or write or seek operation.
+  /// If it is set no more read or write is allowed.
+  /// This value is not dependent on the stream error flags:
+  /// The error flag may be cleared with `clearerr` but the file position
+  /// remains still indeterminate.
+  /// This value applies to all error states in ErrorState except FEOF.
+  /// An EOF+indeterminate state is the same as EOF state.
+  bool const FilePositionIndeterminate = false;
+
+  StreamState(const FnDescription *L, KindTy S, const StreamErrorState &ES,
+              bool IsFilePositionIndeterminate)
+      : LastOperation(L), State(S), ErrorState(ES),
+        FilePositionIndeterminate(IsFilePositionIndeterminate) {
+    assert((!ES.isFEof() || !IsFilePositionIndeterminate) &&
+           "FilePositionIndeterminate should be false in FEof case.");
+    assert((State == Opened || ErrorState.isNoError()) &&
+           "ErrorState should be None in non-opened stream state.");
+  }
 
   bool isOpened() const { return State == Opened; }
   bool isClosed() const { return State == Closed; }
@@ -102,24 +122,27 @@ struct StreamState {
     // In not opened state error state should always NoError, so comparison
     // here is no problem.
     return LastOperation == X.LastOperation && State == X.State &&
-           ErrorState == X.ErrorState;
+           ErrorState == X.ErrorState &&
+           FilePositionIndeterminate == X.FilePositionIndeterminate;
   }
 
   static StreamState getOpened(const FnDescription *L,
-                               const StreamErrorState &ES = {}) {
-    return StreamState{L, Opened, ES};
+                               const StreamErrorState &ES = ErrorNone,
+                               bool IsFilePositionIndeterminate = false) {
+    return StreamState{L, Opened, ES, IsFilePositionIndeterminate};
   }
   static StreamState getClosed(const FnDescription *L) {
-    return StreamState{L, Closed, {}};
+    return StreamState{L, Closed, {}, false};
   }
   static StreamState getOpenFailed(const FnDescription *L) {
-    return StreamState{L, OpenFailed, {}};
+    return StreamState{L, OpenFailed, {}, false};
   }
 
   void Profile(llvm::FoldingSetNodeID &ID) const {
     ID.AddPointer(LastOperation);
     ID.AddInteger(State);
     ID.AddInteger(ErrorState);
+    ID.AddBoolean(FilePositionIndeterminate);
   }
 };
 
@@ -173,7 +196,8 @@ ProgramStateRef bindInt(uint64_t Value, ProgramStateRef State,
 class StreamChecker
     : public Checker<check::PreCall, eval::Call, check::DeadSymbols> {
   mutable std::unique_ptr<BuiltinBug> BT_nullfp, BT_illegalwhence,
-      BT_UseAfterClose, BT_UseAfterOpenFailed, BT_ResourceLeak, BT_StreamEof;
+      BT_UseAfterClose, BT_UseAfterOpenFailed, BT_ResourceLeak, BT_StreamEof,
+      BT_IndeterminatePosition;
 
 public:
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
@@ -279,6 +303,16 @@ class StreamChecker
   ProgramStateRef ensureStreamOpened(SVal StreamVal, CheckerContext &C,
                                      ProgramStateRef State) const;
 
+  /// Check that the stream has not an invalid ("indeterminate") file position,
+  /// generate warning for it.
+  /// (EOF is not an invalid position.)
+  /// The returned state can be nullptr if a fatal error was generated.
+  /// It can return non-null state if the stream has not an invalid position or
+  /// there is execution path with non-invalid position.
+  ProgramStateRef
+  ensureNoFilePositionIndeterminate(SVal StreamVal, CheckerContext &C,
+                                    ProgramStateRef State) const;
+
   /// Check the legality of the 'whence' argument of 'fseek'.
   /// Generate error and return nullptr if it is found to be illegal.
   /// Otherwise returns the state.
@@ -447,6 +481,9 @@ void StreamChecker::preFread(const FnDescription *Desc, const CallEvent &Call,
   if (!State)
     return;
   State = ensureStreamOpened(StreamVal, C, State);
+  if (!State)
+    return;
+  State = ensureNoFilePositionIndeterminate(StreamVal, C, State);
   if (!State)
     return;
 
@@ -468,6 +505,9 @@ void StreamChecker::preFwrite(const FnDescription *Desc, const CallEvent &Call,
   if (!State)
     return;
   State = ensureStreamOpened(StreamVal, C, State);
+  if (!State)
+    return;
+  State = ensureNoFilePositionIndeterminate(StreamVal, C, State);
   if (!State)
     return;
 
@@ -548,7 +588,9 @@ void StreamChecker::evalFreadFwrite(const FnDescription *Desc,
     NewES = (SS->ErrorState == ErrorFEof) ? ErrorFEof : ErrorFEof | ErrorFError;
   else
     NewES = ErrorFError;
-  StreamState NewState = StreamState::getOpened(Desc, NewES);
+  // If a (non-EOF) error occurs, the resulting value of the file position
+  // indicator for the stream is indeterminate.
+  StreamState NewState = StreamState::getOpened(Desc, NewES, !NewES.isFEof());
   StateFailed = StateFailed->set<StreamMap>(StreamSym, NewState);
   C.addTransition(StateFailed);
 }
@@ -601,9 +643,11 @@ void StreamChecker::evalFseek(const FnDescription *Desc, const CallEvent &Call,
       StateNotFailed->set<StreamMap>(StreamSym, StreamState::getOpened(Desc));
   // We get error.
   // It is possible that fseek fails but sets none of the error flags.
+  // If fseek failed, assume that the file position becomes indeterminate in any
+  // case.
   StateFailed = StateFailed->set<StreamMap>(
       StreamSym,
-      StreamState::getOpened(Desc, ErrorNone | ErrorFEof | ErrorFError));
+      StreamState::getOpened(Desc, ErrorNone | ErrorFEof | ErrorFError, true));
 
   C.addTransition(StateNotFailed);
   C.addTransition(StateFailed);
@@ -623,7 +667,10 @@ void StreamChecker::evalClearerr(const FnDescription *Desc,
 
   assertStreamStateOpened(SS);
 
-  State = State->set<StreamMap>(StreamSym, StreamState::getOpened(Desc));
+  // FilePositionIndeterminate is not cleared.
+  State = State->set<StreamMap>(
+      StreamSym,
+      StreamState::getOpened(Desc, ErrorNone, SS->FilePositionIndeterminate));
   C.addTransition(State);
 }
 
@@ -651,7 +698,9 @@ void StreamChecker::evalFeofFerror(const FnDescription *Desc,
     // From now on it is the only one error state.
     ProgramStateRef TrueState = bindAndAssumeTrue(State, C, CE);
     C.addTransition(TrueState->set<StreamMap>(
-        StreamSym, StreamState::getOpened(Desc, ErrorKind)));
+        StreamSym, StreamState::getOpened(Desc, ErrorKind,
+                                          SS->FilePositionIndeterminate &&
+                                              !ErrorKind.isFEof())));
   }
   if (StreamErrorState NewES = SS->ErrorState & (~ErrorKind)) {
     // Execution path(s) with ErrorKind not set.
@@ -659,7 +708,9 @@ void StreamChecker::evalFeofFerror(const FnDescription *Desc,
     // New error state is everything before minus ErrorKind.
     ProgramStateRef FalseState = bindInt(0, State, C, CE);
     C.addTransition(FalseState->set<StreamMap>(
-        StreamSym, StreamState::getOpened(Desc, NewES)));
+        StreamSym,
+        StreamState::getOpened(
+            Desc, NewES, SS->FilePositionIndeterminate && !NewES.isFEof())));
   }
 }
 
@@ -767,6 +818,55 @@ ProgramStateRef StreamChecker::ensureStreamOpened(SVal StreamVal,
   return State;
 }
 
+ProgramStateRef StreamChecker::ensureNoFilePositionIndeterminate(
+    SVal StreamVal, CheckerContext &C, ProgramStateRef State) const {
+  SymbolRef Sym = StreamVal.getAsSymbol();
+  if (!Sym)
+    return State;
+
+  const StreamState *SS = State->get<StreamMap>(Sym);
+  if (!SS)
+    return State;
+
+  assert(SS->isOpened() && "First ensure that stream is opened.");
+
+  if (SS->FilePositionIndeterminate) {
+    if (!BT_IndeterminatePosition)
+      BT_IndeterminatePosition.reset(
+          new BuiltinBug(this, "Invalid stream state",
+                         "File position of the stream might be 'indeterminate' "
+                         "after a failed operation. "
+                         "Can cause undefined behavior."));
+
+    if (SS->ErrorState & ErrorFEof) {
+      // The error is unknown but may be FEOF.
+      // Continue analysis with the FEOF error state.
+      // Report warning because the other possible error states.
+      ExplodedNode *N = C.generateNonFatalErrorNode(State);
+      if (!N)
+        return nullptr;
+
+      C.emitReport(std::make_unique<PathSensitiveBugReport>(
+          *BT_IndeterminatePosition, BT_IndeterminatePosition->getDescription(),
+          N));
+      return State->set<StreamMap>(
+          Sym, StreamState::getOpened(SS->LastOperation, ErrorFEof, false));
+    }
+
+    // Known or unknown error state without FEOF possible.
+    // Stop analysis, report error.
+    ExplodedNode *N = C.generateErrorNode(State);
+    if (N)
+      C.emitReport(std::make_unique<PathSensitiveBugReport>(
+          *BT_IndeterminatePosition, BT_IndeterminatePosition->getDescription(),
+          N));
+
+    return nullptr;
+  }
+
+  return State;
+}
+
 ProgramStateRef
 StreamChecker::ensureFseekWhenceCorrect(SVal WhenceVal, CheckerContext &C,
                                         ProgramStateRef State) const {
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index cc0147deafdf2..e91ab2c6c28cc 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -76,7 +76,7 @@ void error_fread() {
     }
     if (ferror(F)) {
       clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
-      fread(Buf, 1, 10, F);           // no warning
+      fread(Buf, 1, 10, F);           // expected-warning {{might be 'indeterminate'}}
     }
   }
   fclose(F);
@@ -94,7 +94,7 @@ void error_fwrite() {
   } else {
     clang_analyzer_eval(feof(F));   // expected-warning {{FALSE}}
     clang_analyzer_eval(ferror(F)); // expected-warning {{TRUE}}
-    fwrite(0, 1, 10, F);            // no warning
+    fwrite(0, 1, 10, F);            // expected-warning {{might be 'indeterminate'}}
   }
   fclose(F);
   Ret = fwrite(0, 1, 10, F); // expected-warning {{Stream might be already closed}}
@@ -166,3 +166,70 @@ void error_fseek() {
   }
   fclose(F);
 }
+
+void error_indeterminate() {
+  FILE *F = fopen("file", "r+");
+  if (!F)
+    return;
+  const char *Buf = "123456789";
+  int rc = fseek(F, 0, SEEK_SET);
+  if (rc) {
+    if (feof(F)) {
+      fwrite(Buf, 1, 10, F); // no warning
+    } else if (ferror(F)) {
+      fwrite(Buf, 1, 10, F); // expected-warning {{might be 'indeterminate'}}
+    } else {
+      fwrite(Buf, 1, 10, F); // expected-warning {{might be 'indeterminate'}}
+    }
+  }
+  fclose(F);
+}
+
+void error_indeterminate_clearerr() {
+  FILE *F = fopen("file", "r+");
+  if (!F)
+    return;
+  const char *Buf = "123456789";
+  int rc = fseek(F, 0, SEEK_SET);
+  if (rc) {
+    if (feof(F)) {
+      clearerr(F);
+      fwrite(Buf, 1, 10, F); // no warning
+    } else if (ferror(F)) {
+      clearerr(F);
+      fwrite(Buf, 1, 10, F); // expected-warning {{might be 'indeterminate'}}
+    } else {
+      clearerr(F);
+      fwrite(Buf, 1, 10, F); // expected-warning {{might be 'indeterminate'}}
+    }
+  }
+  fclose(F);
+}
+
+void error_indeterminate_feof1() {
+  FILE *F = fopen("file", "r+");
+  if (!F)
+    return;
+  char Buf[10];
+  if (fread(Buf, 1, 10, F) < 10) {
+    if (feof(F)) {
+      // error is feof, should be non-indeterminate
+      fwrite("1", 1, 1, F); // no warning
+    }
+  }
+  fclose(F);
+}
+
+void error_indeterminate_feof2() {
+  FILE *F = fopen("file", "r+");
+  if (!F)
+    return;
+  char Buf[10];
+  if (fread(Buf, 1, 10, F) < 10) {
+    if (ferror(F) == 0) {
+      // error is feof, should be non-indeterminate
+      fwrite("1", 1, 1, F); // no warning
+    }
+  }
+  fclose(F);
+}

From 880c35a554952c3a64483502f3278431f8f06516 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Thu, 28 May 2020 08:36:04 +0100
Subject: [PATCH 319/770] [HardwareLoops] LangRef Intrinsic descriptions

The HardwareLoop intrinsics were missing and not described in LangRef. This
adds these descriptions/definitions.

Differential Revision: https://reviews.llvm.org/D80316
---
 llvm/docs/LangRef.rst | 163 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 61a0085c6f881..0e18dcc9f99e8 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14871,6 +14871,169 @@ Examples:
       %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
 
 
+Hardware-Loop Intrinsics
+------------------------
+
+LLVM support several intrinsics to mark a loop as a hardware-loop. They are
+hints to the backend which are required to lower these intrinsics further to target
+specific instructions, or revert the hardware-loop to a normal loop if target
+specific restriction are not met and a hardware-loop can't be generated.
+
+These intrinsics may be modified in the future and are not intended to be used
+outside the backend. Thus, front-end and mid-level optimizations should not be
+generating these intrinsics.
+
+
+'``llvm.set.loop.iterations.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic.
+
+::
+
+      declare void @llvm.set.loop.iterations.i32(i32)
+      declare void @llvm.set.loop.iterations.i64(i64)
+
+Overview:
+"""""""""
+
+The '``llvm.set.loop.iterations.*``' intrinsics are used to specify the
+hardware-loop trip count. They are placed in the loop preheader basic block and
+are marked as ``IntrNoDuplicate`` to avoid optimizers duplicating these
+instructions.
+
+Arguments:
+""""""""""
+
+The integer operand is the loop trip count of the hardware-loop, and thus
+not e.g. the loop back-edge taken count.
+
+Semantics:
+""""""""""
+
+The '``llvm.set.loop.iterations.*``' intrinsics do not perform any arithmetic
+on their operand. It's a hint to the backend that can use this to set up the
+hardware-loop count with a target specific instruction, usually a move of this
+value to a special register or a hardware-loop instruction.
+
+'``llvm.test.set.loop.iterations.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic.
+
+::
+
+      declare void @llvm.test.set.loop.iterations.i32(i32)
+      declare void @llvm.test.set.loop.iterations.i64(i64)
+
+Overview:
+"""""""""
+
+The '``llvm.test.set.loop.iterations.*``' intrinsics are used to specify the
+the loop trip count, and also test that the given count is not zero, allowing
+it to control entry to a while-loop.  They are placed in the loop preheader's
+predecessor basic block, and are marked as ``IntrNoDuplicate`` to avoid
+optimizers duplicating these instructions.
+
+Arguments:
+""""""""""
+
+The integer operand is the loop trip count of the hardware-loop, and thus
+not e.g. the loop back-edge taken count.
+
+Semantics:
+""""""""""
+
+The '``llvm.test.set.loop.iterations.*``' intrinsics do not perform any
+arithmetic on their operand. It's a hint to the backend that can use this to
+set up the hardware-loop count with a target specific instruction, usually a
+move of this value to a special register or a hardware-loop instruction.
+
+'``llvm.loop.decrement.reg.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+      declare i64 @llvm.loop.decrement.reg.i64(i64, i64)
+
+Overview:
+"""""""""
+
+The '``llvm.loop.decrement.reg.*``' intrinsics are used to lower the loop
+iteration counter and return an updated value that will be used in the next
+loop test check.
+
+Arguments:
+""""""""""
+
+Both arguments must have identical integer types. The first operand is the
+loop iteration counter. The second operand is the maximum number of elements
+processed in an iteration.
+
+Semantics:
+""""""""""
+
+The '``llvm.loop.decrement.reg.*``' intrinsics do an integer ``SUB`` of its
+two operands, which is not allowed to wrap. They return the remaining number of
+iterations still to be executed, and can be used together with a ``PHI``,
+``ICMP`` and ``BR`` to control the number of loop iterations executed. Any
+optimisations are allowed to treat it is a ``SUB``, and it is supported by
+SCEV, so it's the backends responsibility to handle cases where it may be
+optimised. These intrinsics are marked as ``IntrNoDuplicate`` to avoid
+optimizers duplicating these instructions.
+
+
+'``llvm.loop.decrement.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic.
+
+::
+
+      declare i1 @llvm.loop.decrement.i32(i32)
+      declare i1 @llvm.loop.decrement.i64(i64)
+
+Overview:
+"""""""""
+
+The HardwareLoops pass allows the loop decrement value to be specified with an
+option. It defaults to a loop decrement value of 1, but it can be an unsigned
+integer value provided by this option.  The '``llvm.loop.decrement.*``'
+intrinsics decrement the loop iteration counter with this value, and return a
+false predicate if the loop should exit, and true otherwise.
+This is emitted if the loop counter is not updated via a ``PHI`` node, which
+can also be controlled with an option.
+
+Arguments:
+""""""""""
+
+The integer argument is the loop decrement value used to decrement the loop
+iteration counter.
+
+Semantics:
+""""""""""
+
+The '``llvm.loop.decrement.*``' intrinsics do a ``SUB`` of the loop iteration
+counter with the given loop decrement value, and return false if the loop
+should exit, this ``SUB`` is not allowed to wrap. The result is a condition
+that is used by the conditional branch controlling the loop.
+
+
 Experimental Vector Reduction Intrinsics
 ----------------------------------------
 

From 5921782f744deffb5f5bfd96f6a7932a4ff75666 Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Thu, 28 May 2020 10:07:21 +0200
Subject: [PATCH 320/770] [VE] Implements minimum MC layer for VE (3/4)

Summary:
Define ELF binary code for VE and modify code where should use this new code.

Depends on D79544.

Reviewed By: jhenderson

Differential Revision: https://reviews.llvm.org/D79545
---
 llvm/include/llvm/BinaryFormat/ELF.h          |   6 +
 .../llvm/BinaryFormat/ELFRelocs/VE.def        |  48 ++
 llvm/include/llvm/Object/ELFObjectFile.h      |   4 +
 llvm/lib/Object/ELF.cpp                       |   7 +
 llvm/lib/ObjectYAML/ELFYAML.cpp               |   4 +
 .../ELF/file-header-machine-types.test        | 487 ++++++++++++++++++
 llvm/tools/llvm-readobj/ELFDumper.cpp         |   3 +
 llvm/unittests/Object/CMakeLists.txt          |   2 +
 llvm/unittests/Object/ELFObjectFileTest.cpp   | 127 +++++
 llvm/unittests/Object/ELFTest.cpp             |  56 ++
 llvm/unittests/ObjectYAML/CMakeLists.txt      |   1 +
 llvm/unittests/ObjectYAML/ELFYAMLTest.cpp     | 134 +++++
 12 files changed, 879 insertions(+)
 create mode 100644 llvm/include/llvm/BinaryFormat/ELFRelocs/VE.def
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test
 create mode 100644 llvm/unittests/Object/ELFObjectFileTest.cpp
 create mode 100644 llvm/unittests/Object/ELFTest.cpp
 create mode 100644 llvm/unittests/ObjectYAML/ELFYAMLTest.cpp

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 1b0412bc47be4..1a17135b60788 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -311,6 +311,7 @@ enum {
   EM_RISCV = 243,         // RISC-V
   EM_LANAI = 244,         // Lanai 32-bit processor
   EM_BPF = 247,           // Linux kernel bpf virtual machine
+  EM_VE = 251,            // NEC SX-Aurora VE
 };
 
 // Object file classes.
@@ -764,6 +765,11 @@ enum {
 #include "ELFRelocs/MSP430.def"
 };
 
+// ELF Relocation type for VE.
+enum {
+#include "ELFRelocs/VE.def"
+};
+
 #undef ELF_RELOC
 
 // Section header.
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/VE.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/VE.def
new file mode 100644
index 0000000000000..9bfdbf1b0960f
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/VE.def
@@ -0,0 +1,48 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+// Relocation types defined in following documents.
+//
+//  - System V Application Binary Interface - VE Architecture
+//    Processor Supplement
+//  - ELF Handling For Thread-Local Storage - VE Architecture
+//    Processor Supplement
+
+ELF_RELOC(R_VE_NONE,         0)
+ELF_RELOC(R_VE_REFLONG,      1)
+ELF_RELOC(R_VE_REFQUAD,      2)
+ELF_RELOC(R_VE_SREL32,       3)
+ELF_RELOC(R_VE_HI32,         4)
+ELF_RELOC(R_VE_LO32,         5)
+ELF_RELOC(R_VE_PC_HI32,      6)
+ELF_RELOC(R_VE_PC_LO32,      7)
+ELF_RELOC(R_VE_GOT32,        8)
+ELF_RELOC(R_VE_GOT_HI32,     9)
+ELF_RELOC(R_VE_GOT_LO32,     10)
+ELF_RELOC(R_VE_GOTOFF32,     11)
+ELF_RELOC(R_VE_GOTOFF_HI32,  12)
+ELF_RELOC(R_VE_GOTOFF_LO32,  13)
+ELF_RELOC(R_VE_PLT32,        14)
+ELF_RELOC(R_VE_PLT_HI32,     15)
+ELF_RELOC(R_VE_PLT_LO32,     16)
+ELF_RELOC(R_VE_RELATIVE,     17)
+ELF_RELOC(R_VE_GLOB_DAT,     18)
+ELF_RELOC(R_VE_JUMP_SLOT,    19)
+ELF_RELOC(R_VE_COPY,         20)
+ELF_RELOC(R_VE_DTPMOD64,     22)
+ELF_RELOC(R_VE_DTPOFF64,     23)
+// ELF_RELOC(R_VE_TPOFF64,     24)
+ELF_RELOC(R_VE_TLS_GD_HI32,  25)
+ELF_RELOC(R_VE_TLS_GD_LO32,  26)
+// ELF_RELOC(R_VE_TLS_LD_HI32,  27)
+// ELF_RELOC(R_VE_TLS_LD_LO32,  28)
+// ELF_RELOC(R_VE_DTPOFF32,     29)
+// ELF_RELOC(R_VE_TLS_IE_HI32,  30)
+// ELF_RELOC(R_VE_TLS_IE_LO32,  31)
+ELF_RELOC(R_VE_TPOFF_HI32,   32)
+ELF_RELOC(R_VE_TPOFF_LO32,   33)
+// ELF_RELOC(R_VE_TPOFF32,      34)
+ELF_RELOC(R_VE_CALL_HI32,    35)
+ELF_RELOC(R_VE_CALL_LO32,    36)
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index f6435d8b7ccc8..d7fdc5294a0ab 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1139,6 +1139,8 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "elf64-amdgpu";
     case ELF::EM_BPF:
       return "elf64-bpf";
+    case ELF::EM_VE:
+      return "elf64-ve";
     default:
       return "elf64-unknown";
     }
@@ -1217,6 +1219,8 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
   case ELF::EM_BPF:
     return IsLittleEndian ? Triple::bpfel : Triple::bpfeb;
 
+  case ELF::EM_VE:
+    return Triple::ve;
   default:
     return Triple::UnknownArch;
   }
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index e60ddf142ac3d..2515695095a1c 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -145,6 +145,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_VE:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/VE.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 445fcbc412ba2..3a621d77a36b5 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -221,6 +221,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
   ECase(EM_RISCV);
   ECase(EM_LANAI);
   ECase(EM_BPF);
+  ECase(EM_VE);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
@@ -662,6 +663,9 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   case ELF::EM_BPF:
 #include "llvm/BinaryFormat/ELFRelocs/BPF.def"
     break;
+  case ELF::EM_VE:
+#include "llvm/BinaryFormat/ELFRelocs/VE.def"
+    break;
   case ELF::EM_PPC64:
 #include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     break;
diff --git a/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test b/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test
new file mode 100644
index 0000000000000..a497faacf8b3c
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test
@@ -0,0 +1,487 @@
+## Show that all machine codes are correctly printed.
+
+# RUN: yaml2obj %s -o %t.none.o -D MACHINE=EM_NONE
+# RUN: llvm-readelf --file-headers %t.none.o | FileCheck %s -DMACHINE="None"
+
+# RUN: yaml2obj %s -o %t.m32.o -D MACHINE=EM_M32
+# RUN: llvm-readelf --file-headers %t.m32.o | FileCheck %s -DMACHINE="WE32100"
+
+# RUN: yaml2obj %s -o %t.sparc.o -D MACHINE=EM_SPARC
+# RUN: llvm-readelf --file-headers %t.sparc.o | FileCheck %s -DMACHINE="Sparc"
+
+# RUN: yaml2obj %s -o %t.386.o -D MACHINE=EM_386
+# RUN: llvm-readelf --file-headers %t.386.o | FileCheck %s -DMACHINE="Intel 80386"
+
+# RUN: yaml2obj %s -o %t.68k.o -D MACHINE=EM_68K
+# RUN: llvm-readelf --file-headers %t.68k.o | FileCheck %s -DMACHINE="MC68000"
+
+# RUN: yaml2obj %s -o %t.88k.o -D MACHINE=EM_88K
+# RUN: llvm-readelf --file-headers %t.88k.o | FileCheck %s -DMACHINE="MC88000"
+
+# RUN: yaml2obj %s -o %t.iamcu.o -D MACHINE=EM_IAMCU
+# RUN: llvm-readelf --file-headers %t.iamcu.o | FileCheck %s -DMACHINE="EM_IAMCU"
+
+# RUN: yaml2obj %s -o %t.860.o -D MACHINE=EM_860
+# RUN: llvm-readelf --file-headers %t.860.o | FileCheck %s -DMACHINE="Intel 80860"
+
+# RUN: yaml2obj %s -o %t.mips.o -D MACHINE=EM_MIPS
+# RUN: llvm-readelf --file-headers %t.mips.o | FileCheck %s -DMACHINE="MIPS R3000"
+
+# RUN: yaml2obj %s -o %t.s370.o -D MACHINE=EM_S370
+# RUN: llvm-readelf --file-headers %t.s370.o | FileCheck %s -DMACHINE="IBM System/370"
+
+# RUN: yaml2obj %s -o %t.mips_rs3_le.o -D MACHINE=EM_MIPS_RS3_LE
+# RUN: llvm-readelf --file-headers %t.mips_rs3_le.o | FileCheck %s -DMACHINE="MIPS R3000 little-endian"
+
+# RUN: yaml2obj %s -o %t.parisc.o -D MACHINE=EM_PARISC
+# RUN: llvm-readelf --file-headers %t.parisc.o | FileCheck %s -DMACHINE="HPPA"
+
+# RUN: yaml2obj %s -o %t.vpp500.o -D MACHINE=EM_VPP500
+# RUN: llvm-readelf --file-headers %t.vpp500.o | FileCheck %s -DMACHINE="Fujitsu VPP500"
+
+# RUN: yaml2obj %s -o %t.sparc32plus.o -D MACHINE=EM_SPARC32PLUS
+# RUN: llvm-readelf --file-headers %t.sparc32plus.o | FileCheck %s -DMACHINE="Sparc v8+"
+
+# RUN: yaml2obj %s -o %t.960.o -D MACHINE=EM_960
+# RUN: llvm-readelf --file-headers %t.960.o | FileCheck %s -DMACHINE="Intel 80960"
+
+# RUN: yaml2obj %s -o %t.ppc.o -D MACHINE=EM_PPC
+# RUN: llvm-readelf --file-headers %t.ppc.o | FileCheck %s -DMACHINE="PowerPC"
+
+# RUN: yaml2obj %s -o %t.ppc64.o -D MACHINE=EM_PPC64
+# RUN: llvm-readelf --file-headers %t.ppc64.o | FileCheck %s -DMACHINE="PowerPC64"
+
+# RUN: yaml2obj %s -o %t.s390.o -D MACHINE=EM_S390
+# RUN: llvm-readelf --file-headers %t.s390.o | FileCheck %s -DMACHINE="IBM S/390"
+
+# RUN: yaml2obj %s -o %t.spu.o -D MACHINE=EM_SPU
+# RUN: llvm-readelf --file-headers %t.spu.o | FileCheck %s -DMACHINE="SPU"
+
+# RUN: yaml2obj %s -o %t.v800.o -D MACHINE=EM_V800
+# RUN: llvm-readelf --file-headers %t.v800.o | FileCheck %s -DMACHINE="NEC V800 series"
+
+# RUN: yaml2obj %s -o %t.fr20.o -D MACHINE=EM_FR20
+# RUN: llvm-readelf --file-headers %t.fr20.o | FileCheck %s -DMACHINE="Fujistsu FR20"
+
+# RUN: yaml2obj %s -o %t.rh32.o -D MACHINE=EM_RH32
+# RUN: llvm-readelf --file-headers %t.rh32.o | FileCheck %s -DMACHINE="TRW RH-32"
+
+# RUN: yaml2obj %s -o %t.rce.o -D MACHINE=EM_RCE
+# RUN: llvm-readelf --file-headers %t.rce.o | FileCheck %s -DMACHINE="Motorola RCE"
+
+# RUN: yaml2obj %s -o %t.arm.o -D MACHINE=EM_ARM
+# RUN: llvm-readelf --file-headers %t.arm.o | FileCheck %s -DMACHINE="ARM"
+
+# RUN: yaml2obj %s -o %t.alpha.o -D MACHINE=EM_ALPHA
+# RUN: llvm-readelf --file-headers %t.alpha.o | FileCheck %s -DMACHINE="EM_ALPHA"
+
+# RUN: yaml2obj %s -o %t.sh.o -D MACHINE=EM_SH
+# RUN: llvm-readelf --file-headers %t.sh.o | FileCheck %s -DMACHINE="Hitachi SH"
+
+# RUN: yaml2obj %s -o %t.sparcv9.o -D MACHINE=EM_SPARCV9
+# RUN: llvm-readelf --file-headers %t.sparcv9.o | FileCheck %s -DMACHINE="Sparc v9"
+
+# RUN: yaml2obj %s -o %t.tricore.o -D MACHINE=EM_TRICORE
+# RUN: llvm-readelf --file-headers %t.tricore.o | FileCheck %s -DMACHINE="Siemens Tricore"
+
+# RUN: yaml2obj %s -o %t.arc.o -D MACHINE=EM_ARC
+# RUN: llvm-readelf --file-headers %t.arc.o | FileCheck %s -DMACHINE="ARC"
+
+# RUN: yaml2obj %s -o %t.h8_300.o -D MACHINE=EM_H8_300
+# RUN: llvm-readelf --file-headers %t.h8_300.o | FileCheck %s -DMACHINE="Hitachi H8/300"
+
+# RUN: yaml2obj %s -o %t.h8_300h.o -D MACHINE=EM_H8_300H
+# RUN: llvm-readelf --file-headers %t.h8_300h.o | FileCheck %s -DMACHINE="Hitachi H8/300H"
+
+# RUN: yaml2obj %s -o %t.h8s.o -D MACHINE=EM_H8S
+# RUN: llvm-readelf --file-headers %t.h8s.o | FileCheck %s -DMACHINE="Hitachi H8S"
+
+# RUN: yaml2obj %s -o %t.h8_500.o -D MACHINE=EM_H8_500
+# RUN: llvm-readelf --file-headers %t.h8_500.o | FileCheck %s -DMACHINE="Hitachi H8/500"
+
+# RUN: yaml2obj %s -o %t.ia_64.o -D MACHINE=EM_IA_64
+# RUN: llvm-readelf --file-headers %t.ia_64.o | FileCheck %s -DMACHINE="Intel IA-64"
+
+# RUN: yaml2obj %s -o %t.mips_x.o -D MACHINE=EM_MIPS_X
+# RUN: llvm-readelf --file-headers %t.mips_x.o | FileCheck %s -DMACHINE="Stanford MIPS-X"
+
+# RUN: yaml2obj %s -o %t.coldfire.o -D MACHINE=EM_COLDFIRE
+# RUN: llvm-readelf --file-headers %t.coldfire.o | FileCheck %s -DMACHINE="Motorola Coldfire"
+
+# RUN: yaml2obj %s -o %t.68hc12.o -D MACHINE=EM_68HC12
+# RUN: llvm-readelf --file-headers %t.68hc12.o | FileCheck %s -DMACHINE="Motorola MC68HC12 Microcontroller"
+
+# RUN: yaml2obj %s -o %t.mma.o -D MACHINE=EM_MMA
+# RUN: llvm-readelf --file-headers %t.mma.o | FileCheck %s -DMACHINE="Fujitsu Multimedia Accelerator"
+
+# RUN: yaml2obj %s -o %t.pcp.o -D MACHINE=EM_PCP
+# RUN: llvm-readelf --file-headers %t.pcp.o | FileCheck %s -DMACHINE="Siemens PCP"
+
+# RUN: yaml2obj %s -o %t.ncpu.o -D MACHINE=EM_NCPU
+# RUN: llvm-readelf --file-headers %t.ncpu.o | FileCheck %s -DMACHINE="Sony nCPU embedded RISC processor"
+
+# RUN: yaml2obj %s -o %t.ndri.o -D MACHINE=EM_NDR1
+# RUN: llvm-readelf --file-headers %t.ndri.o | FileCheck %s -DMACHINE="Denso NDR1 microprocesspr"
+
+# RUN: yaml2obj %s -o %t.starcore.o -D MACHINE=EM_STARCORE
+# RUN: llvm-readelf --file-headers %t.starcore.o | FileCheck %s -DMACHINE="Motorola Star*Core processor"
+
+# RUN: yaml2obj %s -o %t.me16.o -D MACHINE=EM_ME16
+# RUN: llvm-readelf --file-headers %t.me16.o | FileCheck %s -DMACHINE="Toyota ME16 processor"
+
+# RUN: yaml2obj %s -o %t.st100.o -D MACHINE=EM_ST100
+# RUN: llvm-readelf --file-headers %t.st100.o | FileCheck %s -DMACHINE="STMicroelectronics ST100 processor"
+
+# RUN: yaml2obj %s -o %t.tinyj.o -D MACHINE=EM_TINYJ
+# RUN: llvm-readelf --file-headers %t.tinyj.o | FileCheck %s -DMACHINE="Advanced Logic Corp. TinyJ embedded processor"
+
+# RUN: yaml2obj %s -o %t.x86_64.o -D MACHINE=EM_X86_64
+# RUN: llvm-readelf --file-headers %t.x86_64.o | FileCheck %s -DMACHINE="Advanced Micro Devices X86-64"
+
+# RUN: yaml2obj %s -o %t.pdsp.o -D MACHINE=EM_PDSP
+# RUN: llvm-readelf --file-headers %t.pdsp.o | FileCheck %s -DMACHINE="Sony DSP processor"
+
+# RUN: yaml2obj %s -o %t.pdp10.o -D MACHINE=EM_PDP10
+# RUN: llvm-readelf --file-headers %t.pdp10.o | FileCheck %s -DMACHINE="Digital Equipment Corp. PDP-10"
+
+# RUN: yaml2obj %s -o %t.pdp11.o -D MACHINE=EM_PDP11
+# RUN: llvm-readelf --file-headers %t.pdp11.o | FileCheck %s -DMACHINE="Digital Equipment Corp. PDP-11"
+
+# RUN: yaml2obj %s -o %t.fx66.o -D MACHINE=EM_FX66
+# RUN: llvm-readelf --file-headers %t.fx66.o | FileCheck %s -DMACHINE="Siemens FX66 microcontroller"
+
+# RUN: yaml2obj %s -o %t.st9plus.o -D MACHINE=EM_ST9PLUS
+# RUN: llvm-readelf --file-headers %t.st9plus.o | FileCheck %s -DMACHINE="STMicroelectronics ST9+ 8/16 bit microcontroller"
+
+# RUN: yaml2obj %s -o %t.st7.o -D MACHINE=EM_ST7
+# RUN: llvm-readelf --file-headers %t.st7.o | FileCheck %s -DMACHINE="STMicroelectronics ST7 8-bit microcontroller"
+
+# RUN: yaml2obj %s -o %t.68hc16.o -D MACHINE=EM_68HC16
+# RUN: llvm-readelf --file-headers %t.68hc16.o | FileCheck %s -DMACHINE="Motorola MC68HC16 Microcontroller"
+
+# RUN: yaml2obj %s -o %t.68hc11.o -D MACHINE=EM_68HC11
+# RUN: llvm-readelf --file-headers %t.68hc11.o | FileCheck %s -DMACHINE="Motorola MC68HC11 Microcontroller"
+
+# RUN: yaml2obj %s -o %t.68hc08.o -D MACHINE=EM_68HC08
+# RUN: llvm-readelf --file-headers %t.68hc08.o | FileCheck %s -DMACHINE="Motorola MC68HC08 Microcontroller"
+
+# RUN: yaml2obj %s -o %t.68hc05.o -D MACHINE=EM_68HC05
+# RUN: llvm-readelf --file-headers %t.68hc05.o | FileCheck %s -DMACHINE="Motorola MC68HC05 Microcontroller"
+
+# RUN: yaml2obj %s -o %t.svx.o -D MACHINE=EM_SVX
+# RUN: llvm-readelf --file-headers %t.svx.o | FileCheck %s -DMACHINE="Silicon Graphics SVx"
+
+# RUN: yaml2obj %s -o %t.st19.o -D MACHINE=EM_ST19
+# RUN: llvm-readelf --file-headers %t.st19.o | FileCheck %s -DMACHINE="STMicroelectronics ST19 8-bit microcontroller"
+
+# RUN: yaml2obj %s -o %t.vax.o -D MACHINE=EM_VAX
+# RUN: llvm-readelf --file-headers %t.vax.o | FileCheck %s -DMACHINE="Digital VAX"
+
+# RUN: yaml2obj %s -o %t.cris.o -D MACHINE=EM_CRIS
+# RUN: llvm-readelf --file-headers %t.cris.o | FileCheck %s -DMACHINE="Axis Communications 32-bit embedded processor"
+
+# RUN: yaml2obj %s -o %t.javelin.o -D MACHINE=EM_JAVELIN
+# RUN: llvm-readelf --file-headers %t.javelin.o | FileCheck %s -DMACHINE="Infineon Technologies 32-bit embedded cpu"
+
+# RUN: yaml2obj %s -o %t.firepath.o -D MACHINE=EM_FIREPATH
+# RUN: llvm-readelf --file-headers %t.firepath.o | FileCheck %s -DMACHINE="Element 14 64-bit DSP processor"
+
+# RUN: yaml2obj %s -o %t.zsp.o -D MACHINE=EM_ZSP
+# RUN: llvm-readelf --file-headers %t.zsp.o | FileCheck %s -DMACHINE="LSI Logic's 16-bit DSP processor"
+
+# RUN: yaml2obj %s -o %t.mmix.o -D MACHINE=EM_MMIX
+# RUN: llvm-readelf --file-headers %t.mmix.o | FileCheck %s -DMACHINE="Donald Knuth's educational 64-bit processor"
+
+# RUN: yaml2obj %s -o %t.huany.o -D MACHINE=EM_HUANY
+# RUN: llvm-readelf --file-headers %t.huany.o | FileCheck %s -DMACHINE="Harvard Universitys's machine-independent object format"
+
+# RUN: yaml2obj %s -o %t.prism.o -D MACHINE=EM_PRISM
+# RUN: llvm-readelf --file-headers %t.prism.o | FileCheck %s -DMACHINE="Vitesse Prism"
+
+# RUN: yaml2obj %s -o %t.avr.o -D MACHINE=EM_AVR
+# RUN: llvm-readelf --file-headers %t.avr.o | FileCheck %s -DMACHINE="Atmel AVR 8-bit microcontroller"
+
+# RUN: yaml2obj %s -o %t.fr30.o -D MACHINE=EM_FR30
+# RUN: llvm-readelf --file-headers %t.fr30.o | FileCheck %s -DMACHINE="Fujitsu FR30"
+
+# RUN: yaml2obj %s -o %t.d10v.o -D MACHINE=EM_D10V
+# RUN: llvm-readelf --file-headers %t.d10v.o | FileCheck %s -DMACHINE="Mitsubishi D10V"
+
+# RUN: yaml2obj %s -o %t.d30v.o -D MACHINE=EM_D30V
+# RUN: llvm-readelf --file-headers %t.d30v.o | FileCheck %s -DMACHINE="Mitsubishi D30V"
+
+# RUN: yaml2obj %s -o %t.v850.o -D MACHINE=EM_V850
+# RUN: llvm-readelf --file-headers %t.v850.o | FileCheck %s -DMACHINE="NEC v850"
+
+# RUN: yaml2obj %s -o %t.m32r.o -D MACHINE=EM_M32R
+# RUN: llvm-readelf --file-headers %t.m32r.o | FileCheck %s -DMACHINE="Renesas M32R (formerly Mitsubishi M32r)"
+
+# RUN: yaml2obj %s -o %t.mn10300.o -D MACHINE=EM_MN10300
+# RUN: llvm-readelf --file-headers %t.mn10300.o | FileCheck %s -DMACHINE="Matsushita MN10300"
+
+# RUN: yaml2obj %s -o %t.mn10200.o -D MACHINE=EM_MN10200
+# RUN: llvm-readelf --file-headers %t.mn10200.o | FileCheck %s -DMACHINE="Matsushita MN10200"
+
+# RUN: yaml2obj %s -o %t.pj.o -D MACHINE=EM_PJ
+# RUN: llvm-readelf --file-headers %t.pj.o | FileCheck %s -DMACHINE="picoJava"
+
+# RUN: yaml2obj %s -o %t.openrisc.o -D MACHINE=EM_OPENRISC
+# RUN: llvm-readelf --file-headers %t.openrisc.o | FileCheck %s -DMACHINE="OpenRISC 32-bit embedded processor"
+
+# RUN: yaml2obj %s -o %t.arc_compact.o -D MACHINE=EM_ARC_COMPACT
+# RUN: llvm-readelf --file-headers %t.arc_compact.o | FileCheck %s -DMACHINE="EM_ARC_COMPACT"
+
+# RUN: yaml2obj %s -o %t.xtensa.o -D MACHINE=EM_XTENSA
+# RUN: llvm-readelf --file-headers %t.xtensa.o | FileCheck %s -DMACHINE="Tensilica Xtensa Processor"
+
+# RUN: yaml2obj %s -o %t.videocore.o -D MACHINE=EM_VIDEOCORE
+# RUN: llvm-readelf --file-headers %t.videocore.o | FileCheck %s -DMACHINE="Alphamosaic VideoCore processor"
+
+# RUN: yaml2obj %s -o %t.tmm_gpp.o -D MACHINE=EM_TMM_GPP
+# RUN: llvm-readelf --file-headers %t.tmm_gpp.o | FileCheck %s -DMACHINE="Thompson Multimedia General Purpose Processor"
+
+# RUN: yaml2obj %s -o %t.ns32k.o -D MACHINE=EM_NS32K
+# RUN: llvm-readelf --file-headers %t.ns32k.o | FileCheck %s -DMACHINE="National Semiconductor 32000 series"
+
+# RUN: yaml2obj %s -o %t.tpc.o -D MACHINE=EM_TPC
+# RUN: llvm-readelf --file-headers %t.tpc.o | FileCheck %s -DMACHINE="Tenor Network TPC processor"
+
+# RUN: yaml2obj %s -o %t.snp1k.o -D MACHINE=EM_SNP1K
+# RUN: llvm-readelf --file-headers %t.snp1k.o | FileCheck %s -DMACHINE="EM_SNP1K"
+
+# RUN: yaml2obj %s -o %t.st200.o -D MACHINE=EM_ST200
+# RUN: llvm-readelf --file-headers %t.st200.o | FileCheck %s -DMACHINE="STMicroelectronics ST200 microcontroller"
+
+# RUN: yaml2obj %s -o %t.ip2k.o -D MACHINE=EM_IP2K
+# RUN: llvm-readelf --file-headers %t.ip2k.o | FileCheck %s -DMACHINE="Ubicom IP2xxx 8-bit microcontrollers"
+
+# RUN: yaml2obj %s -o %t.max.o -D MACHINE=EM_MAX
+# RUN: llvm-readelf --file-headers %t.max.o | FileCheck %s -DMACHINE="MAX Processor"
+
+# RUN: yaml2obj %s -o %t.cr.o -D MACHINE=EM_CR
+# RUN: llvm-readelf --file-headers %t.cr.o | FileCheck %s -DMACHINE="National Semiconductor CompactRISC"
+
+# RUN: yaml2obj %s -o %t.f2mc16.o -D MACHINE=EM_F2MC16
+# RUN: llvm-readelf --file-headers %t.f2mc16.o | FileCheck %s -DMACHINE="Fujitsu F2MC16"
+
+# RUN: yaml2obj %s -o %t.msp430.o -D MACHINE=EM_MSP430
+# RUN: llvm-readelf --file-headers %t.msp430.o | FileCheck %s -DMACHINE="Texas Instruments msp430 microcontroller"
+
+# RUN: yaml2obj %s -o %t.blackfin.o -D MACHINE=EM_BLACKFIN
+# RUN: llvm-readelf --file-headers %t.blackfin.o | FileCheck %s -DMACHINE="Analog Devices Blackfin"
+
+# RUN: yaml2obj %s -o %t.se_c33.o -D MACHINE=EM_SE_C33
+# RUN: llvm-readelf --file-headers %t.se_c33.o | FileCheck %s -DMACHINE="S1C33 Family of Seiko Epson processors"
+
+# RUN: yaml2obj %s -o %t.sep.o -D MACHINE=EM_SEP
+# RUN: llvm-readelf --file-headers %t.sep.o | FileCheck %s -DMACHINE="Sharp embedded microprocessor"
+
+# RUN: yaml2obj %s -o %t.arca.o -D MACHINE=EM_ARCA
+# RUN: llvm-readelf --file-headers %t.arca.o | FileCheck %s -DMACHINE="Arca RISC microprocessor"
+
+# RUN: yaml2obj %s -o %t.unicore.o -D MACHINE=EM_UNICORE
+# RUN: llvm-readelf --file-headers %t.unicore.o | FileCheck %s -DMACHINE="Unicore"
+
+# RUN: yaml2obj %s -o %t.excess.o -D MACHINE=EM_EXCESS
+# RUN: llvm-readelf --file-headers %t.excess.o | FileCheck %s -DMACHINE="eXcess 16/32/64-bit configurable embedded CPU"
+
+# RUN: yaml2obj %s -o %t.dxp.o -D MACHINE=EM_DXP
+# RUN: llvm-readelf --file-headers %t.dxp.o | FileCheck %s -DMACHINE="Icera Semiconductor Inc. Deep Execution Processor"
+
+# RUN: yaml2obj %s -o %t.altera_nios2.o -D MACHINE=EM_ALTERA_NIOS2
+# RUN: llvm-readelf --file-headers %t.altera_nios2.o | FileCheck %s -DMACHINE="Altera Nios"
+
+# RUN: yaml2obj %s -o %t.crx.o -D MACHINE=EM_CRX
+# RUN: llvm-readelf --file-headers %t.crx.o | FileCheck %s -DMACHINE="National Semiconductor CRX microprocessor"
+
+# RUN: yaml2obj %s -o %t.xgate.o -D MACHINE=EM_XGATE
+# RUN: llvm-readelf --file-headers %t.xgate.o | FileCheck %s -DMACHINE="Motorola XGATE embedded processor"
+
+# RUN: yaml2obj %s -o %t.c166.o -D MACHINE=EM_C166
+# RUN: llvm-readelf --file-headers %t.c166.o | FileCheck %s -DMACHINE="Infineon Technologies xc16x"
+
+# RUN: yaml2obj %s -o %t.m16c.o -D MACHINE=EM_M16C
+# RUN: llvm-readelf --file-headers %t.m16c.o | FileCheck %s -DMACHINE="Renesas M16C"
+
+# RUN: yaml2obj %s -o %t.dspic30f.o -D MACHINE=EM_DSPIC30F
+# RUN: llvm-readelf --file-headers %t.dspic30f.o | FileCheck %s -DMACHINE="Microchip Technology dsPIC30F Digital Signal Controller"
+
+# RUN: yaml2obj %s -o %t.ce.o -D MACHINE=EM_CE
+# RUN: llvm-readelf --file-headers %t.ce.o | FileCheck %s -DMACHINE="Freescale Communication Engine RISC core"
+
+# RUN: yaml2obj %s -o %t.m32c.o -D MACHINE=EM_M32C
+# RUN: llvm-readelf --file-headers %t.m32c.o | FileCheck %s -DMACHINE="Renesas M32C"
+
+# RUN: yaml2obj %s -o %t.tsk3000.o -D MACHINE=EM_TSK3000
+# RUN: llvm-readelf --file-headers %t.tsk3000.o | FileCheck %s -DMACHINE="Altium TSK3000 core"
+
+# RUN: yaml2obj %s -o %t.rs08.o -D MACHINE=EM_RS08
+# RUN: llvm-readelf --file-headers %t.rs08.o | FileCheck %s -DMACHINE="Freescale RS08 embedded processor"
+
+# RUN: yaml2obj %s -o %t.sharc.o -D MACHINE=EM_SHARC
+# RUN: llvm-readelf --file-headers %t.sharc.o | FileCheck %s -DMACHINE="EM_SHARC"
+
+# RUN: yaml2obj %s -o %t.ecog2.o -D MACHINE=EM_ECOG2
+# RUN: llvm-readelf --file-headers %t.ecog2.o | FileCheck %s -DMACHINE="Cyan Technology eCOG2 microprocessor"
+
+# RUN: yaml2obj %s -o %t.score7.o -D MACHINE=EM_SCORE7
+# RUN: llvm-readelf --file-headers %t.score7.o | FileCheck %s -DMACHINE="SUNPLUS S+Core"
+
+# RUN: yaml2obj %s -o %t.dsp24.o -D MACHINE=EM_DSP24
+# RUN: llvm-readelf --file-headers %t.dsp24.o | FileCheck %s -DMACHINE="New Japan Radio (NJR) 24-bit DSP Processor"
+
+# RUN: yaml2obj %s -o %t.videocore3.o -D MACHINE=EM_VIDEOCORE3
+# RUN: llvm-readelf --file-headers %t.videocore3.o | FileCheck %s -DMACHINE="Broadcom VideoCore III processor"
+
+# RUN: yaml2obj %s -o %t.latticemico32.o -D MACHINE=EM_LATTICEMICO32
+# RUN: llvm-readelf --file-headers %t.latticemico32.o | FileCheck %s -DMACHINE="Lattice Mico32"
+
+# RUN: yaml2obj %s -o %t.se_c17.o -D MACHINE=EM_SE_C17
+# RUN: llvm-readelf --file-headers %t.se_c17.o | FileCheck %s -DMACHINE="Seiko Epson C17 family"
+
+# RUN: yaml2obj %s -o %t.ti_c6000.o -D MACHINE=EM_TI_C6000
+# RUN: llvm-readelf --file-headers %t.ti_c6000.o | FileCheck %s -DMACHINE="Texas Instruments TMS320C6000 DSP family"
+
+# RUN: yaml2obj %s -o %t.ti_c2000.o -D MACHINE=EM_TI_C2000
+# RUN: llvm-readelf --file-headers %t.ti_c2000.o | FileCheck %s -DMACHINE="Texas Instruments TMS320C2000 DSP family"
+
+# RUN: yaml2obj %s -o %t.ti_c5500.o -D MACHINE=EM_TI_C5500
+# RUN: llvm-readelf --file-headers %t.ti_c5500.o | FileCheck %s -DMACHINE="Texas Instruments TMS320C55x DSP family"
+
+# RUN: yaml2obj %s -o %t.mmdsp_plus.o -D MACHINE=EM_MMDSP_PLUS
+# RUN: llvm-readelf --file-headers %t.mmdsp_plus.o | FileCheck %s -DMACHINE="STMicroelectronics 64bit VLIW Data Signal Processor"
+
+# RUN: yaml2obj %s -o %t.cypress_m8c.o -D MACHINE=EM_CYPRESS_M8C
+# RUN: llvm-readelf --file-headers %t.cypress_m8c.o | FileCheck %s -DMACHINE="Cypress M8C microprocessor"
+
+# RUN: yaml2obj %s -o %t.r32c.o -D MACHINE=EM_R32C
+# RUN: llvm-readelf --file-headers %t.r32c.o | FileCheck %s -DMACHINE="Renesas R32C series microprocessors"
+
+# RUN: yaml2obj %s -o %t.trimedia.o -D MACHINE=EM_TRIMEDIA
+# RUN: llvm-readelf --file-headers %t.trimedia.o | FileCheck %s -DMACHINE="NXP Semiconductors TriMedia architecture family"
+
+# RUN: yaml2obj %s -o %t.hexagon.o -D MACHINE=EM_HEXAGON
+# RUN: llvm-readelf --file-headers %t.hexagon.o | FileCheck %s -DMACHINE="Qualcomm Hexagon"
+
+# RUN: yaml2obj %s -o %t.8051.o -D MACHINE=EM_8051
+# RUN: llvm-readelf --file-headers %t.8051.o | FileCheck %s -DMACHINE="Intel 8051 and variants"
+
+# RUN: yaml2obj %s -o %t.stxp7x.o -D MACHINE=EM_STXP7X
+# RUN: llvm-readelf --file-headers %t.stxp7x.o | FileCheck %s -DMACHINE="STMicroelectronics STxP7x family"
+
+# RUN: yaml2obj %s -o %t.nds32.o -D MACHINE=EM_NDS32
+# RUN: llvm-readelf --file-headers %t.nds32.o | FileCheck %s -DMACHINE="Andes Technology compact code size embedded RISC processor family"
+
+# RUN: yaml2obj %s -o %t.ecog1.o -D MACHINE=EM_ECOG1
+# RUN: llvm-readelf --file-headers %t.ecog1.o | FileCheck %s -DMACHINE="Cyan Technology eCOG1 microprocessor"
+
+# RUN: yaml2obj %s -o %t.maxq30.o -D MACHINE=EM_MAXQ30
+# RUN: llvm-readelf --file-headers %t.maxq30.o | FileCheck %s -DMACHINE="Dallas Semiconductor MAXQ30 Core microcontrollers"
+
+# RUN: yaml2obj %s -o %t.ximo16.o -D MACHINE=EM_XIMO16
+# RUN: llvm-readelf --file-headers %t.ximo16.o | FileCheck %s -DMACHINE="New Japan Radio (NJR) 16-bit DSP Processor"
+
+# RUN: yaml2obj %s -o %t.manik.o -D MACHINE=EM_MANIK
+# RUN: llvm-readelf --file-headers %t.manik.o | FileCheck %s -DMACHINE="M2000 Reconfigurable RISC Microprocessor"
+
+# RUN: yaml2obj %s -o %t.craynv2.o -D MACHINE=EM_CRAYNV2
+# RUN: llvm-readelf --file-headers %t.craynv2.o | FileCheck %s -DMACHINE="Cray Inc. NV2 vector architecture"
+
+# RUN: yaml2obj %s -o %t.rx.o -D MACHINE=EM_RX
+# RUN: llvm-readelf --file-headers %t.rx.o | FileCheck %s -DMACHINE="Renesas RX"
+
+# RUN: yaml2obj %s -o %t.metag.o -D MACHINE=EM_METAG
+# RUN: llvm-readelf --file-headers %t.metag.o | FileCheck %s -DMACHINE="Imagination Technologies Meta processor architecture"
+
+# RUN: yaml2obj %s -o %t.mcst_elbrus.o -D MACHINE=EM_MCST_ELBRUS
+# RUN: llvm-readelf --file-headers %t.mcst_elbrus.o | FileCheck %s -DMACHINE="MCST Elbrus general purpose hardware architecture"
+
+# RUN: yaml2obj %s -o %t.ecog16.o -D MACHINE=EM_ECOG16
+# RUN: llvm-readelf --file-headers %t.ecog16.o | FileCheck %s -DMACHINE="Cyan Technology eCOG16 family"
+
+# RUN: yaml2obj %s -o %t.cr16.o -D MACHINE=EM_CR16
+# RUN: llvm-readelf --file-headers %t.cr16.o | FileCheck %s -DMACHINE="Xilinx MicroBlaze"
+
+# RUN: yaml2obj %s -o %t.etpu.o -D MACHINE=EM_ETPU
+# RUN: llvm-readelf --file-headers %t.etpu.o | FileCheck %s -DMACHINE="Freescale Extended Time Processing Unit"
+
+# RUN: yaml2obj %s -o %t.sle9x.o -D MACHINE=EM_SLE9X
+# RUN: llvm-readelf --file-headers %t.sle9x.o | FileCheck %s -DMACHINE="Infineon Technologies SLE9X core"
+
+# RUN: yaml2obj %s -o %t.l10m.o -D MACHINE=EM_L10M
+# RUN: llvm-readelf --file-headers %t.l10m.o | FileCheck %s -DMACHINE="EM_L10M"
+
+# RUN: yaml2obj %s -o %t.k10m.o -D MACHINE=EM_K10M
+# RUN: llvm-readelf --file-headers %t.k10m.o | FileCheck %s -DMACHINE="EM_K10M"
+
+# RUN: yaml2obj %s -o %t.aarch64.o -D MACHINE=EM_AARCH64
+# RUN: llvm-readelf --file-headers %t.aarch64.o | FileCheck %s -DMACHINE="AArch64"
+
+# RUN: yaml2obj %s -o %t.avr32.o -D MACHINE=EM_AVR32
+# RUN: llvm-readelf --file-headers %t.avr32.o | FileCheck %s -DMACHINE="Atmel Corporation 32-bit microprocessor family"
+
+# RUN: yaml2obj %s -o %t.stm8.o -D MACHINE=EM_STM8
+# RUN: llvm-readelf --file-headers %t.stm8.o | FileCheck %s -DMACHINE="STMicroeletronics STM8 8-bit microcontroller"
+
+# RUN: yaml2obj %s -o %t.tile64.o -D MACHINE=EM_TILE64
+# RUN: llvm-readelf --file-headers %t.tile64.o | FileCheck %s -DMACHINE="Tilera TILE64 multicore architecture family"
+
+# RUN: yaml2obj %s -o %t.tilepro.o -D MACHINE=EM_TILEPRO
+# RUN: llvm-readelf --file-headers %t.tilepro.o | FileCheck %s -DMACHINE="Tilera TILEPro multicore architecture family"
+
+# RUN: yaml2obj %s -o %t.cuda.o -D MACHINE=EM_CUDA
+# RUN: llvm-readelf --file-headers %t.cuda.o | FileCheck %s -DMACHINE="NVIDIA CUDA architecture"
+
+# RUN: yaml2obj %s -o %t.tilegx.o -D MACHINE=EM_TILEGX
+# RUN: llvm-readelf --file-headers %t.tilegx.o | FileCheck %s -DMACHINE="Tilera TILE-Gx multicore architecture family"
+
+# RUN: yaml2obj %s -o %t.cloudshield.o -D MACHINE=EM_CLOUDSHIELD
+# RUN: llvm-readelf --file-headers %t.cloudshield.o | FileCheck %s -DMACHINE="EM_CLOUDSHIELD"
+
+# RUN: yaml2obj %s -o %t.corea_1st.o -D MACHINE=EM_COREA_1ST
+# RUN: llvm-readelf --file-headers %t.corea_1st.o | FileCheck %s -DMACHINE="EM_COREA_1ST"
+
+# RUN: yaml2obj %s -o %t.corea_2nd.o -D MACHINE=EM_COREA_2ND
+# RUN: llvm-readelf --file-headers %t.corea_2nd.o | FileCheck %s -DMACHINE="EM_COREA_2ND"
+
+# RUN: yaml2obj %s -o %t.arc_compact2.o -D MACHINE=EM_ARC_COMPACT2
+# RUN: llvm-readelf --file-headers %t.arc_compact2.o | FileCheck %s -DMACHINE="EM_ARC_COMPACT2"
+
+# RUN: yaml2obj %s -o %t.open8.o -D MACHINE=EM_OPEN8
+# RUN: llvm-readelf --file-headers %t.open8.o | FileCheck %s -DMACHINE="EM_OPEN8"
+
+# RUN: yaml2obj %s -o %t.rl78.o -D MACHINE=EM_RL78
+# RUN: llvm-readelf --file-headers %t.rl78.o | FileCheck %s -DMACHINE="Renesas RL78"
+
+# RUN: yaml2obj %s -o %t.videocore5.o -D MACHINE=EM_VIDEOCORE5
+# RUN: llvm-readelf --file-headers %t.videocore5.o | FileCheck %s -DMACHINE="Broadcom VideoCore V processor"
+
+# RUN: yaml2obj %s -o %t.78kor.o -D MACHINE=EM_78KOR
+# RUN: llvm-readelf --file-headers %t.78kor.o | FileCheck %s -DMACHINE="EM_78KOR"
+
+# RUN: yaml2obj %s -o %t.56800ex.o -D MACHINE=EM_56800EX
+# RUN: llvm-readelf --file-headers %t.56800ex.o | FileCheck %s -DMACHINE="EM_56800EX"
+
+# RUN: yaml2obj %s -o %t.amdgpu.o -D MACHINE=EM_AMDGPU
+# RUN: llvm-readelf --file-headers %t.amdgpu.o | FileCheck %s -DMACHINE="EM_AMDGPU"
+
+# RUN: yaml2obj %s -o %t.riscv.o -D MACHINE=EM_RISCV
+# RUN: llvm-readelf --file-headers %t.riscv.o | FileCheck %s -DMACHINE="RISC-V"
+
+# RUN: yaml2obj %s -o %t.lanai.o -D MACHINE=EM_LANAI
+# RUN: llvm-readelf --file-headers %t.lanai.o | FileCheck %s -DMACHINE="EM_LANAI"
+
+# RUN: yaml2obj %s -o %t.bpf.o -D MACHINE=EM_BPF
+# RUN: llvm-readelf --file-headers %t.bpf.o | FileCheck %s -DMACHINE="EM_BPF"
+
+# RUN: yaml2obj %s -o %t.ve.o -D MACHINE=EM_VE
+# RUN: llvm-readelf --file-headers %t.ve.o | FileCheck %s -DMACHINE="NEC SX-Aurora Vector Engine"
+
+# CHECK: Machine: [[MACHINE]]
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: [[MACHINE]]
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 83132869cc2c2..84a68b17b298f 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1435,6 +1435,8 @@ static const EnumEntry<unsigned> ElfMachineType[] = {
   ENUM_ENT(EM_STXP7X,        "STMicroelectronics STxP7x family"),
   ENUM_ENT(EM_NDS32,         "Andes Technology compact code size embedded RISC processor family"),
   ENUM_ENT(EM_ECOG1,         "Cyan Technology eCOG1 microprocessor"),
+  // FIXME: Following EM_ECOG1X definitions is dead code since EM_ECOG1X has
+  //        an identical number to EM_ECOG1.
   ENUM_ENT(EM_ECOG1X,        "Cyan Technology eCOG1X family"),
   ENUM_ENT(EM_MAXQ30,        "Dallas Semiconductor MAXQ30 Core microcontrollers"),
   ENUM_ENT(EM_XIMO16,        "New Japan Radio (NJR) 16-bit DSP Processor"),
@@ -1469,6 +1471,7 @@ static const EnumEntry<unsigned> ElfMachineType[] = {
   ENUM_ENT(EM_RISCV,         "RISC-V"),
   ENUM_ENT(EM_LANAI,         "EM_LANAI"),
   ENUM_ENT(EM_BPF,           "EM_BPF"),
+  ENUM_ENT(EM_VE,            "NEC SX-Aurora Vector Engine"),
 };
 
 static const EnumEntry<unsigned> ElfSymbolBindings[] = {
diff --git a/llvm/unittests/Object/CMakeLists.txt b/llvm/unittests/Object/CMakeLists.txt
index c5d1f5476ccda..1d419eb187d8b 100644
--- a/llvm/unittests/Object/CMakeLists.txt
+++ b/llvm/unittests/Object/CMakeLists.txt
@@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(ObjectTests
   ArchiveTest.cpp
+  ELFObjectFileTest.cpp
+  ELFTest.cpp
   MinidumpTest.cpp
   ObjectFileTest.cpp
   SymbolSizeTest.cpp
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
new file mode 100644
index 0000000000000..3bbc56b61c6ce
--- /dev/null
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -0,0 +1,127 @@
+//===- ELFObjectFileTest.cpp - Tests for ELFObjectFile --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <class ELFT>
+static Expected<ELFObjectFile<ELFT>> create(ArrayRef<uint8_t> Data) {
+  return ELFObjectFile<ELFT>::create(
+      MemoryBufferRef(toStringRef(Data), "Test buffer"));
+}
+
+// A class to initialize a buffer to represent an ELF object file.
+struct DataForTest {
+  std::vector<uint8_t> Data;
+
+  template <typename T>
+  std::vector<uint8_t> makeElfData(uint8_t Class, uint8_t Encoding,
+                                   uint16_t Machine) {
+    T Ehdr{}; // Zero-initialise the header.
+    Ehdr.e_ident[ELF::EI_MAG0] = 0x7f;
+    Ehdr.e_ident[ELF::EI_MAG1] = 'E';
+    Ehdr.e_ident[ELF::EI_MAG2] = 'L';
+    Ehdr.e_ident[ELF::EI_MAG3] = 'F';
+    Ehdr.e_ident[ELF::EI_CLASS] = Class;
+    Ehdr.e_ident[ELF::EI_DATA] = Encoding;
+    Ehdr.e_ident[ELF::EI_VERSION] = 1;
+    Ehdr.e_type = ELF::ET_REL;
+    Ehdr.e_machine = Machine;
+    Ehdr.e_version = 1;
+    Ehdr.e_ehsize = sizeof(T);
+
+    bool IsLittleEndian = Encoding == ELF::ELFDATA2LSB;
+    if (sys::IsLittleEndianHost != IsLittleEndian) {
+      sys::swapByteOrder(Ehdr.e_type);
+      sys::swapByteOrder(Ehdr.e_machine);
+      sys::swapByteOrder(Ehdr.e_version);
+      sys::swapByteOrder(Ehdr.e_ehsize);
+    }
+
+    uint8_t *EhdrBytes = reinterpret_cast<uint8_t *>(&Ehdr);
+    std::vector<uint8_t> Bytes;
+    std::copy(EhdrBytes, EhdrBytes + sizeof(Ehdr), std::back_inserter(Bytes));
+    return Bytes;
+  }
+
+  DataForTest(uint8_t Class, uint8_t Encoding, uint16_t Machine) {
+    if (Class == ELF::ELFCLASS64)
+      Data = makeElfData<ELF::Elf64_Ehdr>(Class, Encoding, Machine);
+    else {
+      assert(Class == ELF::ELFCLASS32);
+      Data = makeElfData<ELF::Elf64_Ehdr>(Class, Encoding, Machine);
+    }
+  }
+};
+
+TEST(ELFObjectFileTest, MachineTestForVE) {
+  DataForTest Data(ELF::ELFCLASS64, ELF::ELFDATA2LSB, ELF::EM_VE);
+  auto ExpectedFile = create<ELF64LE>(Data.Data);
+  ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+  const ELFObjectFile<ELF64LE> &File = *ExpectedFile;
+  EXPECT_EQ("elf64-ve", File.getFileFormatName());
+  EXPECT_EQ(Triple::ve, File.getArch());
+}
+
+TEST(ELFObjectFileTest, MachineTestForX86_64) {
+  DataForTest Data(ELF::ELFCLASS64, ELF::ELFDATA2LSB, ELF::EM_X86_64);
+  auto ExpectedFile = create<ELF64LE>(Data.Data);
+  ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+  const ELFObjectFile<ELF64LE> &File = *ExpectedFile;
+  EXPECT_EQ("elf64-x86-64", File.getFileFormatName());
+  EXPECT_EQ(Triple::x86_64, File.getArch());
+}
+
+TEST(ELFObjectFileTest, MachineTestFor386) {
+  DataForTest Data(ELF::ELFCLASS32, ELF::ELFDATA2LSB, ELF::EM_386);
+  auto ExpectedFile = create<ELF32LE>(Data.Data);
+  ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+  const ELFObjectFile<ELF32LE> &File = *ExpectedFile;
+  EXPECT_EQ("elf32-i386", File.getFileFormatName());
+  EXPECT_EQ(Triple::x86, File.getArch());
+}
+
+TEST(ELFObjectFileTest, MachineTestForMIPS) {
+  {
+    DataForTest Data(ELF::ELFCLASS64, ELF::ELFDATA2LSB, ELF::EM_MIPS);
+    auto ExpectedFile = create<ELF64LE>(Data.Data);
+    ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+    const ELFObjectFile<ELF64LE> &File = *ExpectedFile;
+    EXPECT_EQ("elf64-mips", File.getFileFormatName());
+    EXPECT_EQ(Triple::mips64el, File.getArch());
+  }
+  {
+    DataForTest Data(ELF::ELFCLASS64, ELF::ELFDATA2MSB, ELF::EM_MIPS);
+    auto ExpectedFile = create<ELF64BE>(Data.Data);
+    ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+    const ELFObjectFile<ELF64BE> &File = *ExpectedFile;
+    EXPECT_EQ("elf64-mips", File.getFileFormatName());
+    EXPECT_EQ(Triple::mips64, File.getArch());
+  }
+  {
+    DataForTest Data(ELF::ELFCLASS32, ELF::ELFDATA2LSB, ELF::EM_MIPS);
+    auto ExpectedFile = create<ELF32LE>(Data.Data);
+    ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+    const ELFObjectFile<ELF32LE> &File = *ExpectedFile;
+    EXPECT_EQ("elf32-mips", File.getFileFormatName());
+    EXPECT_EQ(Triple::mipsel, File.getArch());
+  }
+  {
+    DataForTest Data(ELF::ELFCLASS32, ELF::ELFDATA2MSB, ELF::EM_MIPS);
+    auto ExpectedFile = create<ELF32BE>(Data.Data);
+    ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+    const ELFObjectFile<ELF32BE> &File = *ExpectedFile;
+    EXPECT_EQ("elf32-mips", File.getFileFormatName());
+    EXPECT_EQ(Triple::mips, File.getArch());
+  }
+}
diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
new file mode 100644
index 0000000000000..b815d5cdd8390
--- /dev/null
+++ b/llvm/unittests/Object/ELFTest.cpp
@@ -0,0 +1,56 @@
+//===- ELFTest.cpp - Tests for ELF.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELF.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::ELF;
+
+TEST(ELFTest, getELFRelocationTypeNameForVE) {
+  EXPECT_EQ("R_VE_NONE", getELFRelocationTypeName(EM_VE, R_VE_NONE));
+  EXPECT_EQ("R_VE_REFLONG", getELFRelocationTypeName(EM_VE, R_VE_REFLONG));
+  EXPECT_EQ("R_VE_REFQUAD", getELFRelocationTypeName(EM_VE, R_VE_REFQUAD));
+  EXPECT_EQ("R_VE_SREL32", getELFRelocationTypeName(EM_VE, R_VE_SREL32));
+  EXPECT_EQ("R_VE_HI32", getELFRelocationTypeName(EM_VE, R_VE_HI32));
+  EXPECT_EQ("R_VE_LO32", getELFRelocationTypeName(EM_VE, R_VE_LO32));
+  EXPECT_EQ("R_VE_PC_HI32", getELFRelocationTypeName(EM_VE, R_VE_PC_HI32));
+  EXPECT_EQ("R_VE_PC_LO32", getELFRelocationTypeName(EM_VE, R_VE_PC_LO32));
+  EXPECT_EQ("R_VE_GOT32", getELFRelocationTypeName(EM_VE, R_VE_GOT32));
+  EXPECT_EQ("R_VE_GOT_HI32", getELFRelocationTypeName(EM_VE, R_VE_GOT_HI32));
+  EXPECT_EQ("R_VE_GOT_LO32", getELFRelocationTypeName(EM_VE, R_VE_GOT_LO32));
+  EXPECT_EQ("R_VE_GOTOFF32", getELFRelocationTypeName(EM_VE, R_VE_GOTOFF32));
+  EXPECT_EQ("R_VE_GOTOFF_HI32",
+            getELFRelocationTypeName(EM_VE, R_VE_GOTOFF_HI32));
+  EXPECT_EQ("R_VE_GOTOFF_LO32",
+            getELFRelocationTypeName(EM_VE, R_VE_GOTOFF_LO32));
+  EXPECT_EQ("R_VE_PLT32", getELFRelocationTypeName(EM_VE, R_VE_PLT32));
+  EXPECT_EQ("R_VE_PLT_HI32", getELFRelocationTypeName(EM_VE, R_VE_PLT_HI32));
+  EXPECT_EQ("R_VE_PLT_LO32", getELFRelocationTypeName(EM_VE, R_VE_PLT_LO32));
+  EXPECT_EQ("R_VE_RELATIVE", getELFRelocationTypeName(EM_VE, R_VE_RELATIVE));
+  EXPECT_EQ("R_VE_GLOB_DAT", getELFRelocationTypeName(EM_VE, R_VE_GLOB_DAT));
+  EXPECT_EQ("R_VE_JUMP_SLOT", getELFRelocationTypeName(EM_VE, R_VE_JUMP_SLOT));
+  EXPECT_EQ("R_VE_COPY", getELFRelocationTypeName(EM_VE, R_VE_COPY));
+  EXPECT_EQ("R_VE_DTPMOD64", getELFRelocationTypeName(EM_VE, R_VE_DTPMOD64));
+  EXPECT_EQ("R_VE_DTPOFF64", getELFRelocationTypeName(EM_VE, R_VE_DTPOFF64));
+  EXPECT_EQ("R_VE_TLS_GD_HI32",
+            getELFRelocationTypeName(EM_VE, R_VE_TLS_GD_HI32));
+  EXPECT_EQ("R_VE_TLS_GD_LO32",
+            getELFRelocationTypeName(EM_VE, R_VE_TLS_GD_LO32));
+  EXPECT_EQ("R_VE_TPOFF_HI32",
+            getELFRelocationTypeName(EM_VE, R_VE_TPOFF_HI32));
+  EXPECT_EQ("R_VE_TPOFF_LO32",
+            getELFRelocationTypeName(EM_VE, R_VE_TPOFF_LO32));
+  EXPECT_EQ("R_VE_CALL_HI32", getELFRelocationTypeName(EM_VE, R_VE_CALL_HI32));
+  EXPECT_EQ("R_VE_CALL_LO32", getELFRelocationTypeName(EM_VE, R_VE_CALL_LO32));
+}
+
+TEST(ELFTest, getELFRelativeRelocationType) {
+  EXPECT_EQ(0U, getELFRelativeRelocationType(EM_VE));
+}
diff --git a/llvm/unittests/ObjectYAML/CMakeLists.txt b/llvm/unittests/ObjectYAML/CMakeLists.txt
index 45e9c672966d9..04a770a46eb38 100644
--- a/llvm/unittests/ObjectYAML/CMakeLists.txt
+++ b/llvm/unittests/ObjectYAML/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(ObjectYAMLTests
+  ELFYAMLTest.cpp
   MinidumpYAMLTest.cpp
   YAML2ObjTest.cpp
   YAMLTest.cpp
diff --git a/llvm/unittests/ObjectYAML/ELFYAMLTest.cpp b/llvm/unittests/ObjectYAML/ELFYAMLTest.cpp
new file mode 100644
index 0000000000000..fdbafc28f0d73
--- /dev/null
+++ b/llvm/unittests/ObjectYAML/ELFYAMLTest.cpp
@@ -0,0 +1,134 @@
+//===- ELFYAMLTest.cpp - Tests for ELFYAML.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <class ELFT>
+static Expected<ELFObjectFile<ELFT>> toBinary(SmallVectorImpl<char> &Storage,
+                                              StringRef Yaml) {
+  Storage.clear();
+  raw_svector_ostream OS(Storage);
+  yaml::Input YIn(Yaml);
+  if (!yaml::convertYAML(YIn, OS, [](const Twine &Msg) {}))
+    return createStringError(std::errc::invalid_argument,
+                             "unable to convert YAML");
+
+  return ELFObjectFile<ELFT>::create(MemoryBufferRef(OS.str(), "Binary"));
+}
+
+TEST(ELFRelocationTypeTest, RelocationTestForVE) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ExpectedFile = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_VE
+Sections:
+  - Name: .rela.text
+    Type: SHT_RELA
+    Relocations:
+      - Type: R_VE_NONE
+      - Type: R_VE_REFLONG
+      - Type: R_VE_REFQUAD
+      - Type: R_VE_SREL32
+      - Type: R_VE_HI32
+      - Type: R_VE_LO32
+      - Type: R_VE_PC_HI32
+      - Type: R_VE_PC_LO32
+      - Type: R_VE_GOT32
+      - Type: R_VE_GOT_HI32
+      - Type: R_VE_GOT_LO32
+      - Type: R_VE_GOTOFF32
+      - Type: R_VE_GOTOFF_HI32
+      - Type: R_VE_GOTOFF_LO32
+      - Type: R_VE_PLT32
+      - Type: R_VE_PLT_HI32
+      - Type: R_VE_PLT_LO32
+      - Type: R_VE_RELATIVE
+      - Type: R_VE_GLOB_DAT
+      - Type: R_VE_JUMP_SLOT
+      - Type: R_VE_COPY
+      - Type: R_VE_DTPMOD64
+      - Type: R_VE_DTPOFF64
+      - Type: R_VE_TLS_GD_HI32
+      - Type: R_VE_TLS_GD_LO32
+      - Type: R_VE_TPOFF_HI32
+      - Type: R_VE_TPOFF_LO32
+      - Type: R_VE_CALL_HI32
+      - Type: R_VE_CALL_LO32)");
+  ASSERT_THAT_EXPECTED(ExpectedFile, Succeeded());
+  const ELFObjectFile<ELF64LE> &File = *ExpectedFile;
+  EXPECT_EQ("elf64-ve", File.getFileFormatName());
+  EXPECT_EQ(Triple::ve, File.getArch());
+
+  // Test relocation types.
+  for (SectionRef Sec : File.sections()) {
+    Expected<StringRef> NameOrErr = Sec.getName();
+    ASSERT_THAT_EXPECTED(NameOrErr, Succeeded());
+    StringRef SectionName = *NameOrErr;
+    if (SectionName != ".rela.text")
+      continue;
+
+    for (RelocationRef R : Sec.relocations()) {
+      SmallString<32> RelTypeName;
+      using namespace llvm::ELF;
+
+#define NAME_CHECK(ID)                                                         \
+  case ID:                                                                     \
+    R.getTypeName(RelTypeName);                                                \
+    EXPECT_EQ(#ID, RelTypeName);                                               \
+    break
+
+      switch (R.getType()) {
+        NAME_CHECK(R_VE_NONE);
+        NAME_CHECK(R_VE_REFLONG);
+        NAME_CHECK(R_VE_REFQUAD);
+        NAME_CHECK(R_VE_SREL32);
+        NAME_CHECK(R_VE_HI32);
+        NAME_CHECK(R_VE_LO32);
+        NAME_CHECK(R_VE_PC_HI32);
+        NAME_CHECK(R_VE_PC_LO32);
+        NAME_CHECK(R_VE_GOT32);
+        NAME_CHECK(R_VE_GOT_HI32);
+        NAME_CHECK(R_VE_GOT_LO32);
+        NAME_CHECK(R_VE_GOTOFF32);
+        NAME_CHECK(R_VE_GOTOFF_HI32);
+        NAME_CHECK(R_VE_GOTOFF_LO32);
+        NAME_CHECK(R_VE_PLT32);
+        NAME_CHECK(R_VE_PLT_HI32);
+        NAME_CHECK(R_VE_PLT_LO32);
+        NAME_CHECK(R_VE_RELATIVE);
+        NAME_CHECK(R_VE_GLOB_DAT);
+        NAME_CHECK(R_VE_JUMP_SLOT);
+        NAME_CHECK(R_VE_COPY);
+        NAME_CHECK(R_VE_DTPMOD64);
+        NAME_CHECK(R_VE_DTPOFF64);
+        NAME_CHECK(R_VE_TLS_GD_HI32);
+        NAME_CHECK(R_VE_TLS_GD_LO32);
+        NAME_CHECK(R_VE_TPOFF_HI32);
+        NAME_CHECK(R_VE_TPOFF_LO32);
+        NAME_CHECK(R_VE_CALL_HI32);
+        NAME_CHECK(R_VE_CALL_LO32);
+      default:
+        FAIL() << "Found unexpected relocation type: " + Twine(R.getType());
+        break;
+      }
+    }
+  }
+}

From 4b94cee650ce9753214d562826b7f1b9663c2268 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 28 May 2020 08:08:39 +0000
Subject: [PATCH 321/770] [gn build] Port 5921782f744

---
 llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn     | 2 ++
 llvm/utils/gn/secondary/llvm/unittests/ObjectYAML/BUILD.gn | 1 +
 2 files changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
index 1339981833759..0272e9247f410 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
@@ -8,6 +8,8 @@ unittest("ObjectTests") {
   ]
   sources = [
     "ArchiveTest.cpp",
+    "ELFObjectFileTest.cpp",
+    "ELFTest.cpp",
     "MinidumpTest.cpp",
     "ObjectFileTest.cpp",
     "SymbolSizeTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ObjectYAML/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ObjectYAML/BUILD.gn
index 2fbcaa94334d9..7a5855292c403 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ObjectYAML/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ObjectYAML/BUILD.gn
@@ -7,6 +7,7 @@ unittest("ObjectYAMLTests") {
     "//llvm/lib/Testing/Support",
   ]
   sources = [
+    "ELFYAMLTest.cpp",
     "MinidumpYAMLTest.cpp",
     "YAML2ObjTest.cpp",
     "YAMLTest.cpp",

From 213c6cdf2e7a30d722cee4cd66b7d48fc396d44b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 28 May 2020 08:08:20 +0000
Subject: [PATCH 322/770] Harden MLIR detection of misconfiguration when
 missing dialect registration

This changes will catch error where C++ op are used without being
registered, either through creation with the OpBuilder or when trying to
cast to the C++ op.

Differential Revision: https://reviews.llvm.org/D80651
---
 mlir/include/mlir/IR/Builders.h     |  8 ++++++++
 mlir/include/mlir/IR/MLIRContext.h  |  3 +++
 mlir/include/mlir/IR/OpDefinition.h |  5 ++++-
 mlir/lib/IR/MLIRContext.cpp         | 12 ++++++++++--
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 424eb980cd33a..0dcf4daf656fd 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -374,6 +374,10 @@ class OpBuilder : public Builder {
   template <typename OpTy, typename... Args>
   OpTy create(Location location, Args &&... args) {
     OperationState state(location, OpTy::getOperationName());
+    if (!state.name.getAbstractOperation())
+      llvm::report_fatal_error("Building op `" +
+                               state.name.getStringRef().str() +
+                               "` but it isn't registered in this MLIRContext");
     OpTy::build(*this, state, std::forward<Args>(args)...);
     auto *op = createOperation(state);
     auto result = dyn_cast<OpTy>(op);
@@ -390,6 +394,10 @@ class OpBuilder : public Builder {
     // Create the operation without using 'createOperation' as we don't want to
     // insert it yet.
     OperationState state(location, OpTy::getOperationName());
+    if (!state.name.getAbstractOperation())
+      llvm::report_fatal_error("Building op `" +
+                               state.name.getStringRef().str() +
+                               "` but it isn't registered in this MLIRContext");
     OpTy::build(*this, state, std::forward<Args>(args)...);
     Operation *op = Operation::create(state);
 
diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h
index da0b0bd826ced..8e75bb6244493 100644
--- a/mlir/include/mlir/IR/MLIRContext.h
+++ b/mlir/include/mlir/IR/MLIRContext.h
@@ -85,6 +85,9 @@ class MLIRContext {
   /// directly.
   std::vector<AbstractOperation *> getRegisteredOperations();
 
+  /// Return true if this operation name is registered in this context.
+  bool isOperationRegistered(StringRef name);
+
   // This is effectively private given that only MLIRContext.cpp can see the
   // MLIRContextImpl type.
   MLIRContextImpl &getImpl() { return *impl; }
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index bf5bd70c2b7fe..e92d54ec84f9b 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -1235,7 +1235,10 @@ class Op : public OpState,
   static bool classof(Operation *op) {
     if (auto *abstractOp = op->getAbstractOperation())
       return TypeID::get<ConcreteType>() == abstractOp->typeID;
-    return op->getName().getStringRef() == ConcreteType::getOperationName();
+    assert(op->getContext()->isOperationRegistered(
+               ConcreteType::getOperationName()) &&
+           "Casting attempt to an unregistered operation");
+    return false;
   }
 
   /// This is the hook used by the AsmParser to parse the custom form of this
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 0728f294be861..da607a2319bfc 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -543,6 +543,13 @@ std::vector<AbstractOperation *> MLIRContext::getRegisteredOperations() {
   return result;
 }
 
+bool MLIRContext::isOperationRegistered(StringRef name) {
+  // Lock access to the context registry.
+  ScopedReaderLock registryLock(impl->contextMutex, impl->threadingIsEnabled);
+
+  return impl->registeredOperations.count(name);
+}
+
 void Dialect::addOperation(AbstractOperation opInfo) {
   assert((getNamespace().empty() ||
           opInfo.name.split('.').first == getNamespace()) &&
@@ -621,8 +628,9 @@ Identifier Identifier::get(StringRef str, MLIRContext *context) {
 static Dialect &lookupDialectForSymbol(MLIRContext *ctx, TypeID typeID) {
   auto &impl = ctx->getImpl();
   auto it = impl.registeredDialectSymbols.find(typeID);
-  assert(it != impl.registeredDialectSymbols.end() &&
-         "symbol is not registered.");
+  if (it == impl.registeredDialectSymbols.end())
+    llvm::report_fatal_error(
+        "Trying to create a type that was not registered in this MLIRContext.");
   return *it->second;
 }
 

From d20bf5a7258d4b6a7f017a81b125275dac1aa166 Mon Sep 17 00:00:00 2001
From: Alok Kumar Sharma <AlokKumar.Sharma@amd.com>
Date: Thu, 28 May 2020 13:31:22 +0530
Subject: [PATCH 323/770] [DebugInfo] Upgrade DISubrange to support Fortran
 dynamic arrays

This patch upgrades DISubrange to support fortran requirements.

Summary:
Below are the updates/addition of fields.
lowerBound - Now accepts signed integer or DIVariable or DIExpression,
earlier it accepted only signed integer.
upperBound - This field is now added and accepts signed interger or
DIVariable or DIExpression.
stride - This field is now added and accepts signed interger or
DIVariable or DIExpression.
This is required to describe bounds of array which are known at runtime.

Testing:
unit test cases added (hand-written)
check clang
check llvm
check debug-info

Reviewed By: aprantl

Differential Revision: https://reviews.llvm.org/D80197
---
 clang/lib/CodeGen/CGDebugInfo.cpp             |  43 ++++--
 llvm/include/llvm/IR/DIBuilder.h              |   2 +
 llvm/include/llvm/IR/DebugInfoMetadata.h      |  42 ++++--
 llvm/lib/AsmParser/LLParser.cpp               |  36 ++++-
 llvm/lib/Bitcode/Reader/MetadataLoader.cpp    |  14 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   6 +-
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp |   2 +-
 .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |  15 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     |  26 +++-
 llvm/lib/IR/AsmWriter.cpp                     |  31 +++-
 llvm/lib/IR/DIBuilder.cpp                     |  15 +-
 llvm/lib/IR/DebugInfoMetadata.cpp             |  96 +++++++++++-
 llvm/lib/IR/LLVMContextImpl.h                 |  50 ++++---
 llvm/lib/IR/Verifier.cpp                      |  26 +++-
 llvm/test/Assembler/debug-info.ll             |   4 +-
 llvm/test/Assembler/disubrange-empty-array.ll |   4 +-
 .../invalid-disubrange-count-missing.ll       |   3 +-
 llvm/test/Bindings/llvm-c/debug_info.ll       |   2 +-
 llvm/test/Bitcode/fortranSubrange.ll          |  44 ++++++
 llvm/test/Bitcode/fortranSubrangeBackward.ll  |  50 +++++++
 .../Bitcode/fortranSubrangeBackward.ll.bc     | Bin 0 -> 2064 bytes
 .../DebugInfo/X86/default-subrange-array.ll   |   2 +-
 .../X86/nondefault-subrange-array.ll          |   2 +-
 llvm/test/DebugInfo/cDefaultLower.ll          |  35 +++++
 llvm/test/DebugInfo/fortranDefaultLower.ll    |  35 +++++
 llvm/test/DebugInfo/fortranSubrangeExpr.ll    |  44 ++++++
 llvm/test/DebugInfo/fortranSubrangeInt.ll     |  43 ++++++
 llvm/test/DebugInfo/fortranSubrangeVar.ll     |  62 ++++++++
 .../Verifier/disubrange-count-upperBound.ll   |   5 +
 .../Verifier/disubrange-missing-upperBound.ll |   5 +
 .../Verifier/invalid-disubrange-lowerBound.ll |   6 +
 .../Verifier/invalid-disubrange-stride.ll     |   6 +
 .../Verifier/invalid-disubrange-upperBound.ll |   6 +
 llvm/unittests/IR/MetadataTest.cpp            | 139 +++++++++++++++++-
 34 files changed, 815 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/Bitcode/fortranSubrange.ll
 create mode 100644 llvm/test/Bitcode/fortranSubrangeBackward.ll
 create mode 100644 llvm/test/Bitcode/fortranSubrangeBackward.ll.bc
 create mode 100644 llvm/test/DebugInfo/cDefaultLower.ll
 create mode 100644 llvm/test/DebugInfo/fortranDefaultLower.ll
 create mode 100644 llvm/test/DebugInfo/fortranSubrangeExpr.ll
 create mode 100644 llvm/test/DebugInfo/fortranSubrangeInt.ll
 create mode 100644 llvm/test/DebugInfo/fortranSubrangeVar.ll
 create mode 100644 llvm/test/Verifier/disubrange-count-upperBound.ll
 create mode 100644 llvm/test/Verifier/disubrange-missing-upperBound.ll
 create mode 100644 llvm/test/Verifier/invalid-disubrange-lowerBound.ll
 create mode 100644 llvm/test/Verifier/invalid-disubrange-stride.ll
 create mode 100644 llvm/test/Verifier/invalid-disubrange-upperBound.ll

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 7ec792ca0e1f4..4e0b6aa0dca67 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2732,9 +2732,17 @@ llvm::DIType *CGDebugInfo::CreateType(const VectorType *Ty,
   QualType QTy(Ty, 0);
   auto SizeExpr = SizeExprCache.find(QTy);
   if (SizeExpr != SizeExprCache.end())
-    Subscript = DBuilder.getOrCreateSubrange(0, SizeExpr->getSecond());
-  else
-    Subscript = DBuilder.getOrCreateSubrange(0, Count ? Count : -1);
+    Subscript = DBuilder.getOrCreateSubrange(
+        SizeExpr->getSecond() /*count*/, nullptr /*lowerBound*/,
+        nullptr /*upperBound*/, nullptr /*stride*/);
+  else {
+    auto *CountNode =
+        llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
+            llvm::Type::getInt64Ty(CGM.getLLVMContext()), Count ? Count : -1));
+    Subscript = DBuilder.getOrCreateSubrange(
+        CountNode /*count*/, nullptr /*lowerBound*/, nullptr /*upperBound*/,
+        nullptr /*stride*/);
+  }
   llvm::DINodeArray SubscriptArray = DBuilder.getOrCreateArray(Subscript);
 
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
@@ -2754,8 +2762,18 @@ llvm::DIType *CGDebugInfo::CreateType(const ConstantMatrixType *Ty,
 
   // Create ranges for both dimensions.
   llvm::SmallVector<llvm::Metadata *, 2> Subscripts;
-  Subscripts.push_back(DBuilder.getOrCreateSubrange(0, Ty->getNumColumns()));
-  Subscripts.push_back(DBuilder.getOrCreateSubrange(0, Ty->getNumRows()));
+  auto *ColumnCountNode =
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
+          llvm::Type::getInt64Ty(CGM.getLLVMContext()), Ty->getNumColumns()));
+  auto *RowCountNode =
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
+          llvm::Type::getInt64Ty(CGM.getLLVMContext()), Ty->getNumRows()));
+  Subscripts.push_back(DBuilder.getOrCreateSubrange(
+      ColumnCountNode /*count*/, nullptr /*lowerBound*/, nullptr /*upperBound*/,
+      nullptr /*stride*/));
+  Subscripts.push_back(DBuilder.getOrCreateSubrange(
+      RowCountNode /*count*/, nullptr /*lowerBound*/, nullptr /*upperBound*/,
+      nullptr /*stride*/));
   llvm::DINodeArray SubscriptArray = DBuilder.getOrCreateArray(Subscripts);
   return DBuilder.createArrayType(Size, Align, ElementTy, SubscriptArray);
 }
@@ -2810,10 +2828,17 @@ llvm::DIType *CGDebugInfo::CreateType(const ArrayType *Ty, llvm::DIFile *Unit) {
 
     auto SizeNode = SizeExprCache.find(EltTy);
     if (SizeNode != SizeExprCache.end())
-      Subscripts.push_back(
-          DBuilder.getOrCreateSubrange(0, SizeNode->getSecond()));
-    else
-      Subscripts.push_back(DBuilder.getOrCreateSubrange(0, Count));
+      Subscripts.push_back(DBuilder.getOrCreateSubrange(
+          SizeNode->getSecond() /*count*/, nullptr /*lowerBound*/,
+          nullptr /*upperBound*/, nullptr /*stride*/));
+    else {
+      auto *CountNode =
+          llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
+              llvm::Type::getInt64Ty(CGM.getLLVMContext()), Count));
+      Subscripts.push_back(DBuilder.getOrCreateSubrange(
+          CountNode /*count*/, nullptr /*lowerBound*/, nullptr /*upperBound*/,
+          nullptr /*stride*/));
+    }
     EltTy = Ty->getElementType();
   }
 
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index d63ca34c573b8..d1c7d126b5a9e 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -573,6 +573,8 @@ namespace llvm {
     /// implicitly uniques the values returned.
     DISubrange *getOrCreateSubrange(int64_t Lo, int64_t Count);
     DISubrange *getOrCreateSubrange(int64_t Lo, Metadata *CountNode);
+    DISubrange *getOrCreateSubrange(Metadata *Count, Metadata *LowerBound,
+                                    Metadata *UpperBound, Metadata *Stride);
 
     /// Create a new descriptor for the specified variable.
     /// \param Context     Variable scope.
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 7dca44247c04f..900a4b561cda9 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -287,12 +287,8 @@ class DISubrange : public DINode {
   friend class LLVMContextImpl;
   friend class MDNode;
 
-  int64_t LowerBound;
-
-  DISubrange(LLVMContext &C, StorageType Storage, Metadata *Node,
-             int64_t LowerBound, ArrayRef<Metadata *> Ops)
-      : DINode(C, DISubrangeKind, Storage, dwarf::DW_TAG_subrange_type, Ops),
-        LowerBound(LowerBound) {}
+  DISubrange(LLVMContext &C, StorageType Storage, ArrayRef<Metadata *> Ops)
+      : DINode(C, DISubrangeKind, Storage, dwarf::DW_TAG_subrange_type, Ops) {}
 
   ~DISubrange() = default;
 
@@ -304,8 +300,14 @@ class DISubrange : public DINode {
                              int64_t LowerBound, StorageType Storage,
                              bool ShouldCreate = true);
 
+  static DISubrange *getImpl(LLVMContext &Context, Metadata *CountNode,
+                             Metadata *LowerBound, Metadata *UpperBound,
+                             Metadata *Stride, StorageType Storage,
+                             bool ShouldCreate = true);
+
   TempDISubrange cloneImpl() const {
-    return getTemporary(getContext(), getRawCountNode(), getLowerBound());
+    return getTemporary(getContext(), getRawCountNode(), getRawLowerBound(),
+                        getRawUpperBound(), getRawStride());
   }
 
 public:
@@ -315,25 +317,33 @@ class DISubrange : public DINode {
   DEFINE_MDNODE_GET(DISubrange, (Metadata *CountNode, int64_t LowerBound = 0),
                     (CountNode, LowerBound))
 
-  TempDISubrange clone() const { return cloneImpl(); }
+  DEFINE_MDNODE_GET(DISubrange,
+                    (Metadata * CountNode, Metadata *LowerBound,
+                     Metadata *UpperBound, Metadata *Stride),
+                    (CountNode, LowerBound, UpperBound, Stride))
 
-  int64_t getLowerBound() const { return LowerBound; }
+  TempDISubrange clone() const { return cloneImpl(); }
 
   Metadata *getRawCountNode() const {
     return getOperand(0).get();
   }
 
+  Metadata *getRawLowerBound() const { return getOperand(1).get(); }
+
+  Metadata *getRawUpperBound() const { return getOperand(2).get(); }
+
+  Metadata *getRawStride() const { return getOperand(3).get(); }
+
   typedef PointerUnion<ConstantInt*, DIVariable*> CountType;
+  typedef PointerUnion<ConstantInt *, DIVariable *, DIExpression *> BoundType;
 
-  CountType getCount() const {
-    if (auto *MD = dyn_cast<ConstantAsMetadata>(getRawCountNode()))
-      return CountType(cast<ConstantInt>(MD->getValue()));
+  CountType getCount() const;
 
-    if (auto *DV = dyn_cast<DIVariable>(getRawCountNode()))
-      return CountType(DV);
+  BoundType getLowerBound() const;
 
-    return CountType();
-  }
+  BoundType getUpperBound() const;
+
+  BoundType getStride() const;
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DISubrangeKind;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 9e8fe96ac3a83..a2c1b3f632af8 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4500,21 +4500,41 @@ bool LLParser::ParseGenericDINode(MDNode *&Result, bool IsDistinct) {
 /// ParseDISubrange:
 ///   ::= !DISubrange(count: 30, lowerBound: 2)
 ///   ::= !DISubrange(count: !node, lowerBound: 2)
+///   ::= !DISubrange(lowerBound: !node1, upperBound: !node2, stride: !node3)
 bool LLParser::ParseDISubrange(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  REQUIRED(count, MDSignedOrMDField, (-1, -1, INT64_MAX, false));              \
-  OPTIONAL(lowerBound, MDSignedField, );
+  OPTIONAL(count, MDSignedOrMDField, (-1, -1, INT64_MAX, false));              \
+  OPTIONAL(lowerBound, MDSignedOrMDField, );                                   \
+  OPTIONAL(upperBound, MDSignedOrMDField, );                                   \
+  OPTIONAL(stride, MDSignedOrMDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Metadata *Count = nullptr;
+  Metadata *LowerBound = nullptr;
+  Metadata *UpperBound = nullptr;
+  Metadata *Stride = nullptr;
   if (count.isMDSignedField())
-    Result = GET_OR_DISTINCT(
-        DISubrange, (Context, count.getMDSignedValue(), lowerBound.Val));
+    Count = ConstantAsMetadata::get(ConstantInt::getSigned(
+        Type::getInt64Ty(Context), count.getMDSignedValue()));
   else if (count.isMDField())
-    Result = GET_OR_DISTINCT(
-        DISubrange, (Context, count.getMDFieldValue(), lowerBound.Val));
-  else
-    return true;
+    Count = count.getMDFieldValue();
+
+  auto convToMetadata = [&](MDSignedOrMDField Bound) -> Metadata * {
+    if (Bound.isMDSignedField())
+      return ConstantAsMetadata::get(ConstantInt::getSigned(
+          Type::getInt64Ty(Context), Bound.getMDSignedValue()));
+    if (Bound.isMDField())
+      return Bound.getMDFieldValue();
+    return nullptr;
+  };
+
+  LowerBound = convToMetadata(lowerBound);
+  UpperBound = convToMetadata(upperBound);
+  Stride = convToMetadata(stride);
+
+  Result = GET_OR_DISTINCT(DISubrange,
+                           (Context, Count, LowerBound, UpperBound, Stride));
 
   return false;
 }
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 7338d170cb48b..34c93beebb013 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1258,14 +1258,24 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // Operand 'count' is interpreted as:
     // - Signed integer (version 0)
     // - Metadata node  (version 1)
+    // Operand 'lowerBound' is interpreted as:
+    // - Signed integer (version 0 and 1)
+    // - Metadata node  (version 2)
+    // Operands 'upperBound' and 'stride' are interpreted as:
+    // - Metadata node  (version 2)
     switch (Record[0] >> 1) {
     case 0:
       Val = GET_OR_DISTINCT(DISubrange,
-                            (Context, Record[1], unrotateSign(Record.back())));
+                            (Context, Record[1], unrotateSign(Record[2])));
       break;
     case 1:
       Val = GET_OR_DISTINCT(DISubrange, (Context, getMDOrNull(Record[1]),
-                                         unrotateSign(Record.back())));
+                                         unrotateSign(Record[2])));
+      break;
+    case 2:
+      Val = GET_OR_DISTINCT(
+          DISubrange, (Context, getMDOrNull(Record[1]), getMDOrNull(Record[2]),
+                       getMDOrNull(Record[3]), getMDOrNull(Record[4])));
       break;
     default:
       return error("Invalid record: Unsupported version of DISubrange");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index fd4275f7d569a..9da1437b56b2e 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1527,10 +1527,12 @@ static uint64_t rotateSign(int64_t I) {
 void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
                                           SmallVectorImpl<uint64_t> &Record,
                                           unsigned Abbrev) {
-  const uint64_t Version = 1 << 1;
+  const uint64_t Version = 2 << 1;
   Record.push_back((uint64_t)N->isDistinct() | Version);
   Record.push_back(VE.getMetadataOrNullID(N->getRawCountNode()));
-  Record.push_back(rotateSign(N->getLowerBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLowerBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawUpperBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawStride()));
 
   Stream.EmitRecord(bitc::METADATA_SUBRANGE, Record, Abbrev);
   Record.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index de2b9bcc58c7e..f7041c0cc9263 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1592,7 +1592,7 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     assert(Element->getTag() == dwarf::DW_TAG_subrange_type);
 
     const DISubrange *Subrange = cast<DISubrange>(Element);
-    assert(Subrange->getLowerBound() == 0 &&
+    assert(!Subrange->getRawLowerBound() &&
            "codeview doesn't support subranges with lower bounds");
     int64_t Count = -1;
     if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt*>())
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index dce90b3c17c0d..8d6849b4e1e35 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -768,9 +768,18 @@ static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) {
     Result.push_back(DLVar);
   for (auto *El : Array->getElements()) {
     if (auto *Subrange = dyn_cast<DISubrange>(El)) {
-      auto Count = Subrange->getCount();
-      if (auto *Dependency = Count.dyn_cast<DIVariable *>())
-        Result.push_back(Dependency);
+      if (auto Count = Subrange->getCount())
+        if (auto *Dependency = Count.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto LB = Subrange->getLowerBound())
+        if (auto *Dependency = LB.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto UB = Subrange->getUpperBound())
+        if (auto *Dependency = UB.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto ST = Subrange->getStride())
+        if (auto *Dependency = ST.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
     }
   }
   return Result;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 62bf51d422067..e958f38e486b0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1349,20 +1349,40 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
   // Count == -1 then the array is unbounded and we do not emit
   // DW_AT_lower_bound and DW_AT_count attributes.
-  int64_t LowerBound = SR->getLowerBound();
   int64_t DefaultLowerBound = getDefaultLowerBound();
   int64_t Count = -1;
   if (auto *CI = SR->getCount().dyn_cast<ConstantInt*>())
     Count = CI->getSExtValue();
 
-  if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
-    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
+  auto addBoundTypeEntry = [&](dwarf::Attribute Attr,
+                               DISubrange::BoundType Bound) -> void {
+    if (auto *BV = Bound.dyn_cast<DIVariable *>()) {
+      if (auto *VarDIE = getDIE(BV))
+        addDIEEntry(DW_Subrange, Attr, *VarDIE);
+    } else if (auto *BE = Bound.dyn_cast<DIExpression *>()) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+      DwarfExpr.setMemoryLocationKind();
+      DwarfExpr.addExpression(BE);
+      addBlock(DW_Subrange, Attr, DwarfExpr.finalize());
+    } else if (auto *BI = Bound.dyn_cast<ConstantInt *>()) {
+      if (Attr != dwarf::DW_AT_lower_bound || DefaultLowerBound == -1 ||
+          BI->getSExtValue() != DefaultLowerBound)
+        addSInt(DW_Subrange, Attr, dwarf::DW_FORM_sdata, BI->getSExtValue());
+    }
+  };
+
+  addBoundTypeEntry(dwarf::DW_AT_lower_bound, SR->getLowerBound());
 
   if (auto *CV = SR->getCount().dyn_cast<DIVariable*>()) {
     if (auto *CountVarDIE = getDIE(CV))
       addDIEEntry(DW_Subrange, dwarf::DW_AT_count, *CountVarDIE);
   } else if (Count != -1)
     addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
+
+  addBoundTypeEntry(dwarf::DW_AT_upper_bound, SR->getUpperBound());
+
+  addBoundTypeEntry(dwarf::DW_AT_byte_stride, SR->getStride());
 }
 
 DIE *DwarfUnit::getIndexTyDie() {
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 72da461ddcb86..68edb6bad9396 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1858,9 +1858,34 @@ static void writeDISubrange(raw_ostream &Out, const DISubrange *N,
   if (auto *CE = N->getCount().dyn_cast<ConstantInt*>())
     Printer.printInt("count", CE->getSExtValue(), /* ShouldSkipZero */ false);
   else
-    Printer.printMetadata("count", N->getCount().dyn_cast<DIVariable*>(),
-                          /*ShouldSkipNull */ false);
-  Printer.printInt("lowerBound", N->getLowerBound());
+    Printer.printMetadata("count", N->getCount().dyn_cast<DIVariable *>(),
+                          /*ShouldSkipNull */ true);
+
+  // A lowerBound of constant 0 should not be skipped, since it is different
+  // from an unspecified lower bound (= nullptr).
+  auto *LBound = N->getRawLowerBound();
+  if (auto *LE = dyn_cast_or_null<ConstantAsMetadata>(LBound)) {
+    auto *LV = cast<ConstantInt>(LE->getValue());
+    Printer.printInt("lowerBound", LV->getSExtValue(),
+                     /* ShouldSkipZero */ false);
+  } else
+    Printer.printMetadata("lowerBound", LBound, /*ShouldSkipNull */ true);
+
+  auto *UBound = N->getRawUpperBound();
+  if (auto *UE = dyn_cast_or_null<ConstantAsMetadata>(UBound)) {
+    auto *UV = cast<ConstantInt>(UE->getValue());
+    Printer.printInt("upperBound", UV->getSExtValue(),
+                     /* ShouldSkipZero */ false);
+  } else
+    Printer.printMetadata("upperBound", UBound, /*ShouldSkipNull */ true);
+
+  auto *Stride = N->getRawStride();
+  if (auto *SE = dyn_cast_or_null<ConstantAsMetadata>(Stride)) {
+    auto *SV = cast<ConstantInt>(SE->getValue());
+    Printer.printInt("stride", SV->getSExtValue(), /* ShouldSkipZero */ false);
+  } else
+    Printer.printMetadata("stride", Stride, /*ShouldSkipNull */ true);
+
   Out << ")";
 }
 
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 95841be534777..45cbbb3a60370 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -625,11 +625,22 @@ DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
 }
 
 DISubrange *DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
-  return DISubrange::get(VMContext, Count, Lo);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(VMContext), Lo));
+  auto *CountNode = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(VMContext), Count));
+  return DISubrange::get(VMContext, CountNode, LB, nullptr, nullptr);
 }
 
 DISubrange *DIBuilder::getOrCreateSubrange(int64_t Lo, Metadata *CountNode) {
-  return DISubrange::get(VMContext, CountNode, Lo);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(VMContext), Lo));
+  return DISubrange::get(VMContext, CountNode, LB, nullptr, nullptr);
+}
+
+DISubrange *DIBuilder::getOrCreateSubrange(Metadata *CountNode, Metadata *LB,
+                                           Metadata *UB, Metadata *Stride) {
+  return DISubrange::get(VMContext, CountNode, LB, UB, Stride);
 }
 
 static void checkGlobalVariableScope(DIScope *Context) {
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f1e946c1902e6..ea90d6842c317 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -336,15 +336,103 @@ DISubrange *DISubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
                                 StorageType Storage, bool ShouldCreate) {
   auto *CountNode = ConstantAsMetadata::get(
       ConstantInt::getSigned(Type::getInt64Ty(Context), Count));
-  return getImpl(Context, CountNode, Lo, Storage, ShouldCreate);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), Lo));
+  return getImpl(Context, CountNode, LB, nullptr, nullptr, Storage,
+                 ShouldCreate);
 }
 
 DISubrange *DISubrange::getImpl(LLVMContext &Context, Metadata *CountNode,
                                 int64_t Lo, StorageType Storage,
                                 bool ShouldCreate) {
-  DEFINE_GETIMPL_LOOKUP(DISubrange, (CountNode, Lo));
-  Metadata *Ops[] = { CountNode };
-  DEFINE_GETIMPL_STORE(DISubrange, (CountNode, Lo), Ops);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), Lo));
+  return getImpl(Context, CountNode, LB, nullptr, nullptr, Storage,
+                 ShouldCreate);
+}
+
+DISubrange *DISubrange::getImpl(LLVMContext &Context, Metadata *CountNode,
+                                Metadata *LB, Metadata *UB, Metadata *Stride,
+                                StorageType Storage, bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DISubrange, (CountNode, LB, UB, Stride));
+  Metadata *Ops[] = {CountNode, LB, UB, Stride};
+  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DISubrange, Ops);
+}
+
+DISubrange::CountType DISubrange::getCount() const {
+  if (!getRawCountNode())
+    return CountType();
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(getRawCountNode()))
+    return CountType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *DV = dyn_cast<DIVariable>(getRawCountNode()))
+    return CountType(DV);
+
+  return CountType();
+}
+
+DISubrange::BoundType DISubrange::getLowerBound() const {
+  Metadata *LB = getRawLowerBound();
+  if (!LB)
+    return BoundType();
+
+  assert((isa<ConstantAsMetadata>(LB) || isa<DIVariable>(LB) ||
+          isa<DIExpression>(LB)) &&
+         "LowerBound must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(LB))
+    return BoundType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *MD = dyn_cast<DIVariable>(LB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(LB))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DISubrange::BoundType DISubrange::getUpperBound() const {
+  Metadata *UB = getRawUpperBound();
+  if (!UB)
+    return BoundType();
+
+  assert((isa<ConstantAsMetadata>(UB) || isa<DIVariable>(UB) ||
+          isa<DIExpression>(UB)) &&
+         "UpperBound must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(UB))
+    return BoundType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *MD = dyn_cast<DIVariable>(UB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(UB))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DISubrange::BoundType DISubrange::getStride() const {
+  Metadata *ST = getRawStride();
+  if (!ST)
+    return BoundType();
+
+  assert((isa<ConstantAsMetadata>(ST) || isa<DIVariable>(ST) ||
+          isa<DIExpression>(ST)) &&
+         "Stride must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(ST))
+    return BoundType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *MD = dyn_cast<DIVariable>(ST))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(ST))
+    return BoundType(MD);
+
+  return BoundType();
 }
 
 DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, APInt Value,
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 9912808c53c2d..1c7d8746d242f 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -323,32 +323,46 @@ template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
 
 template <> struct MDNodeKeyImpl<DISubrange> {
   Metadata *CountNode;
-  int64_t LowerBound;
-
-  MDNodeKeyImpl(Metadata *CountNode, int64_t LowerBound)
-      : CountNode(CountNode), LowerBound(LowerBound) {}
+  Metadata *LowerBound;
+  Metadata *UpperBound;
+  Metadata *Stride;
+
+  MDNodeKeyImpl(Metadata *CountNode, Metadata *LowerBound, Metadata *UpperBound,
+                Metadata *Stride)
+      : CountNode(CountNode), LowerBound(LowerBound), UpperBound(UpperBound),
+        Stride(Stride) {}
   MDNodeKeyImpl(const DISubrange *N)
-      : CountNode(N->getRawCountNode()),
-        LowerBound(N->getLowerBound()) {}
+      : CountNode(N->getRawCountNode()), LowerBound(N->getRawLowerBound()),
+        UpperBound(N->getRawUpperBound()), Stride(N->getRawStride()) {}
 
   bool isKeyOf(const DISubrange *RHS) const {
-    if (LowerBound != RHS->getLowerBound())
-      return false;
-
-    if (auto *RHSCount = RHS->getCount().dyn_cast<ConstantInt*>())
-      if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
-        if (RHSCount->getSExtValue() ==
-            cast<ConstantInt>(MD->getValue())->getSExtValue())
+    auto BoundsEqual = [=](Metadata *Node1, Metadata *Node2) -> bool {
+      if (Node1 == Node2)
+        return true;
+
+      ConstantAsMetadata *MD1 = dyn_cast_or_null<ConstantAsMetadata>(Node1);
+      ConstantAsMetadata *MD2 = dyn_cast_or_null<ConstantAsMetadata>(Node2);
+      if (MD1 && MD2) {
+        ConstantInt *CV1 = cast<ConstantInt>(MD1->getValue());
+        ConstantInt *CV2 = cast<ConstantInt>(MD2->getValue());
+        if (CV1->getSExtValue() == CV2->getSExtValue())
           return true;
+      }
+      return false;
+    };
 
-    return CountNode == RHS->getRawCountNode();
+    return BoundsEqual(CountNode, RHS->getRawCountNode()) &&
+           BoundsEqual(LowerBound, RHS->getRawLowerBound()) &&
+           BoundsEqual(UpperBound, RHS->getRawUpperBound()) &&
+           BoundsEqual(Stride, RHS->getRawStride());
   }
 
   unsigned getHashValue() const {
-    if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
-      return hash_combine(cast<ConstantInt>(MD->getValue())->getSExtValue(),
-                          LowerBound);
-    return hash_combine(CountNode, LowerBound);
+    if (CountNode)
+      if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
+        return hash_combine(cast<ConstantInt>(MD->getValue())->getSExtValue(),
+                            LowerBound, UpperBound, Stride);
+    return hash_combine(CountNode, LowerBound, UpperBound, Stride);
   }
 };
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c39fb0edc714b..388fc72417ade 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -893,12 +893,30 @@ void Verifier::visitDIScope(const DIScope &N) {
 
 void Verifier::visitDISubrange(const DISubrange &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
+  AssertDI(N.getRawCountNode() || N.getRawUpperBound(),
+           "Subrange must contain count or upperBound", &N);
+  AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(),
+           "Subrange can have any one of count or upperBound", &N);
+  AssertDI(!N.getRawCountNode() || N.getCount(),
+           "Count must either be a signed constant or a DIVariable", &N);
   auto Count = N.getCount();
-  AssertDI(Count, "Count must either be a signed constant or a DIVariable",
-           &N);
-  AssertDI(!Count.is<ConstantInt*>() ||
-               Count.get<ConstantInt*>()->getSExtValue() >= -1,
+  AssertDI(!Count || !Count.is<ConstantInt *>() ||
+               Count.get<ConstantInt *>()->getSExtValue() >= -1,
            "invalid subrange count", &N);
+  auto *LBound = N.getRawLowerBound();
+  AssertDI(!LBound || isa<ConstantAsMetadata>(LBound) ||
+               isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
+           "LowerBound must be signed constant or DIVariable or DIExpression",
+           &N);
+  auto *UBound = N.getRawUpperBound();
+  AssertDI(!UBound || isa<ConstantAsMetadata>(UBound) ||
+               isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
+           "UpperBound must be signed constant or DIVariable or DIExpression",
+           &N);
+  auto *Stride = N.getRawStride();
+  AssertDI(!Stride || isa<ConstantAsMetadata>(Stride) ||
+               isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
+           "Stride must be signed constant or DIVariable or DIExpression", &N);
 }
 
 void Verifier::visitDIEnumerator(const DIEnumerator &N) {
diff --git a/llvm/test/Assembler/debug-info.ll b/llvm/test/Assembler/debug-info.ll
index d54dba07ac1e0..419623a2cb7d1 100644
--- a/llvm/test/Assembler/debug-info.ll
+++ b/llvm/test/Assembler/debug-info.ll
@@ -4,10 +4,10 @@
 ; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39}
 !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42}
 
-; CHECK:      !0 = !DISubrange(count: 3)
+; CHECK:      !0 = !DISubrange(count: 3, lowerBound: 0)
 ; CHECK-NEXT: !1 = !DISubrange(count: 3, lowerBound: 4)
 ; CHECK-NEXT: !2 = !DISubrange(count: 3, lowerBound: -5)
-!0 = !DISubrange(count: 3)
+!0 = !DISubrange(count: 3, lowerBound: 0)
 !1 = !DISubrange(count: 3, lowerBound: 0)
 
 !2 = !DISubrange(count: 3, lowerBound: 4)
diff --git a/llvm/test/Assembler/disubrange-empty-array.ll b/llvm/test/Assembler/disubrange-empty-array.ll
index 7b5279e3d3c2c..ef0ca0e81a270 100644
--- a/llvm/test/Assembler/disubrange-empty-array.ll
+++ b/llvm/test/Assembler/disubrange-empty-array.ll
@@ -4,10 +4,10 @@
 ; CHECK: !named = !{!0, !0, !1, !2}
 !named = !{!0, !1, !2, !3}
 
-; CHECK:      !0 = !DISubrange(count: -1)
+; CHECK:      !0 = !DISubrange(count: -1, lowerBound: 0)
 ; CHECK-NEXT: !1 = !DISubrange(count: -1, lowerBound: 4)
 ; CHECK-NEXT: !2 = !DISubrange(count: -1, lowerBound: -5)
-!0 = !DISubrange(count: -1)
+!0 = !DISubrange(count: -1, lowerBound: 0)
 !1 = !DISubrange(count: -1, lowerBound: 0)
 
 !2 = !DISubrange(count: -1, lowerBound: 4)
diff --git a/llvm/test/Assembler/invalid-disubrange-count-missing.ll b/llvm/test/Assembler/invalid-disubrange-count-missing.ll
index 8fc4487117f68..8b7bf713a8e91 100644
--- a/llvm/test/Assembler/invalid-disubrange-count-missing.ll
+++ b/llvm/test/Assembler/invalid-disubrange-count-missing.ll
@@ -1,4 +1,5 @@
 ; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
 
-; CHECK: [[@LINE+1]]:32: error: missing required field 'count'
+!named = !{!0}
+; CHECK: Subrange must contain count or upperBound
 !0 = !DISubrange(lowerBound: -3)
diff --git a/llvm/test/Bindings/llvm-c/debug_info.ll b/llvm/test/Bindings/llvm-c/debug_info.ll
index 59d9628ff009d..d56873f1cb251 100644
--- a/llvm/test/Bindings/llvm-c/debug_info.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info.ll
@@ -60,7 +60,7 @@
 ; CHECK-NEXT: !33 = !{!6, !6, !34}
 ; CHECK-NEXT: !34 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !35)
 ; CHECK-NEXT: !35 = !{!36}
-; CHECK-NEXT: !36 = !DISubrange(count: 10)
+; CHECK-NEXT: !36 = !DISubrange(count: 10, lowerBound: 0)
 ; CHECK-NEXT: !37 = !{!38, !39, !40, !41}
 ; CHECK-NEXT: !38 = !DILocalVariable(name: "a", arg: 1, scope: !31, file: !1, line: 42, type: !6)
 ; CHECK-NEXT: !39 = !DILocalVariable(name: "b", arg: 2, scope: !31, file: !1, line: 42, type: !6)
diff --git a/llvm/test/Bitcode/fortranSubrange.ll b/llvm/test/Bitcode/fortranSubrange.ll
new file mode 100644
index 0000000000000..7b97be5b352dc
--- /dev/null
+++ b/llvm/test/Bitcode/fortranSubrange.ll
@@ -0,0 +1,44 @@
+;; This test checks DISubrange bounds
+
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+;; Test whether bounds are generated correctly.
+; CHECK: !{{[0-9]+}} = !DISubrange(lowerBound: 3, upperBound: ![[NODE:[0-9]+]], stride: !DIExpression(DW_OP_constu, 4))
+; CHECK: ![[NODE]] = distinct !DILocalVariable
+
+
+; ModuleID = 'fortsubrange.ll'
+source_filename = "fortsubrange.ll"
+
+define void @MAIN_() !dbg !5 {
+L.entry:
+  %.Z0640_333 = alloca i32*, align 8
+  %"arr$sd1_349" = alloca [16 x i64], align 8
+  call void @llvm.dbg.declare(metadata i32** %.Z0640_333, metadata !8, metadata !DIExpression(DW_OP_deref)), !dbg !15
+  call void @llvm.dbg.declare(metadata [16 x i64]* %"arr$sd1_349", metadata !13, metadata !DIExpression(DW_OP_plus_uconst, 120)), !dbg !15
+  ret void, !dbg !16
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !4, imports: !4)
+!3 = !DIFile(filename: "fortsubrange.f90", directory: "/dir")
+!4 = !{}
+!5 = distinct !DISubprogram(name: "main", scope: !2, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !2)
+!6 = !DISubroutineType(cc: DW_CC_program, types: !7)
+!7 = !{null}
+!8 = !DILocalVariable(name: "arr", scope: !5, file: !3, type: !9)
+!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 32, align: 32, elements: !11)
+!10 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DISubrange(lowerBound: 3, upperBound: !13, stride: !DIExpression(DW_OP_constu, 4))
+!13 = distinct !DILocalVariable(scope: !5, file: !3, type: !14, flags: DIFlagArtificial)
+!14 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed)
+!15 = !DILocation(line: 0, scope: !5)
+!16 = !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/Bitcode/fortranSubrangeBackward.ll b/llvm/test/Bitcode/fortranSubrangeBackward.ll
new file mode 100644
index 0000000000000..ffa987e2f01ed
--- /dev/null
+++ b/llvm/test/Bitcode/fortranSubrangeBackward.ll
@@ -0,0 +1,50 @@
+;; This test checks Backward compatibility of DISubrange bounds
+; REQUIRES: x86_64-linux
+
+; RUN: llvm-dis -o - %s.bc | FileCheck %s
+
+;; Test whether bounds are generated correctly.
+; CHECK: !DISubrange(count: 15, lowerBound: 3)
+; CHECK: !DISubrange(count: ![[NODE:[0-9]+]], lowerBound: 3)
+; CHECK: ![[NODE]] = distinct !DILocalVariable
+
+
+; ModuleID = 'fortsubrange.ll'
+source_filename = "fortsubrange.ll"
+
+define void @MAIN_() !dbg !10 {
+L.entry:
+  %.Z0640_333 = alloca i32*, align 8
+  %"arr$sd1_349" = alloca [16 x i64], align 8
+  call void @llvm.dbg.declare(metadata i32** %.Z0640_333, metadata !13, metadata !DIExpression(DW_OP_deref)), !dbg !19
+  call void @llvm.dbg.declare(metadata [16 x i64]* %"arr$sd1_349", metadata !17, metadata !DIExpression(DW_OP_plus_uconst, 120)), !dbg !19
+  ret void, !dbg !20
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !4, imports: !4)
+!3 = !DIFile(filename: "fortsubrange.f90", directory: "/dir")
+!4 = !{}
+!5 = !{!6}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 32, align: 32, elements: !8)
+!7 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9}
+!9 = !DISubrange(count: 15, lowerBound: 3)
+!10 = distinct !DISubprogram(name: "main", scope: !2, file: !3, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !2)
+!11 = !DISubroutineType(cc: DW_CC_program, types: !12)
+!12 = !{null}
+!13 = !DILocalVariable(name: "arr", scope: !10, file: !3, type: !14)
+!14 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 32, align: 32, elements: !15)
+!15 = !{!16}
+!16 = !DISubrange(count: !17, lowerBound: 3)
+!17 = distinct !DILocalVariable(scope: !10, file: !3, type: !18, flags: DIFlagArtificial)
+!18 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed)
+!19 = !DILocation(line: 0, scope: !10)
+!20 = !DILocation(line: 6, column: 1, scope: !10)
diff --git a/llvm/test/Bitcode/fortranSubrangeBackward.ll.bc b/llvm/test/Bitcode/fortranSubrangeBackward.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..00e427b1cdaa2a65525d6002c20b53b41cdf6e63
GIT binary patch
literal 2064
zcmY*aeN0=|6~F#`PH>)|2~CW!y=Nean+5W02gk;y!+w;2N>-z771dJ3&tL<caUL&#
z4T&a$ojtd8E0ad5SLF|*MOCZ)<5c;h>blh*l)90Z(Wt6e8X*`TO)8qwHJh>~T6dn4
zRN9fwIrpA(-?_hY?>YCm>D+Ql4MH-6kWWc99)JEXffw#ydA6?9*da2@kU@#iyCQ_D
z#0DgWHG6=u>4sTXN!l$Z%=Yu5xN@INQLC4};)@bbi{0m|+;PHOV^e6J*=cqsy4ZO`
zLq|2ABwXjk10}`f8j|izke`Um?zrZl@nFhyHhW^0oG(G#mq>(g(B^u@4szg}ba(xw
zIEc0QT&kwJCy}g*$9o8>0qsLCPe<LkGVxb+G_MsIQ|kTmvjDDFgOC)yue-4F0}$3O
zHfHeXQ>Yv4kOJC015BOGNgVfNj!ew@HWu!~!V&%q%Uiwt7|T!Pc&iNyzs>J@%Y%<B
z;!_}zieoW0gPkU^DGK8j<=Z3{CMx5K@!<k4gH5z!Q(kOrQGOd_Wjf-;BM6%!FfNLX
ztl*IhHtK!!AuJxDszymHl2I2@&#rd7-XT*jYT_Rrq7P;ifb~b(>FxSeyOgLTP%*ZP
zx`Kr9#X5viGBJ{a&8$x=Yt=oe<c|Hv7NqpM^~Sh4^HUYN40*o}ev2ZionO@kYD8@b
z+7>S+-BD~Zf}Ku}DmRhpYXZL;#Vty=zL3{lo6`m;S!G$Qt}JOQ(QQddTRD;(W3e*|
zY_f}IS!{}xOhkB=<ZH`nd@F^oQmPQ8THD6AJ-CIK*DV{gO9owm(FdrP93R9Si?scU
z#Q8zY8J9T9+cgtj?932$bcp9bMtL^HvtDe%h)vv+LD+DVKdInHH9VK*C)3l1?rh_C
zR`7r2@GVN>E6r)w=d|UDyvb;<8npA%x~q)*^Av3_(e`vjzE`weplu0><3jN;Z2Dlv
z@tMStEh;}A=Gi`;V`F1OSOo4{!;fY7+H1d1-9f5<dhqoWzSgc<CGjoDf;8%G=Jf@a
z{<=YXb6S6MMZ3PL`%0t_XnJi~y7gkrQT8-GO2o^DBUmKe+jBRi5>7z>qW&tQy=BlX
z=XJjo=>@<B-8Lit<++%xTy$jVHaL(61?M{LNWiq1{Aed+%7+1u0-G{oTzdM%8i8-+
z@C^_C9o3M(?$WQX?p<2dEdzkAT1Ph4b|qF9FWQo{y<Bv@m$#RSkfXEQb!Ak60jg0r
zz&^hAiv@hktNMBcN>;T&sWwQ}zdd+}z}F}orr)i|Z?E9%9{e6fuih}|Hs`e4q6%`R
z^RvGfZE#uH;^8H_wM@4r=vK&jk$wtIk_fq=rZ{~UNhg%GJ2-O}*)W})o&B^ULl_5~
zCbRpsl;ICU<e$;2QuP-?-*to;E<ir#=S~lw3U&B><AY>(C~)erC2)!xJ3a2>n9!iV
zIbf#_4-9i;r=21@gJ2??NK<nQX{Jm^o2Zs1$}}7r^AGwth})p}9W~=sHYnt#8McOg
zUi1%m$WWdZ{Y_G77pllgOUk05QSwBt9p#icC)O^PYoabuk45pai6&lc+#%qT%Do6x
z!vK|u9&>d%lU!Qbx^d#ha_Ot3t0&4!g=?Fg>OK4STCIPw?loVwym9&cUNJnF??8VE
z?_t$W&IomjP4%&9x(&)NOWPNVGn?9lxm~whLR)PbbPwxc&yh7R{-p=XOtmf)o9ass
zem5JP#wLxGE{O1OgF+1mXg(!SrN6PV^HR)l5eiz;mWer`xKl*|H$x)yH0a$@*CAM1
z$$4NOLf=vd>0w@ftd#1-rLy@HLL<y@h~c<Lhj-BS$4mY1paV-{2;dcBE$oxXZAgPM
zD5!`YkE5Z&o32uFm*{ABweajKlsSZ2VSqGmDAF53_%nD^+3Z2*#C-=e3-9MOlCx>D
z@nb3DLKby`NfmnHKFuY{$0AZ8mvaAw2f6q?2-R&6KPzi$P*jvl<V8_ZLnQtqIKCAe
z84y2E47!zcF;WeRO8}hk9-hm_H@NE;wfC<L*&Ut-zyEyn`wzdlk6yce;LRrs;1`)5
z=KpKyihvi5k_bsan{lE(j=rhb1iK&pXK+NQT5`cC<SjU!05${n01LK0V1WcJ_!Ghe
z=om0uIXVff0+s;_^w=+qf7}ShkNtuc(n5_7ZNgm%HbDz^#UU5~y#UyL0vfR05$4BT
z-9JAW3{H$R5BLU~2mJj(hVz?D%~UfrI6Ow0C|`fS)!f1yHQD-o6l1Y4cBaMdvs?Oo
vffhSs8nE;q@!KsCoAspiNYi-eH=$GEP*ZR?G#+Uh42}O^-vxuhjtKn^YBz&?

literal 0
HcmV?d00001

diff --git a/llvm/test/DebugInfo/X86/default-subrange-array.ll b/llvm/test/DebugInfo/X86/default-subrange-array.ll
index 1374cd888861d..fde789a106bbb 100644
--- a/llvm/test/DebugInfo/X86/default-subrange-array.ll
+++ b/llvm/test/DebugInfo/X86/default-subrange-array.ll
@@ -24,7 +24,7 @@ source_filename = "test/DebugInfo/X86/default-subrange-array.ll"
 ; CHECK-NEXT:         DW_AT_type
 ; CHECK:            DW_TAG_subrange_type
 ; CHECK-NEXT:         DW_AT_type
-; DWARF4-NEXT:        DW_AT_lower_bound [DW_FORM_data1] (0x00)
+; DWARF4-NEXT:        DW_AT_lower_bound [DW_FORM_sdata] (0)
 ; CHECK-NEXT:         DW_AT_count [DW_FORM_data1]       (0x2a)
 ; DWARF5-NOT:         DW_AT_lower_bound
 
diff --git a/llvm/test/DebugInfo/X86/nondefault-subrange-array.ll b/llvm/test/DebugInfo/X86/nondefault-subrange-array.ll
index 59deb7a7b9da1..7089030b873ac 100644
--- a/llvm/test/DebugInfo/X86/nondefault-subrange-array.ll
+++ b/llvm/test/DebugInfo/X86/nondefault-subrange-array.ll
@@ -19,7 +19,7 @@ source_filename = "test/DebugInfo/X86/nondefault-subrange-array.ll"
 
 ; CHECK: DW_TAG_subrange_type
 ; CHECK-NEXT:                   DW_AT_type [DW_FORM_ref4]  (cu + 0x{{[0-9a-f]*}} => {[[BASE2:0x[0-9a-f]*]]}
-; CHECK-NEXT:                   DW_AT_lower_bound [DW_FORM_data8]       (0xfffffffffffffffd)
+; CHECK-NEXT:                   DW_AT_lower_bound [DW_FORM_sdata] (-3)
 ; CHECK-NEXT:                   DW_AT_count [DW_FORM_data1]       (0x2a)
 
 ; CHECK: [[BASE]]: DW_TAG_base_type
diff --git a/llvm/test/DebugInfo/cDefaultLower.ll b/llvm/test/DebugInfo/cDefaultLower.ll
new file mode 100644
index 0000000000000..7cd37fe845b35
--- /dev/null
+++ b/llvm/test/DebugInfo/cDefaultLower.ll
@@ -0,0 +1,35 @@
+;; This test checks whether c default lowerBound is removed.
+; REQUIRES: x86_64-linux
+
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+;; c default DW_AT_lower_bound(0) is not dumped.
+; CHECK-LABEL: DW_TAG_subrange_type
+; CHECK-NEXT:   DW_AT_type
+; CHECK-NEXT:   DW_AT_upper_bound     (4)
+
+;; c non-default lowerBound=1 is dumped.
+; CHECK-LABEL: DW_TAG_subrange_type
+; CHECK-NEXT:   DW_AT_type
+; CHECK-NEXT:   DW_AT_lower_bound     (1)
+; CHECK-NEXT:   DW_AT_upper_bound     (5)
+
+; ModuleID = 'cDefaultLower.c'
+source_filename = "cDefaultLower.c"
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !4, imports: !4)
+!3 = !DIFile(filename: "cDefaultLower.c", directory: "dir")
+!4 = !{}
+!5 = !{!6, !10}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 160, align: 32, elements: !8)
+!7 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9}
+!9 = !DISubrange(lowerBound: 0, upperBound: 4)
+!10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 160, align: 32, elements: !11)
+!11 = !{!12}
+!12 = !DISubrange(lowerBound: 1, upperBound: 5)
diff --git a/llvm/test/DebugInfo/fortranDefaultLower.ll b/llvm/test/DebugInfo/fortranDefaultLower.ll
new file mode 100644
index 0000000000000..face5d12fc4fc
--- /dev/null
+++ b/llvm/test/DebugInfo/fortranDefaultLower.ll
@@ -0,0 +1,35 @@
+;; This test checks whether fortran default lowerBound is removed.
+; REQUIRES: x86_64-linux
+
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+;; fortran default DW_AT_lower_bound(1) is not dumped.
+; CHECK-LABEL: DW_TAG_subrange_type
+; CHECK-NEXT:   DW_AT_type
+; CHECK-NEXT:   DW_AT_upper_bound     (5)
+
+;; fortran non-default lowerBound=2 is dumped.
+; CHECK-LABEL: DW_TAG_subrange_type
+; CHECK-NEXT:   DW_AT_type
+; CHECK-NEXT:   DW_AT_lower_bound     (2)
+; CHECK-NEXT:   DW_AT_upper_bound     (6)
+
+; ModuleID = 'fortranDefaultLower.ll'
+source_filename = "fortranDefaultLower.f90"
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !4, imports: !4)
+!3 = !DIFile(filename: "fortranDefaultLower.f90", directory: "dir")
+!4 = !{}
+!5 = !{!6, !10}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 160, align: 32, elements: !8)
+!7 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9}
+!9 = !DISubrange(lowerBound: 1, upperBound: 5)
+!10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 160, align: 32, elements: !11)
+!11 = !{!12}
+!12 = !DISubrange(lowerBound: 2, upperBound: 6)
diff --git a/llvm/test/DebugInfo/fortranSubrangeExpr.ll b/llvm/test/DebugInfo/fortranSubrangeExpr.ll
new file mode 100644
index 0000000000000..5ad5635cc6dc5
--- /dev/null
+++ b/llvm/test/DebugInfo/fortranSubrangeExpr.ll
@@ -0,0 +1,44 @@
+;; This test checks DISubrange bounds for DIExpression
+; REQUIRES: x86_64-linux
+
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+;;  Test whether bounds are generated correctly.
+; CHECK-LABEL:  DW_TAG_array_type
+; CHECK:       DW_TAG_subrange_type
+; DW_AT_lower_bound     (DW_OP_push_object_address, DW_OP_plus_uconst 0x50, DW_OP_deref)
+; CHECK:           DW_AT_lower_bound     (DW_OP_push_object_address, DW_OP_plus_uconst 0x50, DW_OP_deref)
+; CHECK-NEXT:      DW_AT_upper_bound     (DW_OP_push_object_address, DW_OP_plus_uconst 0x78, DW_OP_deref)
+; CHECK-NEXT:      DW_AT_byte_stride     (DW_OP_push_object_address, DW_OP_plus_uconst 0x70, DW_OP_deref, DW_OP_plus_uconst 0x4, DW_OP_mul)
+
+; ModuleID = 'fortsubrange.modified.strategy3check-in.ll'
+source_filename = "fortsubrange.ll"
+
+define void @MAIN_() !dbg !5 {
+L.entry:
+  %"arr$sd1_349" = alloca [16 x i64], align 8
+  call void @llvm.dbg.declare(metadata [16 x i64]* %"arr$sd1_349", metadata !8, metadata !DIExpression()), !dbg !13
+  ret void, !dbg !14
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !4, imports: !4)
+!3 = !DIFile(filename: "fortsubrange.f90", directory: "/dir")
+!4 = !{}
+!5 = distinct !DISubprogram(name: "main", scope: !2, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !2)
+!6 = !DISubroutineType(cc: DW_CC_program, types: !7)
+!7 = !{null}
+!8 = !DILocalVariable(name: "arr", scope: !5, file: !3, type: !9)
+!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 32, align: 32, elements: !11)
+!10 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DISubrange(lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 80, DW_OP_deref), upperBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 120, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 112, DW_OP_deref, DW_OP_plus_uconst, 4, DW_OP_mul))
+!13 = !DILocation(line: 0, scope: !5)
+!14 = !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/DebugInfo/fortranSubrangeInt.ll b/llvm/test/DebugInfo/fortranSubrangeInt.ll
new file mode 100644
index 0000000000000..34290b929cdb2
--- /dev/null
+++ b/llvm/test/DebugInfo/fortranSubrangeInt.ll
@@ -0,0 +1,43 @@
+;; This test checks DISubrange bounds for constants
+; REQUIRES: x86_64-linux
+
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+;; Test whether bounds are generated correctly.
+; CHECK-LABEL:  DW_TAG_array_type
+; CHECK:       DW_TAG_subrange_type
+; CHECK:           DW_AT_lower_bound     (-10)
+; CHECK-NEXT:      DW_AT_upper_bound     (10)
+; CHECK-NEXT:      DW_AT_byte_stride     (4)
+
+; ModuleID = 'fortsubrange.ll'
+source_filename = "fortsubrange.ll"
+
+define void @MAIN_() !dbg !5 {
+L.entry:
+  %"arr$sd1_349" = alloca [16 x i64], align 8
+  call void @llvm.dbg.declare(metadata [16 x i64]* %"arr$sd1_349", metadata !8, metadata !DIExpression()), !dbg !13
+  ret void, !dbg !14
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !4, imports: !4)
+!3 = !DIFile(filename: "fortsubrange.f90", directory: "/dir")
+!4 = !{}
+!5 = distinct !DISubprogram(name: "main", scope: !2, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !2)
+!6 = !DISubroutineType(cc: DW_CC_program, types: !7)
+!7 = !{null}
+!8 = !DILocalVariable(name: "arr", scope: !5, file: !3, type: !9)
+!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 32, align: 32, elements: !11)
+!10 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DISubrange(lowerBound: -10, upperBound: 10, stride: 4)
+!13 = !DILocation(line: 0, scope: !5)
+!14 = !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/DebugInfo/fortranSubrangeVar.ll b/llvm/test/DebugInfo/fortranSubrangeVar.ll
new file mode 100644
index 0000000000000..5ee283e4b21e5
--- /dev/null
+++ b/llvm/test/DebugInfo/fortranSubrangeVar.ll
@@ -0,0 +1,62 @@
+;; This test checks DISubrange bounds for DIVariable
+
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+;; Test whether bounds are generated correctly.
+; CHECK: [[DIE1:0x.+]]:       DW_TAG_variable
+; CHECK:                    DW_AT_location
+; CHECK-SAME: DW_OP_plus_uconst 0x70, DW_OP_deref, DW_OP_lit4, DW_OP_mul
+; CHECK: [[DIE2:0x.+]]:       DW_TAG_variable
+; CHECK:                    DW_AT_location
+; CHECK-SAME: DW_OP_plus_uconst 0x78
+; CHECK: [[DIE3:0x.+]]:       DW_TAG_variable
+; CHECK:                    DW_AT_location
+; CHECK-SAME: DW_OP_plus_uconst 0x50
+; CHECK:     DW_TAG_subrange_type
+; CHECK:                  DW_AT_lower_bound     ([[DIE3]])
+; CHEK-NEXT:              DW_AT_upper_bound     ([[DIE2]])
+; CHECK-NEXT              DW_AT_byte_stride     ([[DIE1]])
+
+
+; ModuleID = 'fortsubrange.ll'
+source_filename = "fortsubrange.ll"
+
+define void @MAIN_() !dbg !5 {
+L.entry:
+  %.Z0640_333 = alloca i32*, align 8
+  %"arr$sd1_349" = alloca [16 x i64], align 8
+  call void @llvm.dbg.declare(metadata i32** %.Z0640_333, metadata !8, metadata !DIExpression(DW_OP_deref)), !dbg !17
+  call void @llvm.dbg.declare(metadata [16 x i64]* %"arr$sd1_349", metadata !13, metadata !DIExpression(DW_OP_plus_uconst, 80)), !dbg !17
+  call void @llvm.dbg.value(metadata [16 x i64]* %"arr$sd1_349", metadata !16, metadata !DIExpression(DW_OP_plus_uconst, 112, DW_OP_deref, DW_OP_constu, 4, DW_OP_mul)), !dbg !17
+  call void @llvm.dbg.declare(metadata [16 x i64]* %"arr$sd1_349", metadata !15, metadata !DIExpression(DW_OP_plus_uconst, 120)), !dbg !17
+  ret void, !dbg !18
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: " F90 Flang - 1.5 2017-05-01", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !4, imports: !4)
+!3 = !DIFile(filename: "fortsubrange.f90", directory: "/dir")
+!4 = !{}
+!5 = distinct !DISubprogram(name: "main", scope: !2, file: !3, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagMainSubprogram, unit: !2)
+!6 = !DISubroutineType(cc: DW_CC_program, types: !7)
+!7 = !{null}
+!8 = !DILocalVariable(name: "arr", scope: !5, file: !3, type: !9)
+!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 32, align: 32, elements: !11)
+!10 = !DIBasicType(name: "integer", size: 32, align: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DISubrange(lowerBound: !13, upperBound: !15, stride: !16)
+!13 = distinct !DILocalVariable(scope: !5, file: !3, type: !14, flags: DIFlagArtificial)
+!14 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed)
+!15 = distinct !DILocalVariable(scope: !5, file: !3, type: !14, flags: DIFlagArtificial)
+!16 = distinct !DILocalVariable(scope: !5, file: !3, type: !14, flags: DIFlagArtificial)
+!17 = !DILocation(line: 0, scope: !5)
+!18 = !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/Verifier/disubrange-count-upperBound.ll b/llvm/test/Verifier/disubrange-count-upperBound.ll
new file mode 100644
index 0000000000000..3dbc79004f022
--- /dev/null
+++ b/llvm/test/Verifier/disubrange-count-upperBound.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!named = !{!0}
+; CHECK: Subrange can have any one of count or upperBound
+!0 = !DISubrange(count: 20, lowerBound: 1, upperBound: 10)
diff --git a/llvm/test/Verifier/disubrange-missing-upperBound.ll b/llvm/test/Verifier/disubrange-missing-upperBound.ll
new file mode 100644
index 0000000000000..26b707caa6093
--- /dev/null
+++ b/llvm/test/Verifier/disubrange-missing-upperBound.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!named = !{!0}
+; CHECK: Subrange must contain count or upperBound
+!0 = !DISubrange(lowerBound: 1, stride: 4)
diff --git a/llvm/test/Verifier/invalid-disubrange-lowerBound.ll b/llvm/test/Verifier/invalid-disubrange-lowerBound.ll
new file mode 100644
index 0000000000000..37a449a832908
--- /dev/null
+++ b/llvm/test/Verifier/invalid-disubrange-lowerBound.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+; CHECK: LowerBound must be signed constant or DIVariable or DIExpression
+!0 = !DISubrange(lowerBound: !1, upperBound: 1)
+!1 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed)
diff --git a/llvm/test/Verifier/invalid-disubrange-stride.ll b/llvm/test/Verifier/invalid-disubrange-stride.ll
new file mode 100644
index 0000000000000..eae6b625911e4
--- /dev/null
+++ b/llvm/test/Verifier/invalid-disubrange-stride.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+; CHECK: Stride must be signed constant or DIVariable or DIExpression
+!0 = !DISubrange(upperBound: 1, stride: !1)
+!1 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed)
diff --git a/llvm/test/Verifier/invalid-disubrange-upperBound.ll b/llvm/test/Verifier/invalid-disubrange-upperBound.ll
new file mode 100644
index 0000000000000..d4daa6ba7e1e2
--- /dev/null
+++ b/llvm/test/Verifier/invalid-disubrange-upperBound.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+; CHECK: UpperBound must be signed constant or DIVariable or DIExpression
+!0 = !DISubrange(lowerBound: 1, upperBound: !1)
+!1 = !DIBasicType(name: "integer*8", size: 64, align: 64, encoding: DW_ATE_signed)
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index d785e469e728e..038899ada9647 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -1139,11 +1139,12 @@ typedef MetadataTest DISubrangeTest;
 TEST_F(DISubrangeTest, get) {
   auto *N = DISubrange::get(Context, 5, 7);
   auto Count = N->getCount();
+  auto Lower = N->getLowerBound();
   EXPECT_EQ(dwarf::DW_TAG_subrange_type, N->getTag());
   ASSERT_TRUE(Count);
   ASSERT_TRUE(Count.is<ConstantInt*>());
   EXPECT_EQ(5, Count.get<ConstantInt*>()->getSExtValue());
-  EXPECT_EQ(7, N->getLowerBound());
+  EXPECT_EQ(7, Lower.get<ConstantInt *>()->getSExtValue());
   EXPECT_EQ(N, DISubrange::get(Context, 5, 7));
   EXPECT_EQ(DISubrange::get(Context, 5, 0), DISubrange::get(Context, 5));
 
@@ -1154,11 +1155,12 @@ TEST_F(DISubrangeTest, get) {
 TEST_F(DISubrangeTest, getEmptyArray) {
   auto *N = DISubrange::get(Context, -1, 0);
   auto Count = N->getCount();
+  auto Lower = N->getLowerBound();
   EXPECT_EQ(dwarf::DW_TAG_subrange_type, N->getTag());
   ASSERT_TRUE(Count);
   ASSERT_TRUE(Count.is<ConstantInt*>());
   EXPECT_EQ(-1, Count.get<ConstantInt*>()->getSExtValue());
-  EXPECT_EQ(0, N->getLowerBound());
+  EXPECT_EQ(0, Lower.get<ConstantInt *>()->getSExtValue());
   EXPECT_EQ(N, DISubrange::get(Context, -1, 0));
 }
 
@@ -1172,15 +1174,146 @@ TEST_F(DISubrangeTest, getVariableCount) {
 
   auto *N = DISubrange::get(Context, VlaExpr, 0);
   auto Count = N->getCount();
+  auto Lower = N->getLowerBound();
   ASSERT_TRUE(Count);
   ASSERT_TRUE(Count.is<DIVariable*>());
   EXPECT_EQ(VlaExpr, Count.get<DIVariable*>());
   ASSERT_TRUE(isa<DIVariable>(N->getRawCountNode()));
-  EXPECT_EQ(0, N->getLowerBound());
+  EXPECT_EQ(0, Lower.get<ConstantInt *>()->getSExtValue());
   EXPECT_EQ("vla_expr", Count.get<DIVariable*>()->getName());
   EXPECT_EQ(N, DISubrange::get(Context, VlaExpr, 0));
 }
 
+TEST_F(DISubrangeTest, fortranAllocatableInt) {
+  DILocalScope *Scope = getSubprogram();
+  DIFile *File = getFile();
+  DIType *Type = getDerivedType();
+  DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
+  auto *LI = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), -10));
+  auto *UI = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 10));
+  auto *SI = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 4));
+  auto *UIother = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 20));
+  auto *UVother = DILocalVariable::get(Context, Scope, "ubother", File, 8, Type,
+                                       2, Flags, 8);
+  auto *UEother = DIExpression::get(Context, {5, 6});
+  auto *LIZero = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 0));
+  auto *UIZero = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 0));
+
+  auto *N = DISubrange::get(Context, nullptr, LI, UI, SI);
+
+  auto Lower = N->getLowerBound();
+  ASSERT_TRUE(Lower);
+  ASSERT_TRUE(Lower.is<ConstantInt *>());
+  EXPECT_EQ(cast<ConstantInt>(LI->getValue()), Lower.get<ConstantInt *>());
+
+  auto Upper = N->getUpperBound();
+  ASSERT_TRUE(Upper);
+  ASSERT_TRUE(Upper.is<ConstantInt *>());
+  EXPECT_EQ(cast<ConstantInt>(UI->getValue()), Upper.get<ConstantInt *>());
+
+  auto Stride = N->getStride();
+  ASSERT_TRUE(Stride);
+  ASSERT_TRUE(Stride.is<ConstantInt *>());
+  EXPECT_EQ(cast<ConstantInt>(SI->getValue()), Stride.get<ConstantInt *>());
+
+  EXPECT_EQ(N, DISubrange::get(Context, nullptr, LI, UI, SI));
+
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LI, UIother, SI));
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LI, UEother, SI));
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LI, UVother, SI));
+
+  auto *NZeroLower = DISubrange::get(Context, nullptr, LIZero, UI, SI);
+  EXPECT_NE(NZeroLower, DISubrange::get(Context, nullptr, nullptr, UI, SI));
+
+  auto *NZeroUpper = DISubrange::get(Context, nullptr, LI, UIZero, SI);
+  EXPECT_NE(NZeroUpper, DISubrange::get(Context, nullptr, LI, nullptr, SI));
+}
+
+TEST_F(DISubrangeTest, fortranAllocatableVar) {
+  DILocalScope *Scope = getSubprogram();
+  DIFile *File = getFile();
+  DIType *Type = getDerivedType();
+  DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
+  auto *LV =
+      DILocalVariable::get(Context, Scope, "lb", File, 8, Type, 2, Flags, 8);
+  auto *UV =
+      DILocalVariable::get(Context, Scope, "ub", File, 8, Type, 2, Flags, 8);
+  auto *SV =
+      DILocalVariable::get(Context, Scope, "st", File, 8, Type, 2, Flags, 8);
+  auto *SVother = DILocalVariable::get(Context, Scope, "stother", File, 8, Type,
+                                       2, Flags, 8);
+  auto *SIother = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 20));
+  auto *SEother = DIExpression::get(Context, {5, 6});
+
+  auto *N = DISubrange::get(Context, nullptr, LV, UV, SV);
+
+  auto Lower = N->getLowerBound();
+  ASSERT_TRUE(Lower);
+  ASSERT_TRUE(Lower.is<DIVariable *>());
+  EXPECT_EQ(LV, Lower.get<DIVariable *>());
+
+  auto Upper = N->getUpperBound();
+  ASSERT_TRUE(Upper);
+  ASSERT_TRUE(Upper.is<DIVariable *>());
+  EXPECT_EQ(UV, Upper.get<DIVariable *>());
+
+  auto Stride = N->getStride();
+  ASSERT_TRUE(Stride);
+  ASSERT_TRUE(Stride.is<DIVariable *>());
+  EXPECT_EQ(SV, Stride.get<DIVariable *>());
+
+  EXPECT_EQ(N, DISubrange::get(Context, nullptr, LV, UV, SV));
+
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LV, UV, SVother));
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LV, UV, SEother));
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LV, UV, SIother));
+}
+
+TEST_F(DISubrangeTest, fortranAllocatableExpr) {
+  DILocalScope *Scope = getSubprogram();
+  DIFile *File = getFile();
+  DIType *Type = getDerivedType();
+  DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
+  auto *LE = DIExpression::get(Context, {1, 2});
+  auto *UE = DIExpression::get(Context, {2, 3});
+  auto *SE = DIExpression::get(Context, {3, 4});
+  auto *LEother = DIExpression::get(Context, {5, 6});
+  auto *LIother = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), 20));
+  auto *LVother = DILocalVariable::get(Context, Scope, "lbother", File, 8, Type,
+                                       2, Flags, 8);
+
+  auto *N = DISubrange::get(Context, nullptr, LE, UE, SE);
+
+  auto Lower = N->getLowerBound();
+  ASSERT_TRUE(Lower);
+  ASSERT_TRUE(Lower.is<DIExpression *>());
+  EXPECT_EQ(LE, Lower.get<DIExpression *>());
+
+  auto Upper = N->getUpperBound();
+  ASSERT_TRUE(Upper);
+  ASSERT_TRUE(Upper.is<DIExpression *>());
+  EXPECT_EQ(UE, Upper.get<DIExpression *>());
+
+  auto Stride = N->getStride();
+  ASSERT_TRUE(Stride);
+  ASSERT_TRUE(Stride.is<DIExpression *>());
+  EXPECT_EQ(SE, Stride.get<DIExpression *>());
+
+  EXPECT_EQ(N, DISubrange::get(Context, nullptr, LE, UE, SE));
+
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LEother, UE, SE));
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LIother, UE, SE));
+  EXPECT_NE(N, DISubrange::get(Context, nullptr, LVother, UE, SE));
+}
+
 typedef MetadataTest DIEnumeratorTest;
 
 TEST_F(DIEnumeratorTest, get) {

From ec0b66c318ea42ec229fd3a9ef4ad92bf81d41cf Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Thu, 28 May 2020 09:27:00 +0100
Subject: [PATCH 324/770] [CodeGen] Specify meaning of ISD opcodes for scalable
 vectors

This patch contains changes to the description of EXTRACT_SUBVECTOR,
INSERT_SUBVECTOR, INSERT_VECTOR_ELT, EXTRACT_VECTOR_ELT and
CONCAT_VECTORS to specify their behaviour for scalable vectors.

For EXTRACT_SUBVECTOR it specifies that the IDX is scaled by the
same runtime scaling as the extracted (or inserted) vector. This
definition is the most natural extension to EXTRACT_SUBVECTOR for
scalable vectors, as most use-cases that work on fixed-width types
will have the same meaning for scalable types. For legalization for
example, it is common to split the vector operation to operate on
the LO and HI halfs of a vector.

For a fixed width vector <16 x i8> this would be expressed with:

  v16i8 %res = EXTRACT_SUBVECTOR v32i8 %v, i32 16

For a scalable vector, this would similarly be expressed as:

  nxv16i8 %res = EXTRACT_SUBVECTOR nxv32i8 %V, i32 16

By extending the meaning of IDX for scalable vectors, most existing
optimisations on EXTRACT/INSERT_SUBVECTOR work for scalable vectors
without any changes. This definition also allows extracting a
fixed-width subvector from a scalable vector, which is useful to
e.g. extract the low N lanes of a scalable vector.

This patch is not NFC because it sets the meaning of these nodes
for scalable vectors, which future patches will build upon.

Reviewers: efriedma, ctetreau, rogfer01, craig.topper

Reviewed By: efriedma

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79806
---
 llvm/include/llvm/CodeGen/ISDOpcodes.h | 67 +++++++++++++++++---------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index f081a53263eff..cf3afd8aeabc0 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -446,44 +446,67 @@ enum NodeType {
   /// Returns platform specific canonical encoding of a floating point number.
   FCANONICALIZE,
 
-  /// BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a vector with the
-  /// specified, possibly variable, elements.  The number of elements is
-  /// required to be a power of two.  The types of the operands must all be
-  /// the same and must match the vector element type, except that integer
-  /// types are allowed to be larger than the element type, in which case
-  /// the operands are implicitly truncated.
+  /// BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector
+  /// with the specified, possibly variable, elements.  The number of elements
+  /// is required to be a power of two. The types of the operands must all be
+  /// the same and must match the vector element type, except that integer types
+  /// are allowed to be larger than the element type, in which case the operands
+  /// are implicitly truncated.
   BUILD_VECTOR,
 
   /// INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element
-  /// at IDX replaced with VAL.  If the type of VAL is larger than the vector
+  /// at IDX replaced with VAL. If the type of VAL is larger than the vector
   /// element type then VAL is truncated before replacement.
+  ///
+  /// If VECTOR is a scalable vector, then IDX may be larger than the minimum
+  /// vector width. IDX is not first scaled by the runtime scaling factor of
+  /// VECTOR.
   INSERT_VECTOR_ELT,
 
   /// EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR
-  /// identified by the (potentially variable) element number IDX.  If the
-  /// return type is an integer type larger than the element type of the
-  /// vector, the result is extended to the width of the return type. In
-  /// that case, the high bits are undefined.
+  /// identified by the (potentially variable) element number IDX. If the return
+  /// type is an integer type larger than the element type of the vector, the
+  /// result is extended to the width of the return type. In that case, the high
+  /// bits are undefined.
+  ///
+  /// If VECTOR is a scalable vector, then IDX may be larger than the minimum
+  /// vector width. IDX is not first scaled by the runtime scaling factor of
+  /// VECTOR.
   EXTRACT_VECTOR_ELT,
 
   /// CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of
   /// vector type with the same length and element type, this produces a
   /// concatenated vector result value, with length equal to the sum of the
-  /// lengths of the input vectors.
+  /// lengths of the input vectors. If VECTOR0 is a fixed-width vector, then
+  /// VECTOR1..VECTORN must all be fixed-width vectors. Similarly, if VECTOR0
+  /// is a scalable vector, then VECTOR1..VECTORN must all be scalable vectors.
   CONCAT_VECTORS,
 
-  /// INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector
-  /// with VECTOR2 inserted into VECTOR1 at the constant element number
-  /// IDX, which must be a multiple of the VECTOR2 vector length. The
-  /// elements of VECTOR1 starting at IDX are overwritten with VECTOR2.
-  /// Elements IDX through vector_length(VECTOR2) must be valid VECTOR1
-  /// indices.
+  /// INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2
+  /// inserted into VECTOR1. IDX represents the starting element number at which
+  /// VECTOR2 will be inserted. IDX must be a constant multiple of T's known
+  /// minimum vector length. Let the type of VECTOR2 be T, then if T is a
+  /// scalable vector, IDX is first scaled by the runtime scaling factor of T.
+  /// The elements of VECTOR1 starting at IDX are overwritten with VECTOR2.
+  /// Elements IDX through (IDX + num_elements(T) - 1) must be valid VECTOR1
+  /// indices. If this condition cannot be determined statically but is false at
+  /// runtime, then the result vector is undefined.
+  ///
+  /// This operation supports inserting a fixed-width vector into a scalable
+  /// vector, but not the other way around.
   INSERT_SUBVECTOR,
 
-  /// EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR (an
-  /// vector value) starting with the constant element number IDX, which
-  /// must be a multiple of the result vector length. Elements IDX through
-  /// vector_length(VECTOR) must be valid VECTOR indices.
+  /// EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
+  /// Let the result type be T, then IDX represents the starting element number
+  /// from which a subvector of type T is extracted. IDX must be a constant
+  /// multiple of T's known minimum vector length. If T is a scalable vector,
+  /// IDX is first scaled by the runtime scaling factor of T. Elements IDX
+  /// through (IDX + num_elements(T) - 1) must be valid VECTOR indices. If this
+  /// condition cannot be determined statically but is false at runtime, then
+  /// the result vector is undefined.
+  ///
+  /// This operation supports extracting a fixed-width vector from a scalable
+  /// vector, but not the other way around.
   EXTRACT_SUBVECTOR,
 
   /// VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as

From 0969541ffcb24ae1af59fcb8778063becf17dbca Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 28 May 2020 11:03:02 +0200
Subject: [PATCH 325/770] tsan: disable java_finalizer2 test on darwin

pthread_barrier_t is not supported on darwin.
Do what other tests that use pthread_barrier_t do.
---
 compiler-rt/test/tsan/java_finalizer2.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/compiler-rt/test/tsan/java_finalizer2.cpp b/compiler-rt/test/tsan/java_finalizer2.cpp
index 0cacf3f9adfd7..87528900541a8 100644
--- a/compiler-rt/test/tsan/java_finalizer2.cpp
+++ b/compiler-rt/test/tsan/java_finalizer2.cpp
@@ -1,5 +1,9 @@
 // RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
 // Regression test for https://github.com/golang/go/issues/39186
+
+// pthread barriers are not available on OS X
+// UNSUPPORTED: darwin
+
 #include "java.h"
 #include <string.h>
 

From 69935d86aed1b691c5f33a2141f15cb3aaee1af6 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 27 May 2020 23:55:53 +0200
Subject: [PATCH 326/770] [Clang][Sanitizers] Expect test failure on
 {arm,thumb}v7

Summary:
Versions of LLVM built on {arm,thumb}v7 appear to have differently
configured pass managers, which causes restrictions on which sanitizers
we may use.

As such, expect failure of the recently added "sanitize-coverage.c" test
on these architectures until we can investigate armv7's restrictions.

Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=46117

Reviewers: vitalybuka, glider

Reviewed By: glider

Subscribers: glider, kristof.beyls, danielkiss, cfe-commits, vvereschaka

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80668
---
 clang/test/CodeGen/sanitize-coverage.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/test/CodeGen/sanitize-coverage.c b/clang/test/CodeGen/sanitize-coverage.c
index 6fc8e39354d4f..ea4ac9296b48f 100644
--- a/clang/test/CodeGen/sanitize-coverage.c
+++ b/clang/test/CodeGen/sanitize-coverage.c
@@ -4,6 +4,9 @@
 // RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=memory     -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,MSAN
 // RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=thread     -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,TSAN
 // RUN: %clang %s -target x86_64-unknown-linux-gnu -emit-llvm -S -fsanitize=undefined  -fsanitize-coverage=trace-pc,trace-cmp -o - | FileCheck %s --check-prefixes=CHECK,UBSAN
+//
+// Host armv7 is currently unsupported: https://bugs.llvm.org/show_bug.cgi?id=46117
+// XFAIL: armv7, thumbv7
 
 int x[10];
 

From e533a176b3d4d936a4870cd1a3273941ba699882 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 28 May 2020 09:00:51 +0000
Subject: [PATCH 327/770] [TableGen] Fix non-standard escape warnings for
 braces in InstAlias

Summary:
TableGen interprets braces ('{}') in the asm string of instruction aliases as
variants but when defining aliases with literal braces they have to be escaped
to prevent them being removed.

Braces are escaped with '\\', for example:

  def FooBraces : InstAlias<"foo \\{$imm\\}", (foo IntOperand:$imm)>;

Although when TableGen is emitting the assembly writer (-gen-asm-writer)
the AsmString that gets emitted is:

  AsmString = "foo \{$\x01\}";

In c/c++ braces don't need to be escaped which causes compilation
warnings:

  warning: use of non-standard escape character '\{' [-Wpedantic]

This patch fixes the issue by unescaping the flattened alias asm string
in the asm writer, by replacing '\{\}' with '{}'.

Reviewed By: hfinkel

Differential Revision: https://reviews.llvm.org/D79991
---
 llvm/test/TableGen/AliasAsmString.td     | 28 ++++++++++++++++++++++++
 llvm/utils/TableGen/AsmWriterEmitter.cpp | 22 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 llvm/test/TableGen/AliasAsmString.td

diff --git a/llvm/test/TableGen/AliasAsmString.td b/llvm/test/TableGen/AliasAsmString.td
new file mode 100644
index 0000000000000..dedcc4b2af89c
--- /dev/null
+++ b/llvm/test/TableGen/AliasAsmString.td
@@ -0,0 +1,28 @@
+// RUN: llvm-tblgen -gen-asm-writer -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def ArchInstrInfo : InstrInfo { }
+
+def Arch : Target {
+  let InstructionSet = ArchInstrInfo;
+}
+
+def Reg : Register<"reg">;
+
+def RegClass : RegisterClass<"foo", [i32], 0, (add Reg)>;
+
+def IntOperand: Operand<i32>;
+
+def foo : Instruction {
+  let Size = 2;
+  let OutOperandList = (outs);
+  let InOperandList = (ins IntOperand:$imm);
+  let AsmString = "foo $imm";
+  let Namespace = "Arch";
+}
+
+def FooBraces : InstAlias<"foo \\{$imm\\}", (foo IntOperand:$imm)>;
+
+// CHECK: static const char AsmStrings[] =
+// CHECK-NEXT: /* 0 */ "foo {$\x01}\0"
diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp
index 72c7baa139a9d..d10ea71e97e3f 100644
--- a/llvm/utils/TableGen/AsmWriterEmitter.cpp
+++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp
@@ -267,6 +267,27 @@ static void UnescapeString(std::string &Str) {
   }
 }
 
+/// UnescapeAliasString - Supports literal braces in InstAlias asm string which
+/// are escaped with '\\' to avoid being interpreted as variants. Braces must
+/// be unescaped before c++ code is generated as (e.g.):
+///
+///   AsmString = "foo \{$\x01\}";
+///
+/// causes non-standard escape character warnings.
+static void UnescapeAliasString(std::string &Str) {
+  for (unsigned i = 0; i != Str.size(); ++i) {
+    if (Str[i] == '\\' && i != Str.size()-1) {
+      switch (Str[i+1]) {
+      default: continue;  // Don't execute the code after the switch.
+      case '{': Str[i] = '{'; break;
+      case '}': Str[i] = '}'; break;
+      }
+      // Nuke the second character.
+      Str.erase(Str.begin()+i+1);
+    }
+  }
+}
+
 /// EmitPrintInstruction - Generate the code for the "printInstruction" method
 /// implementation. Destroys all instances of AsmWriterInst information, by
 /// clearing the Instructions vector.
@@ -803,6 +824,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
       std::string FlatAliasAsmString =
           CodeGenInstruction::FlattenAsmStringVariants(CGA.AsmString, Variant);
+      UnescapeAliasString(FlatAliasAsmString);
 
       // Don't emit the alias if it has more operands than what it's aliasing.
       if (NumResultOps < CountNumOperands(FlatAliasAsmString, Variant))

From 23ac16cf9bd4cc0bb434efcf6385baf083a2ff7b Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomasp@graphcore.ai>
Date: Tue, 5 Mar 2019 23:20:29 +0000
Subject: [PATCH 328/770] FileCheck [10/12]: Add support for signed numeric
 values

Summary:
This patch is part of a patch series to add support for FileCheck
numeric expressions. This specific patch adds support signed numeric
values, thus allowing negative numeric values.

As such, the patch adds a new class to represent a signed or unsigned
value and add the logic for type promotion and type conversion in
numeric expression mixing signed and unsigned values. It also adds
the %d format specifier to represent signed value.

Finally, it also adds underflow and overflow detection when performing a
binary operation.

Copyright:
    - Linaro (changes up to diff 183612 of revision D55940)
    - GraphCore (changes in later versions of revision D55940 and
                 in new revision created off D55940)

Reviewers: jhenderson, chandlerc, jdenny, probinson, grimar, arichardson

Reviewed By: jhenderson, arichardson

Subscribers: MaskRay, hiraditya, llvm-commits, probinson, dblaikie, grimar, arichardson, kristina, hfinkel, rogfer01, JonChesterfield

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D60390
---
 llvm/docs/CommandGuide/FileCheck.rst       |  13 +-
 llvm/lib/Support/FileCheck.cpp             | 262 +++++++++++++---
 llvm/lib/Support/FileCheckImpl.h           |  83 ++++-
 llvm/test/FileCheck/numeric-expression.txt |  42 ++-
 llvm/unittests/Support/FileCheckTest.cpp   | 347 ++++++++++++++++++---
 5 files changed, 631 insertions(+), 116 deletions(-)

diff --git a/llvm/docs/CommandGuide/FileCheck.rst b/llvm/docs/CommandGuide/FileCheck.rst
index d8a2e343026bf..0512133f2e995 100644
--- a/llvm/docs/CommandGuide/FileCheck.rst
+++ b/llvm/docs/CommandGuide/FileCheck.rst
@@ -660,8 +660,8 @@ The syntax to define a numeric variable is ``[[#%<fmtspec>,<NUMVAR>:]]`` where:
 
 * ``%<fmtspec>`` is an optional scanf-style matching format specifier to
   indicate what number format to match (e.g. hex number).  Currently accepted
-  format specifiers are ``%u``, ``%x`` and ``%X``.  If absent, the format
-  specifier defaults to ``%u``.
+  format specifiers are ``%u``, ``%d``, ``%x`` and ``%X``.  If absent, the
+  format specifier defaults to ``%u``.
 
 * ``<NUMVAR>`` is the name of the numeric variable to define to the matching
   value.
@@ -692,10 +692,11 @@ The syntax of a numeric substitution is ``[[#%<fmtspec>,<expr>]]`` where:
   * an expression followed by an operator and a numeric operand.
 
   A numeric operand is a previously defined numeric variable, or an integer
-  literal. The supported operators are ``+`` and ``-``. Spaces are accepted
-  before, after and between any of these elements.
-  There is currently no support for operator precendence, but parentheses can
-  be used to change the evaluation order.
+  literal and have a 64-bit precision. The supported operators are ``+`` and
+  ``-``. Spaces are accepted before, after and between any of these elements.
+  Overflow and underflow are rejected. There is currently no support for
+  operator precendence, but parentheses can be used to change the evaluation
+  order.
 
 For example:
 
diff --git a/llvm/lib/Support/FileCheck.cpp b/llvm/lib/Support/FileCheck.cpp
index 300eea865f91b..454f38132f6be 100644
--- a/llvm/lib/Support/FileCheck.cpp
+++ b/llvm/lib/Support/FileCheck.cpp
@@ -17,6 +17,7 @@
 #include "FileCheckImpl.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/CheckedArithmetic.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <list>
@@ -31,6 +32,8 @@ StringRef ExpressionFormat::toString() const {
     return StringRef("<none>");
   case Kind::Unsigned:
     return StringRef("%u");
+  case Kind::Signed:
+    return StringRef("%d");
   case Kind::HexUpper:
     return StringRef("%X");
   case Kind::HexLower:
@@ -43,6 +46,8 @@ Expected<StringRef> ExpressionFormat::getWildcardRegex() const {
   switch (Value) {
   case Kind::Unsigned:
     return StringRef("[0-9]+");
+  case Kind::Signed:
+    return StringRef("-?[0-9]+");
   case Kind::HexUpper:
     return StringRef("[0-9A-F]+");
   case Kind::HexLower:
@@ -54,43 +59,188 @@ Expected<StringRef> ExpressionFormat::getWildcardRegex() const {
 }
 
 Expected<std::string>
-ExpressionFormat::getMatchingString(uint64_t IntegerValue) const {
+ExpressionFormat::getMatchingString(ExpressionValue IntegerValue) const {
+  if (Value == Kind::Signed) {
+    Expected<int64_t> SignedValue = IntegerValue.getSignedValue();
+    if (!SignedValue)
+      return SignedValue.takeError();
+    return itostr(*SignedValue);
+  }
+
+  Expected<uint64_t> UnsignedValue = IntegerValue.getUnsignedValue();
+  if (!UnsignedValue)
+    return UnsignedValue.takeError();
   switch (Value) {
   case Kind::Unsigned:
-    return utostr(IntegerValue);
+    return utostr(*UnsignedValue);
   case Kind::HexUpper:
-    return utohexstr(IntegerValue, /*LowerCase=*/false);
+    return utohexstr(*UnsignedValue, /*LowerCase=*/false);
   case Kind::HexLower:
-    return utohexstr(IntegerValue, /*LowerCase=*/true);
+    return utohexstr(*UnsignedValue, /*LowerCase=*/true);
   default:
     return createStringError(std::errc::invalid_argument,
                              "trying to match value with invalid format");
   }
 }
 
-Expected<uint64_t>
+Expected<ExpressionValue>
 ExpressionFormat::valueFromStringRepr(StringRef StrVal,
                                       const SourceMgr &SM) const {
+  bool ValueIsSigned = Value == Kind::Signed;
+  StringRef OverflowErrorStr = "unable to represent numeric value";
+  if (ValueIsSigned) {
+    int64_t SignedValue;
+
+    if (StrVal.getAsInteger(10, SignedValue))
+      return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
+
+    return ExpressionValue(SignedValue);
+  }
+
   bool Hex = Value == Kind::HexUpper || Value == Kind::HexLower;
-  uint64_t IntegerValue;
-  if (StrVal.getAsInteger(Hex ? 16 : 10, IntegerValue))
-    return ErrorDiagnostic::get(SM, StrVal,
-                                "unable to represent numeric value");
+  uint64_t UnsignedValue;
+  if (StrVal.getAsInteger(Hex ? 16 : 10, UnsignedValue))
+    return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
 
-  return IntegerValue;
+  return ExpressionValue(UnsignedValue);
 }
 
-Expected<uint64_t> NumericVariableUse::eval() const {
-  Optional<uint64_t> Value = Variable->getValue();
+static int64_t getAsSigned(uint64_t UnsignedValue) {
+  // Use memcpy to reinterpret the bitpattern in Value since casting to
+  // signed is implementation-defined if the unsigned value is too big to be
+  // represented in the signed type and using an union violates type aliasing
+  // rules.
+  int64_t SignedValue;
+  memcpy(&SignedValue, &UnsignedValue, sizeof(SignedValue));
+  return SignedValue;
+}
+
+Expected<int64_t> ExpressionValue::getSignedValue() const {
+  if (Negative)
+    return getAsSigned(Value);
+
+  if (Value > std::numeric_limits<int64_t>::max())
+    return make_error<OverflowError>();
+
+  // Value is in the representable range of int64_t so we can use cast.
+  return static_cast<int64_t>(Value);
+}
+
+Expected<uint64_t> ExpressionValue::getUnsignedValue() const {
+  if (Negative)
+    return make_error<OverflowError>();
+
+  return Value;
+}
+
+ExpressionValue ExpressionValue::getAbsolute() const {
+  if (!Negative)
+    return *this;
+
+  int64_t SignedValue = getAsSigned(Value);
+  int64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+  // Absolute value can be represented as int64_t.
+  if (SignedValue >= -MaxInt64)
+    return ExpressionValue(-getAsSigned(Value));
+
+  // -X == -(max int64_t + Rem), negate each component independently.
+  SignedValue += MaxInt64;
+  uint64_t RemainingValueAbsolute = -SignedValue;
+  return ExpressionValue(MaxInt64 + RemainingValueAbsolute);
+}
+
+Expected<ExpressionValue> llvm::operator+(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    int64_t RightValue = cantFail(RightOperand.getSignedValue());
+    Optional<int64_t> Result = checkedAdd<int64_t>(LeftValue, RightValue);
+    if (!Result)
+      return make_error<OverflowError>();
+
+    return ExpressionValue(*Result);
+  }
+
+  // (-A) + B == B - A.
+  if (LeftOperand.isNegative())
+    return RightOperand - LeftOperand.getAbsolute();
+
+  // A + (-B) == A - B.
+  if (RightOperand.isNegative())
+    return LeftOperand - RightOperand.getAbsolute();
+
+  // Both values are positive at this point.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  Optional<uint64_t> Result =
+      checkedAddUnsigned<uint64_t>(LeftValue, RightValue);
+  if (!Result)
+    return make_error<OverflowError>();
+
+  return ExpressionValue(*Result);
+}
+
+Expected<ExpressionValue> llvm::operator-(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // Result will be negative and thus might underflow.
+  if (LeftOperand.isNegative() && !RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+    // Result <= -1 - (max int64_t) which overflows on 1- and 2-complement.
+    if (RightValue > std::numeric_limits<int64_t>::max())
+      return make_error<OverflowError>();
+    Optional<int64_t> Result =
+        checkedSub(LeftValue, static_cast<int64_t>(RightValue));
+    if (!Result)
+      return make_error<OverflowError>();
+
+    return ExpressionValue(*Result);
+  }
+
+  // (-A) - (-B) == B - A.
+  if (LeftOperand.isNegative())
+    return RightOperand.getAbsolute() - LeftOperand.getAbsolute();
+
+  // A - (-B) == A + B.
+  if (RightOperand.isNegative())
+    return LeftOperand + RightOperand.getAbsolute();
+
+  // Both values are positive at this point.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  if (LeftValue >= RightValue)
+    return ExpressionValue(LeftValue - RightValue);
+  else {
+    uint64_t AbsoluteDifference = RightValue - LeftValue;
+    uint64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+    // Value might underflow.
+    if (AbsoluteDifference > MaxInt64) {
+      AbsoluteDifference -= MaxInt64;
+      int64_t Result = -MaxInt64;
+      int64_t MinInt64 = std::numeric_limits<int64_t>::min();
+      // Underflow, tested by:
+      //   abs(Result + (max int64_t)) > abs((min int64_t) + (max int64_t))
+      if (AbsoluteDifference > static_cast<uint64_t>(-(MinInt64 - Result)))
+        return make_error<OverflowError>();
+      Result -= static_cast<int64_t>(AbsoluteDifference);
+      return ExpressionValue(Result);
+    }
+
+    return ExpressionValue(-static_cast<int64_t>(AbsoluteDifference));
+  }
+}
+
+Expected<ExpressionValue> NumericVariableUse::eval() const {
+  Optional<ExpressionValue> Value = Variable->getValue();
   if (Value)
     return *Value;
 
   return make_error<UndefVarError>(getExpressionStr());
 }
 
-Expected<uint64_t> BinaryOperation::eval() const {
-  Expected<uint64_t> LeftOp = LeftOperand->eval();
-  Expected<uint64_t> RightOp = RightOperand->eval();
+Expected<ExpressionValue> BinaryOperation::eval() const {
+  Expected<ExpressionValue> LeftOp = LeftOperand->eval();
+  Expected<ExpressionValue> RightOp = RightOperand->eval();
 
   // Bubble up any error (e.g. undefined variables) in the recursive
   // evaluation.
@@ -136,7 +286,8 @@ BinaryOperation::getImplicitFormat(const SourceMgr &SM) const {
 Expected<std::string> NumericSubstitution::getResult() const {
   assert(ExpressionPointer->getAST() != nullptr &&
          "Substituting empty expression");
-  Expected<uint64_t> EvaluatedValue = ExpressionPointer->getAST()->eval();
+  Expected<ExpressionValue> EvaluatedValue =
+      ExpressionPointer->getAST()->eval();
   if (!EvaluatedValue)
     return EvaluatedValue.takeError();
   ExpressionFormat Format = ExpressionPointer->getFormat();
@@ -192,6 +343,7 @@ static char popFront(StringRef &S) {
   return C;
 }
 
+char OverflowError::ID = 0;
 char UndefVarError::ID = 0;
 char ErrorDiagnostic::ID = 0;
 char NotFoundError::ID = 0;
@@ -295,13 +447,18 @@ Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
   }
 
   // Otherwise, parse it as a literal.
-  uint64_t LiteralValue;
-  StringRef OperandExpr = Expr;
+  int64_t SignedLiteralValue;
+  uint64_t UnsignedLiteralValue;
+  StringRef SaveExpr = Expr;
+  // Accept both signed and unsigned literal, default to signed literal.
   if (!Expr.consumeInteger((AO == AllowedOperand::LegacyLiteral) ? 10 : 0,
-                           LiteralValue)) {
-    return std::make_unique<ExpressionLiteral>(
-        OperandExpr.drop_back(Expr.size()), LiteralValue);
-  }
+                           UnsignedLiteralValue))
+    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
+                                               UnsignedLiteralValue);
+  Expr = SaveExpr;
+  if (AO == AllowedOperand::Any && !Expr.consumeInteger(0, SignedLiteralValue))
+    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
+                                               SignedLiteralValue);
 
   return ErrorDiagnostic::get(SM, Expr,
                               "invalid operand format '" + Expr + "'");
@@ -339,14 +496,6 @@ Pattern::parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
   return SubExprResult;
 }
 
-static uint64_t add(uint64_t LeftOp, uint64_t RightOp) {
-  return LeftOp + RightOp;
-}
-
-static uint64_t sub(uint64_t LeftOp, uint64_t RightOp) {
-  return LeftOp - RightOp;
-}
-
 Expected<std::unique_ptr<ExpressionAST>>
 Pattern::parseBinop(StringRef Expr, StringRef &RemainingExpr,
                     std::unique_ptr<ExpressionAST> LeftOp,
@@ -363,10 +512,10 @@ Pattern::parseBinop(StringRef Expr, StringRef &RemainingExpr,
   binop_eval_t EvalBinop;
   switch (Operator) {
   case '+':
-    EvalBinop = add;
+    EvalBinop = operator+;
     break;
   case '-':
-    EvalBinop = sub;
+    EvalBinop = operator-;
     break;
   default:
     return ErrorDiagnostic::get(
@@ -415,6 +564,9 @@ Expected<std::unique_ptr<Expression>> Pattern::parseNumericSubstitutionBlock(
     case 'u':
       ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::Unsigned);
       break;
+    case 'd':
+      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::Signed);
+      break;
     case 'x':
       ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::HexLower);
       break;
@@ -819,7 +971,7 @@ Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
   if (!Substitutions.empty()) {
     TmpStr = RegExStr;
     if (LineNumber)
-      Context->LineVariable->setValue(*LineNumber);
+      Context->LineVariable->setValue(ExpressionValue(*LineNumber));
 
     size_t InsertOffset = 0;
     // Substitute all string variables and expressions whose values are only
@@ -828,8 +980,18 @@ Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
     for (const auto &Substitution : Substitutions) {
       // Substitute and check for failure (e.g. use of undefined variable).
       Expected<std::string> Value = Substitution->getResult();
-      if (!Value)
-        return Value.takeError();
+      if (!Value) {
+        // Convert to an ErrorDiagnostic to get location information. This is
+        // done here rather than PrintNoMatch since now we know which
+        // substitution block caused the overflow.
+        Error Err =
+            handleErrors(Value.takeError(), [&](const OverflowError &E) {
+              return ErrorDiagnostic::get(SM, Substitution->getFromString(),
+                                          "unable to substitute variable or "
+                                          "numeric expression: overflow error");
+            });
+        return std::move(Err);
+      }
 
       // Plop it into the regex at the adjusted offset.
       TmpStr.insert(TmpStr.begin() + Substitution->getIndex() + InsertOffset,
@@ -870,7 +1032,8 @@ Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
 
     StringRef MatchedValue = MatchInfo[CaptureParenGroup];
     ExpressionFormat Format = DefinedNumericVariable->getImplicitFormat();
-    Expected<uint64_t> Value = Format.valueFromStringRepr(MatchedValue, SM);
+    Expected<ExpressionValue> Value =
+        Format.valueFromStringRepr(MatchedValue, SM);
     if (!Value)
       return Value.takeError();
     DefinedNumericVariable->setValue(*Value);
@@ -914,17 +1077,20 @@ void Pattern::printSubstitutions(const SourceMgr &SM, StringRef Buffer,
       // variables it uses.
       if (!MatchedValue) {
         bool UndefSeen = false;
-        handleAllErrors(MatchedValue.takeError(), [](const NotFoundError &E) {},
-                        // Handled in PrintNoMatch().
-                        [](const ErrorDiagnostic &E) {},
-                        [&](const UndefVarError &E) {
-                          if (!UndefSeen) {
-                            OS << "uses undefined variable(s):";
-                            UndefSeen = true;
-                          }
-                          OS << " ";
-                          E.log(OS);
-                        });
+        handleAllErrors(
+            MatchedValue.takeError(), [](const NotFoundError &E) {},
+            // Handled in PrintNoMatch().
+            [](const ErrorDiagnostic &E) {},
+            // Handled in match().
+            [](const OverflowError &E) {},
+            [&](const UndefVarError &E) {
+              if (!UndefSeen) {
+                OS << "uses undefined variable(s):";
+                UndefSeen = true;
+              }
+              OS << " ";
+              E.log(OS);
+            });
       } else {
         // Substitution succeeded. Print substituted value.
         OS << "with \"";
@@ -2086,7 +2252,7 @@ Error FileCheckPatternContext::defineCmdlineVariables(
       // to, since the expression of a command-line variable definition should
       // only use variables defined earlier on the command-line. If not, this
       // is an error and we report it.
-      Expected<uint64_t> Value = Expression->getAST()->eval();
+      Expected<ExpressionValue> Value = Expression->getAST()->eval();
       if (!Value) {
         Errs = joinErrors(std::move(Errs), Value.takeError());
         continue;
diff --git a/llvm/lib/Support/FileCheckImpl.h b/llvm/lib/Support/FileCheckImpl.h
index f4f2fc21a2084..068de3da1c692 100644
--- a/llvm/lib/Support/FileCheckImpl.h
+++ b/llvm/lib/Support/FileCheckImpl.h
@@ -31,6 +31,8 @@ namespace llvm {
 // Numeric substitution handling code.
 //===----------------------------------------------------------------------===//
 
+class ExpressionValue;
+
 /// Type representing the format an expression value should be textualized into
 /// for matching. Used to represent both explicit format specifiers as well as
 /// implicit format from using numeric variables.
@@ -41,6 +43,8 @@ struct ExpressionFormat {
     NoFormat,
     /// Value is an unsigned integer and should be printed as a decimal number.
     Unsigned,
+    /// Value is a signed integer and should be printed as a decimal number.
+    Signed,
     /// Value should be printed as an uppercase hex number.
     HexUpper,
     /// Value should be printed as a lowercase hex number.
@@ -80,17 +84,64 @@ struct ExpressionFormat {
   Expected<StringRef> getWildcardRegex() const;
 
   /// \returns the string representation of \p Value in the format represented
-  /// by this instance, or an error if the format is NoFormat.
-  Expected<std::string> getMatchingString(uint64_t Value) const;
+  /// by this instance, or an error if conversion to this format failed or the
+  /// format is NoFormat.
+  Expected<std::string> getMatchingString(ExpressionValue Value) const;
 
   /// \returns the value corresponding to string representation \p StrVal
   /// according to the matching format represented by this instance or an error
   /// with diagnostic against \p SM if \p StrVal does not correspond to a valid
   /// and representable value.
-  Expected<uint64_t> valueFromStringRepr(StringRef StrVal,
-                                         const SourceMgr &SM) const;
+  Expected<ExpressionValue> valueFromStringRepr(StringRef StrVal,
+                                                const SourceMgr &SM) const;
 };
 
+/// Class to represent an overflow error that might result when manipulating a
+/// value.
+class OverflowError : public ErrorInfo<OverflowError> {
+public:
+  static char ID;
+
+  std::error_code convertToErrorCode() const override {
+    return std::make_error_code(std::errc::value_too_large);
+  }
+
+  void log(raw_ostream &OS) const override { OS << "overflow error"; }
+};
+
+/// Class representing a numeric value.
+class ExpressionValue {
+private:
+  uint64_t Value;
+  bool Negative;
+
+public:
+  template <class T>
+  explicit ExpressionValue(T Val) : Value(Val), Negative(Val < 0) {}
+
+  /// Returns true if value is signed and negative, false otherwise.
+  bool isNegative() const { return Negative; }
+
+  /// \returns the value as a signed integer or an error if the value is out of
+  /// range.
+  Expected<int64_t> getSignedValue() const;
+
+  /// \returns the value as an unsigned integer or an error if the value is out
+  /// of range.
+  Expected<uint64_t> getUnsignedValue() const;
+
+  /// \returns an unsigned ExpressionValue instance whose value is the absolute
+  /// value to this object's value.
+  ExpressionValue getAbsolute() const;
+};
+
+/// Performs operation and \returns its result or an error in case of failure,
+/// such as if an overflow occurs.
+Expected<ExpressionValue> operator+(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator-(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+
 /// Base class representing the AST of a given expression.
 class ExpressionAST {
 private:
@@ -105,7 +156,7 @@ class ExpressionAST {
 
   /// Evaluates and \returns the value of the expression represented by this
   /// AST or an error if evaluation fails.
-  virtual Expected<uint64_t> eval() const = 0;
+  virtual Expected<ExpressionValue> eval() const = 0;
 
   /// \returns either the implicit format of this AST, a diagnostic against
   /// \p SM if implicit formats of the AST's components conflict, or NoFormat
@@ -121,16 +172,15 @@ class ExpressionAST {
 class ExpressionLiteral : public ExpressionAST {
 private:
   /// Actual value of the literal.
-  uint64_t Value;
+  ExpressionValue Value;
 
 public:
-  /// Constructs a literal with the specified value parsed from
-  /// \p ExpressionStr.
-  ExpressionLiteral(StringRef ExpressionStr, uint64_t Val)
+  template <class T>
+  explicit ExpressionLiteral(StringRef ExpressionStr, T Val)
       : ExpressionAST(ExpressionStr), Value(Val) {}
 
   /// \returns the literal's value.
-  Expected<uint64_t> eval() const override { return Value; }
+  Expected<ExpressionValue> eval() const override { return Value; }
 };
 
 /// Class to represent an undefined variable error, which quotes that
@@ -190,7 +240,7 @@ class NumericVariable {
   ExpressionFormat ImplicitFormat;
 
   /// Value of numeric variable, if defined, or None otherwise.
-  Optional<uint64_t> Value;
+  Optional<ExpressionValue> Value;
 
   /// Line number where this variable is defined, or None if defined before
   /// input is parsed. Used to determine whether a variable is defined on the
@@ -213,10 +263,10 @@ class NumericVariable {
   ExpressionFormat getImplicitFormat() const { return ImplicitFormat; }
 
   /// \returns this variable's value.
-  Optional<uint64_t> getValue() const { return Value; }
+  Optional<ExpressionValue> getValue() const { return Value; }
 
   /// Sets value of this numeric variable to \p NewValue.
-  void setValue(uint64_t NewValue) { Value = NewValue; }
+  void setValue(ExpressionValue NewValue) { Value = NewValue; }
 
   /// Clears value of this numeric variable, regardless of whether it is
   /// currently defined or not.
@@ -238,7 +288,7 @@ class NumericVariableUse : public ExpressionAST {
   NumericVariableUse(StringRef Name, NumericVariable *Variable)
       : ExpressionAST(Name), Variable(Variable) {}
   /// \returns the value of the variable referenced by this instance.
-  Expected<uint64_t> eval() const override;
+  Expected<ExpressionValue> eval() const override;
 
   /// \returns implicit format of this numeric variable.
   Expected<ExpressionFormat>
@@ -248,7 +298,8 @@ class NumericVariableUse : public ExpressionAST {
 };
 
 /// Type of functions evaluating a given binary operation.
-using binop_eval_t = uint64_t (*)(uint64_t, uint64_t);
+using binop_eval_t = Expected<ExpressionValue> (*)(const ExpressionValue &,
+                                                   const ExpressionValue &);
 
 /// Class representing a single binary operation in the AST of an expression.
 class BinaryOperation : public ExpressionAST {
@@ -275,7 +326,7 @@ class BinaryOperation : public ExpressionAST {
   /// using EvalBinop on the result of recursively evaluating the operands.
   /// \returns the expression value or an error if an undefined numeric
   /// variable is used in one of the operands.
-  Expected<uint64_t> eval() const override;
+  Expected<ExpressionValue> eval() const override;
 
   /// \returns the implicit format of this AST, if any, a diagnostic against
   /// \p SM if the implicit formats of the AST's components conflict, or no
diff --git a/llvm/test/FileCheck/numeric-expression.txt b/llvm/test/FileCheck/numeric-expression.txt
index 3d33e64a0a9ed..d5b4db7d30ea0 100644
--- a/llvm/test/FileCheck/numeric-expression.txt
+++ b/llvm/test/FileCheck/numeric-expression.txt
@@ -19,8 +19,9 @@ REDEF NO SPC  // CHECK-LABEL: REDEF
 
 ; Numeric variable definition with explicit matching format.
 DEF FMT  // CHECK-LABEL: DEF FMT
-c  // CHECK-NEXT: {{^}}[[#%x,LHEX:]]
-D  // CHECK-NEXT: {{^}}[[#%X,UHEX:]]
+c    // CHECK-NEXT: {{^}}[[#%x,LHEX:]]
+D    // CHECK-NEXT: {{^}}[[#%X,UHEX:]]
+-30  // CHECK-NEXT: {{^}}[[#%d,SIGN:]]
 
 ; Numeric variable definition with explicit matching format with different
 ; spacing.
@@ -64,6 +65,10 @@ E   // CHECK-NEXT: {{^}}[[#%X,UHEX+1]]
 C   // CHECK-NEXT: {{^}}[[#%X,UHEX-1]]
 1B  // CHECK-NEXT: {{^}}[[#%X,UHEX+0xe]]
 1B  // CHECK-NEXT: {{^}}[[#%X,UHEX+0xE]]
+-30 // CHECK-NEXT: {{^}}[[#%d,SIGN]]
+-29 // CHECK-NEXT: {{^}}[[#%d,SIGN+1]]
+-31 // CHECK-NEXT: {{^}}[[#%d,SIGN-1]]
+42  // CHECK-NEXT: {{^}}[[#%d,SIGN+72]]
 11  // CHECK-NEXT: {{^}}[[#%u,UNSIa]]
 11  // CHECK-NEXT: {{^}}[[#%u,UNSIb]]
 11  // CHECK-NEXT: {{^}}[[#%u,UNSIc]]
@@ -104,6 +109,9 @@ E   // CHECK-NEXT: {{^}}[[#UHEX+1]]
 C   // CHECK-NEXT: {{^}}[[#UHEX-1]]
 1B  // CHECK-NEXT: {{^}}[[#UHEX+0xe]]
 1B  // CHECK-NEXT: {{^}}[[#UHEX+0xE]]
+-30 // CHECK-NEXT: {{^}}[[#SIGN]]
+-29 // CHECK-NEXT: {{^}}[[#SIGN+1]]
+-31 // CHECK-NEXT: {{^}}[[#SIGN-1]]
 
 ; Numeric expressions using variables defined on other lines and an immediate
 ; interpreted as an unsigned value.
@@ -118,10 +126,16 @@ CHECK-NEXT: [[#UNSI+0x8000000000000000]]
 USE CONV FMT IMPL MATCH  // CHECK-LABEL: USE CONV FMT IMPL MATCH
 b   // CHECK-NEXT: {{^}}[[# %x, UNSI]]
 B   // CHECK-NEXT: {{^}}[[# %X, UNSI]]
+-1  // CHECK-NEXT: {{^}}[[# %d, UNSI-12]]
 12  // CHECK-NEXT: {{^}}[[# %u, LHEX]]
 C   // CHECK-NEXT: {{^}}[[# %X, LHEX]]
+-2  // CHECK-NEXT: {{^}}[[# %d, LHEX-14]]
 13  // CHECK-NEXT: {{^}}[[# %u, UHEX]]
 d   // CHECK-NEXT: {{^}}[[# %x, UHEX]]
+-5  // CHECK-NEXT: {{^}}[[# %d, UHEX-18]]
+15  // CHECK-NEXT: {{^}}[[# %u, SIGN+45]]
+f   // CHECK-NEXT: {{^}}[[# %x, SIGN+45]]
+F   // CHECK-NEXT: {{^}}[[# %X, SIGN+45]]
 
 ; Conflicting implicit format.
 RUN: %ProtectFileCheckOutput \
@@ -329,3 +343,27 @@ REDEF-NEW-FMT-NEXT: [[#%X,UNSI:]]
 REDEF-NEW-FMT-MSG: numeric-expression.txt:[[#@LINE-1]]:31: error: format different from previous variable definition
 REDEF-NEW-FMT-MSG-NEXT: {{R}}EDEF-NEW-FMT-NEXT: {{\[\[#%X,UNSI:\]\]}}
 REDEF-NEW-FMT-MSG-NEXT:    {{^}}                              ^{{$}}
+
+; Numeric expression with overflow.
+RUN: not FileCheck --check-prefix OVERFLOW --input-file %s %s 2>&1 \
+RUN:   | FileCheck --check-prefix OVERFLOW-MSG --strict-whitespace %s
+
+OVERFLOW
+BIGVAR=10000000000000000
+OVERFLOW-LABEL: OVERFLOW
+OVERFLOW-NEXT: BIGVAR: [[#BIGVAR:0x8000000000000000+0x8000000000000000]]
+OVERFLOW-MSG: numeric-expression.txt:[[#@LINE-1]]:27: error: unable to substitute variable or numeric expression
+OVERFLOW-MSG-NEXT: {{O}}VERFLOW-NEXT: BIGVAR: {{\[\[#BIGVAR:0x8000000000000000\+0x8000000000000000\]\]}}
+OVERFLOW-MSG-NEXT:    {{^}}                          ^{{$}}
+
+; Numeric expression with underflow.
+RUN: not FileCheck --check-prefix UNDERFLOW --input-file %s %s 2>&1 \
+RUN:   | FileCheck --check-prefix UNDERFLOW-MSG --strict-whitespace %s
+
+UNDERFLOW
+TINYVAR=-10000000000000000
+UNDERFLOW-LABEL: UNDERFLOW
+UNDERFLOW-NEXT: TINYVAR: [[#%d,TINYVAR:-0x8000000000000000-0x8000000000000000]]
+UNDERFLOW-MSG: numeric-expression.txt:[[#@LINE-1]]:29: error: unable to substitute variable or numeric expression
+UNDERFLOW-MSG-NEXT: {{U}}NDERFLOW-NEXT: TINYVAR: {{\[\[#%d,TINYVAR:-0x8000000000000000-0x8000000000000000\]\]}}
+UNDERFLOW-MSG-NEXT:    {{^}}                            ^{{$}}
diff --git a/llvm/unittests/Support/FileCheckTest.cpp b/llvm/unittests/Support/FileCheckTest.cpp
index 75b7fba8759d8..54646a036f73f 100644
--- a/llvm/unittests/Support/FileCheckTest.cpp
+++ b/llvm/unittests/Support/FileCheckTest.cpp
@@ -88,13 +88,16 @@ struct ExpressionFormatParameterisedFixture
   bool AllowUpperHex;
 };
 
+const uint64_t MaxUint64 = std::numeric_limits<uint64_t>::max();
+
 TEST_P(ExpressionFormatParameterisedFixture, Format) {
   SourceMgr SM;
   ExpressionFormat Format(Kind);
+  bool Signed = Kind == ExpressionFormat::Kind::Signed;
 
   Expected<StringRef> WildcardPattern = Format.getWildcardRegex();
   ASSERT_THAT_EXPECTED(WildcardPattern, Succeeded());
-  Regex WildcardRegex(*WildcardPattern);
+  Regex WildcardRegex((Twine("^") + *WildcardPattern).str());
   ASSERT_TRUE(WildcardRegex.isValid());
   // Does not match empty string.
   EXPECT_FALSE(WildcardRegex.match(""));
@@ -103,6 +106,14 @@ TEST_P(ExpressionFormatParameterisedFixture, Format) {
   StringRef DecimalDigits = "0123456789";
   ASSERT_TRUE(WildcardRegex.match(DecimalDigits, &Matches));
   EXPECT_EQ(Matches[0], DecimalDigits);
+  // Matches negative digits.
+  StringRef MinusFortyTwo = "-42";
+  bool MatchSuccess = WildcardRegex.match(MinusFortyTwo, &Matches);
+  if (Signed) {
+    ASSERT_TRUE(MatchSuccess);
+    EXPECT_EQ(Matches[0], MinusFortyTwo);
+  } else
+    EXPECT_FALSE(MatchSuccess);
   // Check non digits or digits with wrong casing are not matched.
   if (AllowHex) {
     StringRef HexOnlyDigits[] = {"abcdef", "ABCDEF"};
@@ -121,42 +132,75 @@ TEST_P(ExpressionFormatParameterisedFixture, Format) {
     EXPECT_FALSE(WildcardRegex.match("A"));
   }
 
-  Expected<std::string> MatchingString = Format.getMatchingString(0U);
+  Expected<std::string> MatchingString =
+      Format.getMatchingString(ExpressionValue(0u));
   ASSERT_THAT_EXPECTED(MatchingString, Succeeded());
   EXPECT_EQ(*MatchingString, "0");
-  MatchingString = Format.getMatchingString(9U);
+  MatchingString = Format.getMatchingString(ExpressionValue(9u));
   ASSERT_THAT_EXPECTED(MatchingString, Succeeded());
   EXPECT_EQ(*MatchingString, "9");
-  Expected<std::string> TenMatchingString = Format.getMatchingString(10U);
+  MatchingString = Format.getMatchingString(ExpressionValue(-5));
+  if (Signed) {
+    ASSERT_THAT_EXPECTED(MatchingString, Succeeded());
+    EXPECT_EQ(*MatchingString, "-5");
+  } else {
+    // Error message tested in ExpressionValue unit tests.
+    EXPECT_THAT_EXPECTED(MatchingString, Failed());
+  }
+  Expected<std::string> MaxUint64MatchingString =
+      Format.getMatchingString(ExpressionValue(MaxUint64));
+  Expected<std::string> TenMatchingString =
+      Format.getMatchingString(ExpressionValue(10u));
   ASSERT_THAT_EXPECTED(TenMatchingString, Succeeded());
-  Expected<std::string> FifteenMatchingString = Format.getMatchingString(15U);
+  Expected<std::string> FifteenMatchingString =
+      Format.getMatchingString(ExpressionValue(15u));
   ASSERT_THAT_EXPECTED(FifteenMatchingString, Succeeded());
   StringRef ExpectedTenMatchingString, ExpectedFifteenMatchingString;
+  std::string MaxUint64Str;
   if (AllowHex) {
     if (AllowUpperHex) {
+      MaxUint64Str = "FFFFFFFFFFFFFFFF";
       ExpectedTenMatchingString = "A";
       ExpectedFifteenMatchingString = "F";
     } else {
+      MaxUint64Str = "ffffffffffffffff";
       ExpectedTenMatchingString = "a";
       ExpectedFifteenMatchingString = "f";
     }
   } else {
+    MaxUint64Str = std::to_string(MaxUint64);
     ExpectedTenMatchingString = "10";
     ExpectedFifteenMatchingString = "15";
   }
+  if (Signed) {
+    // Error message tested in ExpressionValue unit tests.
+    EXPECT_THAT_EXPECTED(MaxUint64MatchingString, Failed());
+  } else {
+    ASSERT_THAT_EXPECTED(MaxUint64MatchingString, Succeeded());
+    EXPECT_EQ(*MaxUint64MatchingString, MaxUint64Str);
+  }
   EXPECT_EQ(*TenMatchingString, ExpectedTenMatchingString);
   EXPECT_EQ(*FifteenMatchingString, ExpectedFifteenMatchingString);
 
   StringRef BufferizedValidValueStr = bufferize(SM, "0");
-  Expected<uint64_t> Val =
+  Expected<ExpressionValue> Val =
       Format.valueFromStringRepr(BufferizedValidValueStr, SM);
   ASSERT_THAT_EXPECTED(Val, Succeeded());
-  EXPECT_EQ(*Val, 0U);
+  EXPECT_EQ(cantFail(Val->getSignedValue()), 0);
   BufferizedValidValueStr = bufferize(SM, "9");
   Val = Format.valueFromStringRepr(BufferizedValidValueStr, SM);
   ASSERT_THAT_EXPECTED(Val, Succeeded());
-  EXPECT_EQ(*Val, 9U);
-  StringRef BufferizedTenStr, BufferizedInvalidTenStr, BufferizedFifteenStr;
+  EXPECT_EQ(cantFail(Val->getSignedValue()), 9);
+  StringRef BufferizedMinusFiveStr = bufferize(SM, "-5");
+  Val = Format.valueFromStringRepr(BufferizedMinusFiveStr, SM);
+  StringRef OverflowErrorStr = "unable to represent numeric value";
+  if (Signed) {
+    ASSERT_THAT_EXPECTED(Val, Succeeded());
+    EXPECT_EQ(cantFail(Val->getSignedValue()), -5);
+  } else
+    expectDiagnosticError(OverflowErrorStr, Val.takeError());
+  StringRef BufferizedMaxUint64Str, BufferizedTenStr, BufferizedInvalidTenStr,
+      BufferizedFifteenStr;
   StringRef TenStr, FifteenStr, InvalidTenStr;
   if (AllowHex) {
     if (AllowUpperHex) {
@@ -173,19 +217,27 @@ TEST_P(ExpressionFormatParameterisedFixture, Format) {
     FifteenStr = "15";
     InvalidTenStr = "A";
   }
+  BufferizedMaxUint64Str = bufferize(SM, MaxUint64Str);
+  Val = Format.valueFromStringRepr(BufferizedMaxUint64Str, SM);
+  if (Signed)
+    expectDiagnosticError(OverflowErrorStr, Val.takeError());
+  else {
+    ASSERT_THAT_EXPECTED(Val, Succeeded());
+    EXPECT_EQ(cantFail(Val->getUnsignedValue()), MaxUint64);
+  }
   BufferizedTenStr = bufferize(SM, TenStr);
   Val = Format.valueFromStringRepr(BufferizedTenStr, SM);
   ASSERT_THAT_EXPECTED(Val, Succeeded());
-  EXPECT_EQ(*Val, 10U);
+  EXPECT_EQ(cantFail(Val->getSignedValue()), 10);
   BufferizedFifteenStr = bufferize(SM, FifteenStr);
   Val = Format.valueFromStringRepr(BufferizedFifteenStr, SM);
   ASSERT_THAT_EXPECTED(Val, Succeeded());
-  EXPECT_EQ(*Val, 15U);
+  EXPECT_EQ(cantFail(Val->getSignedValue()), 15);
   // Wrong casing is not tested because valueFromStringRepr() relies on
   // StringRef's getAsInteger() which does not allow to restrict casing.
   BufferizedInvalidTenStr = bufferize(SM, InvalidTenStr);
   expectDiagnosticError(
-      "unable to represent numeric value",
+      OverflowErrorStr,
       Format.valueFromStringRepr(bufferize(SM, "G"), SM).takeError());
 
   // Check boolean operator.
@@ -197,6 +249,8 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         std::make_tuple(ExpressionFormat::Kind::Unsigned, /*AllowHex=*/false,
                         /*AllowUpperHex=*/false),
+        std::make_tuple(ExpressionFormat::Kind::Signed, /*AllowHex=*/false,
+                        /*AllowUpperHex=*/false),
         std::make_tuple(ExpressionFormat::Kind::HexLower, /*AllowHex=*/true,
                         /*AllowUpperHex=*/false),
         std::make_tuple(ExpressionFormat::Kind::HexUpper, /*AllowHex=*/true,
@@ -206,8 +260,9 @@ TEST_F(FileCheckTest, NoFormatProperties) {
   ExpressionFormat NoFormat(ExpressionFormat::Kind::NoFormat);
   expectError<StringError>("trying to match value with invalid format",
                            NoFormat.getWildcardRegex().takeError());
-  expectError<StringError>("trying to match value with invalid format",
-                           NoFormat.getMatchingString(18).takeError());
+  expectError<StringError>(
+      "trying to match value with invalid format",
+      NoFormat.getMatchingString(ExpressionValue(18u)).takeError());
   EXPECT_FALSE(bool(NoFormat));
 }
 
@@ -238,31 +293,221 @@ TEST_F(FileCheckTest, FormatKindEqualityOperators) {
   EXPECT_FALSE(NoFormat != ExpressionFormat::Kind::NoFormat);
 }
 
+template <class T1, class T2>
+static Expected<ExpressionValue> doValueOperation(binop_eval_t Operation,
+                                                  T1 LeftValue, T2 RightValue) {
+  ExpressionValue LeftOperand(LeftValue);
+  ExpressionValue RightOperand(RightValue);
+  return Operation(LeftOperand, RightOperand);
+}
+
+template <class T>
+static void expectValueEqual(ExpressionValue ActualValue, T ExpectedValue) {
+  EXPECT_EQ(ExpectedValue < 0, ActualValue.isNegative());
+  if (ExpectedValue < 0) {
+    Expected<int64_t> SignedActualValue = ActualValue.getSignedValue();
+    ASSERT_THAT_EXPECTED(SignedActualValue, Succeeded());
+    EXPECT_EQ(*SignedActualValue, static_cast<int64_t>(ExpectedValue));
+  } else {
+    Expected<uint64_t> UnsignedActualValue = ActualValue.getUnsignedValue();
+    ASSERT_THAT_EXPECTED(UnsignedActualValue, Succeeded());
+    EXPECT_EQ(*UnsignedActualValue, static_cast<uint64_t>(ExpectedValue));
+  }
+}
+
+template <class T1, class T2, class TR>
+static void expectOperationValueResult(binop_eval_t Operation, T1 LeftValue,
+                                       T2 RightValue, TR ResultValue) {
+  Expected<ExpressionValue> OperationResult =
+      doValueOperation(Operation, LeftValue, RightValue);
+  ASSERT_THAT_EXPECTED(OperationResult, Succeeded());
+  expectValueEqual(*OperationResult, ResultValue);
+}
+
+template <class T1, class T2>
+static void expectOperationValueResult(binop_eval_t Operation, T1 LeftValue,
+                                       T2 RightValue) {
+  expectError<OverflowError>(
+      "overflow error",
+      doValueOperation(Operation, LeftValue, RightValue).takeError());
+}
+
+const int64_t MinInt64 = std::numeric_limits<int64_t>::min();
+const int64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+
+TEST_F(FileCheckTest, ExpressionValueGetUnsigned) {
+  // Test positive value.
+  Expected<uint64_t> UnsignedValue = ExpressionValue(10).getUnsignedValue();
+  ASSERT_THAT_EXPECTED(UnsignedValue, Succeeded());
+  EXPECT_EQ(*UnsignedValue, 10U);
+
+  // Test 0.
+  UnsignedValue = ExpressionValue(0).getUnsignedValue();
+  ASSERT_THAT_EXPECTED(UnsignedValue, Succeeded());
+  EXPECT_EQ(*UnsignedValue, 0U);
+
+  // Test max positive value.
+  UnsignedValue = ExpressionValue(MaxUint64).getUnsignedValue();
+  ASSERT_THAT_EXPECTED(UnsignedValue, Succeeded());
+  EXPECT_EQ(*UnsignedValue, MaxUint64);
+
+  // Test failure with negative value.
+  expectError<OverflowError>(
+      "overflow error", ExpressionValue(-1).getUnsignedValue().takeError());
+
+  // Test failure with min negative value.
+  expectError<OverflowError>(
+      "overflow error",
+      ExpressionValue(MinInt64).getUnsignedValue().takeError());
+}
+
+TEST_F(FileCheckTest, ExpressionValueGetSigned) {
+  // Test positive value.
+  Expected<int64_t> SignedValue = ExpressionValue(10).getSignedValue();
+  ASSERT_THAT_EXPECTED(SignedValue, Succeeded());
+  EXPECT_EQ(*SignedValue, 10);
+
+  // Test 0.
+  SignedValue = ExpressionValue(0).getSignedValue();
+  ASSERT_THAT_EXPECTED(SignedValue, Succeeded());
+  EXPECT_EQ(*SignedValue, 0);
+
+  // Test max int64_t.
+  SignedValue = ExpressionValue(MaxInt64).getSignedValue();
+  ASSERT_THAT_EXPECTED(SignedValue, Succeeded());
+  EXPECT_EQ(*SignedValue, MaxInt64);
+
+  // Test failure with too big positive value.
+  expectError<OverflowError>(
+      "overflow error", ExpressionValue(static_cast<uint64_t>(MaxInt64) + 1)
+                            .getSignedValue()
+                            .takeError());
+
+  // Test failure with max uint64_t.
+  expectError<OverflowError>(
+      "overflow error",
+      ExpressionValue(MaxUint64).getSignedValue().takeError());
+
+  // Test negative value.
+  SignedValue = ExpressionValue(-10).getSignedValue();
+  ASSERT_THAT_EXPECTED(SignedValue, Succeeded());
+  EXPECT_EQ(*SignedValue, -10);
+
+  // Test min int64_t.
+  SignedValue = ExpressionValue(MinInt64).getSignedValue();
+  ASSERT_THAT_EXPECTED(SignedValue, Succeeded());
+  EXPECT_EQ(*SignedValue, MinInt64);
+}
+
+TEST_F(FileCheckTest, ExpressionValueAbsolute) {
+  // Test positive value.
+  expectValueEqual(ExpressionValue(10).getAbsolute(), 10);
+
+  // Test 0.
+  expectValueEqual(ExpressionValue(0).getAbsolute(), 0);
+
+  // Test max uint64_t.
+  expectValueEqual(ExpressionValue(MaxUint64).getAbsolute(), MaxUint64);
+
+  // Test negative value.
+  expectValueEqual(ExpressionValue(-10).getAbsolute(), 10);
+
+  // Test absence of overflow on min int64_t.
+  expectValueEqual(ExpressionValue(MinInt64).getAbsolute(),
+                   static_cast<uint64_t>(-(MinInt64 + 10)) + 10);
+}
+
+TEST_F(FileCheckTest, ExpressionValueAddition) {
+  // Test both negative values.
+  expectOperationValueResult(operator+, -10, -10, -20);
+
+  // Test both negative values with underflow.
+  expectOperationValueResult(operator+, MinInt64, -1);
+  expectOperationValueResult(operator+, MinInt64, MinInt64);
+
+  // Test negative and positive value.
+  expectOperationValueResult(operator+, -10, 10, 0);
+  expectOperationValueResult(operator+, -10, 11, 1);
+  expectOperationValueResult(operator+, -11, 10, -1);
+
+  // Test positive and negative value.
+  expectOperationValueResult(operator+, 10, -10, 0);
+  expectOperationValueResult(operator+, 10, -11, -1);
+  expectOperationValueResult(operator+, 11, -10, 1);
+
+  // Test both positive values.
+  expectOperationValueResult(operator+, 10, 10, 20);
+
+  // Test both positive values with overflow.
+  expectOperationValueResult(operator+, MaxUint64, 1);
+  expectOperationValueResult(operator+, MaxUint64, MaxUint64);
+}
+
+TEST_F(FileCheckTest, ExpressionValueSubtraction) {
+  // Test negative value and value bigger than int64_t max.
+  expectOperationValueResult(operator-, -10, MaxUint64);
+
+  // Test negative and positive value with underflow.
+  expectOperationValueResult(operator-, MinInt64, 1);
+
+  // Test negative and positive value.
+  expectOperationValueResult(operator-, -10, 10, -20);
+
+  // Test both negative values.
+  expectOperationValueResult(operator-, -10, -10, 0);
+  expectOperationValueResult(operator-, -11, -10, -1);
+  expectOperationValueResult(operator-, -10, -11, 1);
+
+  // Test positive and negative values.
+  expectOperationValueResult(operator-, 10, -10, 20);
+
+  // Test both positive values with result positive.
+  expectOperationValueResult(operator-, 10, 5, 5);
+
+  // Test both positive values with underflow.
+  expectOperationValueResult(operator-, 0, MaxUint64);
+  expectOperationValueResult(operator-, 0,
+                             static_cast<uint64_t>(-(MinInt64 + 10)) + 11);
+
+  // Test both positive values with result < -(max int64_t)
+  expectOperationValueResult(operator-, 10,
+                             static_cast<uint64_t>(MaxInt64) + 11,
+                             -MaxInt64 - 1);
+
+  // Test both positive values with 0 > result > -(max int64_t)
+  expectOperationValueResult(operator-, 10, 11, -1);
+}
+
 TEST_F(FileCheckTest, Literal) {
   SourceMgr SM;
 
   // Eval returns the literal's value.
-  ExpressionLiteral Ten(bufferize(SM, "10"), 10);
-  Expected<uint64_t> Value = Ten.eval();
+  ExpressionLiteral Ten(bufferize(SM, "10"), 10u);
+  Expected<ExpressionValue> Value = Ten.eval();
   ASSERT_THAT_EXPECTED(Value, Succeeded());
-  EXPECT_EQ(10U, *Value);
+  EXPECT_EQ(10, cantFail(Value->getSignedValue()));
   Expected<ExpressionFormat> ImplicitFormat = Ten.getImplicitFormat(SM);
   ASSERT_THAT_EXPECTED(ImplicitFormat, Succeeded());
   EXPECT_EQ(*ImplicitFormat, ExpressionFormat::Kind::NoFormat);
 
+  // Min value can be correctly represented.
+  ExpressionLiteral Min(bufferize(SM, std::to_string(MinInt64)), MinInt64);
+  Value = Min.eval();
+  ASSERT_TRUE(bool(Value));
+  EXPECT_EQ(MinInt64, cantFail(Value->getSignedValue()));
+
   // Max value can be correctly represented.
-  uint64_t MaxUint64 = std::numeric_limits<uint64_t>::max();
   ExpressionLiteral Max(bufferize(SM, std::to_string(MaxUint64)), MaxUint64);
   Value = Max.eval();
   ASSERT_THAT_EXPECTED(Value, Succeeded());
-  EXPECT_EQ(std::numeric_limits<uint64_t>::max(), *Value);
+  EXPECT_EQ(MaxUint64, cantFail(Value->getUnsignedValue()));
 }
 
 TEST_F(FileCheckTest, Expression) {
   SourceMgr SM;
 
   std::unique_ptr<ExpressionLiteral> Ten =
-      std::make_unique<ExpressionLiteral>(bufferize(SM, "10"), 10);
+      std::make_unique<ExpressionLiteral>(bufferize(SM, "10"), 10u);
   ExpressionLiteral *TenPtr = Ten.get();
   Expression Expr(std::move(Ten),
                   ExpressionFormat(ExpressionFormat::Kind::HexLower));
@@ -283,8 +528,6 @@ expectUndefErrors(std::unordered_set<std::string> ExpectedUndefVarNames,
   EXPECT_TRUE(ExpectedUndefVarNames.empty()) << toString(ExpectedUndefVarNames);
 }
 
-uint64_t doAdd(uint64_t OpL, uint64_t OpR) { return OpL + OpR; }
-
 TEST_F(FileCheckTest, NumericVariable) {
   SourceMgr SM;
 
@@ -299,18 +542,18 @@ TEST_F(FileCheckTest, NumericVariable) {
   ASSERT_THAT_EXPECTED(ImplicitFormat, Succeeded());
   EXPECT_EQ(*ImplicitFormat, ExpressionFormat::Kind::Unsigned);
   EXPECT_FALSE(FooVar.getValue());
-  Expected<uint64_t> EvalResult = FooVarUse.eval();
+  Expected<ExpressionValue> EvalResult = FooVarUse.eval();
   expectUndefErrors({"FOO"}, EvalResult.takeError());
 
-  FooVar.setValue(42);
+  FooVar.setValue(ExpressionValue(42u));
 
   // Defined variable: getValue and eval return value set.
-  Optional<uint64_t> Value = FooVar.getValue();
+  Optional<ExpressionValue> Value = FooVar.getValue();
   ASSERT_TRUE(Value);
-  EXPECT_EQ(42U, *Value);
+  EXPECT_EQ(42, cantFail(Value->getSignedValue()));
   EvalResult = FooVarUse.eval();
   ASSERT_THAT_EXPECTED(EvalResult, Succeeded());
-  EXPECT_EQ(42U, *EvalResult);
+  EXPECT_EQ(42, cantFail(EvalResult->getSignedValue()));
 
   // Clearing variable: getValue and eval fail. Error returned by eval holds
   // the name of the cleared variable.
@@ -327,23 +570,24 @@ TEST_F(FileCheckTest, Binop) {
   StringRef FooStr = ExprStr.take_front(3);
   NumericVariable FooVar(FooStr,
                          ExpressionFormat(ExpressionFormat::Kind::Unsigned), 1);
-  FooVar.setValue(42);
+  FooVar.setValue(ExpressionValue(42u));
   std::unique_ptr<NumericVariableUse> FooVarUse =
       std::make_unique<NumericVariableUse>(FooStr, &FooVar);
   StringRef BarStr = ExprStr.take_back(3);
   NumericVariable BarVar(BarStr,
                          ExpressionFormat(ExpressionFormat::Kind::Unsigned), 2);
-  BarVar.setValue(18);
+  BarVar.setValue(ExpressionValue(18u));
   std::unique_ptr<NumericVariableUse> BarVarUse =
       std::make_unique<NumericVariableUse>(BarStr, &BarVar);
+  binop_eval_t doAdd = operator+;
   BinaryOperation Binop(ExprStr, doAdd, std::move(FooVarUse),
                         std::move(BarVarUse));
 
   // Defined variables: eval returns right value; implicit format is as
   // expected.
-  Expected<uint64_t> Value = Binop.eval();
+  Expected<ExpressionValue> Value = Binop.eval();
   ASSERT_THAT_EXPECTED(Value, Succeeded());
-  EXPECT_EQ(60U, *Value);
+  EXPECT_EQ(60, cantFail(Value->getSignedValue()));
   Expected<ExpressionFormat> ImplicitFormat = Binop.getImplicitFormat(SM);
   ASSERT_THAT_EXPECTED(ImplicitFormat, Succeeded());
   EXPECT_EQ(*ImplicitFormat, ExpressionFormat::Kind::Unsigned);
@@ -366,7 +610,7 @@ TEST_F(FileCheckTest, Binop) {
   StringRef EighteenStr = ExprStr.take_back(2);
   FooVarUse = std::make_unique<NumericVariableUse>(FooStr, &FooVar);
   std::unique_ptr<ExpressionLiteral> Eighteen =
-      std::make_unique<ExpressionLiteral>(EighteenStr, 18);
+      std::make_unique<ExpressionLiteral>(EighteenStr, 18u);
   Binop = BinaryOperation(ExprStr, doAdd, std::move(FooVarUse),
                           std::move(Eighteen));
   ImplicitFormat = Binop.getImplicitFormat(SM);
@@ -376,7 +620,7 @@ TEST_F(FileCheckTest, Binop) {
   FooStr = ExprStr.take_back(3);
   EighteenStr = ExprStr.take_front(2);
   FooVarUse = std::make_unique<NumericVariableUse>(FooStr, &FooVar);
-  Eighteen = std::make_unique<ExpressionLiteral>(EighteenStr, 18);
+  Eighteen = std::make_unique<ExpressionLiteral>(EighteenStr, 18u);
   Binop = BinaryOperation(ExprStr, doAdd, std::move(Eighteen),
                           std::move(FooVarUse));
   ImplicitFormat = Binop.getImplicitFormat(SM);
@@ -655,6 +899,13 @@ TEST_F(FileCheckTest, ParseNumericSubstitutionBlock) {
 
   // Valid single operand expression.
   EXPECT_THAT_EXPECTED(Tester.parseSubst("FOO"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("18"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst(std::to_string(MaxUint64)),
+                       Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("0x12"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("-30"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst(std::to_string(MinInt64)),
+                       Succeeded());
 
   // Invalid format.
   expectDiagnosticError("invalid matching format specification in expression",
@@ -697,6 +948,7 @@ TEST_F(FileCheckTest, ParseNumericSubstitutionBlock) {
 
   // Valid expression with format specifier.
   EXPECT_THAT_EXPECTED(Tester.parseSubst("%u, FOO"), Succeeded());
+  EXPECT_THAT_EXPECTED(Tester.parseSubst("%d, FOO"), Succeeded());
   EXPECT_THAT_EXPECTED(Tester.parseSubst("%x, FOO"), Succeeded());
   EXPECT_THAT_EXPECTED(Tester.parseSubst("%X, FOO"), Succeeded());
 
@@ -804,7 +1056,14 @@ TEST_F(FileCheckTest, ParsePattern) {
 TEST_F(FileCheckTest, Match) {
   PatternTester Tester;
 
+  // Check a substitution error is diagnosed.
+  ASSERT_FALSE(Tester.parsePattern("[[#%u, -1]]"));
+  expectDiagnosticError(
+      "unable to substitute variable or numeric expression: overflow error",
+      Tester.match("").takeError());
+
   // Check matching an empty expression only matches a number.
+  Tester.initNextPattern();
   ASSERT_FALSE(Tester.parsePattern("[[#]]"));
   expectNotFoundError(Tester.match("FAIL").takeError());
   EXPECT_THAT_EXPECTED(Tester.match("18"), Succeeded());
@@ -946,7 +1205,7 @@ TEST_F(FileCheckTest, Substitution) {
   // substituted for the variable's value.
   NumericVariable NVar("N", ExpressionFormat(ExpressionFormat::Kind::Unsigned),
                        1);
-  NVar.setValue(10);
+  NVar.setValue(ExpressionValue(10u));
   auto NVarUse = std::make_unique<NumericVariableUse>("N", &NVar);
   auto ExpressionN = std::make_unique<Expression>(
       std::move(NVarUse), ExpressionFormat(ExpressionFormat::Kind::HexUpper));
@@ -1056,24 +1315,24 @@ TEST_F(FileCheckTest, FileCheckContext) {
   Expected<StringRef> EmptyVar = Cxt.getPatternVarValue(EmptyVarStr);
   Expected<StringRef> UnknownVar = Cxt.getPatternVarValue(UnknownVarStr);
   ASSERT_THAT_EXPECTED(ExpressionPointer, Succeeded());
-  Expected<uint64_t> ExpressionVal = (*ExpressionPointer)->getAST()->eval();
+  Expected<ExpressionValue> ExpressionVal =
+      (*ExpressionPointer)->getAST()->eval();
   ASSERT_THAT_EXPECTED(ExpressionVal, Succeeded());
-  EXPECT_EQ(*ExpressionVal, 18U);
+  EXPECT_EQ(cantFail(ExpressionVal->getSignedValue()), 18);
   ExpressionPointer = P.parseNumericSubstitutionBlock(
       LocalNumVar2Ref, DefinedNumericVariable,
       /*IsLegacyLineExpr=*/false, LineNumber, &Cxt, SM);
   ASSERT_THAT_EXPECTED(ExpressionPointer, Succeeded());
   ExpressionVal = (*ExpressionPointer)->getAST()->eval();
   ASSERT_THAT_EXPECTED(ExpressionVal, Succeeded());
-  EXPECT_EQ(*ExpressionVal, 20U);
-  ExpressionPointer =
-      P.parseNumericSubstitutionBlock(LocalNumVar3Ref, DefinedNumericVariable,
-                                      /*IsLegacyLineExpr=*/false,
-                                      LineNumber, &Cxt, SM);
+  EXPECT_EQ(cantFail(ExpressionVal->getSignedValue()), 20);
+  ExpressionPointer = P.parseNumericSubstitutionBlock(
+      LocalNumVar3Ref, DefinedNumericVariable,
+      /*IsLegacyLineExpr=*/false, LineNumber, &Cxt, SM);
   ASSERT_THAT_EXPECTED(ExpressionPointer, Succeeded());
   ExpressionVal = (*ExpressionPointer)->getAST()->eval();
   ASSERT_THAT_EXPECTED(ExpressionVal, Succeeded());
-  EXPECT_EQ(*ExpressionVal, 12U);
+  EXPECT_EQ(cantFail(ExpressionVal->getSignedValue()), 12);
   ASSERT_THAT_EXPECTED(EmptyVar, Succeeded());
   EXPECT_EQ(*EmptyVar, "");
   expectUndefErrors({std::string(UnknownVarStr)}, UnknownVar.takeError());
@@ -1123,7 +1382,7 @@ TEST_F(FileCheckTest, FileCheckContext) {
   ASSERT_THAT_EXPECTED(ExpressionPointer, Succeeded());
   ExpressionVal = (*ExpressionPointer)->getAST()->eval();
   ASSERT_THAT_EXPECTED(ExpressionVal, Succeeded());
-  EXPECT_EQ(*ExpressionVal, 36U);
+  EXPECT_EQ(cantFail(ExpressionVal->getSignedValue()), 36);
 
   // Clear local variables and check global variables remain defined.
   Cxt.clearLocalVars();
@@ -1135,6 +1394,6 @@ TEST_F(FileCheckTest, FileCheckContext) {
   ASSERT_THAT_EXPECTED(ExpressionPointer, Succeeded());
   ExpressionVal = (*ExpressionPointer)->getAST()->eval();
   ASSERT_THAT_EXPECTED(ExpressionVal, Succeeded());
-  EXPECT_EQ(*ExpressionVal, 36U);
+  EXPECT_EQ(cantFail(ExpressionVal->getSignedValue()), 36);
 }
 } // namespace

From c010d4d195506aaea76a1cc8afb5a6b5884dba44 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Mon, 9 Mar 2020 13:29:37 +0000
Subject: [PATCH 329/770] [ARM] Improve codegen of volatile load/store of i64

Summary:
Instead of generating two i32 instructions for each load or store of a volatile
i64 value (two LDRs or STRs), now emit LDRD/STRD.

These improvements cover architectures implementing ARMv5TE or Thumb-2.

The code generation explicitly deviates from using the register-offset
variant of LDRD/STRD. In this variant, the register allocated to the
register-offset cannot be reused in any of the remaining operands. Such
restriction seems to be non-trivial to implement in LLVM, thus it is
left as a to-do.

Differential Revision: https://reviews.llvm.org/D70072
---
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp  |  18 ++
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp       |  82 ++++++++
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  62 +++++-
 llvm/lib/Target/ARM/ARMISelLowering.h         |   8 +-
 llvm/lib/Target/ARM/ARMInstrInfo.td           |  22 ++
 llvm/lib/Target/ARM/ARMInstrThumb2.td         |   9 +-
 .../CodeGen/ARM/i64_volatile_load_store.ll    | 191 ++++++++++++++++++
 7 files changed, 386 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/i64_volatile_load_store.ll

diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2aef2e8610a37..49056d7830289 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2735,6 +2735,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
+    case ARM::LOADDUAL:
+    case ARM::STOREDUAL: {
+      Register PairReg = MI.getOperand(0).getReg();
+
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                  TII->get(Opcode == ARM::LOADDUAL ? ARM::LDRD : ARM::STRD))
+              .addReg(TRI->getSubReg(PairReg, ARM::gsub_0),
+                      Opcode == ARM::LOADDUAL ? RegState::Define : 0)
+              .addReg(TRI->getSubReg(PairReg, ARM::gsub_1),
+                      Opcode == ARM::LOADDUAL ? RegState::Define : 0);
+      for (unsigned i = 1; i < MI.getNumOperands(); i++)
+        MIB.add(MI.getOperand(i));
+      MIB.add(predOps(ARMCC::AL));
+      MIB.cloneMemRefs(MI);
+      MI.eraseFromParent();
+      return true;
+    }
   }
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b49152e316721..3c6f446580bbe 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -145,6 +145,8 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
 
   // Thumb 2 Addressing Modes:
   bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  template <unsigned Shift>
+  bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
                             SDValue &OffImm);
   bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
@@ -1312,6 +1314,33 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
   return true;
 }
 
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -255, 256, RHSC)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+
+      if (N.getOpcode() == ISD::SUB)
+        RHSC = -RHSC;
+      OffImm =
+          CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+  return true;
+}
+
 bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
                                            SDValue &Base, SDValue &OffImm) {
   // Match simple R - imm8 operands.
@@ -3655,6 +3684,59 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     CurDAG->RemoveDeadNode(N);
     return;
   }
+  case ARMISD::LDRD: {
+    if (Subtarget->isThumb2())
+      break; // TableGen handles isel in this case.
+    SDValue Base, RegOffset, ImmOffset;
+    const SDValue &Chain = N->getOperand(0);
+    const SDValue &Addr = N->getOperand(1);
+    SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+    if (RegOffset != CurDAG->getRegister(0, MVT::i32)) {
+      // The register-offset variant of LDRD mandates that the register
+      // allocated to RegOffset is not reused in any of the remaining operands.
+      // This restriction is currently not enforced. Therefore emitting this
+      // variant is explicitly avoided.
+      Base = Addr;
+      RegOffset = CurDAG->getRegister(0, MVT::i32);
+    }
+    SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain};
+    SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl,
+                                         {MVT::Untyped, MVT::Other}, Ops);
+    SDValue Lo = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+                                                SDValue(New, 0));
+    SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+                                                SDValue(New, 0));
+    transferMemOperands(N, New);
+    ReplaceUses(SDValue(N, 0), Lo);
+    ReplaceUses(SDValue(N, 1), Hi);
+    ReplaceUses(SDValue(N, 2), SDValue(New, 1));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+  case ARMISD::STRD: {
+    if (Subtarget->isThumb2())
+      break; // TableGen handles isel in this case.
+    SDValue Base, RegOffset, ImmOffset;
+    const SDValue &Chain = N->getOperand(0);
+    const SDValue &Addr = N->getOperand(3);
+    SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+    if (RegOffset != CurDAG->getRegister(0, MVT::i32)) {
+      // The register-offset variant of STRD mandates that the register
+      // allocated to RegOffset is not reused in any of the remaining operands.
+      // This restriction is currently not enforced. Therefore emitting this
+      // variant is explicitly avoided.
+      Base = Addr;
+      RegOffset = CurDAG->getRegister(0, MVT::i32);
+    }
+    SDNode *RegPair =
+        createGPRPairNode(MVT::Untyped, N->getOperand(1), N->getOperand(2));
+    SDValue Ops[] = {SDValue(RegPair, 0), Base, RegOffset, ImmOffset, Chain};
+    SDNode *New = CurDAG->getMachineNode(ARM::STOREDUAL, dl, MVT::Other, Ops);
+    transferMemOperands(N, New);
+    ReplaceUses(SDValue(N, 0), SDValue(New, 0));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
   case ARMISD::LOOP_DEC: {
     SDValue Ops[] = { N->getOperand(1),
                       N->getOperand(2),
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c5c99610dd3ab..98161c3494445 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1082,6 +1082,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Custom);
+  setOperationAction(ISD::STORE, MVT::i64, Custom);
 
   // MVE lowers 64 bit shifts to lsll and lsrl
   // assuming that ISD::SRL and SRA of i64 are already marked custom
@@ -1624,6 +1626,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
 
+  case ARMISD::LDRD:          return "ARMISD::LDRD";
+  case ARMISD::STRD:          return "ARMISD::STRD";
+
   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
 
@@ -9151,6 +9156,25 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
   return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
 }
 
+void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT MemVT = LD->getMemoryVT();
+  assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
+
+  if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+      !Subtarget->isThumb1Only() && LD->isVolatile()) {
+    SDLoc dl(N);
+    SDValue Result = DAG.getMemIntrinsicNode(
+        ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
+        {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
+    SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
+    SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
+    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+    Results.append({Pair, Result.getValue(2)});
+  }
+}
+
 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
   EVT MemVT = ST->getMemoryVT();
@@ -9180,6 +9204,38 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
       ST->getMemOperand());
 }
 
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *Subtarget) {
+  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+  EVT MemVT = ST->getMemoryVT();
+  assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
+
+  if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+      !Subtarget->isThumb1Only() && ST->isVolatile()) {
+    SDNode *N = Op.getNode();
+    SDLoc dl(N);
+
+    SDValue Lo = DAG.getNode(
+        ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+        DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
+                              MVT::i32));
+    SDValue Hi = DAG.getNode(
+        ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+        DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
+                              MVT::i32));
+
+    return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
+                                   {ST->getChain(), Lo, Hi, ST->getBasePtr()},
+                                   MemVT, ST->getMemOperand());
+  } else if (Subtarget->hasMVEIntegerOps() &&
+             ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+               MemVT == MVT::v16i1))) {
+    return LowerPredicateStore(Op, DAG);
+  }
+
+  return SDValue();
+}
+
 static bool isZeroVector(SDValue N) {
   return (ISD::isBuildVectorAllZeros(N.getNode()) ||
           (N->getOpcode() == ARMISD::VMOVIMM &&
@@ -9414,7 +9470,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::LOAD:
     return LowerPredicateLoad(Op, DAG);
   case ISD::STORE:
-    return LowerPredicateStore(Op, DAG);
+    return LowerSTORE(Op, DAG, Subtarget);
   case ISD::MLOAD:
     return LowerMLOAD(Op, DAG);
   case ISD::ATOMIC_LOAD:
@@ -9518,7 +9574,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::ABS:
      lowerABS(N, Results, DAG);
      return ;
-
+  case ISD::LOAD:
+    LowerLOAD(N, Results, DAG);
+    break;
   }
   if (Res.getNode())
     Results.push_back(Res);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 8d26b39b42100..4323f00f8dbce 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -305,7 +305,11 @@ class VectorType;
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
-      VST4LN_UPD
+      VST4LN_UPD,
+
+      // Load/Store of dual registers
+      LDRD,
+      STRD
     };
 
   } // end namespace ARMISD
@@ -771,6 +775,8 @@ class VectorType;
     SDValue LowerFSETCC(SDValue Op, SelectionDAG &DAG) const;
     void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
                   SelectionDAG &DAG) const;
+    void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                   SelectionDAG &DAG) const;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index bb701b03991d9..6b990a59ed0e9 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -245,6 +245,12 @@ def ARMqsub8b       : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
 def ARMqadd16b      : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
 def ARMqsub16b      : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
 
+def SDT_ARMldrd     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def ARMldrd         : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDT_ARMstrd     : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def ARMstrd         : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 // Vector operations shared between NEON and MVE
 
 def ARMvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -2736,6 +2742,14 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
              Requires<[IsARM, HasV5TE]>;
 }
 
+let mayLoad = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in {
+def LOADDUAL : ARMPseudoInst<(outs GPRPairOp:$Rt), (ins addrmode3:$addr),
+                             64, IIC_iLoad_d_r, []>,
+               Requires<[IsARM, HasV5TE]> {
+  let AM = AddrMode3;
+}
+}
+
 def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                     NoItinerary, "lda", "\t$Rt, $addr", []>;
 def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
@@ -3014,6 +3028,14 @@ let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
   }
 }
 
+let mayStore = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in {
+def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
+                              64, IIC_iStore_d_r, []>,
+                Requires<[IsARM, HasV5TE]> {
+  let AM = AddrMode3;
+}
+}
+
 // Indexed stores
 multiclass AI2_stridx<bit isByte, string opc,
                       InstrItinClass iii, InstrItinClass iir> {
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index b4f3901a4603f..e2235b1c25013 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -270,7 +270,8 @@ def t2am_imm8_offset : MemOperand,
 
 // t2addrmode_imm8s4  := reg +/- (imm8 << 2)
 def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
-class T2AddrMode_Imm8s4 : MemOperand {
+class T2AddrMode_Imm8s4 : MemOperand,
+                          ComplexPattern<i32, 2, "SelectT2AddrModeImm8<2>", []> {
   let EncoderMethod = "getT2AddrModeImm8s4OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8s4";
   let ParserMatchClass = MemImm8s4OffsetAsmOperand;
@@ -1448,7 +1449,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>,
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "",
+                        [(set rGPR:$Rt, rGPR:$Rt2, (ARMldrd t2addrmode_imm8s4:$addr))]>,
                  Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1629,7 +1631,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
-               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>,
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "",
+               [(ARMstrd rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr)]>,
                Sched<[WriteST]>;
 
 // Indexed stores
diff --git a/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll b/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll
new file mode 100644
index 0000000000000..43479b7e541b7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll
@@ -0,0 +1,191 @@
+; RUN: llc -mtriple=armv5e-arm-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-ARMV5TE,CHECK
+; RUN: llc -mtriple=thumbv6t2-arm-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-T2,CHECK
+; RUN: llc -mtriple=armv4t-arm-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK-ARMV4T,CHECK
+
+@x = common dso_local global i64 0, align 8
+@y = common dso_local global i64 0, align 8
+
+define void @test() {
+entry:
+; CHECK-LABEL: test:
+; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
+; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
+; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
+; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #4]
+; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #4]
+; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]]]
+  %0 = load volatile i64, i64* @x, align 8
+  store volatile i64 %0, i64* @y, align 8
+  ret void
+}
+
+define void @test_offset() {
+entry:
+; CHECK-LABEL: test_offset:
+; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #-4]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #-4]
+; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
+; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
+; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
+; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #-4]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #-4]
+; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #-4]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #-4]
+  %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 -4) to i64*), align 8
+  store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 -4) to i64*), align 8
+  ret void
+}
+
+define void @test_offset_1() {
+; CHECK-LABEL: test_offset_1:
+; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #255]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #255]
+; CHECK-T2:           adds [[ADDR0:r[0-9]+]], #255
+; CHECK-T2-NEXT:      adds [[ADDR1:r[0-9]+]], #255
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #255]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #259]
+; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #259]
+; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #255]
+entry:
+  %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 255) to i64*), align 8
+  store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 255) to i64*), align 8
+  ret void
+}
+
+define void @test_offset_2() {
+; CHECK-LABEL: test_offset_2:
+; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: add [[ADDR0]], [[ADDR0]], #256
+; CHECK-ARMV5TE-NEXT: add [[ADDR1]], [[ADDR1]], #256
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
+; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
+; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
+; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #256]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #256]
+; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #256]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #260]
+; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #260]
+; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #256]
+entry:
+  %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 256) to i64*), align 8
+  store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 256) to i64*), align 8
+  ret void
+}
+
+define void @test_offset_3() {
+; CHECK-LABEL: test_offset_3:
+; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: add [[ADDR0]], [[ADDR0]], #1020
+; CHECK-ARMV5TE-NEXT: add [[ADDR1]], [[ADDR1]], #1020
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
+; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
+; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
+; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1020]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #1020]
+; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #1020]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1024]
+; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #1024]
+; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #1020]
+entry:
+  %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 1020) to i64*), align 8
+  store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 1020) to i64*), align 8
+  ret void
+}
+
+define void @test_offset_4() {
+; CHECK-LABEL: test_offset_4:
+; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV5TE:      ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV5TE-NEXT: add [[ADDR0]], [[ADDR0]], #1024
+; CHECK-ARMV5TE-NEXT: add [[ADDR1]], [[ADDR1]], #1024
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2:           movw [[ADDR1:r[0-9]+]], :lower16:y
+; CHECK-T2-NEXT:      movw [[ADDR0:r[0-9]+]], :lower16:x
+; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
+; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
+; CHECK-T2-NEXT:      add.w [[ADDR0]], [[ADDR0]], #1024
+; CHECK-T2-NEXT:      add.w [[ADDR1]], [[ADDR1]], #1024
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #1024]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1028]
+; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #1028]
+; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #1024]
+entry:
+  %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 1024) to i64*), align 8
+  store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 1024) to i64*), align 8
+  ret void
+}
+
+define i64 @test_stack() {
+; CHECK-LABEL: test_stack:
+; CHECK-ARMV5TE:      sub sp, sp, #80
+; CHECK-ARMV5TE-NEXT: mov [[R0:r[0-9]+]], #0
+; CHECK-ARMV5TE-NEXT: mov [[R1:r[0-9]+]], #1
+; CHECK-ARMV5TE-NEXT: strd [[R1]], [[R0]], [sp, #8]
+; CHECK-ARMV5TE-NEXT: ldrd r0, r1, [sp, #8]
+; CHECK-ARMV5TE-NEXT: add sp, sp, #80
+; CHECK-ARMV5TE-NEXT: bx lr
+; CHECK-T2:      sub sp, #80
+; CHECK-T2-NEXT: movs [[R0:r[0-9]+]], #0
+; CHECK-T2-NEXT: movs [[R1:r[0-9]+]], #1
+; CHECK-T2-NEXT: strd [[R1]], [[R0]], [sp, #8]
+; CHECK-T2-NEXT: ldrd r0, r1, [sp, #8]
+; CHECK-T2-NEXT: add sp, #80
+; CHECK-T2-NEXT: bx lr
+; CHECK-ARMV4T:      sub sp, sp, #80
+; CHECK-ARMV4T-NEXT: mov [[R0:r[0-9]+]], #0
+; CHECK-ARMV4T-NEXT: str [[R0]], [sp, #12]
+; CHECK-ARMV4T-NEXT: mov [[R1:r[0-9]+]], #1
+; CHECK-ARMV4T-NEXT: str [[R1]], [sp, #8]
+; CHECK-ARMV4T-NEXT: ldr r0, [sp, #8]
+; CHECK-ARMV4T-NEXT: ldr r1, [sp, #12]
+; CHECK-ARMV4T-NEXT: add sp, sp, #80
+; CHECK-ARMV4T-NEXT: bx lr
+entry:
+  %a = alloca [10 x i64], align 8
+  %arrayidx = getelementptr inbounds [10 x i64], [10 x i64]* %a, i32 0, i32 1
+  store volatile i64 1, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [10 x i64], [10 x i64]* %a, i32 0, i32 1
+  %0 = load volatile i64, i64* %arrayidx1, align 8
+  ret i64 %0
+}
+

From 8a397b66b2c672999e9e6d63334d5bffd7db1a3f Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 28 May 2020 09:37:55 +0000
Subject: [PATCH 330/770] [AArch64][SVE] Add support for spilling/filling
 ZPR2/3/4

Summary:
This patch enables the register allocator to spill/fill lists of 2, 3
and 4 SVE vectors registers to/from the stack. This is implemented with
pseudo instructions that get expanded to individual LDR_ZXI/STR_ZXI
instructions in AArch64ExpandPseudoInsts.

Patch by Sander de Smalen.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D75988
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  37 ++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  85 ++++++++++---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  14 ++
 llvm/test/CodeGen/AArch64/spillfill-sve.mir   | 120 ++++++++++++++++++
 4 files changed, 236 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 381bf86c7d62b..b9034862c2707 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -80,6 +80,9 @@ class AArch64ExpandPseudo : public MachineFunctionPass {
   bool expandSetTagLoop(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator MBBI,
                         MachineBasicBlock::iterator &NextMBBI);
+  bool expandSVESpillFill(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, unsigned Opc,
+                          unsigned N);
 };
 
 } // end anonymous namespace
@@ -595,6 +598,28 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
   return true;
 }
 
+bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             unsigned Opc, unsigned N) {
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  MachineInstr &MI = *MBBI;
+  for (unsigned Offset = 0; Offset < N; ++Offset) {
+    int ImmOffset = MI.getOperand(2).getImm() + Offset;
+    bool Kill = (Offset + 1 == N) ? MI.getOperand(1).isKill() : false;
+    assert(ImmOffset >= -256 && ImmOffset < 256 &&
+           "Immediate spill offset out of range");
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+        .addReg(
+            TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset),
+            Opc == AArch64::LDR_ZXI ? RegState::Define : 0)
+        .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill))
+        .addImm(ImmOffset);
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
 /// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -970,6 +995,18 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      report_fatal_error(
          "Non-writeback variants of STGloop / STZGloop should not "
          "survive past PrologEpilogInserter.");
+   case AArch64::STR_ZZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
+   case AArch64::STR_ZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
+   case AArch64::STR_ZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
+   case AArch64::LDR_ZZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
+   case AArch64::LDR_ZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
+   case AArch64::LDR_ZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 564fd33ca596a..fd07c32e5496f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2278,6 +2278,27 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 255;
     break;
+  case AArch64::STR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI:
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector * 4;
+    MinOffset = -256;
+    MaxOffset = 252;
+    break;
+  case AArch64::STR_ZZZXI:
+  case AArch64::LDR_ZZZXI:
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector * 3;
+    MinOffset = -256;
+    MaxOffset = 253;
+    break;
+  case AArch64::STR_ZZXI:
+  case AArch64::LDR_ZZXI:
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector * 2;
+    MinOffset = -256;
+    MaxOffset = 254;
+    break;
   case AArch64::LDR_PXI:
   case AArch64::STR_PXI:
     Scale = TypeSize::Scalable(2);
@@ -2984,6 +3005,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
   unsigned Opc = 0;
   bool Offset = true;
+  unsigned StackID = TargetStackID::Default;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -2992,6 +3014,11 @@ void AArch64InstrInfo::storeRegToStackSlot(
   case 2:
     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRHui;
+    else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_PXI;
+      StackID = TargetStackID::SVEVector;
+    }
     break;
   case 4:
     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
@@ -3031,6 +3058,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
                               get(AArch64::STPXi), SrcReg, isKill,
                               AArch64::sube64, AArch64::subo64, FI, MMO);
       return;
+    } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 24:
@@ -3049,6 +3080,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d;
       Offset = false;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 48:
@@ -3056,6 +3091,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Threev2d;
       Offset = false;
+    } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 64:
@@ -3063,19 +3102,13 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d;
       Offset = false;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   }
-  unsigned StackID = TargetStackID::Default;
-  if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
-    Opc = AArch64::STR_PXI;
-    StackID = TargetStackID::SVEVector;
-  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
-    Opc = AArch64::STR_ZXI;
-    StackID = TargetStackID::SVEVector;
-  }
   assert(Opc && "Unknown register class");
   MFI.setStackID(FI, StackID);
 
@@ -3126,6 +3159,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 
   unsigned Opc = 0;
   bool Offset = true;
+  unsigned StackID = TargetStackID::Default;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -3134,6 +3168,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   case 2:
     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRHui;
+    else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_PXI;
+      StackID = TargetStackID::SVEVector;
+    }
     break;
   case 4:
     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
@@ -3173,6 +3212,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
                                AArch64::subo64, FI, MMO);
       return;
+    } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 24:
@@ -3191,6 +3234,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d;
       Offset = false;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 48:
@@ -3198,6 +3245,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Threev2d;
       Offset = false;
+    } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 64:
@@ -3205,20 +3256,14 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d;
       Offset = false;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   }
 
-  unsigned StackID = TargetStackID::Default;
-  if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
-    Opc = AArch64::LDR_PXI;
-    StackID = TargetStackID::SVEVector;
-  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
-    Opc = AArch64::LDR_ZXI;
-    StackID = TargetStackID::SVEVector;
-  }
   assert(Opc && "Unknown register class");
   MFI.setStackID(FI, StackID);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index df82680b1f6db..54a764337324c 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1334,6 +1334,20 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
                   (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
 
+  // Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4.
+  // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
+  // AArch64ExpandPseudoInsts.
+  let mayLoad = 1, hasSideEffects = 0 in {
+    def LDR_ZZXI   : Pseudo<(outs   ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+  }
+  let mayStore = 1, hasSideEffects = 0 in {
+    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+  }
+
   def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
             (PTEST_PP PPR:$pg, PPR:$src)>;
   def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 21bdb45965bd9..982d232f12f49 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -8,6 +8,9 @@
 
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
 
   attributes #0 = { nounwind "target-features"="+sve" }
 
@@ -90,3 +93,120 @@ body:             |
     $z0 = COPY %0
     RET_ReallyLR
 ...
+---
+name: spills_fills_stack_id_zpr2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: zpr2 }
+stack:
+liveins:
+  - { reg: '$z0_z1', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $z0_z1
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_zpr2
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16
+    ; CHECK-NEXT:     stack-id: sve-vec
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
+    ; EXPAND: STR_ZXI $z0, $sp, 0
+    ; EXPAND: STR_ZXI $z1, $sp, 1
+    ; EXPAND: $z0 = LDR_ZXI $sp, 0
+    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+
+    %0:zpr2 = COPY $z0_z1
+
+    $z0_z1_z2_z3     = IMPLICIT_DEF
+    $z4_z5_z6_z7     = IMPLICIT_DEF
+    $z8_z9_z10_z11   = IMPLICIT_DEF
+    $z12_z13_z14_z15 = IMPLICIT_DEF
+    $z16_z17_z18_z19 = IMPLICIT_DEF
+    $z20_z21_z22_z23 = IMPLICIT_DEF
+    $z24_z25_z26_z27 = IMPLICIT_DEF
+    $z28_z29_z30_z31 = IMPLICIT_DEF
+
+    $z0_z1 = COPY %0
+    RET_ReallyLR
+...
+---
+name: spills_fills_stack_id_zpr3
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: zpr3 }
+stack:
+liveins:
+  - { reg: '$z0_z1_z2', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $z0_z1_z2
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_zpr3
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 48, alignment: 16
+    ; CHECK-NEXT:     stack-id: sve-vec
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr3
+    ; EXPAND: STR_ZXI $z0, $sp, 0
+    ; EXPAND: STR_ZXI $z1, $sp, 1
+    ; EXPAND: STR_ZXI $z2, $sp, 2
+    ; EXPAND: $z0 = LDR_ZXI $sp, 0
+    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+    ; EXPAND: $z2 = LDR_ZXI $sp, 2
+
+    %0:zpr3 = COPY $z0_z1_z2
+
+    $z0_z1_z2_z3     = IMPLICIT_DEF
+    $z4_z5_z6_z7     = IMPLICIT_DEF
+    $z8_z9_z10_z11   = IMPLICIT_DEF
+    $z12_z13_z14_z15 = IMPLICIT_DEF
+    $z16_z17_z18_z19 = IMPLICIT_DEF
+    $z20_z21_z22_z23 = IMPLICIT_DEF
+    $z24_z25_z26_z27 = IMPLICIT_DEF
+    $z28_z29_z30_z31 = IMPLICIT_DEF
+
+    $z0_z1_z2 = COPY %0
+    RET_ReallyLR
+...
+---
+name: spills_fills_stack_id_zpr4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: zpr4 }
+stack:
+liveins:
+  - { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $z0_z1_z2_z3
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_zpr4
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16
+    ; CHECK-NEXT:     stack-id: sve-vec
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
+    ; EXPAND: STR_ZXI $z0, $sp, 0
+    ; EXPAND: STR_ZXI $z1, $sp, 1
+    ; EXPAND: STR_ZXI $z2, $sp, 2
+    ; EXPAND: STR_ZXI $z3, $sp, 3
+    ; EXPAND: $z0 = LDR_ZXI $sp, 0
+    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+    ; EXPAND: $z2 = LDR_ZXI $sp, 2
+    ; EXPAND: $z3 = LDR_ZXI $sp, 3
+
+    %0:zpr4 = COPY $z0_z1_z2_z3
+
+    $z0_z1_z2_z3     = IMPLICIT_DEF
+    $z4_z5_z6_z7     = IMPLICIT_DEF
+    $z8_z9_z10_z11   = IMPLICIT_DEF
+    $z12_z13_z14_z15 = IMPLICIT_DEF
+    $z16_z17_z18_z19 = IMPLICIT_DEF
+    $z20_z21_z22_z23 = IMPLICIT_DEF
+    $z24_z25_z26_z27 = IMPLICIT_DEF
+    $z28_z29_z30_z31 = IMPLICIT_DEF
+
+    $z0_z1_z2_z3 = COPY %0
+    RET_ReallyLR
+...

From a0d847c6cdcbe167213d91313577c57073d5c013 Mon Sep 17 00:00:00 2001
From: Alok Kumar Sharma <AlokKumar.Sharma@amd.com>
Date: Thu, 28 May 2020 15:12:28 +0530
Subject: [PATCH 331/770] Fixed bot failure after d20bf5a7258d4b6a7

There were some bot failures due unused funtion `rotateSign`
left in code.

http://lab.llvm.org:8011/builders/clang-ppc64le-rhel/builds/3731

error: unused function 'rotateSign' [-Werror,-Wunused-function]
static uint64_t rotateSign(int64_t I)
---
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 9da1437b56b2e..a46339a4ec1fd 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1519,11 +1519,6 @@ void ModuleBitcodeWriter::writeGenericDINode(const GenericDINode *N,
   Record.clear();
 }
 
-static uint64_t rotateSign(int64_t I) {
-  uint64_t U = I;
-  return I < 0 ? ~(U << 1) : U << 1;
-}
-
 void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
                                           SmallVectorImpl<uint64_t> &Record,
                                           unsigned Abbrev) {

From ab95ac013234189ad797f36d95c96b2d0999a653 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 28 May 2020 10:43:47 +0100
Subject: [PATCH 332/770] [AArch64] Precommit new fp extraction/insertion test.

---
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 0d4d2c7460071..7820734e366d0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -207,6 +207,17 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
   ret <2 x double> %tmp4
 }
 
+define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1) {
+; CHECK-LABEL: ins1f2_args_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill:  def $d1 killed $d1 def $q1
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
 define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
 ; CHECK-LABEL: ins16b8:
 ; CHECK:       // %bb.0:

From d283fc4f9d07a5f3334fe682ccabfc16e8d2933b Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Tue, 26 May 2020 15:43:52 +0200
Subject: [PATCH 333/770] [DebugInfo] Use SplitTemplateClosers (foo<bar<baz> >)
 in DWARF too

Summary:
D76801 caused some regressions in debuginfo compatibility by changing how
certain functions were named.

For CodeView we try to mirror MSVC exactly: this was fixed in a549c0d00486
For DWARF the situation is murkier. Per David Blaikie:
> In general DWARF doesn't specify this at all.
> [...]
> This isn't the only naming divergence between GCC and Clang

Nevertheless, including the space seems to provide better compatibility with
GCC and GDB. E.g. cpexprs.cc in the GDB testsuite requires this formatting.
And there was no particular desire to change the printing of names in debug
info in the first place (just in diagnostics and other more user-facing text).

Fixes PR46052

Reviewers: dblaikie, labath

Subscribers: aprantl, cfe-commits, dyung

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80554
---
 clang/lib/CodeGen/CGDebugInfo.cpp                           | 4 ++++
 .../debug-info-template-explicit-specialization.cpp         | 2 +-
 clang/test/Modules/ExtDebugInfo.cpp                         | 6 +++---
 clang/test/Modules/ModuleDebugInfo.cpp                      | 6 +++---
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 4e0b6aa0dca67..31f8df2430176 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -236,6 +236,10 @@ PrintingPolicy CGDebugInfo::getPrintingPolicy() const {
   if (CGM.getCodeGenOpts().EmitCodeView) {
     PP.MSVCFormatting = true;
     PP.SplitTemplateClosers = true;
+  } else {
+    // For DWARF, printing rules are underspecified.
+    // SplitTemplateClosers yields better interop with GCC and GDB (PR46052).
+    PP.SplitTemplateClosers = true;
   }
 
   // Apply -fdebug-prefix-map.
diff --git a/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp b/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp
index d97d82b769657..4e41c4092bf4e 100644
--- a/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp
+++ b/clang/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp
@@ -110,7 +110,7 @@ struct j_wrap {
 };
 j_wrap<j<int>> j_wrap_j;
 // CHECK: DICompositeType(tag: DW_TAG_structure_type, name: "j<int, int>"
-// CHECK: DICompositeType(tag: DW_TAG_structure_type, name: "j_wrap<j<int, int>>"
+// CHECK: DICompositeType(tag: DW_TAG_structure_type, name: "j_wrap<j<int, int> >"
 
 template <typename T>
 struct k {
diff --git a/clang/test/Modules/ExtDebugInfo.cpp b/clang/test/Modules/ExtDebugInfo.cpp
index 6781810d592cc..aff2953b4bb51 100644
--- a/clang/test/Modules/ExtDebugInfo.cpp
+++ b/clang/test/Modules/ExtDebugInfo.cpp
@@ -85,14 +85,14 @@ void foo() {
 
 // This type is not anchored in the module by an explicit template instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
-// CHECK-SAME:             name: "Template<long, DebugCXX::traits<long>>",
+// CHECK-SAME:             name: "Template<long, DebugCXX::traits<long> >",
 // CHECK-SAME:             scope: ![[NS]],
 // CHECK-SAME:             elements:
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIlNS_6traitsIlEEEE")
 
 // This type is anchored in the module by an explicit template instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
-// CHECK-SAME:             name: "Template<int, DebugCXX::traits<int>>",
+// CHECK-SAME:             name: "Template<int, DebugCXX::traits<int> >",
 // CHECK-SAME:             scope: ![[NS]],
 // CHECK-SAME:             flags: DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIiNS_6traitsIiEEEE")
@@ -103,7 +103,7 @@ void foo() {
 
 // This one isn't.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
-// CHECK-SAME:             name: "Template<float, DebugCXX::traits<float>>",
+// CHECK-SAME:             name: "Template<float, DebugCXX::traits<float> >",
 // CHECK-SAME:             scope: ![[NS]],
 // CHECK-SAME:             elements:
 // CHECK-SAME:             templateParams:
diff --git a/clang/test/Modules/ModuleDebugInfo.cpp b/clang/test/Modules/ModuleDebugInfo.cpp
index 26369c8960581..e6e99ed4e5379 100644
--- a/clang/test/Modules/ModuleDebugInfo.cpp
+++ b/clang/test/Modules/ModuleDebugInfo.cpp
@@ -65,7 +65,7 @@
 
 // This type is anchored by an explicit template instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
-// CHECK-SAME:             name: "Template<int, DebugCXX::traits<int>>"
+// CHECK-SAME:             name: "Template<int, DebugCXX::traits<int> >"
 // CHECK-SAME:             elements:
 // CHECK-SAME:             templateParams:
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIiNS_6traitsIiEEEE")
@@ -80,7 +80,7 @@
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX6traitsIfEE")
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
-// CHECK-SAME:             name: "Template<long, DebugCXX::traits<long>>"
+// CHECK-SAME:             name: "Template<long, DebugCXX::traits<long> >"
 // CHECK-SAME:             elements:
 // CHECK-SAME:             templateParams:
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIlNS_6traitsIlEEEE")
@@ -89,7 +89,7 @@
 // no mangled name here yet.
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
-// CHECK-SAME:             name: "Template<float, DebugCXX::traits<float>>"
+// CHECK-SAME:             name: "Template<float, DebugCXX::traits<float> >"
 // CHECK-SAME:             flags: DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIfNS_6traitsIfEEEE")
 

From ad07d5f39425d4b7013346f4eb52a1e99e6c19a8 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Thu, 14 May 2020 14:15:40 +0300
Subject: [PATCH 334/770] [yaml2obj] - Implement the "SectionHeaderTable" tag.

With the "SectionHeaderTable" it is now possible to reorder
entries in the section header table.

It also allows to stop emitting the table.

Differential revision: https://reviews.llvm.org/D80002
---
 llvm/include/llvm/ObjectYAML/ELFYAML.h        |  18 ++
 llvm/lib/ObjectYAML/ELFEmitter.cpp            |  69 ++++++-
 llvm/lib/ObjectYAML/ELFYAML.cpp               |  11 ++
 .../tools/yaml2obj/ELF/section-headers.yaml   | 184 ++++++++++++++++++
 4 files changed, 275 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/tools/yaml2obj/ELF/section-headers.yaml

diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 22ed82289ca8c..5d3384925631f 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -86,6 +86,14 @@ struct FileHeader {
   Optional<llvm::yaml::Hex16> SHStrNdx;
 };
 
+struct SectionHeader {
+  StringRef Name;
+};
+
+struct SectionHeaderTable {
+  std::vector<SectionHeader> Sections;
+};
+
 struct SectionName {
   StringRef Section;
 };
@@ -508,6 +516,7 @@ struct ProgramHeader {
 
 struct Object {
   FileHeader Header;
+  Optional<SectionHeaderTable> SectionHeaders;
   std::vector<ProgramHeader> ProgramHeaders;
 
   // An object might contain output section descriptions as well as
@@ -539,6 +548,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::LinkerOption)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::CallGraphEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::NoteEntry)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::SectionHeader)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::unique_ptr<llvm::ELFYAML::Chunk>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Symbol)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::VerdefEntry)
@@ -670,6 +680,14 @@ struct MappingTraits<ELFYAML::FileHeader> {
   static void mapping(IO &IO, ELFYAML::FileHeader &FileHdr);
 };
 
+template <> struct MappingTraits<ELFYAML::SectionHeaderTable> {
+  static void mapping(IO &IO, ELFYAML::SectionHeaderTable &SecHdrTable);
+};
+
+template <> struct MappingTraits<ELFYAML::SectionHeader> {
+  static void mapping(IO &IO, ELFYAML::SectionHeader &SHdr);
+};
+
 template <> struct MappingTraits<ELFYAML::ProgramHeader> {
   static void mapping(IO &IO, ELFYAML::ProgramHeader &FileHdr);
 };
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index 78093491704be..2b7bad674fa46 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -218,6 +218,8 @@ template <class ELFT> class ELFState {
 
   void assignSectionAddress(Elf_Shdr &SHeader, ELFYAML::Section *YAMLSec);
 
+  DenseMap<StringRef, size_t> buildSectionHeaderReorderMap();
+
   BumpPtrAllocator StringAlloc;
   uint64_t alignToOffset(ContiguousBlobAccumulator &CBA, uint64_t Align,
                          llvm::Optional<llvm::yaml::Hex64> Offset);
@@ -318,12 +320,29 @@ void ELFState<ELFT>::writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream
   // other sections to the end of the file.
   uint64_t SHOff =
       alignToOffset(CBA, sizeof(typename ELFT::uint), /*Offset=*/None);
-  Header.e_shoff =
-      Doc.Header.SHOff ? typename ELFT::uint(*Doc.Header.SHOff) : SHOff;
-  Header.e_shnum =
-      Doc.Header.SHNum ? (uint16_t)*Doc.Header.SHNum : Doc.getSections().size();
-  Header.e_shstrndx = Doc.Header.SHStrNdx ? (uint16_t)*Doc.Header.SHStrNdx
-                                          : SN2I.get(".shstrtab");
+
+  if (Doc.Header.SHOff)
+    Header.e_shoff = *Doc.Header.SHOff;
+  else if (Doc.SectionHeaders && Doc.SectionHeaders->Sections.empty())
+    Header.e_shoff = 0;
+  else
+    Header.e_shoff = SHOff;
+
+  if (Doc.Header.SHNum)
+    Header.e_shnum = *Doc.Header.SHNum;
+  else if (!Doc.SectionHeaders)
+    Header.e_shnum = Doc.getSections().size();
+  else if (Doc.SectionHeaders->Sections.empty())
+    Header.e_shnum = 0;
+  else
+    Header.e_shnum = Doc.SectionHeaders->Sections.size() + /*Null section*/ 1;
+
+  if (Doc.Header.SHStrNdx)
+    Header.e_shstrndx = *Doc.Header.SHStrNdx;
+  else if (!Doc.SectionHeaders || !Doc.SectionHeaders->Sections.empty())
+    Header.e_shstrndx = SN2I.get(".shstrtab");
+  else
+    Header.e_shstrndx = 0;
 
   OS.write((const char *)&Header, sizeof(Header));
 }
@@ -1447,14 +1466,50 @@ void ELFState<ELFT>::writeFill(ELFYAML::Fill &Fill,
   Fill.Pattern->writeAsBinary(OS, Fill.Size - Written);
 }
 
+template <class ELFT>
+DenseMap<StringRef, size_t> ELFState<ELFT>::buildSectionHeaderReorderMap() {
+  if (!Doc.SectionHeaders || Doc.SectionHeaders->Sections.empty())
+    return DenseMap<StringRef, size_t>();
+
+  DenseMap<StringRef, size_t> Ret;
+  size_t SecNdx = 0;
+  StringSet<> Seen;
+  for (const ELFYAML::SectionHeader &Hdr : Doc.SectionHeaders->Sections) {
+    if (!Ret.try_emplace(Hdr.Name, ++SecNdx).second)
+      reportError("repeated section name: '" + Hdr.Name +
+                  "' in the section header description");
+    Seen.insert(Hdr.Name);
+  }
+
+  for (const ELFYAML::Section *S : Doc.getSections()) {
+    // Ignore special first SHT_NULL section.
+    if (S == Doc.getSections().front())
+      continue;
+    if (!Seen.count(S->Name))
+      reportError("section '" + S->Name +
+                  "' should be present in the 'Sections' list");
+    Seen.erase(S->Name);
+  }
+
+  for (const auto &It : Seen)
+    reportError("section header contains undefined section '" + It.getKey() +
+                "'");
+  return Ret;
+}
+
 template <class ELFT> void ELFState<ELFT>::buildSectionIndex() {
+  // A YAML description can have an explicit section header declaration that allows
+  // to change the order of section headers.
+  DenseMap<StringRef, size_t> ReorderMap = buildSectionHeaderReorderMap();
+
   size_t SecNdx = -1;
   for (const std::unique_ptr<ELFYAML::Chunk> &C : Doc.Chunks) {
     if (!isa<ELFYAML::Section>(C.get()))
       continue;
     ++SecNdx;
 
-    if (!SN2I.addName(C->Name, SecNdx))
+    size_t Index = ReorderMap.empty() ? SecNdx : ReorderMap.lookup(C->Name);
+    if (!SN2I.addName(C->Name, Index))
       llvm_unreachable("buildSectionIndex() failed");
     DotShStrtab.add(ELFYAML::dropUniqueSuffix(C->Name));
   }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 3a621d77a36b5..d3e4d2ee3bd85 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -832,6 +832,16 @@ void ScalarBitSetTraits<ELFYAML::MIPS_AFL_FLAGS1>::bitset(
 #undef BCase
 }
 
+void MappingTraits<ELFYAML::SectionHeader>::mapping(
+    IO &IO, ELFYAML::SectionHeader &SHdr) {
+  IO.mapRequired("Name", SHdr.Name);
+}
+
+void MappingTraits<ELFYAML::SectionHeaderTable>::mapping(
+    IO &IO, ELFYAML::SectionHeaderTable &SectionHeader) {
+  IO.mapRequired("Sections", SectionHeader.Sections);
+}
+
 void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
                                                  ELFYAML::FileHeader &FileHdr) {
   IO.mapRequired("Class", FileHdr.Class);
@@ -1638,6 +1648,7 @@ void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
   IO.setContext(&Object);
   IO.mapTag("!ELF", true);
   IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("SectionHeaderTable", Object.SectionHeaders);
   IO.mapOptional("ProgramHeaders", Object.ProgramHeaders);
   IO.mapOptional("Sections", Object.Chunks);
   IO.mapOptional("Symbols", Object.Symbols);
diff --git a/llvm/test/tools/yaml2obj/ELF/section-headers.yaml b/llvm/test/tools/yaml2obj/ELF/section-headers.yaml
new file mode 100644
index 0000000000000..ee0049d4d31d5
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/ELF/section-headers.yaml
@@ -0,0 +1,184 @@
+## Check we can use "SectionHeaderTable" tag to reorder section header entries.
+
+## This is a general test that has sections with unique prefixes, a fill and a
+## section without the unique prefix. The section header table describes sections
+## in the same order they are listed in the YAML.
+# RUN: yaml2obj %s --docnum=1 -o %t1 -DSEC1=".section (1)" -DSEC2=".section (2)" -DSEC3=".section.foo"
+# RUN: llvm-readelf --section-headers %t1 | FileCheck %s --check-prefix=NO-OP
+
+# NO-OP:      Section Headers:
+# NO-OP-NEXT:   [Nr] Name         Type     Address          Off    Size   ES Flg Lk Inf Al
+# NO-OP-NEXT:   [ 0]              NULL     0000000000000000 000000 000000 00      0   0  0
+# NO-OP-NEXT:   [ 1] .section     PROGBITS 0000000000000000 000040 000010 00      0   0  0
+# NO-OP-NEXT:   [ 2] .section     PROGBITS 0000000000000000 000050 000020 00      0   0  0
+# NO-OP-NEXT:   [ 3] .section.foo PROGBITS 0000000000000000 0000a0 000040 00      0   0  0
+# NO-OP-NEXT:   [ 4] .strtab      STRTAB   0000000000000000 0000e0 000001 00      0   0  1
+# NO-OP-NEXT:   [ 5] .shstrtab    STRTAB   0000000000000000 0000e1 000029 00      0   0  1
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .section (1)
+    Type: SHT_PROGBITS
+    Size: 0x10
+  - Name: .section (2)
+    Type: SHT_PROGBITS
+    Size: 0x20
+  - Type:    Fill
+    Name:    .filler
+    Size:    0x30
+    Pattern: ""
+  - Name: .section.foo
+    Type: SHT_PROGBITS
+    Size: 0x40
+SectionHeaderTable:
+  Sections:
+    - Name: [[SEC1]]
+    - Name: [[SEC2]]
+    - Name: [[SEC3]]
+    - Name: .strtab
+    - Name: .shstrtab
+
+## Show we are able to reorder sections.
+# RUN: yaml2obj %s -o %t2 -DSEC3=".section (1)" -DSEC2=".section (2)" -DSEC1=".section.foo"
+# RUN: llvm-readelf --section-headers %t2 | FileCheck %s --check-prefix=REORDERED
+
+# REORDERED:      Section Headers:
+# REORDERED-NEXT:   [Nr] Name         Type     Address          Off    Size   ES Flg Lk Inf Al
+# REORDERED-NEXT:   [ 0]              NULL     0000000000000000 000000 000000 00      0   0  0
+# REORDERED-NEXT:   [ 1] .section.foo PROGBITS 0000000000000000 0000a0 000040 00      0   0  0
+# REORDERED-NEXT:   [ 2] .section     PROGBITS 0000000000000000 000050 000020 00      0   0  0
+# REORDERED-NEXT:   [ 3] .section     PROGBITS 0000000000000000 000040 000010 00      0   0  0
+# REORDERED-NEXT:   [ 4] .strtab      STRTAB   0000000000000000 0000e0 000001 00      0   0  1
+# REORDERED-NEXT:   [ 5] .shstrtab    STRTAB   0000000000000000 0000e1 000029 00      0   0  1
+
+## Show we report proper errors when the section header description:
+##  a) contains a repeated section name.
+##  b) omits any section that exists.
+##  c) contains a non-existent section.
+# RUN: not yaml2obj %s -o /dev/null -DSEC1=".section.foo" -DSEC2="unknown" -DSEC3=".section.foo" 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR1
+#   d) contains a repeated implicit section name.
+#   e) contains a fill name.
+# RUN: not yaml2obj %s -o /dev/null -DSEC1=".strtab" -DSEC2=".shstrtab" -DSEC3=".filler" 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2
+
+# ERR1:      error: repeated section name: '.section.foo' in the section header description
+# ERR1-NEXT: error: section '.section (1)' should be present in the 'Sections' list
+# ERR1-NEXT: error: section '.section (2)' should be present in the 'Sections' list
+# ERR1-NEXT: error: section header contains undefined section 'unknown'
+
+# ERR2:      error: repeated section name: '.strtab' in the section header description
+# ERR2-NEXT: error: repeated section name: '.shstrtab' in the section header description
+# ERR2-NEXT: error: section '.section (1)' should be present in the 'Sections' list
+# ERR2-NEXT: error: section '.section (2)' should be present in the 'Sections' list
+# ERR2-NEXT: error: section '.section.foo' should be present in the 'Sections' list
+# ERR2-NEXT: error: section header contains undefined section '.filler'
+
+## Test that we are able to specify an empty sections list for
+## the "SectionHeaderTable" tag to produce no section header.
+# RUN: yaml2obj %s --docnum=2 -o %t3
+# RUN: llvm-readelf --file-headers %t3 | FileCheck %s --check-prefix=NO-HEADERS
+
+# NO-HEADERS: Start of section headers:          0  (bytes into file)
+# NO-HEADERS: Size of section headers:           64 (bytes)
+# NO-HEADERS: Number of section headers:         0
+# NO-HEADERS: Section header string table index: 0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .foo
+    Type: SHT_PROGBITS
+SectionHeaderTable:
+  Sections: []
+
+## Test that we are still able to override e_shoff, e_shnum and e_shstrndx
+## fields even when we do not produce section headers.
+# RUN: yaml2obj %s --docnum=3 -o %t4
+# RUN: llvm-readelf --file-headers %t4 | FileCheck %s --check-prefix=NO-HEADERS-OVERRIDE
+
+# NO-HEADERS-OVERRIDE: Start of section headers:          2 (bytes into file)
+# NO-HEADERS-OVERRIDE: Number of section headers:         3
+# NO-HEADERS-OVERRIDE: Section header string table index: 4
+
+--- !ELF
+FileHeader:
+  Class:     ELFCLASS64
+  Data:      ELFDATA2LSB
+  Type:      ET_REL
+  Machine:   EM_X86_64
+  SHOff:     0x2
+  SHNum:     0x3
+  SHStrNdx:  0x4
+Sections:
+  - Name: .foo
+    Type: SHT_PROGBITS
+SectionHeaderTable:
+  Sections: []
+
+## Check that section indices are updated properly in other places when we
+## reorder sections in the section header table.
+# RUN: yaml2obj %s --docnum=4 -o %t5 -DSEC1=".foo" -DSEC2=".bar"
+# RUN: llvm-readelf --section-headers --symbols %t5 | FileCheck %s --check-prefix=INDICES-A
+# RUN: yaml2obj %s --docnum=4 -o %t6 -DSEC2=".foo" -DSEC1=".bar"
+# RUN: llvm-readelf --section-headers --symbols %t6 | FileCheck %s --check-prefix=INDICES-B
+
+# INDICES-A:      [Nr] Name       Type     Address          Off    Size   ES Flg Lk
+# INDICES-A:      [ 1] .foo       PROGBITS 0000000000000000 000040 000000 00      0
+# INDICES-A-NEXT: [ 2] .bar       PROGBITS 0000000000000000 000040 000000 00      0
+# INDICES-A-NEXT: [ 3] .another.1 PROGBITS 0000000000000000 000040 000000 00      1
+# INDICES-A-NEXT: [ 4] .another.2 PROGBITS 0000000000000000 000040 000000 00      2
+
+# INDICES-A:      Num:    Value          Size Type    Bind   Vis       Ndx Name
+# INDICES-A:        1: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT     1 foo
+# INDICES-A-NEXT:   2: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT     2 bar
+
+# INDICES-B:      [ 1] .bar       PROGBITS 0000000000000000 000040 000000 00      0
+# INDICES-B-NEXT: [ 2] .foo       PROGBITS 0000000000000000 000040 000000 00      0
+# INDICES-B-NEXT: [ 3] .another.1 PROGBITS 0000000000000000 000040 000000 00      2
+# INDICES-B-NEXT: [ 4] .another.2 PROGBITS 0000000000000000 000040 000000 00      1
+
+# INDICES-B:      Num: Value            Size Type   Bind  Vis     Ndx Name
+# INDICES-B:        1: 0000000000000000    0 NOTYPE LOCAL DEFAULT   2 foo
+# INDICES-B-NEXT:   2: 0000000000000000    0 NOTYPE LOCAL DEFAULT   1 bar
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .foo
+    Type: SHT_PROGBITS
+  - Name: .bar
+    Type: SHT_PROGBITS
+  - Name: .another.1
+    Link: .foo
+    Type: SHT_PROGBITS
+  - Name: .another.2
+    Link: .bar
+    Type: SHT_PROGBITS
+SectionHeaderTable:
+  Sections:
+    - Name: [[SEC1]]
+    - Name: [[SEC2]]
+    - Name: .another.1
+    - Name: .another.2
+    - Name: .symtab
+    - Name: .strtab
+    - Name: .shstrtab
+Symbols:
+  - Name:    foo
+    Section: .foo
+  - Name:    bar
+    Section: .bar

From bd06c417e6c717cbe33b566d7bbaf27fb47e763a Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Tue, 28 Apr 2020 12:21:39 +0300
Subject: [PATCH 335/770] [analyzer] Allow bindings of the
 CompoundLiteralRegion

Summary:
CompoundLiteralRegions have been properly modeled before, but
'getBindingForElement` was not changed to accommodate this change
properly.

rdar://problem/46144644

Differential Revision: https://reviews.llvm.org/D78990
---
 clang/lib/StaticAnalyzer/Core/RegionStore.cpp |   4 -
 clang/test/Analysis/compound-literals.c       |  17 ++-
 .../retain-release-compound-literal.m         |  25 +++
 clang/unittests/StaticAnalyzer/StoreTest.cpp  | 142 ++++++++++++------
 4 files changed, 137 insertions(+), 51 deletions(-)
 create mode 100644 clang/test/Analysis/retain-release-compound-literal.m

diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 2a55c99647124..57fde32bc01d0 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -1628,10 +1628,6 @@ RegionStoreManager::findLazyBinding(RegionBindingsConstRef B,
 
 SVal RegionStoreManager::getBindingForElement(RegionBindingsConstRef B,
                                               const ElementRegion* R) {
-  // We do not currently model bindings of the CompoundLiteralregion.
-  if (isa<CompoundLiteralRegion>(R->getBaseRegion()))
-    return UnknownVal();
-
   // Check if the region has a binding.
   if (const Optional<SVal> &V = B.getDirectBinding(R))
     return *V;
diff --git a/clang/test/Analysis/compound-literals.c b/clang/test/Analysis/compound-literals.c
index f8b9121494c12..42e6a55a30c7c 100644
--- a/clang/test/Analysis/compound-literals.c
+++ b/clang/test/Analysis/compound-literals.c
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -triple=i386-apple-darwin10 -analyze -analyzer-checker=debug.ExprInspection -verify %s
+// RUN: %clang_cc1 -triple=i386-apple-darwin10 -verify %s -analyze \
+// RUN:   -analyzer-checker=debug.ExprInspection
+
+#define NULL 0
 void clang_analyzer_eval(int);
 
 // pr28449: Used to crash.
@@ -6,3 +9,15 @@ void foo(void) {
   static const unsigned short array[] = (const unsigned short[]){0x0F00};
   clang_analyzer_eval(array[0] == 0x0F00); // expected-warning{{TRUE}}
 }
+
+// check that we propagate info through compound literal regions
+void bar() {
+  int *integers = (int[]){1, 2, 3};
+  clang_analyzer_eval(integers[0] == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(integers[1] == 2); // expected-warning{{TRUE}}
+  clang_analyzer_eval(integers[2] == 3); // expected-warning{{TRUE}}
+
+  int **pointers = (int *[]){&integers[0], NULL};
+  clang_analyzer_eval(pointers[0] == NULL); // expected-warning{{FALSE}}
+  clang_analyzer_eval(pointers[1] == NULL); // expected-warning{{TRUE}}
+}
diff --git a/clang/test/Analysis/retain-release-compound-literal.m b/clang/test/Analysis/retain-release-compound-literal.m
new file mode 100644
index 0000000000000..29a125346363d
--- /dev/null
+++ b/clang/test/Analysis/retain-release-compound-literal.m
@@ -0,0 +1,25 @@
+// RUN: %clang_analyze_cc1 -verify -Wno-objc-root-class %s \
+// RUN:   -analyzer-checker=core,osx.cocoa.RetainCount
+
+#define NULL 0
+#define CF_RETURNS_RETAINED __attribute__((cf_returns_retained))
+#define CF_CONSUMED __attribute__((cf_consumed))
+
+void clang_analyzer_eval(int);
+
+typedef const void *CFTypeRef;
+
+extern CFTypeRef CFCreate() CF_RETURNS_RETAINED;
+extern CFTypeRef CFRetain(CFTypeRef cf);
+extern void CFRelease(CFTypeRef cf);
+
+void bar(CFTypeRef *v) {}
+
+void test1() {
+  CFTypeRef *values = (CFTypeRef[]){
+      CFCreate(),  // no-warning
+      CFCreate(),  // expected-warning{{leak}}
+      CFCreate()}; // no-warning
+  CFRelease(values[0]);
+  CFRelease(values[2]);
+}
diff --git a/clang/unittests/StaticAnalyzer/StoreTest.cpp b/clang/unittests/StaticAnalyzer/StoreTest.cpp
index c8b930bf3247c..17b64ce622f89 100644
--- a/clang/unittests/StaticAnalyzer/StoreTest.cpp
+++ b/clang/unittests/StaticAnalyzer/StoreTest.cpp
@@ -15,89 +15,139 @@ namespace clang {
 namespace ento {
 namespace {
 
+class StoreTestConsumer : public ExprEngineConsumer {
+public:
+  StoreTestConsumer(CompilerInstance &C) : ExprEngineConsumer(C) {}
+
+  bool HandleTopLevelDecl(DeclGroupRef DG) override {
+    for (const auto *D : DG)
+      performTest(D);
+    return true;
+  }
+
+private:
+  virtual void performTest(const Decl *D) = 0;
+};
+
+template <class ConsumerTy> class TestAction : public ASTFrontendAction {
+public:
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &Compiler,
+                                                 StringRef File) override {
+    return std::make_unique<ConsumerTy>(Compiler);
+  }
+};
+
 // Test that we can put a value into an int-type variable and load it
 // back from that variable. Test what happens if default bindings are used.
-class VariableBindConsumer : public ExprEngineConsumer {
-  void performTest(const Decl *D) {
-    StoreManager &StMgr = Eng.getStoreManager();
-    SValBuilder &SVB = Eng.getSValBuilder();
-    MemRegionManager &MRMgr = StMgr.getRegionManager();
-    const ASTContext &ACtx = Eng.getContext();
+class VariableBindConsumer : public StoreTestConsumer {
+  void performTest(const Decl *D) override {
+    StoreManager &SManager = Eng.getStoreManager();
+    SValBuilder &Builder = Eng.getSValBuilder();
+    MemRegionManager &MRManager = SManager.getRegionManager();
+    const ASTContext &ASTCtxt = Eng.getContext();
 
     const auto *VDX0 = findDeclByName<VarDecl>(D, "x0");
     const auto *VDY0 = findDeclByName<VarDecl>(D, "y0");
     const auto *VDZ0 = findDeclByName<VarDecl>(D, "z0");
     const auto *VDX1 = findDeclByName<VarDecl>(D, "x1");
     const auto *VDY1 = findDeclByName<VarDecl>(D, "y1");
-    assert(VDX0 && VDY0 && VDZ0 && VDX1 && VDY1);
+
+    ASSERT_TRUE(VDX0 && VDY0 && VDZ0 && VDX1 && VDY1);
 
     const StackFrameContext *SFC =
         Eng.getAnalysisDeclContextManager().getStackFrame(D);
 
-    Loc LX0 = loc::MemRegionVal(MRMgr.getVarRegion(VDX0, SFC));
-    Loc LY0 = loc::MemRegionVal(MRMgr.getVarRegion(VDY0, SFC));
-    Loc LZ0 = loc::MemRegionVal(MRMgr.getVarRegion(VDZ0, SFC));
-    Loc LX1 = loc::MemRegionVal(MRMgr.getVarRegion(VDX1, SFC));
-    Loc LY1 = loc::MemRegionVal(MRMgr.getVarRegion(VDY1, SFC));
+    Loc LX0 = loc::MemRegionVal(MRManager.getVarRegion(VDX0, SFC));
+    Loc LY0 = loc::MemRegionVal(MRManager.getVarRegion(VDY0, SFC));
+    Loc LZ0 = loc::MemRegionVal(MRManager.getVarRegion(VDZ0, SFC));
+    Loc LX1 = loc::MemRegionVal(MRManager.getVarRegion(VDX1, SFC));
+    Loc LY1 = loc::MemRegionVal(MRManager.getVarRegion(VDY1, SFC));
 
-    Store StInit = StMgr.getInitialStore(SFC).getStore();
-    SVal Zero = SVB.makeZeroVal(ACtx.IntTy);
-    SVal One = SVB.makeIntVal(1, ACtx.IntTy);
-    SVal NarrowZero = SVB.makeZeroVal(ACtx.CharTy);
+    Store StInit = SManager.getInitialStore(SFC).getStore();
+    SVal Zero = Builder.makeZeroVal(ASTCtxt.IntTy);
+    SVal One = Builder.makeIntVal(1, ASTCtxt.IntTy);
+    SVal NarrowZero = Builder.makeZeroVal(ASTCtxt.CharTy);
 
     // Bind(Zero)
-    Store StX0 =
-        StMgr.Bind(StInit, LX0, Zero).getStore();
-    ASSERT_EQ(Zero, StMgr.getBinding(StX0, LX0, ACtx.IntTy));
+    Store StX0 = SManager.Bind(StInit, LX0, Zero).getStore();
+    EXPECT_EQ(Zero, SManager.getBinding(StX0, LX0, ASTCtxt.IntTy));
 
     // BindDefaultInitial(Zero)
     Store StY0 =
-        StMgr.BindDefaultInitial(StInit, LY0.getAsRegion(), Zero).getStore();
-    ASSERT_EQ(Zero, StMgr.getBinding(StY0, LY0, ACtx.IntTy));
-    ASSERT_EQ(Zero, *StMgr.getDefaultBinding(StY0, LY0.getAsRegion()));
+        SManager.BindDefaultInitial(StInit, LY0.getAsRegion(), Zero).getStore();
+    EXPECT_EQ(Zero, SManager.getBinding(StY0, LY0, ASTCtxt.IntTy));
+    EXPECT_EQ(Zero, *SManager.getDefaultBinding(StY0, LY0.getAsRegion()));
 
     // BindDefaultZero()
-    Store StZ0 =
-        StMgr.BindDefaultZero(StInit, LZ0.getAsRegion()).getStore();
+    Store StZ0 = SManager.BindDefaultZero(StInit, LZ0.getAsRegion()).getStore();
     // BindDefaultZero wipes the region with '0 S8b', not with out Zero.
     // Direct load, however, does give us back the object of the type
     // that we specify for loading.
-    ASSERT_EQ(Zero, StMgr.getBinding(StZ0, LZ0, ACtx.IntTy));
-    ASSERT_EQ(NarrowZero, *StMgr.getDefaultBinding(StZ0, LZ0.getAsRegion()));
+    EXPECT_EQ(Zero, SManager.getBinding(StZ0, LZ0, ASTCtxt.IntTy));
+    EXPECT_EQ(NarrowZero, *SManager.getDefaultBinding(StZ0, LZ0.getAsRegion()));
 
     // Bind(One)
-    Store StX1 =
-        StMgr.Bind(StInit, LX1, One).getStore();
-    ASSERT_EQ(One, StMgr.getBinding(StX1, LX1, ACtx.IntTy));
+    Store StX1 = SManager.Bind(StInit, LX1, One).getStore();
+    EXPECT_EQ(One, SManager.getBinding(StX1, LX1, ASTCtxt.IntTy));
 
     // BindDefaultInitial(One)
     Store StY1 =
-        StMgr.BindDefaultInitial(StInit, LY1.getAsRegion(), One).getStore();
-    ASSERT_EQ(One, StMgr.getBinding(StY1, LY1, ACtx.IntTy));
-    ASSERT_EQ(One, *StMgr.getDefaultBinding(StY1, LY1.getAsRegion()));
+        SManager.BindDefaultInitial(StInit, LY1.getAsRegion(), One).getStore();
+    EXPECT_EQ(One, SManager.getBinding(StY1, LY1, ASTCtxt.IntTy));
+    EXPECT_EQ(One, *SManager.getDefaultBinding(StY1, LY1.getAsRegion()));
   }
 
 public:
-  VariableBindConsumer(CompilerInstance &C) : ExprEngineConsumer(C) {}
+  using StoreTestConsumer::StoreTestConsumer;
+};
 
-  bool HandleTopLevelDecl(DeclGroupRef DG) override {
-    for (const auto *D : DG)
-      performTest(D);
-    return true;
+TEST(Store, VariableBind) {
+  EXPECT_TRUE(tooling::runToolOnCode(
+      std::make_unique<TestAction<VariableBindConsumer>>(),
+      "void foo() { int x0, y0, z0, x1, y1; }"));
+}
+
+class LiteralCompoundConsumer : public StoreTestConsumer {
+  void performTest(const Decl *D) override {
+    StoreManager &SManager = Eng.getStoreManager();
+    SValBuilder &Builder = Eng.getSValBuilder();
+    MemRegionManager &MRManager = SManager.getRegionManager();
+    ASTContext &ASTCtxt = Eng.getContext();
+
+    using namespace ast_matchers;
+
+    const auto *CL = findNode<CompoundLiteralExpr>(D, compoundLiteralExpr());
+
+    const StackFrameContext *SFC =
+        Eng.getAnalysisDeclContextManager().getStackFrame(D);
+
+    QualType Int = ASTCtxt.IntTy;
+
+    // Get region for 'test'
+    const SubRegion *CLRegion = MRManager.getCompoundLiteralRegion(CL, SFC);
+
+    // Get value for 'test[0]'
+    NonLoc Zero = Builder.makeIntVal(0, false);
+    loc::MemRegionVal ZeroElement(
+        MRManager.getElementRegion(ASTCtxt.IntTy, Zero, CLRegion, ASTCtxt));
+
+    Store StInit = SManager.getInitialStore(SFC).getStore();
+    // Let's bind constant 1 to 'test[0]'
+    SVal One = Builder.makeIntVal(1, Int);
+    Store StX = SManager.Bind(StInit, ZeroElement, One).getStore();
+
+    // And make sure that we can read this binding back as it was
+    EXPECT_EQ(One, SManager.getBinding(StX, ZeroElement, Int));
   }
-};
 
-class VariableBindAction : public ASTFrontendAction {
 public:
-  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &Compiler,
-                                                 StringRef File) override {
-    return std::make_unique<VariableBindConsumer>(Compiler);
-  }
+  using StoreTestConsumer::StoreTestConsumer;
 };
 
-TEST(Store, VariableBind) {
-  EXPECT_TRUE(tooling::runToolOnCode(std::make_unique<VariableBindAction>(),
-                                     "void foo() { int x0, y0, z0, x1, y1; }"));
+TEST(Store, LiteralCompound) {
+  EXPECT_TRUE(tooling::runToolOnCode(
+      std::make_unique<TestAction<LiteralCompoundConsumer>>(),
+      "void foo() { int *test = (int[]){ 1, 2, 3 }; }", "input.c"));
 }
 
 } // namespace

From bab5dadfcd0fc3a77c47aec7e885e8b70b9f9756 Mon Sep 17 00:00:00 2001
From: Dmitry Preobrazhensky <dmitry.preobrazhensky@amd.com>
Date: Thu, 28 May 2020 14:15:29 +0300
Subject: [PATCH 336/770] [AMDGPU][MC][DISASSEMBLER] Corrected decoder to
 consume each code fragment only once

Summary: disabled disassembly of successfully decoded fragments of code.

See detailed bug description: https://bugs.llvm.org/show_bug.cgi?id=46101

Reviewers: arsenm, rampitec

Differential Revision: https://reviews.llvm.org/D80637
---
 llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 7 -------
 llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt        | 4 ++--
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 277e476907d0b..b15c98c878eb5 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -362,13 +362,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
   } while (false);
 
-  if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral ||
-        !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) {
-    MaxInstBytesNum = 8;
-    Bytes = Bytes_.slice(0, MaxInstBytesNum);
-    eatBytes<uint64_t>(Bytes);
-  }
-
   if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
index 89cbaa7f02998..21344f344fa9b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
@@ -98025,8 +98025,8 @@
 # GFX10: v_trunc_f64_e64 v[5:6], |v[1:2]| ; encoding: [0x05,0x01,0x97,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x01,0x97,0xd5,0x01,0x01,0x00,0x00
 
-# GFX10: v_writelane_b32 v255, 0, s2     ; encoding: [0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00]
-0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00
+# GFX10-FIXME: v_writelane_b32 v255, 0, s2     ; encoding: [0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00]
+# 0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00
 
 # GFX10: v_writelane_b32 v5, -1, s2      ; encoding: [0x05,0x00,0x61,0xd7,0xc1,0x04,0x00,0x00]
 0x05,0x00,0x61,0xd7,0xc1,0x04,0x00,0x00

From a56141b8f9fea112c1ea078c974d91949b6e7a5c Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Sun, 19 Apr 2020 02:19:25 +0200
Subject: [PATCH 337/770] [clangd] Highlight related control flow.

Summary:
This means e.g. highlighting "return" will show other returns/throws
from the same function, highlighting a case will show all the
return/breaks etc.

This is a bit of an abuse of textDocument/highlight, but seems useful.

Reviewers: adamcz

Subscribers: ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D78454
---
 clang-tools-extra/clangd/XRefs.cpp            | 308 +++++++++++++++++-
 .../clangd/unittests/XRefsTests.cpp           | 135 ++++++++
 2 files changed, 426 insertions(+), 17 deletions(-)

diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 1fc0e0348d093..7de1dc53596e9 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -27,8 +27,12 @@
 #include "clang/AST/Attrs.inc"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+#include "clang/AST/StmtCXX.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LLVM.h"
@@ -45,6 +49,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -712,35 +717,304 @@ findRefs(const std::vector<const NamedDecl *> &Decls, ParsedAST &AST) {
   return std::move(RefFinder).take();
 }
 
+const Stmt *getFunctionBody(DynTypedNode N) {
+  if (const auto *FD = N.get<FunctionDecl>())
+    return FD->getBody();
+  if (const auto *FD = N.get<BlockDecl>())
+    return FD->getBody();
+  if (const auto *FD = N.get<LambdaExpr>())
+    return FD->getBody();
+  if (const auto *FD = N.get<ObjCMethodDecl>())
+    return FD->getBody();
+  return nullptr;
+}
+
+const Stmt *getLoopBody(DynTypedNode N) {
+  if (const auto *LS = N.get<ForStmt>())
+    return LS->getBody();
+  if (const auto *LS = N.get<CXXForRangeStmt>())
+    return LS->getBody();
+  if (const auto *LS = N.get<WhileStmt>())
+    return LS->getBody();
+  if (const auto *LS = N.get<DoStmt>())
+    return LS->getBody();
+  return nullptr;
+}
+
+// AST traversal to highlight control flow statements under some root.
+// Once we hit further control flow we prune the tree (or at least restrict
+// what we highlight) so we capture e.g. breaks from the outer loop only.
+class FindControlFlow : public RecursiveASTVisitor<FindControlFlow> {
+  // Types of control-flow statements we might highlight.
+  enum Target {
+    Break = 1,
+    Continue = 2,
+    Return = 4,
+    Case = 8,
+    Throw = 16,
+    Goto = 32,
+    All = Break | Continue | Return | Case | Throw | Goto,
+  };
+  int Ignore = 0;     // bitmask of Target - what are we *not* highlighting?
+  SourceRange Bounds; // Half-open, restricts reported targets.
+  std::vector<SourceLocation> &Result;
+  const SourceManager &SM;
+
+  // Masks out targets for a traversal into D.
+  // Traverses the subtree using Delegate() if any targets remain.
+  template <typename Func>
+  bool filterAndTraverse(DynTypedNode D, const Func &Delegate) {
+    auto RestoreIgnore = llvm::make_scope_exit(
+        [OldIgnore(Ignore), this] { Ignore = OldIgnore; });
+    if (getFunctionBody(D))
+      Ignore = All;
+    else if (getLoopBody(D))
+      Ignore |= Continue | Break;
+    else if (D.get<SwitchStmt>())
+      Ignore |= Break | Case;
+    // Prune tree if we're not looking for anything.
+    return (Ignore == All) ? true : Delegate();
+  }
+
+  void found(Target T, SourceLocation Loc) {
+    if (T & Ignore)
+      return;
+    if (SM.isBeforeInTranslationUnit(Loc, Bounds.getBegin()) ||
+        SM.isBeforeInTranslationUnit(Bounds.getEnd(), Loc))
+      return;
+    Result.push_back(Loc);
+  }
+
+public:
+  FindControlFlow(SourceRange Bounds, std::vector<SourceLocation> &Result,
+                  const SourceManager &SM)
+      : Bounds(Bounds), Result(Result), SM(SM) {}
+
+  // When traversing function or loops, limit targets to those that still
+  // refer to the original root.
+  bool TraverseDecl(Decl *D) {
+    return !D || filterAndTraverse(DynTypedNode::create(*D), [&] {
+      return RecursiveASTVisitor::TraverseDecl(D);
+    });
+  }
+  bool TraverseStmt(Stmt *S) {
+    return !S || filterAndTraverse(DynTypedNode::create(*S), [&] {
+      return RecursiveASTVisitor::TraverseStmt(S);
+    });
+  }
+
+  // Add leaves that we found and want.
+  bool VisitReturnStmt(ReturnStmt *R) {
+    found(Return, R->getReturnLoc());
+    return true;
+  }
+  bool VisitBreakStmt(BreakStmt *B) {
+    found(Break, B->getBreakLoc());
+    return true;
+  }
+  bool VisitContinueStmt(ContinueStmt *C) {
+    found(Continue, C->getContinueLoc());
+    return true;
+  }
+  bool VisitSwitchCase(SwitchCase *C) {
+    found(Case, C->getKeywordLoc());
+    return true;
+  }
+  bool VisitCXXThrowExpr(CXXThrowExpr *T) {
+    found(Throw, T->getThrowLoc());
+    return true;
+  }
+  bool VisitGotoStmt(GotoStmt *G) {
+    // Goto is interesting if its target is outside the root.
+    if (const auto *LD = G->getLabel()) {
+      if (SM.isBeforeInTranslationUnit(LD->getLocation(), Bounds.getBegin()) ||
+          SM.isBeforeInTranslationUnit(Bounds.getEnd(), LD->getLocation()))
+        found(Goto, G->getGotoLoc());
+    }
+    return true;
+  }
+};
+
+// Given a location within a switch statement, return the half-open range that
+// covers the case it's contained in.
+// We treat `case X: case Y: ...` as one case, and assume no other fallthrough.
+SourceRange findCaseBounds(const SwitchStmt &Switch, SourceLocation Loc,
+                           const SourceManager &SM) {
+  // Cases are not stored in order, sort them first.
+  // (In fact they seem to be stored in reverse order, don't rely on this)
+  std::vector<const SwitchCase *> Cases;
+  for (const SwitchCase *Case = Switch.getSwitchCaseList(); Case;
+       Case = Case->getNextSwitchCase())
+    Cases.push_back(Case);
+  llvm::sort(Cases, [&](const SwitchCase *L, const SwitchCase *R) {
+    return SM.isBeforeInTranslationUnit(L->getKeywordLoc(), R->getKeywordLoc());
+  });
+
+  // Find the first case after the target location, the end of our range.
+  auto CaseAfter = llvm::partition_point(Cases, [&](const SwitchCase *C) {
+    return !SM.isBeforeInTranslationUnit(Loc, C->getKeywordLoc());
+  });
+  SourceLocation End = CaseAfter == Cases.end() ? Switch.getEndLoc()
+                                                : (*CaseAfter)->getKeywordLoc();
+
+  // Our target can be before the first case - cases are optional!
+  if (CaseAfter == Cases.begin())
+    return SourceRange(Switch.getBeginLoc(), End);
+  // The start of our range is usually the previous case, but...
+  auto CaseBefore = std::prev(CaseAfter);
+  // ... rewind CaseBefore to the first in a `case A: case B: ...` sequence.
+  while (CaseBefore != Cases.begin() &&
+         (*std::prev(CaseBefore))->getSubStmt() == *CaseBefore)
+    --CaseBefore;
+  return SourceRange((*CaseBefore)->getKeywordLoc(), End);
+}
+
+// Returns the locations of control flow statements related to N. e.g.:
+//   for    => branches: break/continue/return/throw
+//   break  => controlling loop (forwhile/do), and its related control flow
+//   return => all returns/throws from the same function
+// When an inner block is selected, we include branches bound to outer blocks
+// as these are exits from the inner block. e.g. return in a for loop.
+// FIXME: We don't analyze catch blocks, throw is treated the same as return.
+std::vector<SourceLocation> relatedControlFlow(const SelectionTree::Node &N) {
+  const SourceManager &SM =
+      N.getDeclContext().getParentASTContext().getSourceManager();
+  std::vector<SourceLocation> Result;
+
+  // First, check if we're at a node that can resolve to a root.
+  enum class Cur { None, Break, Continue, Return, Case, Throw } Cursor;
+  if (N.ASTNode.get<BreakStmt>()) {
+    Cursor = Cur::Break;
+  } else if (N.ASTNode.get<ContinueStmt>()) {
+    Cursor = Cur::Continue;
+  } else if (N.ASTNode.get<ReturnStmt>()) {
+    Cursor = Cur::Return;
+  } else if (N.ASTNode.get<CXXThrowExpr>()) {
+    Cursor = Cur::Throw;
+  } else if (N.ASTNode.get<SwitchCase>()) {
+    Cursor = Cur::Case;
+  } else if (const GotoStmt *GS = N.ASTNode.get<GotoStmt>()) {
+    // We don't know what root to associate with, but highlight the goto/label.
+    Result.push_back(GS->getGotoLoc());
+    if (const auto *LD = GS->getLabel())
+      Result.push_back(LD->getLocation());
+    Cursor = Cur::None;
+  } else {
+    Cursor = Cur::None;
+  }
+
+  const Stmt *Root = nullptr; // Loop or function body to traverse.
+  SourceRange Bounds;
+  // Look up the tree for a root (or just at this node if we didn't find a leaf)
+  for (const auto *P = &N; P; P = P->Parent) {
+    // return associates with enclosing function
+    if (const Stmt *FunctionBody = getFunctionBody(P->ASTNode)) {
+      if (Cursor == Cur::Return || Cursor == Cur::Throw) {
+        Root = FunctionBody;
+      }
+      break; // other leaves don't cross functions.
+    }
+    // break/continue associate with enclosing loop.
+    if (const Stmt *LoopBody = getLoopBody(P->ASTNode)) {
+      if (Cursor == Cur::None || Cursor == Cur::Break ||
+          Cursor == Cur::Continue) {
+        Root = LoopBody;
+        // Highlight the loop keyword itself.
+        // FIXME: for do-while, this only covers the `do`..
+        Result.push_back(P->ASTNode.getSourceRange().getBegin());
+        break;
+      }
+    }
+    // For switches, users think of case statements as control flow blocks.
+    // We highlight only occurrences surrounded by the same case.
+    // We don't detect fallthrough (other than 'case X, case Y').
+    if (const auto *SS = P->ASTNode.get<SwitchStmt>()) {
+      if (Cursor == Cur::Break || Cursor == Cur::Case) {
+        Result.push_back(SS->getSwitchLoc()); // Highlight the switch.
+        Root = SS->getBody();
+        // Limit to enclosing case, if there is one.
+        Bounds = findCaseBounds(*SS, N.ASTNode.getSourceRange().getBegin(), SM);
+        break;
+      }
+    }
+    // If we didn't start at some interesting node, we're done.
+    if (Cursor == Cur::None)
+      break;
+  }
+  if (Root) {
+    if (!Bounds.isValid())
+      Bounds = Root->getSourceRange();
+    FindControlFlow(Bounds, Result, SM).TraverseStmt(const_cast<Stmt *>(Root));
+  }
+  return Result;
+}
+
+DocumentHighlight toHighlight(const ReferenceFinder::Reference &Ref,
+                              const SourceManager &SM) {
+  DocumentHighlight DH;
+  DH.range = Ref.range(SM);
+  if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Write))
+    DH.kind = DocumentHighlightKind::Write;
+  else if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Read))
+    DH.kind = DocumentHighlightKind::Read;
+  else
+    DH.kind = DocumentHighlightKind::Text;
+  return DH;
+}
+
+llvm::Optional<DocumentHighlight> toHighlight(SourceLocation Loc,
+                                              const syntax::TokenBuffer &TB) {
+  Loc = TB.sourceManager().getFileLoc(Loc);
+  if (const auto *Tok = TB.spelledTokenAt(Loc)) {
+    DocumentHighlight Result;
+    Result.range = halfOpenToRange(
+        TB.sourceManager(),
+        CharSourceRange::getCharRange(Tok->location(), Tok->endLocation()));
+    return Result;
+  }
+  return llvm::None;
+}
+
 } // namespace
 
 std::vector<DocumentHighlight> findDocumentHighlights(ParsedAST &AST,
                                                       Position Pos) {
   const SourceManager &SM = AST.getSourceManager();
   // FIXME: show references to macro within file?
-  DeclRelationSet Relations =
-      DeclRelation::TemplatePattern | DeclRelation::Alias;
   auto CurLoc = sourceLocationInMainFile(SM, Pos);
   if (!CurLoc) {
     llvm::consumeError(CurLoc.takeError());
     return {};
   }
-  auto References = findRefs(getDeclAtPosition(AST, *CurLoc, Relations), AST);
-
-  // FIXME: we may get multiple DocumentHighlights with the same location and
-  // different kinds, deduplicate them.
   std::vector<DocumentHighlight> Result;
-  for (const auto &Ref : References) {
-    DocumentHighlight DH;
-    DH.range = Ref.range(SM);
-    if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Write))
-      DH.kind = DocumentHighlightKind::Write;
-    else if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Read))
-      DH.kind = DocumentHighlightKind::Read;
-    else
-      DH.kind = DocumentHighlightKind::Text;
-    Result.push_back(std::move(DH));
-  }
+  auto TryTree = [&](SelectionTree ST) {
+    if (const SelectionTree::Node *N = ST.commonAncestor()) {
+      DeclRelationSet Relations =
+          DeclRelation::TemplatePattern | DeclRelation::Alias;
+      auto Decls = targetDecl(N->ASTNode, Relations);
+      if (!Decls.empty()) {
+        auto Refs = findRefs({Decls.begin(), Decls.end()}, AST);
+        // FIXME: we may get multiple DocumentHighlights with the same location
+        // and different kinds, deduplicate them.
+        for (const auto &Ref : findRefs({Decls.begin(), Decls.end()}, AST))
+          Result.push_back(toHighlight(Ref, SM));
+        return true;
+      }
+      auto ControlFlow = relatedControlFlow(*N);
+      if (!ControlFlow.empty()) {
+        for (SourceLocation Loc : ControlFlow)
+          if (auto Highlight = toHighlight(Loc, AST.getTokens()))
+            Result.push_back(std::move(*Highlight));
+        return true;
+      }
+    }
+    return false;
+  };
+
+  unsigned Offset =
+      AST.getSourceManager().getDecomposedSpellingLoc(*CurLoc).second;
+  SelectionTree::createEach(AST.getASTContext(), AST.getTokens(), Offset,
+                            Offset, TryTree);
   return Result;
 }
 
diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 77e863895f803..b73a310e95fb2 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -116,6 +116,141 @@ TEST(HighlightsTest, All) {
   }
 }
 
+TEST(HighlightsTest, ControlFlow) {
+  const char *Tests[] = {
+      R"cpp(
+        // Highlight same-function returns.
+        int fib(unsigned n) {
+          if (n <= 1) [[ret^urn]] 1;
+          [[return]] fib(n - 1) + fib(n - 2);
+
+          // Returns from other functions not highlighted.
+          auto Lambda = [] { return; };
+          class LocalClass { void x() { return; } };
+        }
+      )cpp",
+
+      R"cpp(
+        #define FAIL() return false
+        #define DO(x) { x; }
+        bool foo(int n) {
+          if (n < 0) [[FAIL]]();
+          DO([[re^turn]] true)
+        }
+      )cpp",
+
+      R"cpp(
+        // Highlight loop control flow
+        int magic() {
+          int counter = 0;
+          [[^for]] (char c : "fruit loops!") {
+            if (c == ' ') [[continue]];
+            counter += c;
+            if (c == '!') [[break]];
+            if (c == '?') [[return]] -1;
+          }
+          return counter;
+        }
+      )cpp",
+
+      R"cpp(
+        // Highlight loop and same-loop control flow
+        void nonsense() {
+          [[while]] (true) {
+            if (false) [[bre^ak]];
+            switch (1) break;
+            [[continue]];
+          }
+        }
+      )cpp",
+
+      R"cpp(
+        // Highlight switch for break (but not other breaks).
+        void describe(unsigned n) {
+          [[switch]](n) {
+          case 0:
+            break;
+          [[default]]:
+            [[^break]];
+          }
+        }
+      )cpp",
+
+      R"cpp(
+        // Highlight case and exits for switch-break (but not other cases).
+        void describe(unsigned n) {
+          [[switch]](n) {
+          case 0:
+            break;
+          [[case]] 1:
+          [[default]]:
+            [[return]];
+            [[^break]];
+          }
+        }
+      )cpp",
+
+      R"cpp(
+        // Highlight exits and switch for case
+        void describe(unsigned n) {
+          [[switch]](n) {
+          case 0:
+            break;
+          [[case]] 1:
+          [[d^efault]]:
+            [[return]];
+            [[break]];
+          }
+        }
+      )cpp",
+
+      R"cpp(
+        // Highlight nothing for switch.
+        void describe(unsigned n) {
+          s^witch(n) {
+          case 0:
+            break;
+          case 1:
+          default:
+            return;
+            break;
+          }
+        }
+      )cpp",
+
+      R"cpp(
+        // FIXME: match exception type against catch blocks
+        int catchy() {
+          try {                     // wrong: highlight try with matching catch
+            try {                   // correct: has no matching catch
+              [[thr^ow]] "oh no!";
+            } catch (int) { }       // correct: catch doesn't match type
+            [[return]] -1;          // correct: exits the matching catch
+          } catch (const char*) { } // wrong: highlight matching catch
+          [[return]] 42;            // wrong: throw doesn't exit function
+        }
+      )cpp",
+
+      R"cpp(
+        // Loop highlights goto exiting the loop, but not jumping within it.
+        void jumpy() {
+          [[wh^ile]](1) {
+            up:
+            if (0) [[goto]] out;
+            goto up;
+          }
+          out: return;
+        }
+      )cpp",
+  };
+  for (const char *Test : Tests) {
+    Annotations T(Test);
+    auto AST = TestTU::withCode(T.code()).build();
+    EXPECT_THAT(findDocumentHighlights(AST, T.point()), HighlightsFrom(T))
+        << Test;
+  }
+}
+
 MATCHER_P3(Sym, Name, Decl, DefOrNone, "") {
   llvm::Optional<Range> Def = DefOrNone;
   if (Name != arg.Name) {

From 45251ef5345b3c81c8f394d42d252de039c72566 Mon Sep 17 00:00:00 2001
From: Dmitry Preobrazhensky <dmitry.preobrazhensky@amd.com>
Date: Thu, 28 May 2020 14:25:29 +0300
Subject: [PATCH 338/770] [AMDGPU][MC] Corrected v_writelane_b32 to fix a
 decoding bug

Corrected vdst_in to match vdst operand type.
See bug 45193: https://bugs.llvm.org/show_bug.cgi?id=45193

Reviewers: arsenm, rampitec

Differential Revision: https://reviews.llvm.org/D80636
---
 llvm/lib/Target/AMDGPU/VOP2Instructions.td          | 4 ++--
 llvm/lib/Target/AMDGPU/VOP3Instructions.td          | 4 ++--
 llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 4927c6c2f3f27..86dc179f94214 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1262,9 +1262,9 @@ defm V_SUBBREV_U32        : VOP2be_Real_gfx6_gfx7<0x02a>;
 
 defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
 
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
   defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 let SubtargetPredicate = isGFX6GFX7 in {
   defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 7e1ac7509719c..66a4e62a3be45 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -839,9 +839,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
 
 defm V_READLANE_B32  : VOP3_Real_gfx10<0x360>;
 
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
   defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 defm V_XOR3_B32           : VOP3_Real_gfx10<0x178>;
 defm V_LSHLREV_B64        : VOP3_Real_gfx10<0x2ff>;
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
index 21344f344fa9b..89cbaa7f02998 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
@@ -98025,8 +98025,8 @@
 # GFX10: v_trunc_f64_e64 v[5:6], |v[1:2]| ; encoding: [0x05,0x01,0x97,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x01,0x97,0xd5,0x01,0x01,0x00,0x00
 
-# GFX10-FIXME: v_writelane_b32 v255, 0, s2     ; encoding: [0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00]
-# 0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00
+# GFX10: v_writelane_b32 v255, 0, s2     ; encoding: [0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00]
+0xff,0x00,0x61,0xd7,0x80,0x04,0x00,0x00
 
 # GFX10: v_writelane_b32 v5, -1, s2      ; encoding: [0x05,0x00,0x61,0xd7,0xc1,0x04,0x00,0x00]
 0x05,0x00,0x61,0xd7,0xc1,0x04,0x00,0x00

From f9e94eb8688d1fe1727360462e957fbbfb754e59 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanjai@ca.ibm.com>
Date: Thu, 28 May 2020 06:55:37 -0500
Subject: [PATCH 339/770] [Clang] Enable _Complex __float128

When I added __float128 a while ago, I neglected to add support for the complex
variant of the type. This patch just adds that.

Differential revision: https://reviews.llvm.org/D80533
---
 clang/lib/Sema/DeclSpec.cpp               |  3 ++-
 clang/test/CodeGen/ppc64-complex-parms.c  | 11 +++++++++++
 clang/test/CodeGen/ppc64-complex-return.c | 22 ++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index 276e35a3497e6..834e2533342d4 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -1269,7 +1269,8 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
       // Note that this intentionally doesn't include _Complex _Bool.
       if (!S.getLangOpts().CPlusPlus)
         S.Diag(TSTLoc, diag::ext_integer_complex);
-    } else if (TypeSpecType != TST_float && TypeSpecType != TST_double) {
+    } else if (TypeSpecType != TST_float && TypeSpecType != TST_double &&
+               TypeSpecType != TST_float128) {
       S.Diag(TSCLoc, diag::err_invalid_complex_spec)
         << getSpecifierName((TST)TypeSpecType, Policy);
       TypeSpecComplex = TSC_unspecified;
diff --git a/clang/test/CodeGen/ppc64-complex-parms.c b/clang/test/CodeGen/ppc64-complex-parms.c
index c0e1794bf47c6..1c8aa1d568cf7 100644
--- a/clang/test/CodeGen/ppc64-complex-parms.c
+++ b/clang/test/CodeGen/ppc64-complex-parms.c
@@ -1,8 +1,19 @@
+// REQUIRES: powerpc-registered-target
 // RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -target-feature +float128 -DTEST_F128 -triple \
+// RUN:   powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefix CHECK-F128
 
 float crealf(_Complex float);
 double creal(_Complex double);
 long double creall(_Complex long double);
+#ifdef TEST_F128
+__float128 crealf128(_Complex __float128);
+__float128 foo_f128(_Complex __float128 x) {
+  return crealf128(x);
+}
+// CHECK-F128: define fp128 @foo_f128(fp128 {{[%A-Za-z0-9.]+}}, fp128 {{[%A-Za-z0-9.]+}})
+#endif
 
 float foo_float(_Complex float x) {
   return crealf(x);
diff --git a/clang/test/CodeGen/ppc64-complex-return.c b/clang/test/CodeGen/ppc64-complex-return.c
index 02bfe82d4efec..a27286d85b8fd 100644
--- a/clang/test/CodeGen/ppc64-complex-return.c
+++ b/clang/test/CodeGen/ppc64-complex-return.c
@@ -1,9 +1,20 @@
 // REQUIRES: powerpc-registered-target
 // RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -target-feature +float128 -DTEST_F128 -triple \
+// RUN:   powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefix CHECK-F128
 
 float crealf(_Complex float);
 double creal(_Complex double);
 long double creall(_Complex long double);
+#ifdef TEST_F128
+__float128 crealf128(_Complex __float128);
+_Complex __float128 foo_f128(_Complex __float128 x) {
+  return x;
+}
+
+// CHECK-F128: define { fp128, fp128 } @foo_f128(fp128 {{[%A-Za-z0-9.]+}}, fp128 {{[%A-Za-z0-9.]+}}) [[NUW:#[0-9]+]] {
+#endif
 
 _Complex float foo_float(_Complex float x) {
   return x;
@@ -80,6 +91,17 @@ long double bar_long_double(void) {
 // CHECK: extractvalue { ppc_fp128, ppc_fp128 } [[VAR3]], 0
 // CHECK: extractvalue { ppc_fp128, ppc_fp128 } [[VAR3]], 1
 
+#ifdef TEST_F128
+__float128 bar_f128(void) {
+  return crealf128(foo_f128(2.0Q - 2.5Qi));
+}
+
+// CHECK-F128: define fp128 @bar_f128() [[NUW]] {
+// CHECK-F128: [[VAR3:[%A-Za-z0-9.]+]] = call { fp128, fp128 } @foo_f128
+// CHECK-F128: extractvalue { fp128, fp128 } [[VAR3]], 0
+// CHECK-F128: extractvalue { fp128, fp128 } [[VAR3]], 1
+#endif
+
 int bar_int(void) {
   return __real__(foo_int(2 - 3i));
 }

From 84be4278e7966c8747aad48aa49de9bcd9f1d730 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 13:01:53 +0100
Subject: [PATCH 340/770] llvm-dwarfdump.h - remove unnecessary WithColor.h
 include. NFC.

---
 llvm/tools/llvm-dwarfdump/llvm-dwarfdump.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.h b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.h
index 86a2f1b553a96..dc41298265d2a 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.h
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.h
@@ -12,7 +12,6 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {

From ab5abce23ccf8369c39d734c6279949815a9b9dc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 13:02:44 +0100
Subject: [PATCH 341/770] DWARFDebugMacro.h - remove unnecessary WithColor.h
 include. NFC.

---
 llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
index c3d737ca52393..d35fb17113234 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/WithColor.h"
 #include <cstdint>
 
 namespace llvm {

From 73ae678363fb42418a8959955d05488191045b31 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 13:07:06 +0100
Subject: [PATCH 342/770] Fix MSVC signed/unsigned comparison warnings. NFC.

---
 llvm/lib/Support/FileCheck.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/FileCheck.cpp b/llvm/lib/Support/FileCheck.cpp
index 454f38132f6be..a1a37c972b8cb 100644
--- a/llvm/lib/Support/FileCheck.cpp
+++ b/llvm/lib/Support/FileCheck.cpp
@@ -119,7 +119,7 @@ Expected<int64_t> ExpressionValue::getSignedValue() const {
   if (Negative)
     return getAsSigned(Value);
 
-  if (Value > std::numeric_limits<int64_t>::max())
+  if (Value > (uint64_t)std::numeric_limits<int64_t>::max())
     return make_error<OverflowError>();
 
   // Value is in the representable range of int64_t so we can use cast.
@@ -187,7 +187,7 @@ Expected<ExpressionValue> llvm::operator-(const ExpressionValue &LeftOperand,
     int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
     uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
     // Result <= -1 - (max int64_t) which overflows on 1- and 2-complement.
-    if (RightValue > std::numeric_limits<int64_t>::max())
+    if (RightValue > (uint64_t)std::numeric_limits<int64_t>::max())
       return make_error<OverflowError>();
     Optional<int64_t> Result =
         checkedSub(LeftValue, static_cast<int64_t>(RightValue));

From f47e27e260e3e06167a7e1de8a4c092b95717e15 Mon Sep 17 00:00:00 2001
From: Dmitry Preobrazhensky <dmitry.preobrazhensky@amd.com>
Date: Thu, 28 May 2020 15:07:58 +0300
Subject: [PATCH 343/770] [AMDGPU][MC][GFX908] Corrected src0 of
 v_accvgpr_write to accept only VGPRs and inline constants.

This change disables use of special SGPR registers like scc, vccz, execz, etc as operands of v_accvgpr_write.

See bug 45414: https://bugs.llvm.org/show_bug.cgi?id=45414

Reviewers: arsenm, rampitec

Differential Revision: https://reviews.llvm.org/D80530
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 28 +++++++++++++++++++
 llvm/test/MC/AMDGPU/mai.s                     | 21 +++++++++++++-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 806cc482f634e..4221e3f053718 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1346,6 +1346,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateOpSel(const MCInst &Inst);
   bool validateVccOperand(unsigned Reg) const;
   bool validateVOP3Literal(const MCInst &Inst) const;
+  bool validateMAIAccWrite(const MCInst &Inst);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -3147,6 +3148,30 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
   return !isSGPR(mc2PseudoReg(Reg), TRI);
 }
 
+bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+
+  if (Opc != AMDGPU::V_ACCVGPR_WRITE_B32_vi)
+    return true;
+
+  const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  assert(Src0Idx != -1);
+
+  const MCOperand &Src0 = Inst.getOperand(Src0Idx);
+  if (!Src0.isReg())
+    return true;
+
+  auto Reg = Src0.getReg();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  if (isSGPR(mc2PseudoReg(Reg), TRI)) {
+    Error(getLoc(), "source operand must be either a VGPR or an inline constant");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
 
   const unsigned Opc = Inst.getOpcode();
@@ -3617,6 +3642,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateSMEMOffset(Inst, Operands)) {
     return false;
   }
+  if (!validateMAIAccWrite(Inst)) {
+    return false;
+  }
 
   return true;
 }
diff --git a/llvm/test/MC/AMDGPU/mai.s b/llvm/test/MC/AMDGPU/mai.s
index 76aa534bdef65..09eddb0d258c8 100644
--- a/llvm/test/MC/AMDGPU/mai.s
+++ b/llvm/test/MC/AMDGPU/mai.s
@@ -1,4 +1,5 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck -check-prefix=GFX908 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck -check-prefix=GFX908 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 -show-encoding %s 2>&1 | FileCheck -check-prefix=NOGFX908 %s
 
 v_accvgpr_read_b32 v2, a0
 // GFX908: v_accvgpr_read_b32 v2, a0       ; encoding: [0x02,0x00,0xd8,0xd3,0x00,0x01,0x00,0x08]
@@ -24,6 +25,24 @@ v_accvgpr_write_b32 a2, v1
 v_accvgpr_write a2, v255
 // GFX908: v_accvgpr_write_b32 a2, v255    ; encoding: [0x02,0x00,0xd9,0xd3,0xff,0x01,0x00,0x00]
 
+v_accvgpr_write a2, 100
+// NOGFX908: error: invalid operand for instruction
+
+v_accvgpr_write a2, execz
+// NOGFX908: error: source operand must be either a VGPR or an inline constant
+
+v_accvgpr_write a2, vccz
+// NOGFX908: error: source operand must be either a VGPR or an inline constant
+
+v_accvgpr_write a2, scc
+// NOGFX908: error: source operand must be either a VGPR or an inline constant
+
+v_accvgpr_write a2, shared_base
+// NOGFX908: error: source operand must be either a VGPR or an inline constant
+
+v_accvgpr_write a2, pops_exiting_wave_id
+// NOGFX908: error: source operand must be either a VGPR or an inline constant
+
 v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[1:32]
 // GFX908: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[1:32] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x06,0x04]
 

From 7716681cfd0ea2dadbddae6f1983e130c2fa4247 Mon Sep 17 00:00:00 2001
From: Alok Kumar Sharma <AlokKumar.Sharma@amd.com>
Date: Thu, 28 May 2020 15:46:06 +0530
Subject: [PATCH 344/770] Fixed bot failure after d20bf5a7258d4b6a7

There was a failure on windows bit due to format mismatch on
different(Hex and Decimal) platforms even if meaning of output is same.

For example on X86 linux =>
DW_OP_plus_uconst 0x70, DW_OP_deref, DW_OP_lit4, DW_OP_mul
              ^
on X86 Windows-gnu =>
DW_AT_location (DW_OP_fbreg +112, DW_OP_deref, DW_OP_lit4, DW_OP_mul)

: error: CHECK-SAME: expected string not found in input
; CHECK-SAME: DW_OP_plus_uconst 0x70, DW_OP_deref, DW_OP_lit4, DW_OP_mul
              ^
<stdin>:28:17: note: scanning from here
 DW_AT_location (DW_OP_fbreg +112, DW_OP_deref, DW_OP_lit4, DW_OP_mul)
                ^
<stdin>:28:18: note: possible intended match here
 DW_AT_location (DW_OP_fbreg +112, DW_OP_deref, DW_OP_lit4, DW_OP_mul)

Now the test is limited to x86 using REQUIRED and -mtriple.

http://45.33.8.238/win/16214/step_11.txt
---
 llvm/test/DebugInfo/fortranSubrangeVar.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/DebugInfo/fortranSubrangeVar.ll b/llvm/test/DebugInfo/fortranSubrangeVar.ll
index 5ee283e4b21e5..5cc5f6075ff68 100644
--- a/llvm/test/DebugInfo/fortranSubrangeVar.ll
+++ b/llvm/test/DebugInfo/fortranSubrangeVar.ll
@@ -1,6 +1,7 @@
 ;; This test checks DISubrange bounds for DIVariable
+; REQUIRES: x86_64-linux
 
-; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
 
 ;; Test whether bounds are generated correctly.
 ; CHECK: [[DIE1:0x.+]]:       DW_TAG_variable

From 1ddac9563d7f2414e6c4302f9902ac1294966161 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 13:34:09 +0100
Subject: [PATCH 345/770] [X86][SSE] Peek though MOVMSK source sign bits using
 SimplifyMultipleUseDemandedBits

Allows SimplifyDemandedBitsForTargetNode to peek through multi-use ops where MOVMSK only demands the signbit of each vector element.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++-
 llvm/test/CodeGen/X86/vec_saddo.ll      | 17 ++---
 llvm/test/CodeGen/X86/vec_smulo.ll      | 83 ++++++++++++-------------
 llvm/test/CodeGen/X86/vec_ssubo.ll      | 17 ++---
 4 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d70b5a7f3a227..8ec958338c024 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37363,14 +37363,20 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     // MOVMSK only uses the MSB from each vector element.
     KnownBits KnownSrc;
-    if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
-                             KnownSrc, TLO, Depth + 1))
+    APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+                             Depth + 1))
       return true;
 
     if (KnownSrc.One[SrcBits - 1])
       Known.One.setLowBits(NumElts);
     else if (KnownSrc.Zero[SrcBits - 1])
       Known.Zero.setLowBits(NumElts);
+
+    // Attempt to avoid multi-use os if we don't need anything from it.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
     return false;
   }
   case X86ISD::BEXTR: {
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 5223187eee7db..dd3a733ab2178 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1078,12 +1078,13 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pslld $31, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: saddo_v4i1:
@@ -1094,8 +1095,8 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
@@ -1110,8 +1111,8 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index a3e28ae8d7baa..5fde07d1269df 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3422,21 +3422,20 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; SSE2-NEXT:    psubd %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pslld $31, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    movmskps %xmm3, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    movmskps %xmm0, %eax
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    retq
 ;
@@ -3461,21 +3460,20 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; SSSE3-NEXT:    psubd %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    psrad $31, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pslld $31, %xmm3
-; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
-; SSSE3-NEXT:    movmskps %xmm3, %eax
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    movmskps %xmm0, %eax
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    movb %al, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
@@ -3494,17 +3492,16 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pslld $31, %xmm3
+; SSE41-NEXT:    movmskps %xmm3, %eax
 ; SSE41-NEXT:    psrad $31, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    pslld $31, %xmm2
-; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    movmskps %xmm2, %eax
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movb %al, (%rdi)
 ; SSE41-NEXT:    retq
 ;
@@ -3526,8 +3523,8 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm3
-; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovmskps %xmm3, %eax
@@ -3552,8 +3549,8 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm3
-; AVX2-NEXT:    vpsrad $31, %xmm3, %xmm3
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm4, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovmskps %xmm3, %eax
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 21a5e71036d78..8ab9367c32f86 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1088,12 +1088,13 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pslld $31, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    movb %al, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: ssubo_v4i1:
@@ -1104,8 +1105,8 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
@@ -1120,8 +1121,8 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax

From e73bb4fba7092f7e1ef807812063a0f655a185af Mon Sep 17 00:00:00 2001
From: Frederik Gossen <frgossen@google.com>
Date: Thu, 28 May 2020 13:35:51 +0000
Subject: [PATCH 346/770] [MLIR] Move `ConcatOp` to its lexicographic position

Purely cosmetic change.
The operation implementations in `Shape.cpp` are now lexicographic order.

Differential Revision: https://reviews.llvm.org/D80277
---
 mlir/lib/Dialect/Shape/IR/Shape.cpp | 56 ++++++++++++++---------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index c4a8b15298171..fc8f9b23e1e46 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -180,6 +180,34 @@ OpFoldResult BroadcastOp::fold(ArrayRef<Attribute> operands) {
   return builder.getIndexTensorAttr(resultShape);
 }
 
+//===----------------------------------------------------------------------===//
+// ConcatOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ConcatOp::inferReturnTypes(MLIRContext *context, Optional<Location> location,
+                           ValueRange operands, DictionaryAttr attributes,
+                           RegionRange regions,
+                           SmallVectorImpl<Type> &inferredReturnTypes) {
+  auto shapeType = ShapeType::get(context);
+  inferredReturnTypes.push_back(shapeType);
+  return success();
+}
+
+OpFoldResult ConcatOp::fold(ArrayRef<Attribute> operands) {
+  if (!operands[0] || !operands[1])
+    return nullptr;
+  auto lhsShape = llvm::to_vector<6>(
+      operands[0].cast<DenseIntElementsAttr>().getValues<int64_t>());
+  auto rhsShape = llvm::to_vector<6>(
+      operands[1].cast<DenseIntElementsAttr>().getValues<int64_t>());
+  SmallVector<int64_t, 6> resultShape;
+  resultShape.append(lhsShape.begin(), lhsShape.end());
+  resultShape.append(rhsShape.begin(), rhsShape.end());
+  Builder builder(getContext());
+  return builder.getIndexTensorAttr(resultShape);
+}
+
 //===----------------------------------------------------------------------===//
 // ConstShapeOp
 //===----------------------------------------------------------------------===//
@@ -341,34 +369,6 @@ LogicalResult SplitAtOp::fold(ArrayRef<Attribute> operands,
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ConcatOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult
-ConcatOp::inferReturnTypes(MLIRContext *context, Optional<Location> location,
-                           ValueRange operands, DictionaryAttr attributes,
-                           RegionRange regions,
-                           SmallVectorImpl<Type> &inferredReturnTypes) {
-  auto shapeType = ShapeType::get(context);
-  inferredReturnTypes.push_back(shapeType);
-  return success();
-}
-
-OpFoldResult ConcatOp::fold(ArrayRef<Attribute> operands) {
-  if (!operands[0] || !operands[1])
-    return nullptr;
-  auto lhsShape = llvm::to_vector<6>(
-      operands[0].cast<DenseIntElementsAttr>().getValues<int64_t>());
-  auto rhsShape = llvm::to_vector<6>(
-      operands[1].cast<DenseIntElementsAttr>().getValues<int64_t>());
-  SmallVector<int64_t, 6> resultShape;
-  resultShape.append(lhsShape.begin(), lhsShape.end());
-  resultShape.append(rhsShape.begin(), rhsShape.end());
-  Builder builder(getContext());
-  return builder.getIndexTensorAttr(resultShape);
-}
-
 //===----------------------------------------------------------------------===//
 // ToExtentTensorOp
 //===----------------------------------------------------------------------===//

From 1a945757ac7debd9ad14497fa404e3900869cad5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 13:58:35 +0100
Subject: [PATCH 347/770] WithColor.h - reduce unnecessary includes to forward
 declarations. NFC.

---
 llvm/include/llvm/Support/WithColor.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Support/WithColor.h b/llvm/include/llvm/Support/WithColor.h
index b5758b19be6da..411a92071fc72 100644
--- a/llvm/include/llvm/Support/WithColor.h
+++ b/llvm/include/llvm/Support/WithColor.h
@@ -9,15 +9,15 @@
 #ifndef LLVM_SUPPORT_WITHCOLOR_H
 #define LLVM_SUPPORT_WITHCOLOR_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Error.h"
 
 namespace llvm {
 
-extern cl::OptionCategory ColorCategory;
-
+class Error;
 class raw_ostream;
+class StringRef;
+
+extern cl::OptionCategory ColorCategory;
 
 // Symbolic names for various syntax elements.
 enum class HighlightColor {

From f6417f5db8c16286904d074d1e40e9c1eb083951 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 14:37:17 +0100
Subject: [PATCH 348/770] FileOutputBuffer.h - remove unused includes. NFC.

Move dependent includes down to source files where necessary.
---
 lld/Common/Strings.cpp                           | 1 +
 llvm/include/llvm/Support/FileOutputBuffer.h     | 2 --
 llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp | 3 +--
 llvm/lib/Support/FileOutputBuffer.cpp            | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lld/Common/Strings.cpp b/lld/Common/Strings.cpp
index 605d9de685eda..17c2c207491ff 100644
--- a/lld/Common/Strings.cpp
+++ b/lld/Common/Strings.cpp
@@ -10,6 +10,7 @@
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/LLVM.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GlobPattern.h"
 #include <algorithm>
 #include <mutex>
diff --git a/llvm/include/llvm/Support/FileOutputBuffer.h b/llvm/include/llvm/Support/FileOutputBuffer.h
index bdc1425d43617..8eb36d0034ad4 100644
--- a/llvm/include/llvm/Support/FileOutputBuffer.h
+++ b/llvm/include/llvm/Support/FileOutputBuffer.h
@@ -13,11 +13,9 @@
 #ifndef LLVM_SUPPORT_FILEOUTPUTBUFFER_H
 #define LLVM_SUPPORT_FILEOUTPUTBUFFER_H
 
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 
 namespace llvm {
 /// FileOutputBuffer - This interface provides simple way to create an in-memory
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index d6a187ebb9f6a..deb0f201a71ed 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
-
 #include "llvm/ADT/BitVector.h"
-
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
@@ -23,6 +21,7 @@
 #include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/CRC.h"
+#include "llvm/Support/Chrono.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp
index ec12820e9692c..3342682270dcd 100644
--- a/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/llvm/lib/Support/FileOutputBuffer.cpp
@@ -12,8 +12,8 @@
 
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Path.h"
 #include <system_error>

From dd484baffdf4a92e564c38a17d35a742e633b0e0 Mon Sep 17 00:00:00 2001
From: Frederik Gossen <frgossen@google.com>
Date: Thu, 28 May 2020 13:40:34 +0000
Subject: [PATCH 349/770] [MLIR] Tidy up documentation for `Shape_JoinOp`,
 `Shape_ReduceOp`, and `Shape_ConstSizeOp`

Fix places that refer to `shape.type` instead of `shape.shape`.

Differential Revision: https://reviews.llvm.org/D80278
---
 mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 406aac2db99a2..dddc4c3ea08cb 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -115,9 +115,9 @@ def Shape_ConstSizeOp : Shape_Op<"const_size",
     [ConstantLike,
      NoSideEffect,
      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
-  let summary = "Creates a constant of !shape.size type.";
+  let summary = "Creates a constant of type `shape.size`";
   let description = [{
-    Creates a !shape.size type representing the constant size given by `value`.
+    Creates a `shape.size` type representing the constant size given by `value`.
 
     ```mlir
     %x = shape.const_size 10
@@ -217,10 +217,10 @@ def Shape_GetExtentOp : Shape_Op<"get_extent",
 def Shape_JoinOp : Shape_Op<"join", []> {
   let summary = "Returns the least general shape.size of its operands";
   let description = [{
-    An operation that computes the least general shape of input operands. This
-    effectively asserts that corresponding static dimensions are equal. The
-    behavior is to match each element of the `shape.type` and propagate the most
-    restrictive information, returning an invalid shape if there are
+    An operation that computes the least general shape of input operands.
+    This effectively asserts that corresponding static dimensions are equal.
+    The behavior is to match each element of the `shape.shape` and propagate the
+    most restrictive information, returning an invalid shape if there are
     contradictory requirements. E.g., using pseudo code
 
     ```
@@ -238,7 +238,7 @@ def Shape_JoinOp : Shape_Op<"join", []> {
     used to return an error to the user upon mismatch of dimensions.
 
     ```mlir
-    %c = shape.join %a, %b, error="<reason>" : !shape.type
+    %c = shape.join %a, %b, error="<reason>" : !shape.shape
     ```
   }];
 
@@ -279,14 +279,14 @@ def Shape_ReduceOp : Shape_Op<"reduce", []> {
     number of elements
 
     ```mlir
-    func @shape_num_elements(%shape : !shape.type) -> !shape.size {
+    func @shape_num_elements(%shape : !shape.shape) -> !shape.size {
       %0 = "shape.constant_dim"() {value = 1 : i32} : () -> !shape.size
       %1 = "shape.reduce"(%shape, %0) ( {
         ^bb0(%index: i32, %dim: !shape.size, %lci: !shape.size):
           %acc = "shape.mul"(%lci, %dim) :
             (!shape.size, !shape.size) -> !shape.size
           shape.yield %acc : !shape.size
-        }) : (!shape.type, !shape.size) -> (!shape.size)
+        }) : (!shape.shape, !shape.size) -> (!shape.size)
       return %1 : !shape.size
     }
     ```

From c3098e4f4036e96dbd3de0e61c5e114b0eb7bbb4 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 28 May 2020 13:36:40 +0200
Subject: [PATCH 350/770] [MLIR] Add TensorFromElementsOp to Standard ops.

Differential Revision: https://reviews.llvm.org/D80705
---
 .../mlir/Dialect/StandardOps/IR/Ops.td        | 33 ++++++++
 mlir/lib/Dialect/StandardOps/IR/Ops.cpp       | 80 +++++++++++++++++++
 mlir/test/IR/core-ops.mlir                    | 18 +++++
 mlir/test/IR/invalid-ops.mlir                 | 19 ++++-
 mlir/test/Transforms/canonicalize.mlir        | 12 +++
 5 files changed, 161 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
index 0304703172365..eae71b0263c16 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -1524,6 +1524,39 @@ def ExtractElementOp : Std_Op<"extract_element",
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// TensorFromElementsOp
+//===----------------------------------------------------------------------===//
+
+def TensorFromElementsOp : Std_Op<"tensor_from_elements",
+    [NoSideEffect, SameOperandsAndResultElementType]> {
+  string summary = "tensor from elements operation.";
+  string description = [{
+    Create a 1D tensor from a range of same-type arguments.
+
+    Example:
+
+    ```mlir
+    tensor_from_elements(i_1, ..., i_N) :  tensor<Nxindex>
+    ```
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$elements);
+  let results = (outs AnyTensor:$result);
+
+  let skipDefaultBuilders = 1;
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, ValueRange elements", [{
+      assert(!elements.empty() && "expected at least one element");
+      result.addOperands(elements);
+      result.addTypes(
+          RankedTensorType::get({static_cast<int64_t>(elements.size())},
+                                *elements.getTypes().begin()));
+    }]>];
+
+  let hasCanonicalizer = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // FPExtOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index 3d493a8a57a59..118a1119833c6 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -1640,6 +1640,86 @@ OpFoldResult ExtractElementOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// TensorFromElementsOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseTensorFromElementsOp(OpAsmParser &parser,
+                                             OperationState &result) {
+  SmallVector<OpAsmParser::OperandType, 4> elementsOperands;
+  Type resultType;
+  if (parser.parseLParen() || parser.parseOperandList(elementsOperands) ||
+      parser.parseRParen() || parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColon() || parser.parseType(resultType))
+    return failure();
+
+  if (parser.resolveOperands(elementsOperands,
+                             resultType.cast<ShapedType>().getElementType(),
+                             result.operands))
+    return failure();
+
+  result.addTypes(resultType);
+  return success();
+}
+
+static void print(OpAsmPrinter &p, TensorFromElementsOp op) {
+  p << "tensor_from_elements(" << op.elements() << ')';
+  p.printOptionalAttrDict(op.getAttrs());
+  p << " : " << op.result().getType();
+}
+
+static LogicalResult verify(TensorFromElementsOp op) {
+  auto resultTensorType = op.result().getType().dyn_cast<RankedTensorType>();
+  if (!resultTensorType)
+    return op.emitOpError("expected result type to be a ranked tensor");
+
+  int64_t elementsCount = static_cast<int64_t>(op.elements().size());
+  if (resultTensorType.getRank() != 1 ||
+      resultTensorType.getShape().front() != elementsCount)
+    return op.emitOpError()
+           << "expected result type to be a 1D tensor with " << elementsCount
+           << (elementsCount == 1 ? " element" : " elements");
+  return success();
+}
+
+namespace {
+
+// Canonicalizes the pattern of the form
+//
+// %tensor = "tensor_from_elements(%element) : (i32) -> tensor<1xi32>
+// %extracted_element = extract_element %tensor[%c0] : tensor<1xi32>
+//
+// to just %element.
+struct ExtractElementFromTensorFromElements
+    : public OpRewritePattern<ExtractElementOp> {
+  using OpRewritePattern<ExtractElementOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ExtractElementOp extract,
+                                PatternRewriter &rewriter) const final {
+    if (extract.indices().size() != 1)
+      return failure();
+
+    auto tensor_from_elements =
+        dyn_cast<TensorFromElementsOp>(extract.aggregate().getDefiningOp());
+    if (tensor_from_elements == nullptr)
+      return failure();
+
+    APInt index;
+    if (!matchPattern(*extract.indices().begin(), m_ConstantInt(&index)))
+      return failure();
+    rewriter.replaceOp(extract,
+                       tensor_from_elements.getOperand(index.getZExtValue()));
+    return success();
+  }
+};
+
+} // namespace
+
+void TensorFromElementsOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ExtractElementFromTensorFromElements>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // FPExtOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir
index 41172aa22527b..7727fa5e0363d 100644
--- a/mlir/test/IR/core-ops.mlir
+++ b/mlir/test/IR/core-ops.mlir
@@ -644,6 +644,24 @@ func @extract_element(%arg0: tensor<*xi32>, %arg1 : tensor<4x4xf32>) -> i32 {
   return %0 : i32
 }
 
+// CHECK-LABEL: func @tensor_from_elements() {
+func @tensor_from_elements() {
+  %c0 = "std.constant"() {value = 0: index} : () -> index
+  // CHECK: %0 = tensor_from_elements(%c0) : tensor<1xindex>
+  %0 = tensor_from_elements(%c0) : tensor<1xindex>
+
+  %c1 = "std.constant"() {value = 1: index} : () -> index
+  // CHECK: %1 = tensor_from_elements(%c0, %c1) : tensor<2xindex>
+  %1 = tensor_from_elements(%c0, %c1) : tensor<2xindex>
+
+  %c0_f32 = "std.constant"() {value = 0.0: f32} : () -> f32
+  // CHECK: [[C0_F32:%.*]] = constant
+  // CHECK: %2 = tensor_from_elements([[C0_F32]]) : tensor<1xf32>
+  %2 = tensor_from_elements(%c0_f32) : tensor<1xf32>
+
+  return
+}
+
 // CHECK-LABEL: func @tensor_cast(%arg0
 func @tensor_cast(%arg0: tensor<*xf32>, %arg1 : tensor<4x4xf32>, %arg2: tensor<?x?xf32>) {
   // CHECK: %0 = tensor_cast %arg0 : tensor<*xf32> to tensor<?x?xf32>
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
index b0535047874fd..c8e40c520139d 100644
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -605,7 +605,24 @@ func @extract_element_tensor_too_many_indices(%t : tensor<2x3xf32>, %i : index)
 
 func @extract_element_tensor_too_few_indices(%t : tensor<2x3xf32>, %i : index) {
   // expected-error@+1 {{incorrect number of indices for extract_element}}
-  %0 = "std.extract_element"(%t, %i) : (tensor<2x3xf32>, index) -> f32
+  %0 = "std.extract_element"(%t, %i) : (tensor<2x3xf32>, index) -> f32 return
+}
+
+// -----
+
+func @tensor_from_elements_wrong_result_type() {
+  // expected-error@+2 {{expected result type to be a ranked tensor}}
+  %c0 = constant 0 : i32
+  %0 = tensor_from_elements(%c0) : tensor<*xi32>
+  return
+}
+
+// -----
+
+func @tensor_from_elements_wrong_elements_count() {
+  // expected-error@+2 {{expected result type to be a 1D tensor with 1 element}}
+  %c0 = constant 0 : index
+  %0 = tensor_from_elements(%c0) : tensor<2xindex>
   return
 }
 
diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
index b17cade291a5a..6e24bb3b2d832 100644
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -971,3 +971,15 @@ func @memref_cast_folding_subview_static(%V: memref<16x16xf32>, %a: index, %b: i
   // CHECK:  memref_cast{{.*}}: memref<3x4xf32, #[[map0]]> to memref<3x4xf32, #[[map1]]>
   return %1: memref<3x4xf32, offset:?, strides:[?, 1]>
 }
+
+// -----
+
+// CHECK-LABEL: func @extract_element_from_tensor_from_elements
+func @extract_element_from_tensor_from_elements(%element : index) -> index {
+  // CHECK-SAME: ([[ARG:%.*]]: index)
+  %c0 = constant 0 : index
+  %tensor = tensor_from_elements(%element) : tensor<1xindex>
+  %extracted_element = extract_element %tensor[%c0] : tensor<1xindex>
+  // CHECK: [[ARG]] : index
+  return %extracted_element : index
+}

From 6594d54571ee5887f031555a7660b8d8e74194d3 Mon Sep 17 00:00:00 2001
From: Frederik Gossen <frgossen@google.com>
Date: Thu, 28 May 2020 13:55:02 +0000
Subject: [PATCH 351/770] [MLIR] Add `index_to_size` and `size_to_index` to the
 shape dialect

Add the two conversion operations `index_to_size` and `size_to_index` to the
shape dialect.
This facilitates the conversion of index types between the shape and the
standard dialect.

Differential Revision: https://reviews.llvm.org/D80280
---
 .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 39 +++++++++++++-
 mlir/lib/Dialect/Shape/IR/Shape.cpp           | 42 ++++++++++++++-
 mlir/test/Dialect/Shape/canonicalize.mlir     | 54 +++++++++++++++++++
 3 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index dddc4c3ea08cb..57d1954a31995 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -214,6 +214,25 @@ def Shape_GetExtentOp : Shape_Op<"get_extent",
   let hasFolder = 1;
 }
 
+def Shape_IndexToSizeOp : Shape_Op<"index_to_size", [
+    NoSideEffect,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "Converts a standard index to a shape size";
+  let description = [{
+    Converts a standard index to a `shape.size`.
+    This operation and its inverse, `size_to_index`, facilitate index conversion
+    between the standard and the shape dialect.
+    The behavior is undefined for negative indices.
+  }];
+
+  let arguments = (ins Index:$arg);
+  let results = (outs Shape_SizeType:$result);
+
+  let assemblyFormat = "attr-dict $arg";
+
+  let hasFolder = 1;
+}
+
 def Shape_JoinOp : Shape_Op<"join", []> {
   let summary = "Returns the least general shape.size of its operands";
   let description = [{
@@ -312,6 +331,25 @@ def Shape_ShapeOfOp : Shape_Op<"shape_of",
   let hasFolder = 1;
 }
 
+def Shape_SizeToIndexOp : Shape_Op<"size_to_index", [
+    NoSideEffect,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "Casts between index types of the shape and standard dialect";
+  let description = [{
+    Converts a `shape.size` to a standard index.
+    This operation and its inverse, `index_to_size`, facilitate index conversion
+    between the standard and the shape dialect.
+    The behavior is undefined for unknown and invalid arguments.
+  }];
+
+  let arguments = (ins Shape_SizeType:$arg);
+  let results = (outs Index:$result);
+
+  let assemblyFormat = "attr-dict $arg";
+
+  let hasFolder = 1;
+}
+
 def Shape_YieldOp : Shape_Op<"yield", [NoSideEffect, Terminator]> {
   let summary = "Returns the value to parent op";
 
@@ -523,7 +561,6 @@ def Shape_CstrEqOp : Shape_Op<"cstr_eq", []> {
   let assemblyFormat = "$inputs attr-dict";
 }
 
-
 // Canonicalization patterns.
 
 #endif // SHAPE_OPS
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index fc8f9b23e1e46..a077948fdd31c 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -249,7 +249,7 @@ static ParseResult parseConstShapeOp(OpAsmParser &parser,
   return success();
 }
 
-OpFoldResult ConstShapeOp::fold(ArrayRef<Attribute>) { return shape(); }
+OpFoldResult ConstShapeOp::fold(ArrayRef<Attribute>) { return shapeAttr(); }
 
 //===----------------------------------------------------------------------===//
 // ConstSizeOp
@@ -266,6 +266,26 @@ ConstSizeOp::inferReturnTypes(MLIRContext *context, Optional<Location> location,
 
 OpFoldResult ConstSizeOp::fold(ArrayRef<Attribute>) { return valueAttr(); }
 
+//===----------------------------------------------------------------------===//
+// IndexToSizeOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult IndexToSizeOp::fold(ArrayRef<Attribute> operands) {
+  // Constant values of both types, `shape.size` and `index`, are represented as
+  // `IntegerAttr`s which makes constant folding simple.
+  if (Attribute arg = operands[0])
+    return arg;
+  return {};
+}
+
+LogicalResult IndexToSizeOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(SizeType::get(context));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FromExtentsOp
 //===----------------------------------------------------------------------===//
@@ -333,6 +353,26 @@ OpFoldResult ShapeOfOp::fold(ArrayRef<Attribute>) {
   return builder.getIndexTensorAttr(type.getShape());
 }
 
+//===----------------------------------------------------------------------===//
+// SizeToIndexOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SizeToIndexOp::fold(ArrayRef<Attribute> operands) {
+  // Constant values of both types, `shape.size` and `index`, are represented as
+  // `IntegerAttr`s which makes constant folding simple.
+  if (Attribute arg = operands[0])
+    return arg;
+  return {};
+}
+
+LogicalResult SizeToIndexOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(IndexType::get(context));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // SplitAtOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 23147e557a151..106171de60878 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -108,6 +108,60 @@ func @no_fold(%arg0: index) -> !shape.shape {
 }
 
 // -----
+// Cast constant size to index and fold it away.
+// CHECK-LABEL: func @const_size_to_index
+func @const_size_to_index() -> index {
+  // CHECK-NOT: shape.index_cast
+  %cs = shape.const_size 123
+  // CHECK: constant 123 : index
+  %ci = shape.size_to_index %cs
+  return %ci : index
+}
+
+// -----
+// Cast constant index to size and fold it away.
+// CHECK-LABEL: func @const_index_to_size
+func @const_index_to_size() -> !shape.size {
+  // CHECK-NOT: index_cast
+  %ci = constant 123 : index
+  // CHECK: shape.const_size 123
+  %cs = shape.index_to_size %ci
+  return %cs : !shape.size
+}
+
+// -----
+// Cast constant index to size, then back, and fold it away.
+// CHECK-LABEL: func @const_index_to_size_to_index
+func @const_index_to_size_to_index() -> index {
+  // CHECK-NOT: shape.index_cast
+  %ci0 = constant 123 : index
+  %cs0 = shape.index_to_size %ci0
+  // CHECK: %[[CI:.*]] = constant 123 : index
+  // CHECK-NEXT: return %[[CI]] : index
+  %ci1 = shape.size_to_index %cs0
+  return %ci1 : index
+}
+
+// -----
+// No folding.
+// CHECK-LABEL: func @nonfoldable_size_to_index
+func @nonfoldable_size_to_index(%cs : !shape.size) -> index {
+  // CHECK: shape.size_to_index
+  %ci = shape.size_to_index %cs
+  return %ci : index
+}
+
+// -----
+// No folding.
+// CHECK-LABEL: func @nonfoldable_index_to_size
+func @nonfoldable_index_to_size(%ci : index) -> !shape.size {
+  // CHECK: shape.index_to_size
+  %cs = shape.index_to_size %ci
+  return %cs : !shape.size
+}
+
+// -----
+
 // Canonicalization of shape.get_extent
 
 // Basic folding.

From 0da4353938368c1f2473cd24553989f84b964279 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 15 May 2020 17:29:40 -0400
Subject: [PATCH 352/770] AMDGPU: Add baseline test for ptrmask infer address
 space

---
 .../InferAddressSpaces/AMDGPU/ptrmask.ll      | 190 ++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll

diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
new file mode 100644
index 0000000000000..40daa4877dbf4
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces -instsimplify %s | FileCheck %s
+
+define i8 @ptrmask_cast_local_to_flat(i8 addrspace(3)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_private_to_flat(i8 addrspace(5)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_private_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(5)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(5)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_region_to_flat(i8 addrspace(2)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_region_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(2)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(2)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_global_to_flat(i8 addrspace(1)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_global_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(1)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(1)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_999_to_flat(i8 addrspace(999)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_999_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(999)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(999)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_flat_to_local(i8* %ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_flat_to_local(
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[PTR:%.*]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8* [[MASKED]] to i8 addrspace(3)*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[CAST]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %ptr, i64 %mask)
+  %cast = addrspacecast i8* %masked to i8 addrspace(3)*
+  %load = load i8, i8 addrspace(3)* %cast
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_flat_to_private(i8* %ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_flat_to_private(
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[PTR:%.*]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8* [[MASKED]] to i8 addrspace(5)*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(5)* [[CAST]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %ptr, i64 %mask)
+  %cast = addrspacecast i8* %masked to i8 addrspace(5)*
+  %load = load i8, i8 addrspace(5)* %cast
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_flat_to_global(i8* %ptr, i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_flat_to_global(
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[PTR:%.*]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8* [[MASKED]] to i8 addrspace(1)*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(1)* [[CAST]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %ptr, i64 %mask)
+  %cast = addrspacecast i8* %masked to i8 addrspace(1)*
+  %load = load i8, i8 addrspace(1)* %cast
+  ret i8 %load
+}
+
+@lds0 = internal addrspace(3) global i8 123, align 4
+@gv = internal addrspace(1) global i8 123, align 4
+
+define i8 @ptrmask_cast_local_to_flat_global(i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_global(
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* addrspacecast (i8 addrspace(3)* @lds0 to i8*), i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* addrspacecast (i8 addrspace(3)* @lds0 to i8*), i64 %mask)
+  %load = load i8, i8* %masked, align 1
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_global_to_flat_global(i64 %mask) {
+; CHECK-LABEL: @ptrmask_cast_global_to_flat_global(
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* addrspacecast (i8 addrspace(1)* @gv to i8*), i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* addrspacecast (i8 addrspace(1)* @gv to i8*), i64 %mask)
+  %load = load i8, i8* %masked, align 1
+  ret i8 %load
+}
+
+define i8 @multi_ptrmask_cast_global_to_flat(i8 addrspace(1)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @multi_ptrmask_cast_global_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(1)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, i8 addrspace(1)* [[SRC_PTR]], align 1
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %cast = addrspacecast i8 addrspace(1)* %src.ptr to i8*
+  %load0 = load i8, i8* %cast
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load1 = load i8, i8* %masked
+  %add = add i8 %load0, %load1
+  ret i8 %add
+}
+
+; Can't rewrite the ptrmask, but can rewrite other use instructions
+define i8 @multi_ptrmask_cast_local_to_flat(i8 addrspace(3)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @multi_ptrmask_cast_local_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, i8 addrspace(3)* [[SRC_PTR]], align 1
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %load0 = load i8, i8* %cast
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load1 = load i8, i8* %masked
+  %add = add i8 %load0, %load1
+  ret i8 %add
+}
+
+define i8 @multi_ptrmask_cast_region_to_flat(i8 addrspace(2)* %src.ptr, i64 %mask) {
+; CHECK-LABEL: @multi_ptrmask_cast_region_to_flat(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(2)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, i8 addrspace(2)* [[SRC_PTR]], align 1
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %cast = addrspacecast i8 addrspace(2)* %src.ptr to i8*
+  %load0 = load i8, i8* %cast
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %mask)
+  %load1 = load i8, i8* %masked
+  %add = add i8 %load0, %load1
+  ret i8 %add
+}
+
+declare i8* @llvm.ptrmask.p0i8.i64(i8*, i64) #0
+declare i8 addrspace(5)* @llvm.ptrmask.p5i8.i32(i8 addrspace(5)*, i32) #0
+declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)*, i32) #0
+declare i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)*, i64) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }

From d6671ee90c1423eb18c6fab11819df850ae2200d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 15 May 2020 14:54:51 -0400
Subject: [PATCH 353/770] InferAddressSpaces: Handle ptrmask intrinsic

This one is slightly odd since it counts as an address expression,
which previously could never fail. Allow the existing TTI hook to
return the value to use, and re-use it for handling how to handle
ptrmask.

Handles the no-op addrspacecasts for AMDGPU. We could probably do
something better based on analysis of the mask value based on the
address space, but leave that for now.
---
 .../llvm/Analysis/TargetTransformInfo.h       | 16 ++--
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  6 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  6 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 30 ++++++--
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  4 +-
 .../Transforms/Scalar/InferAddressSpaces.cpp  | 77 +++++++++++++++----
 .../InferAddressSpaces/AMDGPU/ptrmask.ll      | 21 +++--
 8 files changed, 113 insertions(+), 52 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c50c696741b17..51aa1cb1cb1ec 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -379,9 +379,10 @@ class TargetTransformInfo {
   /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
   /// NewV, which has a different address space. This should happen for every
   /// operand index that collectFlatAddressOperands returned for the intrinsic.
-  /// \returns true if the intrinsic /// was handled.
-  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
-                                        Value *NewV) const;
+  /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the
+  /// new value (which may be the original \p II with modified operands).
+  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+                                          Value *NewV) const;
 
   /// Test whether calls to a function lower to actual program function
   /// calls.
@@ -1236,8 +1237,9 @@ class TargetTransformInfo::Concept {
   virtual unsigned getFlatAddressSpace() = 0;
   virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                           Intrinsic::ID IID) const = 0;
-  virtual bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
-                                                Value *NewV) const = 0;
+  virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                                  Value *OldV,
+                                                  Value *NewV) const = 0;
   virtual bool isLoweredToCall(const Function *F) = 0;
   virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                        UnrollingPreferences &UP) = 0;
@@ -1505,8 +1507,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.collectFlatAddressOperands(OpIndexes, IID);
   }
 
-  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
-                                        Value *NewV) const override {
+  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+                                          Value *NewV) const override {
     return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
   }
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index f98b8bf7da2c9..0e8fc5dd6cfa2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -86,9 +86,9 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
-  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
-                                        Value *NewV) const {
-    return false;
+  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+                                          Value *NewV) const {
+    return nullptr;
   }
 
   bool isLoweredToCall(const Function *F) {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index cc751a5b47898..c751c3703ba7f 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -222,9 +222,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return false;
   }
 
-  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
-                                        Value *OldV, Value *NewV) const {
-    return false;
+  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+                                          Value *NewV) const {
+    return nullptr;
   }
 
   bool isLegalAddImmediate(int64_t imm) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 9f319c40ae6a7..0c34050a66288 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -290,9 +290,8 @@ bool TargetTransformInfo::collectFlatAddressOperands(
   return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);
 }
 
-bool TargetTransformInfo::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
-                                                           Value *OldV,
-                                                           Value *NewV) const {
+Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
+    IntrinsicInst *II, Value *OldV, Value *NewV) const {
   return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2405a24dd14f8..324dcba86c2ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -851,8 +851,9 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   }
 }
 
-bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
-  IntrinsicInst *II, Value *OldV, Value *NewV) const {
+Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                                    Value *OldV,
+                                                    Value *NewV) const {
   auto IntrID = II->getIntrinsicID();
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
@@ -862,7 +863,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
   case Intrinsic::amdgcn_ds_fmax: {
     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
     if (!IsVolatile->isZero())
-      return false;
+      return nullptr;
     Module *M = II->getParent()->getParent()->getParent();
     Type *DestTy = II->getType();
     Type *SrcTy = NewV->getType();
@@ -870,7 +871,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
     II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
-    return true;
+    return II;
   }
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private: {
@@ -880,12 +881,25 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
     LLVMContext &Ctx = NewV->getType()->getContext();
     ConstantInt *NewVal = (TrueAS == NewAS) ?
       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
-    II->replaceAllUsesWith(NewVal);
-    II->eraseFromParent();
-    return true;
+    return NewVal;
+  }
+  case Intrinsic::ptrmask: {
+    unsigned OldAS = OldV->getType()->getPointerAddressSpace();
+    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+    if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS))
+      return nullptr;
+
+    Module *M = II->getParent()->getParent()->getParent();
+    Value *MaskOp = II->getArgOperand(1);
+    Type *MaskTy = MaskOp->getType();
+    Function *NewDecl = Intrinsic::getDeclaration(M, Intrinsic::ptrmask,
+                                                  {NewV->getType(), MaskTy});
+    CallInst *NewCall = CallInst::Create(NewDecl->getFunctionType(), NewDecl,
+                                         {NewV, MaskOp}, "", II);
+    return NewCall;
   }
   default:
-    return false;
+    return nullptr;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index bc74965c1780c..72c040fa4d91f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -211,8 +211,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                   Intrinsic::ID IID) const;
-  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
-                                        Value *OldV, Value *NewV) const;
+  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+                                          Value *NewV) const;
 
   unsigned getVectorSplitCost() { return 0; }
 
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 6fd6b84178548..d407d0439cd8d 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -175,6 +175,11 @@ class InferAddressSpaces : public FunctionPass {
 
   bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
 
+  Value *cloneInstructionWithNewAddressSpace(
+      Instruction *I, unsigned NewAddrSpace,
+      const ValueToValueMapTy &ValueWithNewAddrSpace,
+      SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+
   // Changes the flat address expressions in function F to point to specific
   // address spaces if InferredAddrSpace says so. Postorder is the postorder of
   // all flat expressions in the use-def graph of function F.
@@ -218,20 +223,24 @@ INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
 // TODO: Currently, we consider only phi, bitcast, addrspacecast, and
 // getelementptr operators.
 static bool isAddressExpression(const Value &V) {
-  if (!isa<Operator>(V))
+  const Operator *Op = dyn_cast<Operator>(&V);
+  if (!Op)
     return false;
 
-  const Operator &Op = cast<Operator>(V);
-  switch (Op.getOpcode()) {
+  switch (Op->getOpcode()) {
   case Instruction::PHI:
-    assert(Op.getType()->isPointerTy());
+    assert(Op->getType()->isPointerTy());
     return true;
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
   case Instruction::GetElementPtr:
     return true;
   case Instruction::Select:
-    return Op.getType()->isPointerTy();
+    return Op->getType()->isPointerTy();
+  case Instruction::Call: {
+    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V);
+    return II && II->getIntrinsicID() == Intrinsic::ptrmask;
+  }
   default:
     return false;
   }
@@ -254,12 +263,17 @@ static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
     return {Op.getOperand(0)};
   case Instruction::Select:
     return {Op.getOperand(1), Op.getOperand(2)};
+  case Instruction::Call: {
+    const IntrinsicInst &II = cast<IntrinsicInst>(Op);
+    assert(II.getIntrinsicID() == Intrinsic::ptrmask &&
+           "unexpected intrinsic call");
+    return {II.getArgOperand(0)};
+  }
   default:
     llvm_unreachable("Unexpected instruction type.");
   }
 }
 
-// TODO: Move logic to TTI?
 bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
                                                   Value *OldV,
                                                   Value *NewV) const {
@@ -275,8 +289,17 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
     II->setCalledFunction(NewDecl);
     return true;
   }
-  default:
-    return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+  case Intrinsic::ptrmask:
+    // This is handled as an address expression, not as a use memory operation.
+    return false;
+  default: {
+    Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+    if (!Rewrite)
+      return false;
+    if (Rewrite != II)
+      II->replaceAllUsesWith(Rewrite);
+    return true;
+  }
   }
 }
 
@@ -285,6 +308,7 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
     DenseSet<Value *> &Visited) const {
   auto IID = II->getIntrinsicID();
   switch (IID) {
+  case Intrinsic::ptrmask:
   case Intrinsic::objectsize:
     appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
                                                  PostorderStack, Visited);
@@ -438,10 +462,13 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
 // Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
 // from a pointer whose type already matches. Therefore, this function returns a
 // Value* instead of an Instruction*.
-static Value *cloneInstructionWithNewAddressSpace(
+//
+// This may also return nullptr in the case the instruction could not be
+// rewritten.
+Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
     Instruction *I, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
-    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const {
   Type *NewPtrType =
       I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
 
@@ -456,6 +483,23 @@ static Value *cloneInstructionWithNewAddressSpace(
     return Src;
   }
 
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Technically the intrinsic ID is a pointer typed argument, so specially
+    // handle calls early.
+    assert(II->getIntrinsicID() == Intrinsic::ptrmask);
+    Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef(
+        II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace,
+        UndefUsesToFix);
+    Value *Rewrite =
+        TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr);
+    if (Rewrite) {
+      assert(Rewrite != II && "cannot modify this pointer operation in place");
+      return Rewrite;
+    }
+
+    return nullptr;
+  }
+
   // Computes the converted pointer operands.
   SmallVector<Value *, 4> NewPointerOperands;
   for (const Use &OperandUse : I->operands()) {
@@ -591,7 +635,7 @@ Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     Value *NewV = cloneInstructionWithNewAddressSpace(
       I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
-    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+    if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) {
       if (NewI->getParent() == nullptr) {
         NewI->insertBefore(I);
         NewI->takeName(I);
@@ -879,8 +923,10 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   for (Value* V : Postorder) {
     unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
     if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
-      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
-        V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+      Value *New = cloneValueWithNewAddressSpace(
+          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+      if (New)
+        ValueWithNewAddrSpace[V] = New;
     }
   }
 
@@ -890,7 +936,10 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
   for (const Use *UndefUse : UndefUsesToFix) {
     User *V = UndefUse->getUser();
-    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    User *NewV = cast_or_null<User>(ValueWithNewAddrSpace.lookup(V));
+    if (!NewV)
+      continue;
+
     unsigned OperandNo = UndefUse->getOperandNo();
     assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
     NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
index 40daa4877dbf4..6f25a3ec2adbe 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
@@ -42,9 +42,8 @@ define i8 @ptrmask_cast_region_to_flat(i8 addrspace(2)* %src.ptr, i64 %mask) {
 
 define i8 @ptrmask_cast_global_to_flat(i8 addrspace(1)* %src.ptr, i64 %mask) {
 ; CHECK-LABEL: @ptrmask_cast_global_to_flat(
-; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(1)* [[SRC_PTR:%.*]] to i8*
-; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* [[SRC_PTR:%.*]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
   %cast = addrspacecast i8 addrspace(1)* %src.ptr to i8*
@@ -55,9 +54,8 @@ define i8 @ptrmask_cast_global_to_flat(i8 addrspace(1)* %src.ptr, i64 %mask) {
 
 define i8 @ptrmask_cast_999_to_flat(i8 addrspace(999)* %src.ptr, i64 %mask) {
 ; CHECK-LABEL: @ptrmask_cast_999_to_flat(
-; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(999)* [[SRC_PTR:%.*]] to i8*
-; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(999)* @llvm.ptrmask.p999i8.i64(i8 addrspace(999)* [[SRC_PTR:%.*]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(999)* [[TMP1]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
   %cast = addrspacecast i8 addrspace(999)* %src.ptr to i8*
@@ -121,8 +119,8 @@ define i8 @ptrmask_cast_local_to_flat_global(i64 %mask) {
 
 define i8 @ptrmask_cast_global_to_flat_global(i64 %mask) {
 ; CHECK-LABEL: @ptrmask_cast_global_to_flat_global(
-; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* addrspacecast (i8 addrspace(1)* @gv to i8*), i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* @gv, i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
   %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* addrspacecast (i8 addrspace(1)* @gv to i8*), i64 %mask)
@@ -132,10 +130,9 @@ define i8 @ptrmask_cast_global_to_flat_global(i64 %mask) {
 
 define i8 @multi_ptrmask_cast_global_to_flat(i8 addrspace(1)* %src.ptr, i64 %mask) {
 ; CHECK-LABEL: @multi_ptrmask_cast_global_to_flat(
-; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(1)* [[SRC_PTR:%.*]] to i8*
-; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, i8 addrspace(1)* [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, i8 addrspace(1)* [[SRC_PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* [[SRC_PTR]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD0]], [[LOAD1]]
 ; CHECK-NEXT:    ret i8 [[ADD]]
 ;

From fdaa391e3df3c3a555d933122b0ef85eaf5eb63c Mon Sep 17 00:00:00 2001
From: Frederik Gossen <frgossen@google.com>
Date: Thu, 28 May 2020 14:04:39 +0000
Subject: [PATCH 354/770] [MLIR] Add `num_elements` to the shape dialect

The operation `num_elements` determines the number of elements for a given
shape.
That is the product of its dimensions.

Differential Revision: https://reviews.llvm.org/D80281
---
 .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 18 +++++++++++++
 mlir/lib/Dialect/Shape/IR/Shape.cpp           | 26 +++++++++++++++++++
 mlir/test/Dialect/Shape/canonicalize.mlir     | 22 ++++++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 57d1954a31995..0d300d3c64c8c 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -278,6 +278,24 @@ def Shape_MulOp : Shape_Op<"mul", [SameOperandsAndResultType]> {
   let results = (outs Shape_SizeType:$result);
 }
 
+def Shape_NumElementsOp : Shape_Op<"num_elements", [
+    NoSideEffect,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+
+  let summary = "Returns the number of elements for a given shape";
+  let description = [{
+    Returns the number of elements for a given shape which is the product of its
+    dimensions.
+  }];
+
+  let arguments = (ins Shape_ShapeType:$shape);
+  let results = (outs Shape_SizeType:$result);
+
+  let assemblyFormat = "attr-dict $shape";
+
+  let hasFolder = 1;
+}
+
 def Shape_ReduceOp : Shape_Op<"reduce", []> {
   let summary = "Returns an expression reduced over a shape";
   let description = [{
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index a077948fdd31c..b0103e15fa35a 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -332,6 +332,32 @@ OpFoldResult GetExtentOp::fold(ArrayRef<Attribute> operands) {
   return elements.getValue({dimToGet});
 }
 
+//===----------------------------------------------------------------------===//
+// NumElementsOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult NumElementsOp::fold(ArrayRef<Attribute> operands) {
+
+  // Fold only when argument constant.
+  Attribute shape = operands[0];
+  if (!shape)
+    return {};
+
+  APInt product(64, 1);
+  for (auto value : shape.cast<DenseIntElementsAttr>())
+    product *= value;
+  Builder builder(getContext());
+  return builder.getIndexAttr(product.getLimitedValue());
+}
+
+LogicalResult NumElementsOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(SizeType::get(context));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ShapeOfOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 106171de60878..69c312e6dad78 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -160,6 +160,28 @@ func @nonfoldable_index_to_size(%ci : index) -> !shape.size {
   return %cs : !shape.size
 }
 
+// -----
+// Fold number of elements computation.
+// CHECK-LABEL: func @num_elements
+func @num_elements() -> !shape.size {
+  // CHECK-NOT: shape.const_shape
+  %shape = shape.const_shape [4, 5, 6]
+  // CHECK-NOT: shape.num_elements
+  %num_elements = shape.num_elements %shape
+  // CHECK: %[[NUM:.*]] = shape.const_size 120
+  // CHECK-NEXT: return %[[NUM]] : !shape.size
+  return %num_elements : !shape.size
+}
+
+// -----
+// No folding.
+// CHECK-LABEL: func @nonfoldable_num_elements
+func @nonfoldable_num_elements(%shape : !shape.shape) -> !shape.size {
+  // CHECK-NOT: shape.const_{{.*}}
+  %num_elements = shape.num_elements %shape
+  return %num_elements : !shape.size
+}
+
 // -----
 
 // Canonicalization of shape.get_extent

From 061fb8eb2d9f6ffa05f2b57670c918c477ca7f36 Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <whchung@gmail.com>
Date: Fri, 22 May 2020 16:25:00 -0500
Subject: [PATCH 355/770] [mlir][gpu][mlir-cuda-runner] Refactor
 ConvertKernelFuncToCubin to be generic.

Make ConvertKernelFuncToCubin pass to be generic:

- Rename to ConvertKernelFuncToBlob.
- Allow specifying triple, target chip, target features.
- Initializing LLVM backend is supplied by a callback function.
- Lowering process from MLIR module to LLVM module is via another callback.
- Change mlir-cuda-runner to adopt the revised pass.
- Add new tests for lowering to ROCm HSA code object (HSACO).
- Tests for CUDA and ROCm are kept in separate directories.

Differential Revision: https://reviews.llvm.org/D80142
---
 mlir/CMakeLists.txt                           |   9 +
 .../mlir/Conversion/GPUCommon/GPUCommonPass.h |  48 ++++-
 .../mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h |  50 ------
 mlir/include/mlir/InitAllPasses.h             |   1 -
 mlir/lib/Conversion/CMakeLists.txt            |   1 -
 mlir/lib/Conversion/GPUCommon/CMakeLists.txt  |   7 +-
 .../GPUCommon/ConvertKernelFuncToBlob.cpp     | 168 ++++++++++++++++++
 mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt  |  35 ----
 .../GPUToCUDA/ConvertKernelFuncToCubin.cpp    | 165 -----------------
 mlir/test/Conversion/GPUToROCm/lit.local.cfg  |   2 +
 .../lower-rocdl-kernel-to-hsaco.mlir          |  26 +++
 mlir/test/lib/Transforms/CMakeLists.txt       |  29 ++-
 .../TestConvertGPUKernelToCubin.cpp           |  28 ++-
 .../TestConvertGPUKernelToHsaco.cpp           |  41 +++++
 mlir/test/lit.site.cfg.py.in                  |   1 +
 .../mlir-cuda-runner/mlir-cuda-runner.cpp     |  19 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |   4 +
 17 files changed, 359 insertions(+), 275 deletions(-)
 delete mode 100644 mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
 create mode 100644 mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp
 delete mode 100644 mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
 delete mode 100644 mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
 create mode 100644 mlir/test/Conversion/GPUToROCm/lit.local.cfg
 create mode 100644 mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
 create mode 100644 mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 7c2c5978c44e6..0cf1e8d445168 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -31,6 +31,15 @@ endif()
 # TODO: we should use a config.h file like LLVM does
 add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_CUDA_CONVERSIONS_ENABLED})
 
+# Build the ROCm conversions and run according tests if the AMDGPU backend
+# is available
+if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_ROCM_CONVERSIONS_ENABLED 1)
+else()
+  set(MLIR_ROCM_CONVERSIONS_ENABLED 0)
+endif()
+add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED})
+
 set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
 set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 791d859f64147..2c4b3dc6ac88b 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -9,19 +9,33 @@
 #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
 
 #include "mlir/Support/LLVM.h"
-#include <functional>
-#include <memory>
-#include <string>
+#include "llvm/IR/Module.h"
 #include <vector>
 
 namespace mlir {
 
 class Location;
+class LogicalResult;
 class ModuleOp;
+class Operation;
 
 template <typename T>
 class OperationPass;
 
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
+
+namespace LLVM {
+class LLVMDialect;
+} // namespace LLVM
+
+using OwnedBlob = std::unique_ptr<std::vector<char>>;
+using BlobGenerator =
+    std::function<OwnedBlob(const std::string &, Location, StringRef)>;
+using LoweringCallback =
+    std::function<std::unique_ptr<llvm::Module>(Operation *)>;
+
 /// Creates a pass to convert a gpu.launch_func operation into a sequence of
 /// GPU runtime calls.
 ///
@@ -31,6 +45,34 @@ class OperationPass;
 std::unique_ptr<OperationPass<ModuleOp>>
 createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
 
+/// Creates a pass to convert kernel functions into GPU target object blobs.
+///
+/// This transformation takes the body of each function that is annotated with
+/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the
+/// module with help of the GPU backend to target object and then invokes
+/// the provided blobGenerator to produce a binary blob. Such blob is then
+/// attached as a string attribute to the kernel function.
+///
+/// Following callbacks are to be provided by user:
+/// - loweringCallback : lower the module to an LLVM module.
+/// - blobGenerator : build a blob executable on target GPU.
+///
+/// Information wrt LLVM backend are to be supplied by user:
+/// - triple : target triple to be used.
+/// - targetChip : mcpu to be used.
+/// - features : target-specific features to be used.
+///
+/// Information about result attribute is to be specified by user:
+/// - gpuBinaryAnnotation : the name of the attribute which contains the blob.
+///
+/// After the transformation, the body of the kernel function is removed (i.e.,
+/// it is turned into a declaration).
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback,
+                                 BlobGenerator blobGenerator, StringRef triple,
+                                 StringRef targetChip, StringRef features,
+                                 StringRef gpuBinaryAnnotation);
+
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
deleted file mode 100644
index bac13d6d7ccbc..0000000000000
--- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//===- GPUToCUDAPass.h - MLIR CUDA runtime support --------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
-#define MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
-
-#include "mlir/Support/LLVM.h"
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace mlir {
-
-class Location;
-class ModuleOp;
-
-template <typename T> class OperationPass;
-
-namespace gpu {
-class GPUModuleOp;
-} // namespace gpu
-
-namespace LLVM {
-class LLVMDialect;
-} // namespace LLVM
-
-using OwnedCubin = std::unique_ptr<std::vector<char>>;
-using CubinGenerator =
-    std::function<OwnedCubin(const std::string &, Location, StringRef)>;
-
-/// Creates a pass to convert kernel functions into CUBIN blobs.
-///
-/// This transformation takes the body of each function that is annotated with
-/// the 'nvvm.kernel' attribute, copies it to a new LLVM module, compiles the
-/// module with help of the nvptx backend to PTX and then invokes the provided
-/// cubinGenerator to produce a binary blob (the cubin). Such blob is then
-/// attached as a string attribute named 'nvvm.cubin' to the kernel function.
-/// After the transformation, the body of the kernel function is removed (i.e.,
-/// it is turned into a declaration).
-std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
-
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 66083f671cde0..fb2ac1ee086f2 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -16,7 +16,6 @@
 
 #include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
 #include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 248f5f5a0e6c3..8b70e6523106c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_subdirectory(AffineToStandard)
 add_subdirectory(AVX512ToLLVM)
 add_subdirectory(GPUCommon)
-add_subdirectory(GPUToCUDA)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
 add_subdirectory(GPUToSPIRV)
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index a01fb7676b105..eb7d21f66f73d 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -1,9 +1,6 @@
-set(SOURCES
-  ConvertLaunchFuncToRuntimeCalls.cpp
-)
-
 add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
-  ${SOURCES}
+  ConvertLaunchFuncToRuntimeCalls.cpp
+  ConvertKernelFuncToBlob.cpp
 
   DEPENDS
   MLIRConversionPassIncGen
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp
new file mode 100644
index 0000000000000..cf41523d3b293
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp
@@ -0,0 +1,168 @@
+//===- ConvertKernelFuncToBlob.cpp - MLIR GPU lowering passes -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert gpu kernel functions into a
+// corresponding binary blob that can be executed on a GPU. Currently
+// only translates the function itself but no dependencies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace mlir;
+
+namespace {
+
+/// A pass converting tagged kernel modules to a blob with target instructions.
+///
+/// If tagged as a kernel module, each contained function is translated to
+/// user-specified IR. A user provided BlobGenerator then compiles the IR to
+/// GPU binary code, which is then attached as an attribute to the function.
+/// The function body is erased.
+class GpuKernelToBlobPass
+    : public PassWrapper<GpuKernelToBlobPass, OperationPass<gpu::GPUModuleOp>> {
+public:
+  GpuKernelToBlobPass(LoweringCallback loweringCallback,
+                      BlobGenerator blobGenerator, StringRef triple,
+                      StringRef targetChip, StringRef features,
+                      StringRef gpuBinaryAnnotation)
+      : loweringCallback(loweringCallback), blobGenerator(blobGenerator),
+        triple(triple), targetChip(targetChip), features(features),
+        blobAnnotation(gpuBinaryAnnotation) {}
+
+  void runOnOperation() override {
+    gpu::GPUModuleOp module = getOperation();
+
+    // Lock access to the llvm context.
+    llvm::sys::SmartScopedLock<true> scopedLock(
+        module.getContext()
+            ->getRegisteredDialect<LLVM::LLVMDialect>()
+            ->getLLVMContextMutex());
+
+    // Lower the module to a llvm module.
+    std::unique_ptr<llvm::Module> llvmModule = loweringCallback(module);
+    if (!llvmModule)
+      return signalPassFailure();
+
+    // Translate the llvm module to a target blob and attach the result as
+    // attribute to the module.
+    if (auto blobAttr = translateGPUModuleToBinaryAnnotation(
+            *llvmModule, module.getLoc(), module.getName()))
+      module.setAttr(blobAnnotation, blobAttr);
+    else
+      signalPassFailure();
+  }
+
+private:
+  std::string translateModuleToISA(llvm::Module &module,
+                                   llvm::TargetMachine &targetMachine);
+
+  /// Converts llvmModule to a blob with target instructions using the
+  /// user-provided generator. Location is used for error reporting and name is
+  /// forwarded to the blob generator to use in its logging mechanisms.
+  OwnedBlob convertModuleToBlob(llvm::Module &llvmModule, Location loc,
+                                StringRef name);
+
+  /// Translates llvmModule to a blob with target instructions and returns the
+  /// result as attribute.
+  StringAttr translateGPUModuleToBinaryAnnotation(llvm::Module &llvmModule,
+                                                  Location loc, StringRef name);
+
+  LoweringCallback loweringCallback;
+  BlobGenerator blobGenerator;
+  llvm::Triple triple;
+  StringRef targetChip;
+  StringRef features;
+  StringRef blobAnnotation;
+};
+
+} // anonymous namespace
+
+std::string
+GpuKernelToBlobPass::translateModuleToISA(llvm::Module &module,
+                                          llvm::TargetMachine &targetMachine) {
+  std::string targetISA;
+  {
+    // Clone the llvm module into a new context to enable concurrent compilation
+    // with multiple threads.
+    llvm::LLVMContext llvmContext;
+    auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module);
+
+    llvm::raw_string_ostream stream(targetISA);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegenPasses;
+    targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                      llvm::CGFT_AssemblyFile);
+    codegenPasses.run(*clone);
+  }
+
+  return targetISA;
+}
+
+OwnedBlob GpuKernelToBlobPass::convertModuleToBlob(llvm::Module &llvmModule,
+                                                   Location loc,
+                                                   StringRef name) {
+  std::unique_ptr<llvm::TargetMachine> targetMachine;
+  {
+    std::string error;
+    const llvm::Target *target =
+        llvm::TargetRegistry::lookupTarget("", triple, error);
+    if (target == nullptr) {
+      emitError(loc, "cannot initialize target triple");
+      return {};
+    }
+    targetMachine.reset(target->createTargetMachine(triple.str(), targetChip,
+                                                    features, {}, {}));
+  }
+
+  llvmModule.setDataLayout(targetMachine->createDataLayout());
+
+  auto targetISA = translateModuleToISA(llvmModule, *targetMachine);
+
+  return blobGenerator(targetISA, loc, name);
+}
+
+StringAttr GpuKernelToBlobPass::translateGPUModuleToBinaryAnnotation(
+    llvm::Module &llvmModule, Location loc, StringRef name) {
+  auto blob = convertModuleToBlob(llvmModule, loc, name);
+  if (!blob)
+    return {};
+  return StringAttr::get({blob->data(), blob->size()}, loc->getContext());
+}
+
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+mlir::createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback,
+                                       BlobGenerator blobGenerator,
+                                       StringRef triple, StringRef targetChip,
+                                       StringRef features,
+                                       StringRef gpuBinaryAnnotation) {
+  return std::make_unique<GpuKernelToBlobPass>(loweringCallback, blobGenerator,
+                                               triple, targetChip, features,
+                                               gpuBinaryAnnotation);
+}
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
deleted file mode 100644
index 90cc8d573ff34..0000000000000
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-set(LLVM_OPTIONAL_SOURCES
-  ConvertKernelFuncToCubin.cpp
-)
-
-if (MLIR_CUDA_CONVERSIONS_ENABLED)
-  set(NVPTX_LIBS
-    MC
-    NVPTXCodeGen
-    NVPTXDesc
-    NVPTXInfo
-  )
-
-  add_mlir_conversion_library(MLIRGPUtoCUDATransforms
-    ConvertKernelFuncToCubin.cpp
-
-    DEPENDS
-    MLIRConversionPassIncGen
-    intrinsics_gen
-
-    LINK_COMPONENTS
-    Core
-    ${NVPTX_LIBS}
-
-    LINK_LIBS PUBLIC
-    MLIRGPU
-    MLIRIR
-    MLIRLLVMIR
-    MLIRNVVMIR
-    MLIRPass
-    MLIRSupport
-    MLIRTargetNVVMIR
-  )
-else()
-  add_library(MLIRGPUtoCUDATransforms INTERFACE IMPORTED GLOBAL)
-endif()
diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
deleted file mode 100644
index 3f99c56c47169..0000000000000
--- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a pass to convert gpu kernel functions into a
-// corresponding binary blob that can be executed on a CUDA GPU. Currently
-// only translates the function itself but no dependencies.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
-
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Module.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Target/NVVMIR.h"
-
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace mlir;
-
-namespace {
-// TODO(herhut): Move to shared location.
-static constexpr const char *kCubinAnnotation = "nvvm.cubin";
-
-/// A pass converting tagged kernel modules to cubin blobs.
-///
-/// If tagged as a kernel module, each contained function is translated to NVVM
-/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
-/// GPU binary code, which is then attached as an attribute to the function. The
-/// function body is erased.
-class GpuKernelToCubinPass
-    : public PassWrapper<GpuKernelToCubinPass,
-                         OperationPass<gpu::GPUModuleOp>> {
-public:
-  GpuKernelToCubinPass(CubinGenerator cubinGenerator)
-      : cubinGenerator(cubinGenerator) {}
-
-  void runOnOperation() override {
-    gpu::GPUModuleOp module = getOperation();
-
-    // Lock access to the llvm context.
-    llvm::sys::SmartScopedLock<true> scopedLock(
-        module.getContext()
-            ->getRegisteredDialect<LLVM::LLVMDialect>()
-            ->getLLVMContextMutex());
-
-    // Make sure the NVPTX target is initialized.
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-
-    auto llvmModule = translateModuleToNVVMIR(module);
-    if (!llvmModule)
-      return signalPassFailure();
-
-    // Translate the module to CUBIN and attach the result as attribute to the
-    // module.
-    if (auto cubinAttr = translateGPUModuleToCubinAnnotation(
-            *llvmModule, module.getLoc(), module.getName()))
-      module.setAttr(kCubinAnnotation, cubinAttr);
-    else
-      signalPassFailure();
-  }
-
-private:
-  std::string translateModuleToPtx(llvm::Module &module,
-                                   llvm::TargetMachine &target_machine);
-
-  /// Converts llvmModule to cubin using the user-provided generator. Location
-  /// is used for error reporting and name is forwarded to the CUBIN generator
-  /// to use in its logging mechanisms.
-  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
-                                  StringRef name);
-
-  /// Translates llvmModule to cubin and returns the result as attribute.
-  StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule,
-                                                 Location loc, StringRef name);
-
-  CubinGenerator cubinGenerator;
-};
-
-} // anonymous namespace
-
-std::string GpuKernelToCubinPass::translateModuleToPtx(
-    llvm::Module &module, llvm::TargetMachine &target_machine) {
-  std::string ptx;
-  {
-    // Clone the llvm module into a new context to enable concurrent compilation
-    // with multiple threads.
-    // TODO(zinenko): Reevaluate model of ownership of LLVMContext in
-    //                LLVMDialect.
-    llvm::LLVMContext llvmContext;
-    auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module);
-
-    llvm::raw_string_ostream stream(ptx);
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegen_passes;
-    target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
-                                       llvm::CGFT_AssemblyFile);
-    codegen_passes.run(*clone);
-  }
-
-  return ptx;
-}
-
-OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
-                                                      Location loc,
-                                                      StringRef name) {
-  std::unique_ptr<llvm::TargetMachine> targetMachine;
-  {
-    std::string error;
-    // TODO(herhut): Make triple configurable.
-    constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
-    llvm::Triple triple(cudaTriple);
-    const llvm::Target *target =
-        llvm::TargetRegistry::lookupTarget("", triple, error);
-    if (target == nullptr) {
-      emitError(loc, "cannot initialize target triple");
-      return {};
-    }
-    targetMachine.reset(
-        target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
-  }
-
-  // Set the data layout of the llvm module to match what the ptx target needs.
-  llvmModule.setDataLayout(targetMachine->createDataLayout());
-
-  auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
-
-  return cubinGenerator(ptx, loc, name);
-}
-
-StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation(
-    llvm::Module &llvmModule, Location loc, StringRef name) {
-  auto cubin = convertModuleToCubin(llvmModule, loc, name);
-  if (!cubin)
-    return {};
-  return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
-}
-
-std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
-  return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
-}
diff --git a/mlir/test/Conversion/GPUToROCm/lit.local.cfg b/mlir/test/Conversion/GPUToROCm/lit.local.cfg
new file mode 100644
index 0000000000000..6eb561783b3fb
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCm/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.run_rocm_tests:
+  config.unsupported = True
diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
new file mode 100644
index 0000000000000..5ee3bb21aa916
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s
+
+// CHECK: attributes {rocdl.hsaco = "HSACO"}
+gpu.module @foo {
+  llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+    // CHECK: attributes  {gpu.kernel}
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
+
+// -----
+
+gpu.module @bar {
+  // CHECK: func @kernel_a
+  llvm.func @kernel_a()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+
+  // CHECK: func @kernel_b
+  llvm.func @kernel_b()
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index d040cdf97abb0..55bf84cb16373 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,3 +1,21 @@
+if (MLIR_CUDA_CONVERSIONS_ENABLED)
+  set(NVPTX_LIBS
+    MC
+    NVPTXCodeGen
+    NVPTXDesc
+    NVPTXInfo
+  )
+endif()
+
+if (MLIR_ROCM_CONVERSIONS_ENABLED)
+  set(AMDGPU_LIBS
+    MC
+    AMDGPUCodeGen
+    AMDGPUDesc
+    AMDGPUInfo
+  )
+endif()
+
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestTransforms
   TestAllReduceLowering.cpp
@@ -5,6 +23,7 @@ add_mlir_library(MLIRTestTransforms
   TestCallGraph.cpp
   TestConstantFold.cpp
   TestConvertGPUKernelToCubin.cpp
+  TestConvertGPUKernelToHsaco.cpp
   TestDominance.cpp
   TestLoopFusion.cpp
   TestGpuMemoryPromotion.cpp
@@ -31,18 +50,26 @@ add_mlir_library(MLIRTestTransforms
   MLIRStandardOpsIncGen
   MLIRTestVectorTransformPatternsIncGen
 
+  LINK_COMPONENTS
+  ${AMDGPU_LIBS}
+  ${NVPTX_LIBS}
+
   LINK_LIBS PUBLIC
   MLIRAffineOps
   MLIRAnalysis
   MLIREDSC
   MLIRGPU
-  MLIRGPUtoCUDATransforms
+  MLIRGPUtoGPURuntimeTransforms
   MLIRLinalgOps
   MLIRLinalgTransforms
+  MLIRNVVMIR
   MLIRSCF
   MLIRGPU
   MLIRPass
+  MLIRROCDLIR
   MLIRStandardOpsTransforms
+  MLIRTargetNVVMIR
+  MLIRTargetROCDLIR
   MLIRTestDialect
   MLIRTransformUtils
   MLIRVectorToSCF
diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp
index e0c4c1907c4f5..a347b2c280316 100644
--- a/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp
+++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp
@@ -6,26 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Target/NVVMIR.h"
+#include "llvm/Support/TargetSelect.h"
 using namespace mlir;
 
 #if MLIR_CUDA_CONVERSIONS_ENABLED
-static OwnedCubin compilePtxToCubinForTesting(const std::string &, Location,
-                                              StringRef) {
+static OwnedBlob compilePtxToCubinForTesting(const std::string &, Location,
+                                             StringRef) {
   const char data[] = "CUBIN";
   return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
 }
 
 namespace mlir {
 void registerTestConvertGPUKernelToCubinPass() {
-  PassPipelineRegistration<>("test-kernel-to-cubin",
-                             "Convert all kernel functions to CUDA cubin blobs",
-                             [](OpPassManager &pm) {
-                               pm.addPass(createConvertGPUKernelToCubinPass(
-                                   compilePtxToCubinForTesting));
-                             });
+  PassPipelineRegistration<>(
+      "test-kernel-to-cubin",
+      "Convert all kernel functions to CUDA cubin blobs",
+      [](OpPassManager &pm) {
+        // Initialize LLVM NVPTX backend.
+        LLVMInitializeNVPTXTarget();
+        LLVMInitializeNVPTXTargetInfo();
+        LLVMInitializeNVPTXTargetMC();
+        LLVMInitializeNVPTXAsmPrinter();
+
+        pm.addPass(createConvertGPUKernelToBlobPass(
+            translateModuleToNVVMIR, compilePtxToCubinForTesting,
+            "nvptx64-nvidia-cuda", "sm_35", "+ptx60", "nvvm.cubin"));
+      });
 }
 } // namespace mlir
 #endif
diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
new file mode 100644
index 0000000000000..54293a8099b40
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp
@@ -0,0 +1,41 @@
+//===- TestConvertGPUKernelToHsaco.cpp - Test gpu kernel hsaco lowering ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/ROCDLIR.h"
+#include "llvm/Support/TargetSelect.h"
+using namespace mlir;
+
+#if MLIR_ROCM_CONVERSIONS_ENABLED
+static OwnedBlob compileIsaToHsacoForTesting(const std::string &, Location,
+                                             StringRef) {
+  const char data[] = "HSACO";
+  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+}
+
+namespace mlir {
+void registerTestConvertGPUKernelToHsacoPass() {
+  PassPipelineRegistration<>(
+      "test-kernel-to-hsaco",
+      "Convert all kernel functions to ROCm hsaco blobs",
+      [](OpPassManager &pm) {
+        // Initialize LLVM AMDGPU backend.
+        LLVMInitializeAMDGPUTarget();
+        LLVMInitializeAMDGPUTargetInfo();
+        LLVMInitializeAMDGPUTargetMC();
+        LLVMInitializeAMDGPUAsmPrinter();
+
+        pm.addPass(createConvertGPUKernelToBlobPass(
+            translateModuleToROCDLIR, compileIsaToHsacoForTesting,
+            "amdgcn-amd-amdhsa", "gfx900", "-code-object-v3", "rocdl.hsaco"));
+      });
+}
+} // namespace mlir
+#endif
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index dc6286a827bb7..e07acf4d21a82 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -38,6 +38,7 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@
 config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
 config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
+config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
 config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@"
 config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@
 
diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
index 6a404221744b3..cdd8ec3fe5f38 100644
--- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
+++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/STLExtras.h"
 
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
-#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
@@ -30,6 +29,7 @@
 #include "mlir/InitAllDialects.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Target/NVVMIR.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 #include "llvm/Support/InitLLVM.h"
@@ -57,8 +57,8 @@ inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
     }                                                                          \
   }
 
-OwnedCubin compilePtxToCubin(const std::string ptx, Location loc,
-                             StringRef name) {
+OwnedBlob compilePtxToCubin(const std::string ptx, Location loc,
+                            StringRef name) {
   char jitErrorBuffer[4096] = {0};
 
   RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
@@ -97,7 +97,7 @@ OwnedCubin compilePtxToCubin(const std::string ptx, Location loc,
                        "cuLinkComplete");
 
   char *cubinAsChar = static_cast<char *>(cubinData);
-  OwnedCubin result =
+  OwnedBlob result =
       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
 
   // This will also destroy the cubin data.
@@ -114,7 +114,9 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
   auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
   kernelPm.addPass(createStripDebugInfoPass());
   kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
-  kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
+  kernelPm.addPass(createConvertGPUKernelToBlobPass(
+      translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
+      "sm_35", "+ptx60", "nvvm.cubin"));
   pm.addPass(createLowerToLLVMPass());
   pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());
 
@@ -127,6 +129,13 @@ int main(int argc, char **argv) {
   llvm::InitLLVM y(argc, argv);
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
+
+  // Initialize LLVM NVPTX backend.
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTargetMC();
+  LLVMInitializeNVPTXAsmPrinter();
+
   mlir::initializeLLVMPasses();
   return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
 }
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 69b1d8d57bc56..159a7fd4bca54 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -46,6 +46,7 @@ void registerTestLoopPermutationPass();
 void registerTestCallGraphPass();
 void registerTestConstantFold();
 void registerTestConvertGPUKernelToCubinPass();
+void registerTestConvertGPUKernelToHsacoPass();
 void registerTestDominancePass();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
@@ -112,6 +113,9 @@ void registerTestPasses() {
   registerTestConstantFold();
 #if MLIR_CUDA_CONVERSIONS_ENABLED
   registerTestConvertGPUKernelToCubinPass();
+#endif
+#if MLIR_ROCM_CONVERSIONS_ENABLED
+  registerTestConvertGPUKernelToHsacoPass();
 #endif
   registerTestBufferPlacementPreparationPass();
   registerTestDominancePass();

From f1ab7550bcd51c353a1cac0303df9bbe960b7eab Mon Sep 17 00:00:00 2001
From: Frederik Gossen <frgossen@google.com>
Date: Thu, 28 May 2020 14:10:11 +0000
Subject: [PATCH 356/770] [MLIR] Fix operand type in `from_extent_tensor` in
 the shape dialect

The operand of `from_extent_tensor` is now of the same index type as the result
type of the inverse operation `to_extent_tensor`.

Differential Revision: https://reviews.llvm.org/D80283
---
 mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 0d300d3c64c8c..460f5becc1f9e 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -163,7 +163,7 @@ def Shape_FromExtentTensorOp : Shape_Op<"from_extent_tensor", []> {
     extents match the values of the elements.
   }];
 
-  let arguments = (ins I32Tensor:$input);
+  let arguments = (ins IndexTensor:$input);
   let results = (outs Shape_ShapeType:$result);
 }
 

From 6c2b7ee2f7fac7b683e343c2c383b7e67fadf9f8 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Thu, 28 May 2020 16:14:49 +0200
Subject: [PATCH 357/770] Prevent test from failing in my home directory

---
 clang/test/Headers/nvptx_device_math_macro.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/Headers/nvptx_device_math_macro.cpp b/clang/test/Headers/nvptx_device_math_macro.cpp
index e21aa2b072b91..02bdc1f35b0b6 100644
--- a/clang/test/Headers/nvptx_device_math_macro.cpp
+++ b/clang/test/Headers/nvptx_device_math_macro.cpp
@@ -8,9 +8,9 @@
 #pragma omp declare target
 int use_macro() {
   double a(0);
-// CHECK-NOT:  call
+// CHECK-NOT:  call {{.*}}
 // CHECK:  call double @llvm.fabs.f64(double
-// CHECK-NOT:  call
+// CHECK-NOT:  call {{.*}}
 // CHECK:      ret i32 %conv
   return (std::fpclassify(a) != FP_ZERO);
 }

From 0ea52537feae4bc64b5cefc3a4c168dc3ad90463 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 15:26:15 +0100
Subject: [PATCH 358/770] SymbolicFile.h - removed unused FileSystem.h include.
 NFC.

Exposes a number of implicit dependencies that needs fixing in source files and XCOFFObjectFile.h.
---
 llvm/include/llvm/Object/SymbolicFile.h    | 1 -
 llvm/include/llvm/Object/XCOFFObjectFile.h | 1 +
 llvm/lib/Object/COFFObjectFile.cpp         | 1 +
 llvm/lib/Object/WasmObjectFile.cpp         | 1 +
 llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp | 2 +-
 llvm/tools/llvm-objcopy/wasm/Writer.cpp    | 2 +-
 llvm/tools/llvm-rc/ResourceFileWriter.cpp  | 2 +-
 7 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Object/SymbolicFile.h b/llvm/include/llvm/Object/SymbolicFile.h
index 442eeddc58376..a0d8b7225598b 100644
--- a/llvm/include/llvm/Object/SymbolicFile.h
+++ b/llvm/include/llvm/Object/SymbolicFile.h
@@ -18,7 +18,6 @@
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <cinttypes>
diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h
index 47aa861b3793e..9c24707360230 100644
--- a/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -15,6 +15,7 @@
 
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Endian.h"
 #include <limits>
 
 namespace llvm {
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 9efc708c09473..78bcfb177ee5d 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/COFF.h"
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 3608d95a2d27f..573055b665910 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/MC/SubtargetFeature.h"
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
index 8e14c887170d9..aaf3b73cf541e 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -11,7 +11,6 @@
 #include "CopyConfig.h"
 #include "Object.h"
 #include "llvm-objcopy.h"
-
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
@@ -32,6 +31,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/tools/llvm-objcopy/wasm/Writer.cpp b/llvm/tools/llvm-objcopy/wasm/Writer.cpp
index 996cfa76ad3ca..50d26507b4983 100644
--- a/llvm/tools/llvm-objcopy/wasm/Writer.cpp
+++ b/llvm/tools/llvm-objcopy/wasm/Writer.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Writer.h"
-
 #include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index 615474b4a8646..d8d4014124860 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -11,11 +11,11 @@
 //===---------------------------------------------------------------------===//
 
 #include "ResourceFileWriter.h"
-
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"

From 1a9e0d7092145e33175f628f4cdd28acf0d17100 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 14:48:14 -0400
Subject: [PATCH 359/770] AMDGPU: Make S_DENORM_MODE not be a scheduling
 boundary

Now that the mode register uses/defs should be properly modeled, we
don't need to treat the FP mode switch as an arbitrary side effect.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp    | 4 +++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td     | 2 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td | 3 +--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e68f8a95efed5..0300de69caea8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2944,10 +2944,12 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
+
+  // TODO: Don't treat setreg with known constant that only changes MODE as
+  // barrier.
   return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
-         MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
          changesVGPRIndexingMode(MI);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b988de596c648..62b7f8318fd02 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -264,7 +264,7 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
 
 def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
   SDTypeProfile<0 ,1, [SDTCisInt<0>]>,
-  [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]
+  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
 >;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 4f9aaa1bc604f..7b8c2c27b8063 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1218,8 +1218,7 @@ let SubtargetPredicate = isGFX10Plus in {
   def S_WAITCNT_DEPCTR :
     SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
 
-  let hasSideEffects = 1, Uses = [MODE], Defs = [MODE] in {
-    // FIXME: Should remove hasSideEffects
+  let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
     def S_ROUND_MODE :
       SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
     def S_DENORM_MODE :

From cf4d4e366a2165f0e93948f166d76ae650aecc98 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 26 May 2020 17:03:20 -0400
Subject: [PATCH 360/770] libclc: Compile with -nostdlib

This fixes a build error when compiling for amdgcn-amd-amdhsa, which
defaults to trying to link bitcode libraries.
---
 libclc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 7b981110f6fdf..9472f191fbde9 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -262,7 +262,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
 		target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
 			"__CLC_INTERNAL" )
 		target_compile_options( builtins.link.${arch_suffix} PRIVATE  -target
-			${t} ${mcpu} -fno-builtin )
+			${t} ${mcpu} -fno-builtin -nostdlib )
 		set_target_properties( builtins.link.${arch_suffix} PROPERTIES
 			LINKER_LANGUAGE CLC )
 

From 06019e312571c886494d3287e8962e5d5943dab8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 28 May 2020 10:46:23 -0400
Subject: [PATCH 361/770] AMDGPU: Add missing test for s_denorm_mode scheduling

Forgot to add this file to 1a9e0d7092145e33175f628f4cdd28acf0d17100
---
 .../AMDGPU/schedule-barrier-fpmode.mir        | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir

diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
new file mode 100644
index 0000000000000..117e6f588162c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=machine-scheduler -o - %s | FileCheck %s
+# Make sure FP mode is not a hard scheduling boundary
+
+---
+name: denorm_mode_not_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: denorm_mode_not_barrier
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD1]], implicit $exec
+    ; CHECK: S_DENORM_MODE 0, implicit-def $mode, implicit $mode
+    ; CHECK: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %3:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
+    S_ENDPGM 0, implicit %3
+...
+
+---
+name: round_mode_not_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: round_mode_not_barrier
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD1]], implicit $exec
+    ; CHECK: S_ROUND_MODE 0, implicit-def $mode, implicit $mode
+    ; CHECK: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_ROUND_MODE 0, implicit-def $mode, implicit $mode
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %3:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
+    S_ENDPGM 0, implicit %3
+...
+
+---
+name: denorm_mode_mode_def_use
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: denorm_mode_mode_def_use
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: S_DENORM_MODE 0, implicit-def $mode, implicit $mode
+    ; CHECK: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
+    ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_ADD_F32_e32_]], implicit [[V_ADD_U32_e32_]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_DENORM_MODE 0, implicit-def $mode, implicit $mode
+    %2:vgpr_32 = V_ADD_F32_e32 0, %1, implicit $mode, implicit $exec
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %4:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
+    S_ENDPGM 0, implicit %2, implicit %4
+...
+
+---
+name: round_mode_mode_def_use
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: round_mode_mode_def_use
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4)
+    ; CHECK: S_ROUND_MODE 0, implicit-def $mode, implicit $mode
+    ; CHECK: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
+    ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
+    ; CHECK: S_ENDPGM 0, implicit [[V_ADD_F32_e32_]], implicit [[V_ADD_U32_e32_]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    S_ROUND_MODE 0, implicit-def $mode, implicit $mode
+    %2:vgpr_32 = V_ADD_F32_e32 0, %1, implicit $mode, implicit $exec
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4)
+    %4:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec
+    S_ENDPGM 0, implicit %2, implicit %4
+...

From 8e325cfc1456820e2253909e4aa0c3014f1e050c Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Thu, 28 May 2020 17:12:43 +0200
Subject: [PATCH 362/770] [clangd] Work around PS4 -fno-exceptions, easier than
 disabling tests?

---
 clang-tools-extra/clangd/unittests/XRefsTests.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index b73a310e95fb2..e260285a179cf 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -245,7 +245,9 @@ TEST(HighlightsTest, ControlFlow) {
   };
   for (const char *Test : Tests) {
     Annotations T(Test);
-    auto AST = TestTU::withCode(T.code()).build();
+    auto TU = TestTU::withCode(T.code());
+    TU.ExtraArgs.push_back("-fexceptions"); // FIXME: stop testing on PS4.
+    auto AST = TU.build();
     EXPECT_THAT(findDocumentHighlights(AST, T.point()), HighlightsFrom(T))
         << Test;
   }

From 04a96aa3e430a66767732f44acea00c6e13c9f78 Mon Sep 17 00:00:00 2001
From: Yitzhak Mandelbaum <yitzhakm@google.com>
Date: Wed, 27 May 2020 18:04:50 -0400
Subject: [PATCH 363/770] [ASTMatchers] Add traversal-kind support to
 `DynTypedMatcher`

Summary:
This patch exposes `TraversalKind` support in the `DynTypedMatcher` API. While
previously, the `match` method supported traversal logic, it was not possible to
set or get the traversal kind.

Reviewers: gribozavr, steveire

Subscribers: hokein, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80685
---
 .../clang/ASTMatchers/ASTMatchersInternal.h   | 14 ++++++++
 clang/lib/ASTMatchers/ASTMatchersInternal.cpp | 33 +++++++++++++++++++
 .../ASTMatchers/ASTMatchersInternalTest.cpp   | 22 +++++++++++++
 clang/unittests/ASTMatchers/CMakeLists.txt    |  5 +++
 4 files changed, 74 insertions(+)

diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index e363bdd9ae9cf..ac8469bded538 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -395,6 +395,12 @@ class DynTypedMatcher {
   ///   restricts the node types for \p Kind.
   DynTypedMatcher dynCastTo(const ASTNodeKind Kind) const;
 
+  /// Return a matcher that that points to the same implementation, but sets the
+  ///   traversal kind.
+  ///
+  /// If the traversal kind is already set, then \c TK overrides it.
+  DynTypedMatcher withTraversalKind(TraversalKind TK);
+
   /// Returns true if the matcher matches the given \c DynNode.
   bool matches(const DynTypedNode &DynNode, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const;
@@ -458,6 +464,14 @@ class DynTypedMatcher {
   /// If it is not compatible, then this matcher will never match anything.
   template <typename T> Matcher<T> unconditionalConvertTo() const;
 
+  /// Returns the \c TraversalKind respected by calls to `match()`, if any.
+  ///
+  /// Most matchers will not have a traversal kind set, instead relying on the
+  /// surrounding context. For those, \c llvm::None is returned.
+  llvm::Optional<clang::TraversalKind> getTraversalKind() const {
+    return Implementation->TraversalKind();
+  }
+
 private:
   DynTypedMatcher(ASTNodeKind SupportedKind, ASTNodeKind RestrictKind,
                   IntrusiveRefCntPtr<DynMatcherInterface> Implementation)
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 40bd439f79fae..1ee89ccd3c115 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -136,6 +136,31 @@ class TrueMatcherImpl : public DynMatcherInterface {
   }
 };
 
+/// A matcher that specifies a particular \c TraversalKind.
+///
+/// The kind provided to the constructor overrides any kind that may be
+/// specified by the `InnerMatcher`.
+class DynTraversalMatcherImpl : public DynMatcherInterface {
+public:
+  explicit DynTraversalMatcherImpl(
+      clang::TraversalKind TK,
+      IntrusiveRefCntPtr<DynMatcherInterface> InnerMatcher)
+      : TK(TK), InnerMatcher(std::move(InnerMatcher)) {}
+
+  bool dynMatches(const DynTypedNode &DynNode, ASTMatchFinder *Finder,
+                  BoundNodesTreeBuilder *Builder) const override {
+    return this->InnerMatcher->dynMatches(DynNode, Finder, Builder);
+  }
+
+  llvm::Optional<clang::TraversalKind> TraversalKind() const override {
+    return TK;
+  }
+
+private:
+  clang::TraversalKind TK;
+  IntrusiveRefCntPtr<DynMatcherInterface> InnerMatcher;
+};
+
 } // namespace
 
 static llvm::ManagedStatic<TrueMatcherImpl> TrueMatcherInstance;
@@ -204,6 +229,14 @@ DynTypedMatcher::constructRestrictedWrapper(const DynTypedMatcher &InnerMatcher,
   return Copy;
 }
 
+DynTypedMatcher
+DynTypedMatcher::withTraversalKind(ast_type_traits::TraversalKind TK) {
+  auto Copy = *this;
+  Copy.Implementation =
+      new DynTraversalMatcherImpl(TK, std::move(Copy.Implementation));
+  return Copy;
+}
+
 DynTypedMatcher DynTypedMatcher::trueMatcher(ASTNodeKind NodeKind) {
   return DynTypedMatcher(NodeKind, NodeKind, &*TrueMatcherInstance);
 }
diff --git a/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
index 0d0f0307e7f16..2886481ea262b 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
@@ -13,10 +13,12 @@
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
 
 namespace clang {
 namespace ast_matchers {
+using internal::DynTypedMatcher;
 
 #if GTEST_HAS_DEATH_TEST
 TEST(HasNameDeathTest, DiesOnEmptyName) {
@@ -171,6 +173,26 @@ TEST(Matcher, matchOverEntireASTContext) {
   EXPECT_NE(nullptr, PT);
 }
 
+TEST(DynTypedMatcherTest, TraversalKindForwardsToImpl) {
+  auto M = DynTypedMatcher(decl());
+  EXPECT_FALSE(M.getTraversalKind().hasValue());
+
+  M = DynTypedMatcher(traverse(TK_AsIs, decl()));
+  EXPECT_THAT(M.getTraversalKind(), llvm::ValueIs(TK_AsIs));
+}
+
+TEST(DynTypedMatcherTest, ConstructWithTraversalKindSetsTK) {
+  auto M = DynTypedMatcher(decl()).withTraversalKind(TK_AsIs);
+  EXPECT_THAT(M.getTraversalKind(), llvm::ValueIs(TK_AsIs));
+}
+
+TEST(DynTypedMatcherTest, ConstructWithTraversalKindOverridesNestedTK) {
+  auto M = DynTypedMatcher(decl()).withTraversalKind(TK_AsIs).withTraversalKind(
+      TK_IgnoreUnlessSpelledInSource);
+  EXPECT_THAT(M.getTraversalKind(),
+              llvm::ValueIs(TK_IgnoreUnlessSpelledInSource));
+}
+
 TEST(IsInlineMatcher, IsInline) {
   EXPECT_TRUE(matches("void g(); inline void f();",
                       functionDecl(isInline(), hasName("f"))));
diff --git a/clang/unittests/ASTMatchers/CMakeLists.txt b/clang/unittests/ASTMatchers/CMakeLists.txt
index e128cfe695a68..bcc829ac64662 100644
--- a/clang/unittests/ASTMatchers/CMakeLists.txt
+++ b/clang/unittests/ASTMatchers/CMakeLists.txt
@@ -30,4 +30,9 @@ clang_target_link_libraries(ASTMatchersTests
   clangTooling
   )
 
+target_link_libraries(ASTMatchersTests
+  PRIVATE
+  LLVMTestingSupport
+)
+
 add_subdirectory(Dynamic)

From db52a4901096f035b6cda832c4bf4c6ce2ede2f9 Mon Sep 17 00:00:00 2001
From: Jean-Michel Gorius <jean-michel.gorius@ens-rennes.fr>
Date: Thu, 28 May 2020 17:38:00 +0200
Subject: [PATCH 364/770] [mlir] Make translation libraries available through
 MLIRConfig.cmake

---
 mlir/cmake/modules/CMakeLists.txt      | 1 +
 mlir/cmake/modules/MLIRConfig.cmake.in | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mlir/cmake/modules/CMakeLists.txt b/mlir/cmake/modules/CMakeLists.txt
index 588de5495db64..b9fb9adce2ea5 100644
--- a/mlir/cmake/modules/CMakeLists.txt
+++ b/mlir/cmake/modules/CMakeLists.txt
@@ -14,6 +14,7 @@ export(TARGETS ${MLIR_EXPORTS} FILE ${mlir_cmake_builddir}/MLIRTargets.cmake)
 get_property(MLIR_ALL_LIBS GLOBAL PROPERTY MLIR_ALL_LIBS)
 get_property(MLIR_DIALECT_LIBS GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(MLIR_CONVERSION_LIBS GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(MLIR_TRANSLATION_LIBS GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
 
 # Generate MlirConfig.cmake for the build tree.
 set(MLIR_CONFIG_CMAKE_DIR "${mlir_cmake_builddir}")
diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in
index de38f94add75e..6095362d2b36d 100644
--- a/mlir/cmake/modules/MLIRConfig.cmake.in
+++ b/mlir/cmake/modules/MLIRConfig.cmake.in
@@ -17,6 +17,7 @@ set(MLIR_MAIN_SRC_DIR "@MLIR_MAIN_SRC_DIR@")
 set_property(GLOBAL PROPERTY MLIR_ALL_LIBS "@MLIR_ALL_LIBS@")
 set_property(GLOBAL PROPERTY MLIR_DIALECT_LIBS "@MLIR_DIALECT_LIBS@")
 set_property(GLOBAL PROPERTY MLIR_CONVERSION_LIBS "@MLIR_CONVERSION_LIBS@")
+set_property(GLOBAL PROPERTY MLIR_TRANSLATION_LIBS "@MLIR_TRANSLATION_LIBS@")
 
 # Provide all our library targets to users.
 include("@MLIR_CONFIG_EXPORTS_FILE@")

From ce5780b88c6e2f3303afd266e5e29c1badd9eb3b Mon Sep 17 00:00:00 2001
From: Yitzhak Mandelbaum <yitzhakm@google.com>
Date: Tue, 26 May 2020 22:59:08 -0400
Subject: [PATCH 365/770] [libTooling] Fix Transformer to work with ambient
 traversal kinds.

Summary:
`RewriteRule`'s `applyFirst` was brittle with respect to the default setting of the
`TraversalKind`. This patch builds awareness of traversal kinds directly into
rewrite rules so that they are insensitive to any changes in defaults.

Reviewers: steveire, gribozavr

Subscribers: hokein, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80606
---
 .../clang/Tooling/Transformer/RewriteRule.h   | 12 +++--
 clang/lib/Tooling/Transformer/RewriteRule.cpp | 24 ++++++---
 clang/unittests/Tooling/TransformerTest.cpp   | 53 +++++++++++++++++++
 3 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Tooling/Transformer/RewriteRule.h b/clang/include/clang/Tooling/Transformer/RewriteRule.h
index 4a5e5556cdff0..0a961ccc475df 100644
--- a/clang/include/clang/Tooling/Transformer/RewriteRule.h
+++ b/clang/include/clang/Tooling/Transformer/RewriteRule.h
@@ -273,11 +273,13 @@ namespace detail {
 /// supports mixing matchers of different kinds.
 ast_matchers::internal::DynTypedMatcher buildMatcher(const RewriteRule &Rule);
 
-/// Builds a set of matchers that cover the rule (one for each distinct node
-/// matcher base kind: Stmt, Decl, etc.). Node-matchers for `QualType` and
-/// `Type` are not permitted, since such nodes carry no source location
-/// information and are therefore not relevant for rewriting. If any such
-/// matchers are included, will return an empty vector.
+/// Builds a set of matchers that cover the rule.
+///
+/// One matcher is built for each distinct node matcher base kind: Stmt, Decl,
+/// etc. Node-matchers for `QualType` and `Type` are not permitted, since such
+/// nodes carry no source location information and are therefore not relevant
+/// for rewriting. If any such matchers are included, will return an empty
+/// vector.
 std::vector<ast_matchers::internal::DynTypedMatcher>
 buildMatchers(const RewriteRule &Rule);
 
diff --git a/clang/lib/Tooling/Transformer/RewriteRule.cpp b/clang/lib/Tooling/Transformer/RewriteRule.cpp
index 968f7fa6cd325..ddce6ce591851 100644
--- a/clang/lib/Tooling/Transformer/RewriteRule.cpp
+++ b/clang/lib/Tooling/Transformer/RewriteRule.cpp
@@ -116,10 +116,13 @@ static bool hasValidKind(const DynTypedMatcher &M) {
 #endif
 
 // Binds each rule's matcher to a unique (and deterministic) tag based on
-// `TagBase` and the id paired with the case.
+// `TagBase` and the id paired with the case. All of the returned matchers have
+// their traversal kind explicitly set, either based on a pre-set kind or to the
+// provided `DefaultTraversalKind`.
 static std::vector<DynTypedMatcher> taggedMatchers(
     StringRef TagBase,
-    const SmallVectorImpl<std::pair<size_t, RewriteRule::Case>> &Cases) {
+    const SmallVectorImpl<std::pair<size_t, RewriteRule::Case>> &Cases,
+    ast_type_traits::TraversalKind DefaultTraversalKind) {
   std::vector<DynTypedMatcher> Matchers;
   Matchers.reserve(Cases.size());
   for (const auto &Case : Cases) {
@@ -127,8 +130,10 @@ static std::vector<DynTypedMatcher> taggedMatchers(
     // HACK: Many matchers are not bindable, so ensure that tryBind will work.
     DynTypedMatcher BoundMatcher(Case.second.Matcher);
     BoundMatcher.setAllowBind(true);
-    auto M = BoundMatcher.tryBind(Tag);
-    Matchers.push_back(*std::move(M));
+    auto M = *BoundMatcher.tryBind(Tag);
+    Matchers.push_back(!M.getTraversalKind()
+                           ? M.withTraversalKind(DefaultTraversalKind)
+                           : std::move(M));
   }
   return Matchers;
 }
@@ -158,14 +163,21 @@ transformer::detail::buildMatchers(const RewriteRule &Rule) {
     Buckets[Cases[I].Matcher.getSupportedKind()].emplace_back(I, Cases[I]);
   }
 
+  // Each anyOf explicitly controls the traversal kind. The anyOf itself is set
+  // to `TK_AsIs` to ensure no nodes are skipped, thereby deferring to the kind
+  // of the branches. Then, each branch is either left as is, if the kind is
+  // already set, or explicitly set to `TK_IgnoreUnlessSpelledInSource`. We
+  // choose this setting, because we think it is the one most friendly to
+  // beginners, who are (largely) the target audience of Transformer.
   std::vector<DynTypedMatcher> Matchers;
   for (const auto &Bucket : Buckets) {
     DynTypedMatcher M = DynTypedMatcher::constructVariadic(
         DynTypedMatcher::VO_AnyOf, Bucket.first,
-        taggedMatchers("Tag", Bucket.second));
+        taggedMatchers("Tag", Bucket.second, TK_IgnoreUnlessSpelledInSource));
     M.setAllowBind(true);
     // `tryBind` is guaranteed to succeed, because `AllowBind` was set to true.
-    Matchers.push_back(*M.tryBind(RewriteRule::RootID));
+    Matchers.push_back(
+        M.tryBind(RewriteRule::RootID)->withTraversalKind(TK_AsIs));
   }
   return Matchers;
 }
diff --git a/clang/unittests/Tooling/TransformerTest.cpp b/clang/unittests/Tooling/TransformerTest.cpp
index c8c6db059fedf..d19f747a69b57 100644
--- a/clang/unittests/Tooling/TransformerTest.cpp
+++ b/clang/unittests/Tooling/TransformerTest.cpp
@@ -571,6 +571,59 @@ TEST_F(TransformerTest, OrderedRuleMultipleKinds) {
   testRule(Rule, Input, Expected);
 }
 
+// Verifies that a rule with a top-level matcher for an implicit node (like
+// `implicitCastExpr`) does not change the code, when the AST traversal skips
+// implicit nodes. In this test, only the rule with the explicit-node matcher
+// will fire.
+TEST_F(TransformerTest, OrderedRuleImplicitIgnored) {
+  std::string Input = R"cc(
+    void f1();
+    int f2();
+    void call_f1() { f1(); }
+    float call_f2() { return f2(); }
+  )cc";
+  std::string Expected = R"cc(
+    void f1();
+    int f2();
+    void call_f1() { REPLACE_F1; }
+    float call_f2() { return f2(); }
+  )cc";
+
+  RewriteRule ReplaceF1 =
+      makeRule(callExpr(callee(functionDecl(hasName("f1")))),
+               changeTo(cat("REPLACE_F1")));
+  RewriteRule ReplaceF2 =
+      makeRule(implicitCastExpr(hasSourceExpression(callExpr())),
+               changeTo(cat("REPLACE_F2")));
+  testRule(applyFirst({ReplaceF1, ReplaceF2}), Input, Expected);
+}
+
+// Verifies that explicitly setting the traversal kind fixes the problem in the
+// previous test.
+TEST_F(TransformerTest, OrderedRuleImplicitMatched) {
+  std::string Input = R"cc(
+    void f1();
+    int f2();
+    void call_f1() { f1(); }
+    float call_f2() { return f2(); }
+  )cc";
+  std::string Expected = R"cc(
+    void f1();
+    int f2();
+    void call_f1() { REPLACE_F1; }
+    float call_f2() { return REPLACE_F2; }
+  )cc";
+
+  RewriteRule ReplaceF1 = makeRule(
+      traverse(clang::TK_AsIs, callExpr(callee(functionDecl(hasName("f1"))))),
+      changeTo(cat("REPLACE_F1")));
+  RewriteRule ReplaceF2 =
+      makeRule(traverse(clang::TK_AsIs,
+                        implicitCastExpr(hasSourceExpression(callExpr()))),
+               changeTo(cat("REPLACE_F2")));
+  testRule(applyFirst({ReplaceF1, ReplaceF2}), Input, Expected);
+}
+
 //
 // Negative tests (where we expect no transformation to occur).
 //

From f5192d7fb7564c45ff4ec42f359408974b7c8fa2 Mon Sep 17 00:00:00 2001
From: Jean-Michel Gorius <jean-michel.gorius@ens-rennes.fr>
Date: Thu, 28 May 2020 17:43:59 +0200
Subject: [PATCH 366/770] [x86] Propagate memory operands during call frame
 optimization

Summary:
Propagate memory operands when folding load instructions into instructions that directly operate on memory.

The original revision has been split. See D80140 for the other part of the changes.

Reviewers: craig.topper, rnk, lebedev.ri, efriedma

Reviewed By: craig.topper

Subscribers: lebedev.ri, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80062
---
 .../Target/X86/X86CallFrameOptimization.cpp   |   3 +
 llvm/test/CodeGen/X86/cf-opt-memops.mir       | 109 ++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/cf-opt-memops.mir

diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 8144e85d9e55e..caa1f79524750 100644
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -531,6 +531,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
           PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
       }
       Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
+      Push->cloneMemRefs(MF, *Store);
       break;
     case X86::MOV32mr:
     case X86::MOV64mr: {
@@ -562,6 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         unsigned NumOps = DefMov->getDesc().getNumOperands();
         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
           Push->addOperand(DefMov->getOperand(i));
+        Push->cloneMergedMemRefs(MF, {&*DefMov, &*Store});
 
         DefMov->eraseFromParent();
       } else {
@@ -569,6 +571,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
                    .addReg(Reg)
                    .getInstr();
+        Push->cloneMemRefs(MF, *Store);
       }
       break;
     }
diff --git a/llvm/test/CodeGen/X86/cf-opt-memops.mir b/llvm/test/CodeGen/X86/cf-opt-memops.mir
new file mode 100644
index 0000000000000..40737a92d570e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cf-opt-memops.mir
@@ -0,0 +1,109 @@
+# RUN: llc -o - -mtriple=x86_64-- -run-pass=x86-cf-opt %s | FileCheck %s
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "code_io.c"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  
+  @.str.8 = private unnamed_addr constant [34 x i8] c"%10s%10s%10s%10s%10s%10s%10s%10s\0A\00", align 1
+  @.str.9 = private unnamed_addr constant [6 x i8] c"nbody\00", align 1
+  @.str.10 = private unnamed_addr constant [6 x i8] c"dtime\00", align 1
+  @.str.11 = private unnamed_addr constant [4 x i8] c"eps\00", align 1
+  @.str.12 = private unnamed_addr constant [4 x i8] c"tol\00", align 1
+  @.str.13 = private unnamed_addr constant [6 x i8] c"dtout\00", align 1
+  @.str.14 = private unnamed_addr constant [6 x i8] c"tstop\00", align 1
+  @.str.15 = private unnamed_addr constant [7 x i8] c"fcells\00", align 1
+  @.str.16 = private unnamed_addr constant [6 x i8] c"NPROC\00", align 1
+  
+  define dso_local void @initoutput() local_unnamed_addr {
+  entry:
+    %call1 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([34 x i8], [34 x i8]* @.str.8, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.9, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.10, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.11, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.12, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.13, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.14, i64 0, i64 0), i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.15, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str.16, i64 0, i64 0))
+    ret void
+  }
+  
+  declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+  
+  attributes #0 = { nounwind }
+
+...
+---
+name:            initoutput
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: gr64, preferred-register: '' }
+  - { id: 2, class: gr64, preferred-register: '' }
+  - { id: 3, class: gr64, preferred-register: '' }
+  - { id: 4, class: gr64, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+  - { id: 6, class: gr64, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr8, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    ADJCALLSTACKDOWN64 24, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr64 = COPY $rsp
+    MOV64mi32 %0, 1, $noreg, 16, $noreg, @.str.16 :: (store 8 into stack + 16)
+    MOV64mi32 %0, 1, $noreg, 8, $noreg, @.str.15 :: (store 8 into stack + 8)
+    MOV64mi32 %0, 1, $noreg, 0, $noreg, @.str.14 :: (store 8 into stack)
+    %1:gr64 = MOV32ri64 @.str.8
+    %2:gr64 = MOV32ri64 @.str.9
+    %3:gr64 = MOV32ri64 @.str.10
+    %4:gr64 = MOV32ri64 @.str.11
+    %5:gr64 = MOV32ri64 @.str.12
+    %6:gr64 = MOV32ri64 @.str.13
+    %7:gr32 = MOV32r0 implicit-def dead $eflags
+    %8:gr8 = COPY %7.sub_8bit
+    $rdi = COPY %1
+    $rsi = COPY %2
+    $rdx = COPY %3
+    $rcx = COPY %4
+    $r8 = COPY %5
+    $r9 = COPY %6
+    $al = COPY %8
+    CALL64pcrel32 @printf, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit $r9, implicit $al, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 24, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+# Call frame optimization should propagate memory operands
+# CHECK: PUSH64i32 @{{.*}} :: (store 8 into stack + 16)
+# CHECK: PUSH64i32 @{{.*}} :: (store 8 into stack + 8)
+# CHECK: PUSH64i32 @{{.*}} :: (store 8 into stack)
+
+...

From bb2ae74717a25ba268e7bd17a2a572d931ed094e Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Thu, 30 Apr 2020 11:09:52 +0300
Subject: [PATCH 367/770] [analyzer] Merge implementations of SymInt, IntSym,
 and SymSym exprs

Summary:
SymIntExpr, IntSymExpr, and SymSymExpr share a big portion of logic
that used to be duplicated across all three classes.  New
implementation also adds an easy way of introducing another type of
operands into the mix.

Differential Revision: https://reviews.llvm.org/D79156
---
 .../Core/PathSensitive/SymbolManager.h        | 145 ++++++------------
 .../lib/StaticAnalyzer/Core/SymbolManager.cpp |  48 ++----
 2 files changed, 61 insertions(+), 132 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index 2c505995bee0a..390ced8c29f8f 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -326,136 +326,83 @@ class BinarySymExpr : public SymExpr {
     Kind k = SE->getKind();
     return k >= BEGIN_BINARYSYMEXPRS && k <= END_BINARYSYMEXPRS;
   }
-};
-
-/// Represents a symbolic expression like 'x' + 3.
-class SymIntExpr : public BinarySymExpr {
-  const SymExpr *LHS;
-  const llvm::APSInt& RHS;
 
-public:
-  SymIntExpr(const SymExpr *lhs, BinaryOperator::Opcode op,
-             const llvm::APSInt &rhs, QualType t)
-      : BinarySymExpr(SymIntExprKind, op, t), LHS(lhs), RHS(rhs) {
-    assert(lhs);
+protected:
+  static unsigned computeOperandComplexity(const SymExpr *Value) {
+    return Value->computeComplexity();
   }
-
-  void dumpToStream(raw_ostream &os) const override;
-
-  const SymExpr *getLHS() const { return LHS; }
-  const llvm::APSInt &getRHS() const { return RHS; }
-
-  unsigned computeComplexity() const override {
-    if (Complexity == 0)
-      Complexity = 1 + LHS->computeComplexity();
-    return Complexity;
+  static unsigned computeOperandComplexity(const llvm::APSInt &Value) {
+    return 1;
   }
 
-  static void Profile(llvm::FoldingSetNodeID& ID, const SymExpr *lhs,
-                      BinaryOperator::Opcode op, const llvm::APSInt& rhs,
-                      QualType t) {
-    ID.AddInteger((unsigned) SymIntExprKind);
-    ID.AddPointer(lhs);
-    ID.AddInteger(op);
-    ID.AddPointer(&rhs);
-    ID.Add(t);
+  static const llvm::APSInt *getPointer(const llvm::APSInt &Value) {
+    return &Value;
   }
+  static const SymExpr *getPointer(const SymExpr *Value) { return Value; }
 
-  void Profile(llvm::FoldingSetNodeID& ID) override {
-    Profile(ID, LHS, getOpcode(), RHS, getType());
-  }
-
-  // Implement isa<T> support.
-  static bool classof(const SymExpr *SE) {
-    return SE->getKind() == SymIntExprKind;
-  }
+  static void dumpToStreamImpl(raw_ostream &os, const SymExpr *Value);
+  static void dumpToStreamImpl(raw_ostream &os, const llvm::APSInt &Value);
+  static void dumpToStreamImpl(raw_ostream &os, BinaryOperator::Opcode op);
 };
 
-/// Represents a symbolic expression like 3 - 'x'.
-class IntSymExpr : public BinarySymExpr {
-  const llvm::APSInt& LHS;
-  const SymExpr *RHS;
+/// Template implementation for all binary symbolic expressions
+template <class LHSTYPE, class RHSTYPE, SymExpr::Kind ClassKind>
+class BinarySymExprImpl : public BinarySymExpr {
+  LHSTYPE LHS;
+  RHSTYPE RHS;
 
 public:
-  IntSymExpr(const llvm::APSInt &lhs, BinaryOperator::Opcode op,
-             const SymExpr *rhs, QualType t)
-      : BinarySymExpr(IntSymExprKind, op, t), LHS(lhs), RHS(rhs) {
-    assert(rhs);
+  BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs,
+                    QualType t)
+      : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) {
+    assert(getPointer(lhs));
+    assert(getPointer(rhs));
   }
 
-  void dumpToStream(raw_ostream &os) const override;
+  void dumpToStream(raw_ostream &os) const override {
+    dumpToStreamImpl(os, LHS);
+    dumpToStreamImpl(os, getOpcode());
+    dumpToStreamImpl(os, RHS);
+  }
 
-  const SymExpr *getRHS() const { return RHS; }
-  const llvm::APSInt &getLHS() const { return LHS; }
+  LHSTYPE getLHS() const { return LHS; }
+  RHSTYPE getRHS() const { return RHS; }
 
   unsigned computeComplexity() const override {
     if (Complexity == 0)
-      Complexity = 1 + RHS->computeComplexity();
+      Complexity =
+          computeOperandComplexity(RHS) + computeOperandComplexity(LHS);
     return Complexity;
   }
 
-  static void Profile(llvm::FoldingSetNodeID& ID, const llvm::APSInt& lhs,
-                      BinaryOperator::Opcode op, const SymExpr *rhs,
-                      QualType t) {
-    ID.AddInteger((unsigned) IntSymExprKind);
-    ID.AddPointer(&lhs);
+  static void Profile(llvm::FoldingSetNodeID &ID, LHSTYPE lhs,
+                      BinaryOperator::Opcode op, RHSTYPE rhs, QualType t) {
+    ID.AddInteger((unsigned)ClassKind);
+    ID.AddPointer(getPointer(lhs));
     ID.AddInteger(op);
-    ID.AddPointer(rhs);
+    ID.AddPointer(getPointer(rhs));
     ID.Add(t);
   }
 
-  void Profile(llvm::FoldingSetNodeID& ID) override {
+  void Profile(llvm::FoldingSetNodeID &ID) override {
     Profile(ID, LHS, getOpcode(), RHS, getType());
   }
 
   // Implement isa<T> support.
-  static bool classof(const SymExpr *SE) {
-    return SE->getKind() == IntSymExprKind;
-  }
+  static bool classof(const SymExpr *SE) { return SE->getKind() == ClassKind; }
 };
 
-/// Represents a symbolic expression like 'x' + 'y'.
-class SymSymExpr : public BinarySymExpr {
-  const SymExpr *LHS;
-  const SymExpr *RHS;
-
-public:
-  SymSymExpr(const SymExpr *lhs, BinaryOperator::Opcode op, const SymExpr *rhs,
-             QualType t)
-      : BinarySymExpr(SymSymExprKind, op, t), LHS(lhs), RHS(rhs) {
-    assert(lhs);
-    assert(rhs);
-  }
-
-  const SymExpr *getLHS() const { return LHS; }
-  const SymExpr *getRHS() const { return RHS; }
-
-  void dumpToStream(raw_ostream &os) const override;
-
-  unsigned computeComplexity() const override {
-    if (Complexity == 0)
-      Complexity = RHS->computeComplexity() + LHS->computeComplexity();
-    return Complexity;
-  }
-
-  static void Profile(llvm::FoldingSetNodeID& ID, const SymExpr *lhs,
-                    BinaryOperator::Opcode op, const SymExpr *rhs, QualType t) {
-    ID.AddInteger((unsigned) SymSymExprKind);
-    ID.AddPointer(lhs);
-    ID.AddInteger(op);
-    ID.AddPointer(rhs);
-    ID.Add(t);
-  }
+/// Represents a symbolic expression like 'x' + 3.
+using SymIntExpr = BinarySymExprImpl<const SymExpr *, const llvm::APSInt &,
+                                     SymExpr::Kind::SymIntExprKind>;
 
-  void Profile(llvm::FoldingSetNodeID& ID) override {
-    Profile(ID, LHS, getOpcode(), RHS, getType());
-  }
+/// Represents a symbolic expression like 3 - 'x'.
+using IntSymExpr = BinarySymExprImpl<const llvm::APSInt &, const SymExpr *,
+                                     SymExpr::Kind::IntSymExprKind>;
 
-  // Implement isa<T> support.
-  static bool classof(const SymExpr *SE) {
-    return SE->getKind() == SymSymExprKind;
-  }
-};
+/// Represents a symbolic expression like 'x' + 'y'.
+using SymSymExpr = BinarySymExprImpl<const SymExpr *, const SymExpr *,
+                                     SymExpr::Kind::SymSymExprKind>;
 
 class SymbolManager {
   using DataSetTy = llvm::FoldingSet<SymExpr>;
diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
index ff6cd21b09d63..6ca7aec9caeca 100644
--- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
@@ -34,45 +34,27 @@ using namespace ento;
 
 void SymExpr::anchor() {}
 
-LLVM_DUMP_METHOD void SymExpr::dump() const {
-  dumpToStream(llvm::errs());
-}
+LLVM_DUMP_METHOD void SymExpr::dump() const { dumpToStream(llvm::errs()); }
 
-void SymIntExpr::dumpToStream(raw_ostream &os) const {
-  os << '(';
-  getLHS()->dumpToStream(os);
-  os << ") "
-     << BinaryOperator::getOpcodeStr(getOpcode()) << ' ';
-  if (getRHS().isUnsigned())
-    os << getRHS().getZExtValue();
-  else
-    os << getRHS().getSExtValue();
-  if (getRHS().isUnsigned())
-    os << 'U';
+void BinarySymExpr::dumpToStreamImpl(raw_ostream &OS, const SymExpr *Sym) {
+  OS << '(';
+  Sym->dumpToStream(OS);
+  OS << ')';
 }
 
-void IntSymExpr::dumpToStream(raw_ostream &os) const {
-  if (getLHS().isUnsigned())
-    os << getLHS().getZExtValue();
+void BinarySymExpr::dumpToStreamImpl(raw_ostream &OS,
+                                     const llvm::APSInt &Value) {
+  if (Value.isUnsigned())
+    OS << Value.getZExtValue();
   else
-    os << getLHS().getSExtValue();
-  if (getLHS().isUnsigned())
-    os << 'U';
-  os << ' '
-     << BinaryOperator::getOpcodeStr(getOpcode())
-     << " (";
-  getRHS()->dumpToStream(os);
-  os << ')';
+    OS << Value.getSExtValue();
+  if (Value.isUnsigned())
+    OS << 'U';
 }
 
-void SymSymExpr::dumpToStream(raw_ostream &os) const {
-  os << '(';
-  getLHS()->dumpToStream(os);
-  os << ") "
-     << BinaryOperator::getOpcodeStr(getOpcode())
-     << " (";
-  getRHS()->dumpToStream(os);
-  os << ')';
+void BinarySymExpr::dumpToStreamImpl(raw_ostream &OS,
+                                     BinaryOperator::Opcode Op) {
+  OS << ' ' << BinaryOperator::getOpcodeStr(Op) << ' ';
 }
 
 void SymbolCast::dumpToStream(raw_ostream &os) const {

From 1f57d76a8dd00611aaa4b33048be195ea9a2dc44 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Fri, 1 May 2020 11:49:23 +0300
Subject: [PATCH 368/770] [analyzer] Refactor range inference for symbolic
 expressions

Summary:
This change introduces a new component to unite all of the reasoning
we have about operations on ranges in the analyzer's solver.
In many cases, we might conclude that the range for a symbolic operation
is much more narrow than the type implies.  While reasoning about
runtime conditions (especially in loops), we need to support more and
more of those little pieces of logic.  The new component mostly plays
a role of an organizer for those, and allows us to focus on the actual
reasoning about ranges and not dispatching manually on the types of the
nested symbolic expressions.

Differential Revision: https://reviews.llvm.org/D79232
---
 .../PathSensitive/RangedConstraintManager.h   |   7 +
 .../Core/RangeConstraintManager.cpp           | 303 +++++++++++++-----
 clang/test/Analysis/constant-folding.c        |  19 +-
 clang/test/Analysis/double-ranges-bug.c       |  22 ++
 4 files changed, 264 insertions(+), 87 deletions(-)
 create mode 100644 clang/test/Analysis/double-ranges-bug.c

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
index c72f8292647dc..97c9c6d63eb2f 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
@@ -30,6 +30,10 @@ class Range : public std::pair<const llvm::APSInt *, const llvm::APSInt *> {
       : std::pair<const llvm::APSInt *, const llvm::APSInt *>(&from, &to) {
     assert(from <= to);
   }
+
+  Range(const llvm::APSInt &point)
+      : std::pair<const llvm::APSInt *, const llvm::APSInt *>(&point, &point) {}
+
   bool Includes(const llvm::APSInt &v) const {
     return *first <= v && v <= *second;
   }
@@ -89,6 +93,9 @@ class RangeSet {
   RangeSet(Factory &F, const llvm::APSInt &from, const llvm::APSInt &to)
       : ranges(F.add(F.getEmptySet(), Range(from, to))) {}
 
+  /// Construct a new RangeSet representing the given point as a range.
+  RangeSet(Factory &F, const llvm::APSInt &point) : RangeSet(F, point, point) {}
+
   /// Profile - Generates a hash profile of this RangeSet for use
   ///  by FoldingSet.
   void Profile(llvm::FoldingSetNodeID &ID) const { ranges.Profile(ID); }
diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index a3ea7d4c013b9..368324d3d34fd 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -16,6 +16,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/SValVisitor.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ImmutableSet.h"
 #include "llvm/Support/raw_ostream.h"
@@ -23,10 +24,16 @@
 using namespace clang;
 using namespace ento;
 
+//===----------------------------------------------------------------------===//
+//                           RangeSet implementation
+//===----------------------------------------------------------------------===//
+
 void RangeSet::IntersectInRange(BasicValueFactory &BV, Factory &F,
-                      const llvm::APSInt &Lower, const llvm::APSInt &Upper,
-                      PrimRangeSet &newRanges, PrimRangeSet::iterator &i,
-                      PrimRangeSet::iterator &e) const {
+                                const llvm::APSInt &Lower,
+                                const llvm::APSInt &Upper,
+                                PrimRangeSet &newRanges,
+                                PrimRangeSet::iterator &i,
+                                PrimRangeSet::iterator &e) const {
   // There are six cases for each range R in the set:
   //   1. R is entirely before the intersection range.
   //   2. R is entirely after the intersection range.
@@ -66,6 +73,11 @@ const llvm::APSInt &RangeSet::getMinValue() const {
 }
 
 bool RangeSet::pin(llvm::APSInt &Lower, llvm::APSInt &Upper) const {
+  if (isEmpty()) {
+    // This range is already infeasible.
+    return false;
+  }
+
   // This function has nine cases, the cartesian product of range-testing
   // both the upper and lower bounds against the symbol's type.
   // Each case requires a different pinning operation.
@@ -283,6 +295,207 @@ void RangeSet::print(raw_ostream &os) const {
 }
 
 namespace {
+
+/// A little component aggregating all of the reasoning we have about
+/// the ranges of symbolic expressions.
+///
+/// Even when we don't know the exact values of the operands, we still
+/// can get a pretty good estimate of the result's range.
+class SymbolicRangeInferrer
+    : public SymExprVisitor<SymbolicRangeInferrer, RangeSet> {
+public:
+  static RangeSet inferRange(BasicValueFactory &BV, RangeSet::Factory &F,
+                             ProgramStateRef State, SymbolRef Sym) {
+    SymbolicRangeInferrer Inferrer(BV, F, State);
+    return Inferrer.infer(Sym);
+  }
+
+  RangeSet VisitSymExpr(SymbolRef Sym) {
+    // If we got to this function, the actual type of the symbolic
+    // expression is not supported for advanced inference.
+    // In this case, we simply backoff to the default "let's simply
+    // infer the range from the expression's type".
+    return infer(Sym->getType());
+  }
+
+  RangeSet VisitSymIntExpr(const SymIntExpr *Sym) {
+    return VisitBinaryOperator(Sym);
+  }
+
+  RangeSet VisitIntSymExpr(const IntSymExpr *Sym) {
+    return VisitBinaryOperator(Sym);
+  }
+
+  RangeSet VisitSymSymExpr(const SymSymExpr *Sym) {
+    return VisitBinaryOperator(Sym);
+  }
+
+private:
+  SymbolicRangeInferrer(BasicValueFactory &BV, RangeSet::Factory &F,
+                        ProgramStateRef S)
+      : ValueFactory(BV), RangeFactory(F), State(S) {}
+
+  /// Infer range information from the given integer constant.
+  ///
+  /// It's not a real "inference", but is here for operating with
+  /// sub-expressions in a more polymorphic manner.
+  RangeSet inferAs(const llvm::APSInt &Val, QualType) {
+    return {RangeFactory, Val};
+  }
+
+  /// Infer range information from symbol in the context of the given type.
+  RangeSet inferAs(SymbolRef Sym, QualType DestType) {
+    QualType ActualType = Sym->getType();
+    // Check that we can reason about the symbol at all.
+    if (ActualType->isIntegralOrEnumerationType() ||
+        Loc::isLocType(ActualType)) {
+      return infer(Sym);
+    }
+    // Otherwise, let's simply infer from the destination type.
+    // We couldn't figure out nothing else about that expression.
+    return infer(DestType);
+  }
+
+  RangeSet infer(SymbolRef Sym) {
+    const RangeSet *AssociatedRange = State->get<ConstraintRange>(Sym);
+
+    // If Sym is a difference of symbols A - B, then maybe we have range set
+    // stored for B - A.
+    const RangeSet *RangeAssociatedWithNegatedSym =
+        getRangeForMinusSymbol(State, Sym);
+
+    // If we have range set stored for both A - B and B - A then calculate the
+    // effective range set by intersecting the range set for A - B and the
+    // negated range set of B - A.
+    if (AssociatedRange && RangeAssociatedWithNegatedSym)
+      return AssociatedRange->Intersect(
+          ValueFactory, RangeFactory,
+          RangeAssociatedWithNegatedSym->Negate(ValueFactory, RangeFactory));
+
+    if (AssociatedRange)
+      return *AssociatedRange;
+
+    if (RangeAssociatedWithNegatedSym)
+      return RangeAssociatedWithNegatedSym->Negate(ValueFactory, RangeFactory);
+
+    return Visit(Sym);
+  }
+
+  /// Infer range information solely from the type.
+  RangeSet infer(QualType T) {
+    // Lazily generate a new RangeSet representing all possible values for the
+    // given symbol type.
+    RangeSet Result(RangeFactory, ValueFactory.getMinValue(T),
+                    ValueFactory.getMaxValue(T));
+
+    // References are known to be non-zero.
+    if (T->isReferenceType())
+      return assumeNonZero(Result, T);
+
+    return Result;
+  }
+
+  template <class BinarySymExprTy>
+  RangeSet VisitBinaryOperator(const BinarySymExprTy *Sym) {
+    // TODO #1: VisitBinaryOperator implementation might not make a good
+    // use of the inferred ranges.  In this case, we might be calculating
+    // everything for nothing.  This being said, we should introduce some
+    // sort of laziness mechanism here.
+    //
+    // TODO #2: We didn't go into the nested expressions before, so it
+    // might cause us spending much more time doing the inference.
+    // This can be a problem for deeply nested expressions that are
+    // involved in conditions and get tested continuously.  We definitely
+    // need to address this issue and introduce some sort of caching
+    // in here.
+    QualType ResultType = Sym->getType();
+    return VisitBinaryOperator(inferAs(Sym->getLHS(), ResultType),
+                               Sym->getOpcode(),
+                               inferAs(Sym->getRHS(), ResultType), ResultType);
+  }
+
+  RangeSet VisitBinaryOperator(RangeSet LHS, BinaryOperator::Opcode Op,
+                               RangeSet RHS, QualType T) {
+    switch (Op) {
+    case BO_Or:
+      return VisitOrOperator(LHS, RHS, T);
+    case BO_And:
+      return VisitAndOperator(LHS, RHS, T);
+    default:
+      return infer(T);
+    }
+  }
+
+  RangeSet VisitOrOperator(RangeSet LHS, RangeSet RHS, QualType T) {
+    // TODO: generalize for the ranged RHS.
+    if (const llvm::APSInt *RHSConstant = RHS.getConcreteValue()) {
+      // For unsigned types, the output is greater-or-equal than RHS.
+      if (T->isUnsignedIntegerType()) {
+        return LHS.Intersect(ValueFactory, RangeFactory, *RHSConstant,
+                             ValueFactory.getMaxValue(T));
+      }
+
+      // Bitwise-or with a non-zero constant is always non-zero.
+      const llvm::APSInt &Zero = ValueFactory.getAPSIntType(T).getZeroValue();
+      if (*RHSConstant != Zero) {
+        return assumeNonZero(LHS, T);
+      }
+    }
+    return infer(T);
+  }
+
+  RangeSet VisitAndOperator(RangeSet LHS, RangeSet RHS, QualType T) {
+    // TODO: generalize for the ranged RHS.
+    if (const llvm::APSInt *RHSConstant = RHS.getConcreteValue()) {
+      const llvm::APSInt &Zero = ValueFactory.getAPSIntType(T).getZeroValue();
+
+      // For unsigned types, or positive RHS,
+      // bitwise-and output is always smaller-or-equal than RHS (assuming two's
+      // complement representation of signed types).
+      if (T->isUnsignedIntegerType() || *RHSConstant >= Zero) {
+        return LHS.Intersect(ValueFactory, RangeFactory,
+                             ValueFactory.getMinValue(T), *RHSConstant);
+      }
+    }
+    return infer(T);
+  }
+
+  /// Return a range set subtracting zero from \p Domain.
+  RangeSet assumeNonZero(RangeSet Domain, QualType T) {
+    APSIntType IntType = ValueFactory.getAPSIntType(T);
+    return Domain.Intersect(ValueFactory, RangeFactory,
+                            ++IntType.getZeroValue(), --IntType.getZeroValue());
+  }
+
+  // FIXME: Once SValBuilder supports unary minus, we should use SValBuilder to
+  //        obtain the negated symbolic expression instead of constructing the
+  //        symbol manually. This will allow us to support finding ranges of not
+  //        only negated SymSymExpr-type expressions, but also of other, simpler
+  //        expressions which we currently do not know how to negate.
+  const RangeSet *getRangeForMinusSymbol(ProgramStateRef State, SymbolRef Sym) {
+    if (const SymSymExpr *SSE = dyn_cast<SymSymExpr>(Sym)) {
+      if (SSE->getOpcode() == BO_Sub) {
+        QualType T = Sym->getType();
+        SymbolManager &SymMgr = State->getSymbolManager();
+        SymbolRef negSym =
+            SymMgr.getSymSymExpr(SSE->getRHS(), BO_Sub, SSE->getLHS(), T);
+
+        if (const RangeSet *negV = State->get<ConstraintRange>(negSym)) {
+          // Unsigned range set cannot be negated, unless it is [0, 0].
+          if (T->isUnsignedIntegerOrEnumerationType() ||
+              T->isSignedIntegerOrEnumerationType())
+            return negV;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  BasicValueFactory &ValueFactory;
+  RangeSet::Factory &RangeFactory;
+  ProgramStateRef State;
+};
+
 class RangeConstraintManager : public RangedConstraintManager {
 public:
   RangeConstraintManager(ExprEngine *EE, SValBuilder &SVB)
@@ -350,8 +563,7 @@ class RangeConstraintManager : public RangedConstraintManager {
   RangeSet::Factory F;
 
   RangeSet getRange(ProgramStateRef State, SymbolRef Sym);
-  const RangeSet* getRangeForMinusSymbol(ProgramStateRef State,
-                                         SymbolRef Sym);
+  const RangeSet *getRangeForMinusSymbol(ProgramStateRef State, SymbolRef Sym);
 
   RangeSet getSymLTRange(ProgramStateRef St, SymbolRef Sym,
                          const llvm::APSInt &Int,
@@ -368,7 +580,6 @@ class RangeConstraintManager : public RangedConstraintManager {
   RangeSet getSymGERange(ProgramStateRef St, SymbolRef Sym,
                          const llvm::APSInt &Int,
                          const llvm::APSInt &Adjustment);
-
 };
 
 } // end anonymous namespace
@@ -475,87 +686,9 @@ RangeConstraintManager::removeDeadBindings(ProgramStateRef State,
   return Changed ? State->set<ConstraintRange>(CR) : State;
 }
 
-/// Return a range set subtracting zero from \p Domain.
-static RangeSet assumeNonZero(
-    BasicValueFactory &BV,
-    RangeSet::Factory &F,
-    SymbolRef Sym,
-    RangeSet Domain) {
-  APSIntType IntType = BV.getAPSIntType(Sym->getType());
-  return Domain.Intersect(BV, F, ++IntType.getZeroValue(),
-      --IntType.getZeroValue());
-}
-
-/// Apply implicit constraints for bitwise OR- and AND-.
-/// For unsigned types, bitwise OR with a constant always returns
-/// a value greater-or-equal than the constant, and bitwise AND
-/// returns a value less-or-equal then the constant.
-///
-/// Pattern matches the expression \p Sym against those rule,
-/// and applies the required constraints.
-/// \p Input Previously established expression range set
-static RangeSet applyBitwiseConstraints(
-    BasicValueFactory &BV,
-    RangeSet::Factory &F,
-    RangeSet Input,
-    const SymIntExpr* SIE) {
-  QualType T = SIE->getType();
-  bool IsUnsigned = T->isUnsignedIntegerType();
-  const llvm::APSInt &RHS = SIE->getRHS();
-  const llvm::APSInt &Zero = BV.getAPSIntType(T).getZeroValue();
-  BinaryOperator::Opcode Operator = SIE->getOpcode();
-
-  // For unsigned types, the output of bitwise-or is bigger-or-equal than RHS.
-  if (Operator == BO_Or && IsUnsigned)
-    return Input.Intersect(BV, F, RHS, BV.getMaxValue(T));
-
-  // Bitwise-or with a non-zero constant is always non-zero.
-  if (Operator == BO_Or && RHS != Zero)
-    return assumeNonZero(BV, F, SIE, Input);
-
-  // For unsigned types, or positive RHS,
-  // bitwise-and output is always smaller-or-equal than RHS (assuming two's
-  // complement representation of signed types).
-  if (Operator == BO_And && (IsUnsigned || RHS >= Zero))
-    return Input.Intersect(BV, F, BV.getMinValue(T), RHS);
-
-  return Input;
-}
-
 RangeSet RangeConstraintManager::getRange(ProgramStateRef State,
                                           SymbolRef Sym) {
-  ConstraintRangeTy::data_type *V = State->get<ConstraintRange>(Sym);
-
-  // If Sym is a difference of symbols A - B, then maybe we have range set
-  // stored for B - A.
-  BasicValueFactory &BV = getBasicVals();
-  const RangeSet *R = getRangeForMinusSymbol(State, Sym);
-
-  // If we have range set stored for both A - B and B - A then calculate the
-  // effective range set by intersecting the range set for A - B and the
-  // negated range set of B - A.
-  if (V && R)
-    return V->Intersect(BV, F, R->Negate(BV, F));
-  if (V)
-    return *V;
-  if (R)
-    return R->Negate(BV, F);
-
-  // Lazily generate a new RangeSet representing all possible values for the
-  // given symbol type.
-  QualType T = Sym->getType();
-
-  RangeSet Result(F, BV.getMinValue(T), BV.getMaxValue(T));
-
-  // References are known to be non-zero.
-  if (T->isReferenceType())
-    return assumeNonZero(BV, F, Sym, Result);
-
-  // Known constraints on ranges of bitwise expressions.
-  if (const SymIntExpr* SIE = dyn_cast<SymIntExpr>(Sym))
-    return applyBitwiseConstraints(BV, F, Result, SIE);
-
-  return Result;
+  return SymbolicRangeInferrer::inferRange(getBasicVals(), F, State, Sym);
 }
 
 // FIXME: Once SValBuilder supports unary minus, we should use SValBuilder to
diff --git a/clang/test/Analysis/constant-folding.c b/clang/test/Analysis/constant-folding.c
index fc04e608f7ef6..5429f3c740e1c 100644
--- a/clang/test/Analysis/constant-folding.c
+++ b/clang/test/Analysis/constant-folding.c
@@ -115,7 +115,22 @@ void testBitwiseRules(unsigned int a, int b) {
 #endif
 
   // Check that dynamically computed constants also work.
-  int constant = 1 << 3;
+  unsigned int constant = 1 << 3;
   unsigned int d = a | constant;
-  clang_analyzer_eval(constant > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(d >= constant); // expected-warning{{TRUE}}
+
+  // Check that nested expressions also work.
+  clang_analyzer_eval(((a | 10) | 5) >= 10); // expected-warning{{TRUE}}
+
+  // TODO: We misuse intersection of ranges for bitwise AND and OR operators.
+  //       Resulting ranges for the following cases are infeasible.
+  //       This is what causes paradoxical results below.
+  if (a > 10) {
+    clang_analyzer_eval((a & 1) <= 1); // expected-warning{{FALSE}}
+    clang_analyzer_eval((a & 1) > 1);  // expected-warning{{FALSE}}
+  }
+  if (a < 10) {
+    clang_analyzer_eval((a | 20) >= 20); // expected-warning{{FALSE}}
+    clang_analyzer_eval((a | 20) < 20);  // expected-warning{{FALSE}}
+  }
 }
diff --git a/clang/test/Analysis/double-ranges-bug.c b/clang/test/Analysis/double-ranges-bug.c
new file mode 100644
index 0000000000000..aa3dde2930348
--- /dev/null
+++ b/clang/test/Analysis/double-ranges-bug.c
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -verify %s -analyzer-checker=core
+
+// expected-no-diagnostics
+
+typedef unsigned long int A;
+
+extern int fill(A **values, int *nvalues);
+
+void foo() {
+  A *values;
+  int nvalues;
+  fill(&values, &nvalues);
+
+  int i = 1;
+  double x, y;
+
+  y = values[i - 1];
+  x = values[i];
+
+  if (x <= y) {
+  }
+}

From 47c4b8bd68698b1827f39c3056783ed042faf718 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Mon, 4 May 2020 19:44:43 +0300
Subject: [PATCH 369/770] [analyzer] Generalize bitwise OR rules for ranges

Summary:
Previously the current solver started reasoning about bitwise OR
expressions only when one of the operands is a constant.  However,
very similar logic could be applied to ranges.  This commit addresses
this shortcoming.  Additionally, it refines how we deal with negative
operands.

Differential Revision: https://reviews.llvm.org/D79336
---
 .../Core/PathSensitive/BasicValueFactory.h    |  16 ++-
 .../PathSensitive/RangedConstraintManager.h   |   8 +-
 .../Core/RangeConstraintManager.cpp           | 124 ++++++++++++++++--
 clang/test/Analysis/constant-folding.c        |  51 +++++--
 4 files changed, 166 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
index ac218bc070e9a..a001c0dc70308 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
@@ -157,6 +157,10 @@ class BasicValueFactory {
 
   const llvm::APSInt &Convert(QualType T, const llvm::APSInt &From) {
     APSIntType TargetType = getAPSIntType(T);
+    return Convert(TargetType, From);
+  }
+
+  const llvm::APSInt &Convert(APSIntType TargetType, const llvm::APSInt &From) {
     if (TargetType == APSIntType(From))
       return From;
 
@@ -177,11 +181,19 @@ class BasicValueFactory {
   }
 
   const llvm::APSInt &getMaxValue(QualType T) {
-    return getValue(getAPSIntType(T).getMaxValue());
+    return getMaxValue(getAPSIntType(T));
   }
 
   const llvm::APSInt &getMinValue(QualType T) {
-    return getValue(getAPSIntType(T).getMinValue());
+    return getMinValue(getAPSIntType(T));
+  }
+
+  const llvm::APSInt &getMaxValue(APSIntType T) {
+    return getValue(T.getMaxValue());
+  }
+
+  const llvm::APSInt &getMinValue(APSIntType T) {
+    return getValue(T.getMinValue());
   }
 
   const llvm::APSInt &Add1(const llvm::APSInt &V) {
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
index 97c9c6d63eb2f..a42eebd7d4e81 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h
@@ -107,14 +107,17 @@ class RangeSet {
     return ranges.isSingleton() ? ranges.begin()->getConcreteValue() : nullptr;
   }
 
+  /// Get a minimal value covered by the ranges in the set
+  const llvm::APSInt &getMinValue() const;
+  /// Get a maximal value covered by the ranges in the set
+  const llvm::APSInt &getMaxValue() const;
+
 private:
   void IntersectInRange(BasicValueFactory &BV, Factory &F,
                         const llvm::APSInt &Lower, const llvm::APSInt &Upper,
                         PrimRangeSet &newRanges, PrimRangeSet::iterator &i,
                         PrimRangeSet::iterator &e) const;
 
-  const llvm::APSInt &getMinValue() const;
-
   bool pin(llvm::APSInt &Lower, llvm::APSInt &Upper) const;
 
 public:
@@ -131,7 +134,6 @@ class RangeSet {
   }
 };
 
-
 class ConstraintRange {};
 using ConstraintRangeTy = llvm::ImmutableMap<SymbolRef, RangeSet>;
 
diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 368324d3d34fd..570161543805f 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -69,7 +69,19 @@ void RangeSet::IntersectInRange(BasicValueFactory &BV, Factory &F,
 
 const llvm::APSInt &RangeSet::getMinValue() const {
   assert(!isEmpty());
-  return ranges.begin()->From();
+  return begin()->From();
+}
+
+const llvm::APSInt &RangeSet::getMaxValue() const {
+  assert(!isEmpty());
+  // NOTE: It's a shame that we can't implement 'getMaxValue' without scanning
+  //       the whole tree to get to the last element.
+  //       llvm::ImmutableSet should support decrement for 'end' iterators
+  //       or reverse order iteration.
+  auto It = begin();
+  for (auto End = end(); std::next(It) != End; ++It) {
+  }
+  return It->To();
 }
 
 bool RangeSet::pin(llvm::APSInt &Lower, llvm::APSInt &Upper) const {
@@ -426,22 +438,106 @@ class SymbolicRangeInferrer
     }
   }
 
+  //===----------------------------------------------------------------------===//
+  //                         Ranges and operators
+  //===----------------------------------------------------------------------===//
+
+  /// Return a rough approximation of the given range set.
+  ///
+  /// For the range set:
+  ///   { [x_0, y_0], [x_1, y_1], ... , [x_N, y_N] }
+  /// it will return the range [x_0, y_N].
+  static Range fillGaps(RangeSet Origin) {
+    assert(!Origin.isEmpty());
+    return {Origin.getMinValue(), Origin.getMaxValue()};
+  }
+
+  /// Try to convert given range into the given type.
+  ///
+  /// It will return llvm::None only when the trivial conversion is possible.
+  llvm::Optional<Range> convert(const Range &Origin, APSIntType To) {
+    if (To.testInRange(Origin.From(), false) != APSIntType::RTR_Within ||
+        To.testInRange(Origin.To(), false) != APSIntType::RTR_Within) {
+      return llvm::None;
+    }
+    return Range(ValueFactory.Convert(To, Origin.From()),
+                 ValueFactory.Convert(To, Origin.To()));
+  }
+
   RangeSet VisitOrOperator(RangeSet LHS, RangeSet RHS, QualType T) {
-    // TODO: generalize for the ranged RHS.
-    if (const llvm::APSInt *RHSConstant = RHS.getConcreteValue()) {
-      // For unsigned types, the output is greater-or-equal than RHS.
-      if (T->isUnsignedIntegerType()) {
-        return LHS.Intersect(ValueFactory, RangeFactory, *RHSConstant,
-                             ValueFactory.getMaxValue(T));
-      }
+    // We should propagate information about unfeasbility of one of the
+    // operands to the resulting range.
+    if (LHS.isEmpty() || RHS.isEmpty()) {
+      return RangeFactory.getEmptySet();
+    }
 
-      // Bitwise-or with a non-zero constant is always non-zero.
-      const llvm::APSInt &Zero = ValueFactory.getAPSIntType(T).getZeroValue();
-      if (*RHSConstant != Zero) {
-        return assumeNonZero(LHS, T);
-      }
+    APSIntType ResultType = ValueFactory.getAPSIntType(T);
+    RangeSet DefaultRange = infer(T);
+
+    Range CoarseLHS = fillGaps(LHS);
+    Range CoarseRHS = fillGaps(RHS);
+
+    // We need to convert ranges to the resulting type, so we can compare values
+    // and combine them in a meaningful (in terms of the given operation) way.
+    auto ConvertedCoarseLHS = convert(CoarseLHS, ResultType);
+    auto ConvertedCoarseRHS = convert(CoarseRHS, ResultType);
+
+    // It is hard to reason about ranges when conversion changes
+    // borders of the ranges.
+    if (!ConvertedCoarseLHS || !ConvertedCoarseRHS) {
+      return DefaultRange;
     }
-    return infer(T);
+
+    llvm::APSInt Zero = ResultType.getZeroValue();
+
+    bool IsLHSPositiveOrZero = ConvertedCoarseLHS->From() >= Zero;
+    bool IsRHSPositiveOrZero = ConvertedCoarseRHS->From() >= Zero;
+
+    bool IsLHSNegative = ConvertedCoarseLHS->To() < Zero;
+    bool IsRHSNegative = ConvertedCoarseRHS->To() < Zero;
+
+    // Check if both ranges have the same sign.
+    if ((IsLHSPositiveOrZero && IsRHSPositiveOrZero) ||
+        (IsLHSNegative && IsRHSNegative)) {
+      // The result is definitely greater or equal than any of the operands.
+      const llvm::APSInt &Min =
+          std::max(ConvertedCoarseLHS->From(), ConvertedCoarseRHS->From());
+
+      // We estimate maximal value for positives as the maximal value for the
+      // given type.  For negatives, we estimate it with -1 (e.g. 0x11111111).
+      //
+      // TODO: We basically, limit the resulting range from below (in absolute
+      //       numbers), but don't do anything with the upper bound.
+      //       For positive operands, it can be done as follows: for the upper
+      //       bound of LHS and RHS we calculate the most significant bit set.
+      //       Let's call it the N-th bit.  Then we can estimate the maximal
+      //       number to be 2^(N+1)-1, i.e. the number with all the bits up to
+      //       the N-th bit set.
+      const llvm::APSInt &Max = IsLHSNegative
+                                    ? ValueFactory.getValue(--Zero)
+                                    : ValueFactory.getMaxValue(ResultType);
+
+      return {RangeFactory, ValueFactory.getValue(Min), Max};
+    }
+
+    // Otherwise, let's check if at least one of the operands is negative.
+    if (IsLHSNegative || IsRHSNegative) {
+      // This means that the result is definitely negative as well.
+      return {RangeFactory, ValueFactory.getMinValue(ResultType),
+              ValueFactory.getValue(--Zero)};
+    }
+
+    // It is pretty hard to reason about operands with different signs
+    // (and especially with possibly different signs).  We simply check if it
+    // can be zero.  In order to conclude that the result could not be zero,
+    // at least one of the operands should be definitely not zero itself.
+    if (!ConvertedCoarseLHS->Includes(Zero) ||
+        !ConvertedCoarseRHS->Includes(Zero)) {
+      return assumeNonZero(DefaultRange, T);
+    }
+
+    // Nothing much else to do here.
+    return DefaultRange;
   }
 
   RangeSet VisitAndOperator(RangeSet LHS, RangeSet RHS, QualType T) {
diff --git a/clang/test/Analysis/constant-folding.c b/clang/test/Analysis/constant-folding.c
index 5429f3c740e1c..1fdd474dc90ad 100644
--- a/clang/test/Analysis/constant-folding.c
+++ b/clang/test/Analysis/constant-folding.c
@@ -77,7 +77,7 @@ void testMixedTypeComparisons (char a, unsigned long b) {
   clang_analyzer_eval(a != b); // expected-warning{{TRUE}}
 }
 
-void testBitwiseRules(unsigned int a, int b) {
+void testBitwiseRules(unsigned int a, int b, int c) {
   clang_analyzer_eval((a | 1) >= 1); // expected-warning{{TRUE}}
   clang_analyzer_eval((a | -1) >= -1); // expected-warning{{TRUE}}
   clang_analyzer_eval((a | 2) >= 2); // expected-warning{{TRUE}}
@@ -96,9 +96,9 @@ void testBitwiseRules(unsigned int a, int b) {
   // Again, check for different argument order.
   clang_analyzer_eval((1 & a) <= 1); // expected-warning{{TRUE}}
 
-  unsigned int c = a;
-  c |= 1;
-  clang_analyzer_eval((c | 0) == 0); // expected-warning{{FALSE}}
+  unsigned int d = a;
+  d |= 1;
+  clang_analyzer_eval((d | 0) == 0); // expected-warning{{FALSE}}
 
   // Rules don't apply to signed typed, as the values might be negative.
   clang_analyzer_eval((b | 1) > 0); // expected-warning{{UNKNOWN}}
@@ -108,20 +108,47 @@ void testBitwiseRules(unsigned int a, int b) {
   clang_analyzer_eval((b | -2) == 0); // expected-warning{{FALSE}}
   clang_analyzer_eval((b | 10) == 0); // expected-warning{{FALSE}}
   clang_analyzer_eval((b | 0) == 0); // expected-warning{{UNKNOWN}}
-#ifdef ANALYZER_CM_Z3
   clang_analyzer_eval((b | -2) >= 0); // expected-warning{{FALSE}}
-#else
-  clang_analyzer_eval((b | -2) >= 0); // expected-warning{{UNKNOWN}}
-#endif
+
+  // Check that we can operate with negative ranges
+  if (b < 0) {
+    clang_analyzer_eval((b | -1) == -1);   // expected-warning{{TRUE}}
+    clang_analyzer_eval((b | -10) >= -10); // expected-warning{{TRUE}}
+
+    int e = (b | -5);
+    clang_analyzer_eval(e >= -5 && e <= -1); // expected-warning{{TRUE}}
+
+    if (b < -20) {
+      clang_analyzer_eval((b | e) >= -5); // expected-warning{{TRUE}}
+    }
+
+    // Check that we can reason about the result even if know nothing
+    // about one of the operands.
+    clang_analyzer_eval((b | c) != 0); // expected-warning{{TRUE}}
+  }
+
+  if (a <= 30 && b >= 10 && c >= 20) {
+    // Check that we can reason about non-constant operands.
+    clang_analyzer_eval((b | c) >= 20); // expected-warning{{TRUE}}
+
+    // Check that we can reason about the resulting range even if
+    // the types are not the same, but we still can convert operand
+    // ranges.
+    clang_analyzer_eval((a | b) >= 10); // expected-warning{{TRUE}}
+  }
 
   // Check that dynamically computed constants also work.
   unsigned int constant = 1 << 3;
-  unsigned int d = a | constant;
-  clang_analyzer_eval(d >= constant); // expected-warning{{TRUE}}
+  unsigned int f = a | constant;
+  clang_analyzer_eval(f >= constant); // expected-warning{{TRUE}}
 
   // Check that nested expressions also work.
   clang_analyzer_eval(((a | 10) | 5) >= 10); // expected-warning{{TRUE}}
 
+  if (a < 10) {
+    clang_analyzer_eval((a | 20) >= 20); // expected-warning{{TRUE}}
+  }
+
   // TODO: We misuse intersection of ranges for bitwise AND and OR operators.
   //       Resulting ranges for the following cases are infeasible.
   //       This is what causes paradoxical results below.
@@ -129,8 +156,4 @@ void testBitwiseRules(unsigned int a, int b) {
     clang_analyzer_eval((a & 1) <= 1); // expected-warning{{FALSE}}
     clang_analyzer_eval((a & 1) > 1);  // expected-warning{{FALSE}}
   }
-  if (a < 10) {
-    clang_analyzer_eval((a | 20) >= 20); // expected-warning{{FALSE}}
-    clang_analyzer_eval((a | 20) < 20);  // expected-warning{{FALSE}}
-  }
 }

From 2a09daff0f902e70a08f2b30f3461fb8848f5ab1 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Tue, 5 May 2020 19:42:33 +0300
Subject: [PATCH 370/770] [analyzer] Generalize bitwise AND rules for ranges

Summary:
Previously the current solver started reasoning about bitwise AND
expressions only when one of the operands is a constant.  However,
very similar logic could be applied to ranges.  This commit addresses
this shortcoming.  Additionally, it refines how we deal with negative
operands.

rdar://problem/54359410

Differential Revision: https://reviews.llvm.org/D79434
---
 .../Core/RangeConstraintManager.cpp           | 183 +++++++++++-------
 clang/test/Analysis/constant-folding.c        |  47 +++--
 clang/test/Analysis/switch-case.c             |  11 ++
 .../Analysis/uninit-exhaustive-switch-bug.c   |  20 ++
 4 files changed, 176 insertions(+), 85 deletions(-)
 create mode 100644 clang/test/Analysis/uninit-exhaustive-switch-bug.c

diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 570161543805f..b73c395d80fa7 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -430,9 +430,9 @@ class SymbolicRangeInferrer
                                RangeSet RHS, QualType T) {
     switch (Op) {
     case BO_Or:
-      return VisitOrOperator(LHS, RHS, T);
+      return VisitBinaryOperator<BO_Or>(LHS, RHS, T);
     case BO_And:
-      return VisitAndOperator(LHS, RHS, T);
+      return VisitBinaryOperator<BO_And>(LHS, RHS, T);
     default:
       return infer(T);
     }
@@ -464,19 +464,19 @@ class SymbolicRangeInferrer
                  ValueFactory.Convert(To, Origin.To()));
   }
 
-  RangeSet VisitOrOperator(RangeSet LHS, RangeSet RHS, QualType T) {
+  template <BinaryOperator::Opcode Op>
+  RangeSet VisitBinaryOperator(RangeSet LHS, RangeSet RHS, QualType T) {
     // We should propagate information about unfeasbility of one of the
     // operands to the resulting range.
     if (LHS.isEmpty() || RHS.isEmpty()) {
       return RangeFactory.getEmptySet();
     }
 
-    APSIntType ResultType = ValueFactory.getAPSIntType(T);
-    RangeSet DefaultRange = infer(T);
-
     Range CoarseLHS = fillGaps(LHS);
     Range CoarseRHS = fillGaps(RHS);
 
+    APSIntType ResultType = ValueFactory.getAPSIntType(T);
+
     // We need to convert ranges to the resulting type, so we can compare values
     // and combine them in a meaningful (in terms of the given operation) way.
     auto ConvertedCoarseLHS = convert(CoarseLHS, ResultType);
@@ -485,74 +485,14 @@ class SymbolicRangeInferrer
     // It is hard to reason about ranges when conversion changes
     // borders of the ranges.
     if (!ConvertedCoarseLHS || !ConvertedCoarseRHS) {
-      return DefaultRange;
-    }
-
-    llvm::APSInt Zero = ResultType.getZeroValue();
-
-    bool IsLHSPositiveOrZero = ConvertedCoarseLHS->From() >= Zero;
-    bool IsRHSPositiveOrZero = ConvertedCoarseRHS->From() >= Zero;
-
-    bool IsLHSNegative = ConvertedCoarseLHS->To() < Zero;
-    bool IsRHSNegative = ConvertedCoarseRHS->To() < Zero;
-
-    // Check if both ranges have the same sign.
-    if ((IsLHSPositiveOrZero && IsRHSPositiveOrZero) ||
-        (IsLHSNegative && IsRHSNegative)) {
-      // The result is definitely greater or equal than any of the operands.
-      const llvm::APSInt &Min =
-          std::max(ConvertedCoarseLHS->From(), ConvertedCoarseRHS->From());
-
-      // We estimate maximal value for positives as the maximal value for the
-      // given type.  For negatives, we estimate it with -1 (e.g. 0x11111111).
-      //
-      // TODO: We basically, limit the resulting range from below (in absolute
-      //       numbers), but don't do anything with the upper bound.
-      //       For positive operands, it can be done as follows: for the upper
-      //       bound of LHS and RHS we calculate the most significant bit set.
-      //       Let's call it the N-th bit.  Then we can estimate the maximal
-      //       number to be 2^(N+1)-1, i.e. the number with all the bits up to
-      //       the N-th bit set.
-      const llvm::APSInt &Max = IsLHSNegative
-                                    ? ValueFactory.getValue(--Zero)
-                                    : ValueFactory.getMaxValue(ResultType);
-
-      return {RangeFactory, ValueFactory.getValue(Min), Max};
-    }
-
-    // Otherwise, let's check if at least one of the operands is negative.
-    if (IsLHSNegative || IsRHSNegative) {
-      // This means that the result is definitely negative as well.
-      return {RangeFactory, ValueFactory.getMinValue(ResultType),
-              ValueFactory.getValue(--Zero)};
-    }
-
-    // It is pretty hard to reason about operands with different signs
-    // (and especially with possibly different signs).  We simply check if it
-    // can be zero.  In order to conclude that the result could not be zero,
-    // at least one of the operands should be definitely not zero itself.
-    if (!ConvertedCoarseLHS->Includes(Zero) ||
-        !ConvertedCoarseRHS->Includes(Zero)) {
-      return assumeNonZero(DefaultRange, T);
+      return infer(T);
     }
 
-    // Nothing much else to do here.
-    return DefaultRange;
+    return VisitBinaryOperator<Op>(*ConvertedCoarseLHS, *ConvertedCoarseRHS, T);
   }
 
-  RangeSet VisitAndOperator(RangeSet LHS, RangeSet RHS, QualType T) {
-    // TODO: generalize for the ranged RHS.
-    if (const llvm::APSInt *RHSConstant = RHS.getConcreteValue()) {
-      const llvm::APSInt &Zero = ValueFactory.getAPSIntType(T).getZeroValue();
-
-      // For unsigned types, or positive RHS,
-      // bitwise-and output is always smaller-or-equal than RHS (assuming two's
-      // complement representation of signed types).
-      if (T->isUnsignedIntegerType() || *RHSConstant >= Zero) {
-        return LHS.Intersect(ValueFactory, RangeFactory,
-                             ValueFactory.getMinValue(T), *RHSConstant);
-      }
-    }
+  template <BinaryOperator::Opcode Op>
+  RangeSet VisitBinaryOperator(Range LHS, Range RHS, QualType T) {
     return infer(T);
   }
 
@@ -592,6 +532,109 @@ class SymbolicRangeInferrer
   ProgramStateRef State;
 };
 
+template <>
+RangeSet SymbolicRangeInferrer::VisitBinaryOperator<BO_Or>(Range LHS, Range RHS,
+                                                           QualType T) {
+  APSIntType ResultType = ValueFactory.getAPSIntType(T);
+  llvm::APSInt Zero = ResultType.getZeroValue();
+
+  bool IsLHSPositiveOrZero = LHS.From() >= Zero;
+  bool IsRHSPositiveOrZero = RHS.From() >= Zero;
+
+  bool IsLHSNegative = LHS.To() < Zero;
+  bool IsRHSNegative = RHS.To() < Zero;
+
+  // Check if both ranges have the same sign.
+  if ((IsLHSPositiveOrZero && IsRHSPositiveOrZero) ||
+      (IsLHSNegative && IsRHSNegative)) {
+    // The result is definitely greater or equal than any of the operands.
+    const llvm::APSInt &Min = std::max(LHS.From(), RHS.From());
+
+    // We estimate maximal value for positives as the maximal value for the
+    // given type.  For negatives, we estimate it with -1 (e.g. 0x11111111).
+    //
+    // TODO: We basically, limit the resulting range from below, but don't do
+    //       anything with the upper bound.
+    //
+    //       For positive operands, it can be done as follows: for the upper
+    //       bound of LHS and RHS we calculate the most significant bit set.
+    //       Let's call it the N-th bit.  Then we can estimate the maximal
+    //       number to be 2^(N+1)-1, i.e. the number with all the bits up to
+    //       the N-th bit set.
+    const llvm::APSInt &Max = IsLHSNegative
+                                  ? ValueFactory.getValue(--Zero)
+                                  : ValueFactory.getMaxValue(ResultType);
+
+    return {RangeFactory, ValueFactory.getValue(Min), Max};
+  }
+
+  // Otherwise, let's check if at least one of the operands is negative.
+  if (IsLHSNegative || IsRHSNegative) {
+    // This means that the result is definitely negative as well.
+    return {RangeFactory, ValueFactory.getMinValue(ResultType),
+            ValueFactory.getValue(--Zero)};
+  }
+
+  RangeSet DefaultRange = infer(T);
+
+  // It is pretty hard to reason about operands with different signs
+  // (and especially with possibly different signs).  We simply check if it
+  // can be zero.  In order to conclude that the result could not be zero,
+  // at least one of the operands should be definitely not zero itself.
+  if (!LHS.Includes(Zero) || !RHS.Includes(Zero)) {
+    return assumeNonZero(DefaultRange, T);
+  }
+
+  // Nothing much else to do here.
+  return DefaultRange;
+}
+
+template <>
+RangeSet SymbolicRangeInferrer::VisitBinaryOperator<BO_And>(Range LHS,
+                                                            Range RHS,
+                                                            QualType T) {
+  APSIntType ResultType = ValueFactory.getAPSIntType(T);
+  llvm::APSInt Zero = ResultType.getZeroValue();
+
+  bool IsLHSPositiveOrZero = LHS.From() >= Zero;
+  bool IsRHSPositiveOrZero = RHS.From() >= Zero;
+
+  bool IsLHSNegative = LHS.To() < Zero;
+  bool IsRHSNegative = RHS.To() < Zero;
+
+  // Check if both ranges have the same sign.
+  if ((IsLHSPositiveOrZero && IsRHSPositiveOrZero) ||
+      (IsLHSNegative && IsRHSNegative)) {
+    // The result is definitely less or equal than any of the operands.
+    const llvm::APSInt &Max = std::min(LHS.To(), RHS.To());
+
+    // We conservatively estimate lower bound to be the smallest positive
+    // or negative value corresponding to the sign of the operands.
+    const llvm::APSInt &Min = IsLHSNegative
+                                  ? ValueFactory.getMinValue(ResultType)
+                                  : ValueFactory.getValue(Zero);
+
+    return {RangeFactory, Min, Max};
+  }
+
+  // Otherwise, let's check if at least one of the operands is positive.
+  if (IsLHSPositiveOrZero || IsRHSPositiveOrZero) {
+    // This makes result definitely positive.
+    //
+    // We can also reason about a maximal value by finding the maximal
+    // value of the positive operand.
+    const llvm::APSInt &Max = IsLHSPositiveOrZero ? LHS.To() : RHS.To();
+
+    // The minimal value on the other hand is much harder to reason about.
+    // The only thing we know for sure is that the result is positive.
+    return {RangeFactory, ValueFactory.getValue(Zero),
+            ValueFactory.getValue(Max)};
+  }
+
+  // Nothing much else to do here.
+  return infer(T);
+}
+
 class RangeConstraintManager : public RangedConstraintManager {
 public:
   RangeConstraintManager(ExprEngine *EE, SValBuilder &SVB)
diff --git a/clang/test/Analysis/constant-folding.c b/clang/test/Analysis/constant-folding.c
index 1fdd474dc90ad..b3320cc53636d 100644
--- a/clang/test/Analysis/constant-folding.c
+++ b/clang/test/Analysis/constant-folding.c
@@ -78,19 +78,20 @@ void testMixedTypeComparisons (char a, unsigned long b) {
 }
 
 void testBitwiseRules(unsigned int a, int b, int c) {
-  clang_analyzer_eval((a | 1) >= 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval((a | 1) >= 1);   // expected-warning{{TRUE}}
   clang_analyzer_eval((a | -1) >= -1); // expected-warning{{TRUE}}
-  clang_analyzer_eval((a | 2) >= 2); // expected-warning{{TRUE}}
-  clang_analyzer_eval((a | 5) >= 5); // expected-warning{{TRUE}}
+  clang_analyzer_eval((a | 2) >= 2);   // expected-warning{{TRUE}}
+  clang_analyzer_eval((a | 5) >= 5);   // expected-warning{{TRUE}}
   clang_analyzer_eval((a | 10) >= 10); // expected-warning{{TRUE}}
 
   // Argument order should not influence this
   clang_analyzer_eval((1 | a) >= 1); // expected-warning{{TRUE}}
 
-  clang_analyzer_eval((a & 1) <= 1); // expected-warning{{TRUE}}
-  clang_analyzer_eval((a & 2) <= 2); // expected-warning{{TRUE}}
-  clang_analyzer_eval((a & 5) <= 5); // expected-warning{{TRUE}}
-  clang_analyzer_eval((a & 10) <= 10); // expected-warning{{TRUE}}
+  clang_analyzer_eval((a & 1) <= 1);    // expected-warning{{TRUE}}
+  clang_analyzer_eval((a & 1) >= 0);    // expected-warning{{TRUE}}
+  clang_analyzer_eval((a & 2) <= 2);    // expected-warning{{TRUE}}
+  clang_analyzer_eval((a & 5) <= 5);    // expected-warning{{TRUE}}
+  clang_analyzer_eval((a & 10) <= 10);  // expected-warning{{TRUE}}
   clang_analyzer_eval((a & -10) <= 10); // expected-warning{{UNKNOWN}}
 
   // Again, check for different argument order.
@@ -104,22 +105,37 @@ void testBitwiseRules(unsigned int a, int b, int c) {
   clang_analyzer_eval((b | 1) > 0); // expected-warning{{UNKNOWN}}
 
   // Even for signed values, bitwise OR with a non-zero is always non-zero.
-  clang_analyzer_eval((b | 1) == 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval((b | 1) == 0);  // expected-warning{{FALSE}}
   clang_analyzer_eval((b | -2) == 0); // expected-warning{{FALSE}}
   clang_analyzer_eval((b | 10) == 0); // expected-warning{{FALSE}}
-  clang_analyzer_eval((b | 0) == 0); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval((b | 0) == 0);  // expected-warning{{UNKNOWN}}
   clang_analyzer_eval((b | -2) >= 0); // expected-warning{{FALSE}}
 
   // Check that we can operate with negative ranges
   if (b < 0) {
     clang_analyzer_eval((b | -1) == -1);   // expected-warning{{TRUE}}
     clang_analyzer_eval((b | -10) >= -10); // expected-warning{{TRUE}}
+    clang_analyzer_eval((b & 0) == 0);     // expected-warning{{TRUE}}
+    clang_analyzer_eval((b & -10) <= -10); // expected-warning{{TRUE}}
+    clang_analyzer_eval((b & 5) >= 0);     // expected-warning{{TRUE}}
 
     int e = (b | -5);
     clang_analyzer_eval(e >= -5 && e <= -1); // expected-warning{{TRUE}}
 
     if (b < -20) {
-      clang_analyzer_eval((b | e) >= -5); // expected-warning{{TRUE}}
+      clang_analyzer_eval((b | e) >= -5);    // expected-warning{{TRUE}}
+      clang_analyzer_eval((b & -10) < -20);  // expected-warning{{TRUE}}
+      clang_analyzer_eval((b & e) < -20);    // expected-warning{{TRUE}}
+      clang_analyzer_eval((b & -30) <= -30); // expected-warning{{TRUE}}
+
+      if (c >= -30 && c <= -10) {
+        clang_analyzer_eval((b & c) <= -20); // expected-warning{{TRUE}}
+      }
+    }
+
+    if (a <= 40) {
+      int g = (int)a & b;
+      clang_analyzer_eval(g <= 40 && g >= 0); // expected-warning{{TRUE}}
     }
 
     // Check that we can reason about the result even if know nothing
@@ -135,6 +151,11 @@ void testBitwiseRules(unsigned int a, int b, int c) {
     // the types are not the same, but we still can convert operand
     // ranges.
     clang_analyzer_eval((a | b) >= 10); // expected-warning{{TRUE}}
+    clang_analyzer_eval((a & b) <= 30); // expected-warning{{TRUE}}
+
+    if (b <= 20) {
+      clang_analyzer_eval((a & b) <= 20); // expected-warning{{TRUE}}
+    }
   }
 
   // Check that dynamically computed constants also work.
@@ -149,11 +170,7 @@ void testBitwiseRules(unsigned int a, int b, int c) {
     clang_analyzer_eval((a | 20) >= 20); // expected-warning{{TRUE}}
   }
 
-  // TODO: We misuse intersection of ranges for bitwise AND and OR operators.
-  //       Resulting ranges for the following cases are infeasible.
-  //       This is what causes paradoxical results below.
   if (a > 10) {
-    clang_analyzer_eval((a & 1) <= 1); // expected-warning{{FALSE}}
-    clang_analyzer_eval((a & 1) > 1);  // expected-warning{{FALSE}}
+    clang_analyzer_eval((a & 1) <= 1); // expected-warning{{TRUE}}
   }
 }
diff --git a/clang/test/Analysis/switch-case.c b/clang/test/Analysis/switch-case.c
index 1391f437f1f09..a9f566160f1ba 100644
--- a/clang/test/Analysis/switch-case.c
+++ b/clang/test/Analysis/switch-case.c
@@ -218,3 +218,14 @@ void testConstant() {
     break;
   }
 }
+
+void testExhaustiveSwitch(unsigned int a) {
+  switch (a & 5) {
+  case 0 ... 5:
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    break;
+  default:
+    clang_analyzer_warnIfReached(); // no-warning
+    break;
+  }
+}
diff --git a/clang/test/Analysis/uninit-exhaustive-switch-bug.c b/clang/test/Analysis/uninit-exhaustive-switch-bug.c
new file mode 100644
index 0000000000000..634b00ce30bc4
--- /dev/null
+++ b/clang/test/Analysis/uninit-exhaustive-switch-bug.c
@@ -0,0 +1,20 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s
+
+// rdar://problem/54359410
+// expected-no-diagnostics
+
+int rand();
+
+void test() {
+  int offset = 0;
+  int value;
+  int test = rand();
+  switch (test & 0x1) {
+  case 0:
+  case 1:
+    value = 0;
+    break;
+  }
+
+  offset += value; // no-warning
+}

From 73c120a9895a7e12e3c29a755d64096c8bd0220f Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@nitrogen.local>
Date: Thu, 14 May 2020 17:07:54 +0300
Subject: [PATCH 371/770] [analyzer] Introduce reasoning about symbolic
 remainder operator

Summary:
New logic tries to narrow possible result values of the remainder operation
based on its operands and their ranges.  It also tries to be conservative
with negative operands because according to the standard the sign of
the result is implementation-defined.

rdar://problem/44978988

Differential Revision: https://reviews.llvm.org/D80117
---
 .../Core/RangeConstraintManager.cpp           |  99 +++++++++
 .../StaticAnalyzer/Core/SimpleSValBuilder.cpp |   5 +
 clang/test/Analysis/PR35418.cpp               |  28 +++
 clang/test/Analysis/constant-folding.c        |  77 +++++++
 clang/test/Analysis/hangs.c                   | 196 +++++++++++++++++-
 .../uninit-bug-first-iteration-init.c         |  27 +++
 6 files changed, 429 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Analysis/PR35418.cpp
 create mode 100644 clang/test/Analysis/uninit-bug-first-iteration-init.c

diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index b73c395d80fa7..6f92b965ce5b3 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -433,6 +433,8 @@ class SymbolicRangeInferrer
       return VisitBinaryOperator<BO_Or>(LHS, RHS, T);
     case BO_And:
       return VisitBinaryOperator<BO_And>(LHS, RHS, T);
+    case BO_Rem:
+      return VisitBinaryOperator<BO_Rem>(LHS, RHS, T);
     default:
       return infer(T);
     }
@@ -496,6 +498,46 @@ class SymbolicRangeInferrer
     return infer(T);
   }
 
+  /// Return a symmetrical range for the given range and type.
+  ///
+  /// If T is signed, return the smallest range [-x..x] that covers the original
+  /// range, or [-min(T), max(T)] if the aforementioned symmetric range doesn't
+  /// exist due to original range covering min(T)).
+  ///
+  /// If T is unsigned, return the smallest range [0..x] that covers the
+  /// original range.
+  Range getSymmetricalRange(Range Origin, QualType T) {
+    APSIntType RangeType = ValueFactory.getAPSIntType(T);
+
+    if (RangeType.isUnsigned()) {
+      return Range(ValueFactory.getMinValue(RangeType), Origin.To());
+    }
+
+    if (Origin.From().isMinSignedValue()) {
+      // If mini is a minimal signed value, absolute value of it is greater
+      // than the maximal signed value.  In order to avoid these
+      // complications, we simply return the whole range.
+      return {ValueFactory.getMinValue(RangeType),
+              ValueFactory.getMaxValue(RangeType)};
+    }
+
+    // At this point, we are sure that the type is signed and we can safely
+    // use unary - operator.
+    //
+    // While calculating absolute maximum, we can use the following formula
+    // because of these reasons:
+    //   * If From >= 0 then To >= From and To >= -From.
+    //     AbsMax == To == max(To, -From)
+    //   * If To <= 0 then -From >= -To and -From >= From.
+    //     AbsMax == -From == max(-From, To)
+    //   * Otherwise, From <= 0, To >= 0, and
+    //     AbsMax == max(abs(From), abs(To))
+    llvm::APSInt AbsMax = std::max(-Origin.From(), Origin.To());
+
+    // Intersection is guaranteed to be non-empty.
+    return {ValueFactory.getValue(-AbsMax), ValueFactory.getValue(AbsMax)};
+  }
+
   /// Return a range set subtracting zero from \p Domain.
   RangeSet assumeNonZero(RangeSet Domain, QualType T) {
     APSIntType IntType = ValueFactory.getAPSIntType(T);
@@ -635,6 +677,63 @@ RangeSet SymbolicRangeInferrer::VisitBinaryOperator<BO_And>(Range LHS,
   return infer(T);
 }
 
+template <>
+RangeSet SymbolicRangeInferrer::VisitBinaryOperator<BO_Rem>(Range LHS,
+                                                            Range RHS,
+                                                            QualType T) {
+  llvm::APSInt Zero = ValueFactory.getAPSIntType(T).getZeroValue();
+
+  Range ConservativeRange = getSymmetricalRange(RHS, T);
+
+  llvm::APSInt Max = ConservativeRange.To();
+  llvm::APSInt Min = ConservativeRange.From();
+
+  if (Max == Zero) {
+    // It's an undefined behaviour to divide by 0 and it seems like we know
+    // for sure that RHS is 0.  Let's say that the resulting range is
+    // simply infeasible for that matter.
+    return RangeFactory.getEmptySet();
+  }
+
+  // At this point, our conservative range is closed.  The result, however,
+  // couldn't be greater than the RHS' maximal absolute value.  Because of
+  // this reason, we turn the range into open (or half-open in case of
+  // unsigned integers).
+  //
+  // While we operate on integer values, an open interval (a, b) can be easily
+  // represented by the closed interval [a + 1, b - 1].  And this is exactly
+  // what we do next.
+  //
+  // If we are dealing with unsigned case, we shouldn't move the lower bound.
+  if (Min.isSigned()) {
+    ++Min;
+  }
+  --Max;
+
+  bool IsLHSPositiveOrZero = LHS.From() >= Zero;
+  bool IsRHSPositiveOrZero = RHS.From() >= Zero;
+
+  // Remainder operator results with negative operands is implementation
+  // defined.  Positive cases are much easier to reason about though.
+  if (IsLHSPositiveOrZero && IsRHSPositiveOrZero) {
+    // If maximal value of LHS is less than maximal value of RHS,
+    // the result won't get greater than LHS.To().
+    Max = std::min(LHS.To(), Max);
+    // We want to check if it is a situation similar to the following:
+    //
+    // <------------|---[  LHS  ]--------[  RHS  ]----->
+    //  -INF        0                              +INF
+    //
+    // In this situation, we can conclude that (LHS / RHS) == 0 and
+    // (LHS % RHS) == LHS.
+    Min = LHS.To() < RHS.From() ? LHS.From() : Zero;
+  }
+
+  // Nevertheless, the symmetrical range for RHS is a conservative estimate
+  // for any sign of either LHS, or RHS.
+  return {RangeFactory, ValueFactory.getValue(Min), ValueFactory.getValue(Max)};
+}
+
 class RangeConstraintManager : public RangedConstraintManager {
 public:
   RangeConstraintManager(ExprEngine *EE, SValBuilder &SVB)
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index d9fe3af3c0000..2e269f6a596e8 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -652,6 +652,11 @@ SVal SimpleSValBuilder::evalBinOpNN(ProgramStateRef state,
         if (LHSValue == 0)
           return evalCastFromNonLoc(lhs, resultTy);
         return makeSymExprValNN(op, InputLHS, InputRHS, resultTy);
+      case BO_Rem:
+        // 0 % x == 0
+        if (LHSValue == 0)
+          return makeZeroVal(resultTy);
+        LLVM_FALLTHROUGH;
       default:
         return makeSymExprValNN(op, InputLHS, InputRHS, resultTy);
       }
diff --git a/clang/test/Analysis/PR35418.cpp b/clang/test/Analysis/PR35418.cpp
new file mode 100644
index 0000000000000..658da72f1462d
--- /dev/null
+++ b/clang/test/Analysis/PR35418.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s
+
+// expected-no-diagnostics
+
+void halt() __attribute__((__noreturn__));
+void assert(int b) {
+  if (!b)
+    halt();
+}
+
+void decode(unsigned width) {
+  assert(width > 0);
+
+  int base;
+  bool inited = false;
+
+  int i = 0;
+
+  if (i % width == 0) {
+    base = 512;
+    inited = true;
+  }
+
+  base += 1; // no-warning
+
+  if (base >> 10)
+    assert(false);
+}
diff --git a/clang/test/Analysis/constant-folding.c b/clang/test/Analysis/constant-folding.c
index b3320cc53636d..08a7accfba641 100644
--- a/clang/test/Analysis/constant-folding.c
+++ b/clang/test/Analysis/constant-folding.c
@@ -1,5 +1,9 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s
 
+#define UINT_MAX (~0U)
+#define INT_MAX (int)(UINT_MAX & (UINT_MAX >> 1))
+#define INT_MIN (int)(UINT_MAX & ~(UINT_MAX >> 1))
+
 void clang_analyzer_eval(int);
 
 // There should be no warnings unless otherwise indicated.
@@ -174,3 +178,76 @@ void testBitwiseRules(unsigned int a, int b, int c) {
     clang_analyzer_eval((a & 1) <= 1); // expected-warning{{TRUE}}
   }
 }
+
+void testRemainderRules(unsigned int a, unsigned int b, int c, int d) {
+  // Check that we know that remainder of zero divided by any number is still 0.
+  clang_analyzer_eval((0 % c) == 0); // expected-warning{{TRUE}}
+
+  clang_analyzer_eval((10 % a) <= 10); // expected-warning{{TRUE}}
+
+  if (a <= 30 && b <= 50) {
+    clang_analyzer_eval((40 % a) < 30); // expected-warning{{TRUE}}
+    clang_analyzer_eval((a % b) < 50);  // expected-warning{{TRUE}}
+    clang_analyzer_eval((b % a) < 30);  // expected-warning{{TRUE}}
+
+    if (a >= 10) {
+      // Even though it seems like a valid assumption, it is not.
+      // Check that we are not making this mistake.
+      clang_analyzer_eval((a % b) >= 10); // expected-warning{{UNKNOWN}}
+
+      // Check that we can we can infer when remainder is equal
+      // to the dividend.
+      clang_analyzer_eval((4 % a) == 4); // expected-warning{{TRUE}}
+      if (b < 7) {
+        clang_analyzer_eval((b % a) < 7); // expected-warning{{TRUE}}
+      }
+    }
+  }
+
+  if (c > -10) {
+    clang_analyzer_eval((d % c) < INT_MAX);     // expected-warning{{TRUE}}
+    clang_analyzer_eval((d % c) > INT_MIN + 1); // expected-warning{{TRUE}}
+  }
+
+  // Check that we can reason about signed integers when they are
+  // known to be positive.
+  if (c >= 10 && c <= 30 && d >= 20 && d <= 50) {
+    clang_analyzer_eval((5 % c) == 5);  // expected-warning{{TRUE}}
+    clang_analyzer_eval((c % d) <= 30); // expected-warning{{TRUE}}
+    clang_analyzer_eval((c % d) >= 0);  // expected-warning{{TRUE}}
+    clang_analyzer_eval((d % c) < 30);  // expected-warning{{TRUE}}
+    clang_analyzer_eval((d % c) >= 0);  // expected-warning{{TRUE}}
+  }
+
+  if (c >= -30 && c <= -10 && d >= -20 && d <= 50) {
+    // Test positive LHS with negative RHS.
+    clang_analyzer_eval((40 % c) < 30);  // expected-warning{{TRUE}}
+    clang_analyzer_eval((40 % c) > -30); // expected-warning{{TRUE}}
+
+    // Test negative LHS with possibly negative RHS.
+    clang_analyzer_eval((-10 % d) < 50);  // expected-warning{{TRUE}}
+    clang_analyzer_eval((-20 % d) > -50); // expected-warning{{TRUE}}
+
+    // Check that we don't make wrong assumptions
+    clang_analyzer_eval((-20 % d) > -20); // expected-warning{{UNKNOWN}}
+
+    // Check that we can reason about negative ranges...
+    clang_analyzer_eval((c % d) < 50); // expected-warning{{TRUE}}
+    /// ...both ways
+    clang_analyzer_eval((d % c) < 30); // expected-warning{{TRUE}}
+
+    if (a <= 10) {
+      // Result is unsigned.  This means that 'c' is casted to unsigned.
+      // We don't want to reason about ranges changing boundaries with
+      // conversions.
+      clang_analyzer_eval((a % c) < 30); // expected-warning{{UNKNOWN}}
+    }
+  }
+
+  // Check that we work correctly when minimal unsigned value from a range is
+  // equal to the signed minimum for the same bit width.
+  unsigned int x = INT_MIN;
+  if (a >= x && a <= x + 10) {
+    clang_analyzer_eval((b % a) < x + 10); // expected-warning{{TRUE}}
+  }
+}
diff --git a/clang/test/Analysis/hangs.c b/clang/test/Analysis/hangs.c
index b109bcb52fdb2..ce719a16d6c45 100644
--- a/clang/test/Analysis/hangs.c
+++ b/clang/test/Analysis/hangs.c
@@ -1,9 +1,16 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker core -verify %s
-
-// expected-no-diagnostics
+// RUN: %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker core,debug.ExprInspection
 
 // Stuff that used to hang.
 
+extern void __assert_fail(__const char *__assertion, __const char *__file,
+                          unsigned int __line, __const char *__function)
+    __attribute__((__noreturn__));
+#define assert(expr) \
+  ((expr) ? (void)(0) : __assert_fail(#expr, __FILE__, __LINE__, __func__))
+
+void clang_analyzer_eval(int);
+
 int g();
 
 int f(int y) {
@@ -28,3 +35,186 @@ void produce_an_exponentially_exploding_symbol(int x, int y) {
   x += y; y += x + g();
   x += y; y += x + g();
 }
+
+void produce_an_exponentially_exploding_symbol_2(int x, int y) {
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  if (x > 1) {
+    if (x > 2) {
+      if (x > 3) {
+        if (x > 4) {
+          if (x > 5) {
+            if (x > 6) {
+              if (x > 7) {
+                if (x > 8) {
+                  if (x > 9) {
+                    if (x > 10) {
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void produce_an_exponentially_exploding_symbol_3(int x, int y) {
+  assert(0 < x && x < 10);
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  x &= y;
+  y &= x & g();
+  clang_analyzer_eval(0 < x && x < 10); // expected-warning{{TRUE}}
+                                        // expected-warning@-1{{FALSE}}
+}
diff --git a/clang/test/Analysis/uninit-bug-first-iteration-init.c b/clang/test/Analysis/uninit-bug-first-iteration-init.c
new file mode 100644
index 0000000000000..a0fae2950fed4
--- /dev/null
+++ b/clang/test/Analysis/uninit-bug-first-iteration-init.c
@@ -0,0 +1,27 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s
+
+// rdar://problem/44978988
+// expected-no-diagnostics
+
+int foo();
+
+int gTotal;
+
+double bar(int start, int end) {
+  int i, cnt, processed, size;
+  double result, inc;
+
+  result = 0;
+  processed = start;
+  size = gTotal * 2;
+  cnt = (end - start + 1) * size;
+
+  for (i = 0; i < cnt; i += 2) {
+    if ((i % size) == 0) {
+      inc = foo();
+      processed++;
+    }
+    result += inc * inc; // no-warning
+  }
+  return result;
+}

From 35492270ed705ea9ac98ba04c6fda1adafef613a Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Thu, 28 May 2020 17:36:48 +0200
Subject: [PATCH 372/770] Remove WrapperMatcherInterface

Summary:
WrapperMatcherInterface is an abstraction over a member variable -- in
other words, not much of an abstraction at all. I think it makes code
harder to read more than in helps with deduplication. Not to even
mention the questionable usage of the ~Interface suffix for a type with
state.

Reviewers: ymandel

Reviewed By: ymandel

Subscribers: arichardson, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80704
---
 .../clang/ASTMatchers/ASTMatchersInternal.h   | 116 ++++++++++--------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index ac8469bded538..e064b28b84f91 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -489,19 +489,6 @@ class DynTypedMatcher {
   IntrusiveRefCntPtr<DynMatcherInterface> Implementation;
 };
 
-/// Wrapper base class for a wrapping matcher.
-///
-/// This is just a container for a DynTypedMatcher that can be used as a base
-/// class for another matcher.
-template <typename T>
-class WrapperMatcherInterface : public MatcherInterface<T> {
-protected:
-  explicit WrapperMatcherInterface(DynTypedMatcher &&InnerMatcher)
-      : InnerMatcher(std::move(InnerMatcher)) {}
-
-  const DynTypedMatcher InnerMatcher;
-};
-
 /// Wrapper of a MatcherInterface<T> *that allows copying.
 ///
 /// A Matcher<Base> can be used anywhere a Matcher<Derived> is
@@ -572,10 +559,12 @@ class Matcher {
   /// does only matches in the absence of qualifiers, or not, i.e. simply
   /// ignores any qualifiers.
   template <typename TypeT>
-  class TypeToQualType : public WrapperMatcherInterface<QualType> {
+  class TypeToQualType : public MatcherInterface<QualType> {
+    const DynTypedMatcher InnerMatcher;
+
   public:
     TypeToQualType(const Matcher<TypeT> &InnerMatcher)
-        : TypeToQualType::WrapperMatcherInterface(InnerMatcher) {}
+        : InnerMatcher(InnerMatcher) {}
 
     bool matches(const QualType &Node, ASTMatchFinder *Finder,
                  BoundNodesTreeBuilder *Builder) const override {
@@ -764,13 +753,15 @@ Matcher<ObjCMessageExpr> hasAnySelectorFunc(
 /// Type argument DeclMatcherT is required by PolymorphicMatcherWithParam1 but
 /// not actually used.
 template <typename T, typename DeclMatcherT>
-class HasDeclarationMatcher : public WrapperMatcherInterface<T> {
+class HasDeclarationMatcher : public MatcherInterface<T> {
   static_assert(std::is_same<DeclMatcherT, Matcher<Decl>>::value,
                 "instantiated with wrong types");
 
+  const DynTypedMatcher InnerMatcher;
+
 public:
   explicit HasDeclarationMatcher(const Matcher<Decl> &InnerMatcher)
-      : HasDeclarationMatcher::WrapperMatcherInterface(InnerMatcher) {}
+      : InnerMatcher(InnerMatcher) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
@@ -1181,14 +1172,14 @@ struct ArgumentAdaptingMatcherFunc {
   }
 };
 
-template <typename T>
-class TraversalMatcher : public WrapperMatcherInterface<T> {
+template <typename T> class TraversalMatcher : public MatcherInterface<T> {
+  const DynTypedMatcher InnerMatcher;
   clang::TraversalKind Traversal;
 
 public:
-  explicit TraversalMatcher(clang::TraversalKind TK, const Matcher<T> &ChildMatcher)
-      : TraversalMatcher::WrapperMatcherInterface(ChildMatcher), Traversal(TK) {
-  }
+  explicit TraversalMatcher(clang::TraversalKind TK,
+                            const Matcher<T> &InnerMatcher)
+      : InnerMatcher(InnerMatcher), Traversal(TK) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
@@ -1337,10 +1328,12 @@ class BindableMatcher : public Matcher<T> {
 ///
 /// ChildT must be an AST base type.
 template <typename T, typename ChildT>
-class HasMatcher : public WrapperMatcherInterface<T> {
+class HasMatcher : public MatcherInterface<T> {
+  const DynTypedMatcher InnerMatcher;
+
 public:
-  explicit HasMatcher(const Matcher<ChildT> &ChildMatcher)
-      : HasMatcher::WrapperMatcherInterface(ChildMatcher) {}
+  explicit HasMatcher(const Matcher<ChildT> &InnerMatcher)
+      : InnerMatcher(InnerMatcher) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
@@ -1356,16 +1349,18 @@ class HasMatcher : public WrapperMatcherInterface<T> {
 /// As opposed to the HasMatcher, the ForEachMatcher will produce a match
 /// for each child that matches.
 template <typename T, typename ChildT>
-class ForEachMatcher : public WrapperMatcherInterface<T> {
+class ForEachMatcher : public MatcherInterface<T> {
   static_assert(IsBaseType<ChildT>::value,
                 "for each only accepts base type matcher");
 
- public:
-   explicit ForEachMatcher(const Matcher<ChildT> &ChildMatcher)
-       : ForEachMatcher::WrapperMatcherInterface(ChildMatcher) {}
+  const DynTypedMatcher InnerMatcher;
 
-  bool matches(const T& Node, ASTMatchFinder* Finder,
-               BoundNodesTreeBuilder* Builder) const override {
+public:
+  explicit ForEachMatcher(const Matcher<ChildT> &InnerMatcher)
+      : InnerMatcher(InnerMatcher) {}
+
+  bool matches(const T &Node, ASTMatchFinder *Finder,
+               BoundNodesTreeBuilder *Builder) const override {
     return Finder->matchesChildOf(
         Node, this->InnerMatcher, Builder,
         TraversalKind::TK_IgnoreImplicitCastsAndParentheses,
@@ -1469,17 +1464,19 @@ BindableMatcher<T> makeDynCastAllOfComposite(
 ///
 /// DescendantT must be an AST base type.
 template <typename T, typename DescendantT>
-class HasDescendantMatcher : public WrapperMatcherInterface<T> {
+class HasDescendantMatcher : public MatcherInterface<T> {
   static_assert(IsBaseType<DescendantT>::value,
                 "has descendant only accepts base type matcher");
 
+  const DynTypedMatcher DescendantMatcher;
+
 public:
   explicit HasDescendantMatcher(const Matcher<DescendantT> &DescendantMatcher)
-      : HasDescendantMatcher::WrapperMatcherInterface(DescendantMatcher) {}
+      : DescendantMatcher(DescendantMatcher) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
-    return Finder->matchesDescendantOf(Node, this->InnerMatcher, Builder,
+    return Finder->matchesDescendantOf(Node, this->DescendantMatcher, Builder,
                                        ASTMatchFinder::BK_First);
   }
 };
@@ -1489,17 +1486,19 @@ class HasDescendantMatcher : public WrapperMatcherInterface<T> {
 ///
 /// \c ParentT must be an AST base type.
 template <typename T, typename ParentT>
-class HasParentMatcher : public WrapperMatcherInterface<T> {
+class HasParentMatcher : public MatcherInterface<T> {
   static_assert(IsBaseType<ParentT>::value,
                 "has parent only accepts base type matcher");
 
+  const DynTypedMatcher ParentMatcher;
+
 public:
   explicit HasParentMatcher(const Matcher<ParentT> &ParentMatcher)
-      : HasParentMatcher::WrapperMatcherInterface(ParentMatcher) {}
+      : ParentMatcher(ParentMatcher) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
-    return Finder->matchesAncestorOf(Node, this->InnerMatcher, Builder,
+    return Finder->matchesAncestorOf(Node, this->ParentMatcher, Builder,
                                      ASTMatchFinder::AMM_ParentOnly);
   }
 };
@@ -1509,17 +1508,19 @@ class HasParentMatcher : public WrapperMatcherInterface<T> {
 ///
 /// \c AncestorT must be an AST base type.
 template <typename T, typename AncestorT>
-class HasAncestorMatcher : public WrapperMatcherInterface<T> {
+class HasAncestorMatcher : public MatcherInterface<T> {
   static_assert(IsBaseType<AncestorT>::value,
                 "has ancestor only accepts base type matcher");
 
+  const DynTypedMatcher AncestorMatcher;
+
 public:
   explicit HasAncestorMatcher(const Matcher<AncestorT> &AncestorMatcher)
-      : HasAncestorMatcher::WrapperMatcherInterface(AncestorMatcher) {}
+      : AncestorMatcher(AncestorMatcher) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
-    return Finder->matchesAncestorOf(Node, this->InnerMatcher, Builder,
+    return Finder->matchesAncestorOf(Node, this->AncestorMatcher, Builder,
                                      ASTMatchFinder::AMM_All);
   }
 };
@@ -1531,18 +1532,20 @@ class HasAncestorMatcher : public WrapperMatcherInterface<T> {
 /// As opposed to HasDescendantMatcher, ForEachDescendantMatcher will match
 /// for each descendant node that matches instead of only for the first.
 template <typename T, typename DescendantT>
-class ForEachDescendantMatcher : public WrapperMatcherInterface<T> {
+class ForEachDescendantMatcher : public MatcherInterface<T> {
   static_assert(IsBaseType<DescendantT>::value,
                 "for each descendant only accepts base type matcher");
 
+  const DynTypedMatcher DescendantMatcher;
+
 public:
   explicit ForEachDescendantMatcher(
       const Matcher<DescendantT> &DescendantMatcher)
-      : ForEachDescendantMatcher::WrapperMatcherInterface(DescendantMatcher) {}
+      : DescendantMatcher(DescendantMatcher) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
-    return Finder->matchesDescendantOf(Node, this->InnerMatcher, Builder,
+    return Finder->matchesDescendantOf(Node, this->DescendantMatcher, Builder,
                                        ASTMatchFinder::BK_All);
   }
 };
@@ -1635,10 +1638,12 @@ class VariadicAllOfMatcher
 /// Matches nodes of type \c TLoc for which the inner
 /// \c Matcher<T> matches.
 template <typename TLoc, typename T>
-class LocMatcher : public WrapperMatcherInterface<TLoc> {
+class LocMatcher : public MatcherInterface<TLoc> {
+  const DynTypedMatcher InnerMatcher;
+
 public:
   explicit LocMatcher(const Matcher<T> &InnerMatcher)
-      : LocMatcher::WrapperMatcherInterface(InnerMatcher) {}
+      : InnerMatcher(InnerMatcher) {}
 
   bool matches(const TLoc &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
@@ -1657,10 +1662,12 @@ class LocMatcher : public WrapperMatcherInterface<TLoc> {
 /// \c QualType.
 ///
 /// Used to implement the \c loc() matcher.
-class TypeLocTypeMatcher : public WrapperMatcherInterface<TypeLoc> {
+class TypeLocTypeMatcher : public MatcherInterface<TypeLoc> {
+  const DynTypedMatcher InnerMatcher;
+
 public:
   explicit TypeLocTypeMatcher(const Matcher<QualType> &InnerMatcher)
-      : TypeLocTypeMatcher::WrapperMatcherInterface(InnerMatcher) {}
+      : InnerMatcher(InnerMatcher) {}
 
   bool matches(const TypeLoc &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
@@ -1674,13 +1681,13 @@ class TypeLocTypeMatcher : public WrapperMatcherInterface<TypeLoc> {
 /// Matches nodes of type \c T for which the inner matcher matches on a
 /// another node of type \c T that can be reached using a given traverse
 /// function.
-template <typename T>
-class TypeTraverseMatcher : public WrapperMatcherInterface<T> {
+template <typename T> class TypeTraverseMatcher : public MatcherInterface<T> {
+  const DynTypedMatcher InnerMatcher;
+
 public:
   explicit TypeTraverseMatcher(const Matcher<QualType> &InnerMatcher,
                                QualType (T::*TraverseFunction)() const)
-      : TypeTraverseMatcher::WrapperMatcherInterface(InnerMatcher),
-        TraverseFunction(TraverseFunction) {}
+      : InnerMatcher(InnerMatcher), TraverseFunction(TraverseFunction) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {
@@ -1699,12 +1706,13 @@ class TypeTraverseMatcher : public WrapperMatcherInterface<T> {
 /// matcher matches on a another node of type \c T that can be reached using a
 /// given traverse function.
 template <typename T>
-class TypeLocTraverseMatcher : public WrapperMatcherInterface<T> {
+class TypeLocTraverseMatcher : public MatcherInterface<T> {
+  const DynTypedMatcher InnerMatcher;
+
 public:
   explicit TypeLocTraverseMatcher(const Matcher<TypeLoc> &InnerMatcher,
                                   TypeLoc (T::*TraverseFunction)() const)
-      : TypeLocTraverseMatcher::WrapperMatcherInterface(InnerMatcher),
-        TraverseFunction(TraverseFunction) {}
+      : InnerMatcher(InnerMatcher), TraverseFunction(TraverseFunction) {}
 
   bool matches(const T &Node, ASTMatchFinder *Finder,
                BoundNodesTreeBuilder *Builder) const override {

From fefe4366c3bdd03552c448972930a0f7df328c24 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 28 May 2020 09:05:24 -0700
Subject: [PATCH 373/770] [mlir] Use ValueRange instead of ArrayRef<Value>

This allows constructing operand adaptor from existing op (useful for commonalizing verification as I want to do in a follow up).

I also add ability to use member initializers for the generated adaptor constructors for convenience.

Differential Revision: https://reviews.llvm.org/D80667
---
 .../StandardToLLVM/ConvertStandardToLLVM.h    |  4 +-
 .../mlir/Dialect/SPIRV/SPIRVLowering.h        |  2 +-
 mlir/include/mlir/TableGen/OpClass.h          | 30 ++++++++++++--
 mlir/include/mlir/TableGen/Operator.h         |  3 ++
 .../StandardToLLVM/StandardToLLVM.cpp         |  9 ++--
 mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp      |  5 ++-
 mlir/lib/TableGen/OpClass.cpp                 | 39 ++++++++++++++----
 mlir/lib/TableGen/Operator.cpp                |  4 ++
 mlir/test/mlir-tblgen/op-decl.td              | 16 ++++----
 mlir/test/mlir-tblgen/op-operand.td           |  6 +--
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp   | 41 ++++++++++++-------
 11 files changed, 112 insertions(+), 47 deletions(-)

diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
index 2eae578fc966a..c241de6ff6fea 100644
--- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -438,12 +438,12 @@ class ConvertToLLVMPattern : public ConversionPattern {
   // This is a strided getElementPtr variant that linearizes subscripts as:
   //   `base_offset + index_0 * stride_0 + ... + index_n * stride_n`.
   Value getStridedElementPtr(Location loc, Type elementTypePtr,
-                             Value descriptor, ArrayRef<Value> indices,
+                             Value descriptor, ValueRange indices,
                              ArrayRef<int64_t> strides, int64_t offset,
                              ConversionPatternRewriter &rewriter) const;
 
   Value getDataPtr(Location loc, MemRefType type, Value memRefDesc,
-                   ArrayRef<Value> indices, ConversionPatternRewriter &rewriter,
+                   ValueRange indices, ConversionPatternRewriter &rewriter,
                    llvm::Module &module) const;
 
 protected:
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
index 1fa668d7ddc0f..f0a429941fb35 100644
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
@@ -124,7 +124,7 @@ Value getBuiltinVariableValue(Operation *op, BuiltIn builtin,
 // with AffineMap that has static strides. Extend to handle dynamic strides.
 spirv::AccessChainOp getElementPtr(SPIRVTypeConverter &typeConverter,
                                    MemRefType baseType, Value basePtr,
-                                   ArrayRef<Value> indices, Location loc,
+                                   ValueRange indices, Location loc,
                                    OpBuilder &builder);
 
 /// Sets the InterfaceVarABIAttr and EntryPointABIAttr for a function and its
diff --git a/mlir/include/mlir/TableGen/OpClass.h b/mlir/include/mlir/TableGen/OpClass.h
index e8f73c605dfdb..694fed767e330 100644
--- a/mlir/include/mlir/TableGen/OpClass.h
+++ b/mlir/include/mlir/TableGen/OpClass.h
@@ -86,6 +86,7 @@ class OpMethod {
 
   OpMethod(StringRef retType, StringRef name, StringRef params,
            Property property, bool declOnly);
+  virtual ~OpMethod() = default;
 
   OpMethodBody &body();
 
@@ -96,13 +97,13 @@ class OpMethod {
   bool isPrivate() const;
 
   // Writes the method as a declaration to the given `os`.
-  void writeDeclTo(raw_ostream &os) const;
+  virtual void writeDeclTo(raw_ostream &os) const;
   // Writes the method as a definition to the given `os`. `namePrefix` is the
   // prefix to be prepended to the method name (typically namespaces for
   // qualifying the method definition).
-  void writeDefTo(raw_ostream &os, StringRef namePrefix) const;
+  virtual void writeDefTo(raw_ostream &os, StringRef namePrefix) const;
 
-private:
+protected:
   Property properties;
   // Whether this method only contains a declaration.
   bool isDeclOnly;
@@ -110,6 +111,26 @@ class OpMethod {
   OpMethodBody methodBody;
 };
 
+// Class for holding an op's constructor method for C++ code emission.
+class OpConstructor : public OpMethod {
+public:
+  OpConstructor(StringRef retType, StringRef name, StringRef params,
+                Property property, bool declOnly)
+      : OpMethod(retType, name, params, property, declOnly){};
+
+  // Add member initializer to constructor initializing `name` with `value`.
+  void addMemberInitializer(StringRef name, StringRef value);
+
+  // Writes the method as a definition to the given `os`. `namePrefix` is the
+  // prefix to be prepended to the method name (typically namespaces for
+  // qualifying the method definition).
+  void writeDefTo(raw_ostream &os, StringRef namePrefix) const override;
+
+private:
+  // Member initializers.
+  std::string memberInitializers;
+};
+
 // A class used to emit C++ classes from Tablegen.  Contains a list of public
 // methods and a list of private fields to be emitted.
 class Class {
@@ -121,7 +142,7 @@ class Class {
                       OpMethod::Property = OpMethod::MP_None,
                       bool declOnly = false);
 
-  OpMethod &newConstructor(StringRef params = "", bool declOnly = false);
+  OpConstructor &newConstructor(StringRef params = "", bool declOnly = false);
 
   // Creates a new field in this class.
   void newField(StringRef type, StringRef name, StringRef defaultValue = "");
@@ -136,6 +157,7 @@ class Class {
 
 protected:
   std::string className;
+  SmallVector<OpConstructor, 2> constructors;
   SmallVector<OpMethod, 8> methods;
   SmallVector<std::string, 4> fields;
 };
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index 040f52314cea0..cce754dd34546 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -58,6 +58,9 @@ class Operator {
   // Returns this op's C++ class name prefixed with namespaces.
   std::string getQualCppClassName() const;
 
+  // Returns the name of op's adaptor C++ class.
+  std::string getAdaptorName() const;
+
   /// A class used to represent the decorators of an operator variable, i.e.
   /// argument or result.
   struct VariableDecorator {
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index cbe6da31addf2..8cc2315ddd15f 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -795,8 +795,8 @@ Value ConvertToLLVMPattern::linearizeSubscripts(
 }
 
 Value ConvertToLLVMPattern::getStridedElementPtr(
-    Location loc, Type elementTypePtr, Value descriptor,
-    ArrayRef<Value> indices, ArrayRef<int64_t> strides, int64_t offset,
+    Location loc, Type elementTypePtr, Value descriptor, ValueRange indices,
+    ArrayRef<int64_t> strides, int64_t offset,
     ConversionPatternRewriter &rewriter) const {
   MemRefDescriptor memRefDescriptor(descriptor);
 
@@ -818,8 +818,7 @@ Value ConvertToLLVMPattern::getStridedElementPtr(
 }
 
 Value ConvertToLLVMPattern::getDataPtr(Location loc, MemRefType type,
-                                       Value memRefDesc,
-                                       ArrayRef<Value> indices,
+                                       Value memRefDesc, ValueRange indices,
                                        ConversionPatternRewriter &rewriter,
                                        llvm::Module &module) const {
   LLVM::LLVMType ptrType = MemRefDescriptor(memRefDesc).getElementType();
@@ -2602,7 +2601,7 @@ struct ViewOpLowering : public ConvertOpToLLVMPattern<ViewOp> {
   // Build and return the value for the idx^th shape dimension, either by
   // returning the constant shape dimension or counting the proper dynamic size.
   Value getSize(ConversionPatternRewriter &rewriter, Location loc,
-                ArrayRef<int64_t> shape, ArrayRef<Value> dynamicSizes,
+                ArrayRef<int64_t> shape, ValueRange dynamicSizes,
                 unsigned idx) const {
     assert(idx < shape.size());
     if (!ShapedType::isDynamic(shape[idx]))
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
index dfc2728ef7109..6458756dec69e 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
@@ -579,7 +579,7 @@ Value mlir::spirv::getBuiltinVariableValue(Operation *op,
 
 spirv::AccessChainOp mlir::spirv::getElementPtr(
     SPIRVTypeConverter &typeConverter, MemRefType baseType, Value basePtr,
-    ArrayRef<Value> indices, Location loc, OpBuilder &builder) {
+    ValueRange indices, Location loc, OpBuilder &builder) {
   // Get base and offset of the MemRefType and verify they are static.
 
   int64_t offset;
@@ -591,6 +591,7 @@ spirv::AccessChainOp mlir::spirv::getElementPtr(
   }
 
   auto indexType = typeConverter.getIndexType(builder.getContext());
+
   SmallVector<Value, 2> linearizedIndices;
   // Add a '0' at the start to index into the struct.
   auto zero = spirv::ConstantOp::getZero(indexType, loc, builder);
@@ -606,7 +607,7 @@ spirv::AccessChainOp mlir::spirv::getElementPtr(
         loc, indexType, IntegerAttr::get(indexType, offset));
     assert(indices.size() == strides.size() &&
            "must provide indices for all dimensions");
-    for (auto index : enumerate(indices)) {
+    for (auto index : llvm::enumerate(indices)) {
       Value strideVal = builder.create<spirv::ConstantOp>(
           loc, indexType, IntegerAttr::get(indexType, strides[index.index()]));
       Value update =
diff --git a/mlir/lib/TableGen/OpClass.cpp b/mlir/lib/TableGen/OpClass.cpp
index bfdcbdc344a3d..43bbe2420a9a6 100644
--- a/mlir/lib/TableGen/OpClass.cpp
+++ b/mlir/lib/TableGen/OpClass.cpp
@@ -119,6 +119,27 @@ void tblgen::OpMethod::writeDefTo(raw_ostream &os, StringRef namePrefix) const {
   os << "}";
 }
 
+//===----------------------------------------------------------------------===//
+// OpConstructor definitions
+//===----------------------------------------------------------------------===//
+
+void mlir::tblgen::OpConstructor::addMemberInitializer(StringRef name,
+                                                       StringRef value) {
+  memberInitializers.append(std::string(llvm::formatv(
+      "{0}{1}({2})", memberInitializers.empty() ? " : " : ", ", name, value)));
+}
+
+void mlir::tblgen::OpConstructor::writeDefTo(raw_ostream &os,
+                                             StringRef namePrefix) const {
+  if (isDeclOnly)
+    return;
+
+  methodSignature.writeDefTo(os, namePrefix);
+  os << " " << memberInitializers << " {\n";
+  methodBody.writeTo(os);
+  os << "}";
+}
+
 //===----------------------------------------------------------------------===//
 // Class definitions
 //===----------------------------------------------------------------------===//
@@ -133,10 +154,11 @@ tblgen::OpMethod &tblgen::Class::newMethod(StringRef retType, StringRef name,
   return methods.back();
 }
 
-tblgen::OpMethod &tblgen::Class::newConstructor(StringRef params,
-                                                bool declOnly) {
-  return newMethod("", getClassName(), params, OpMethod::MP_Constructor,
-                   declOnly);
+tblgen::OpConstructor &tblgen::Class::newConstructor(StringRef params,
+                                                     bool declOnly) {
+  constructors.emplace_back("", getClassName(), params,
+                            OpMethod::MP_Constructor, declOnly);
+  return constructors.back();
 }
 
 void tblgen::Class::newField(StringRef type, StringRef name,
@@ -152,7 +174,8 @@ void tblgen::Class::writeDeclTo(raw_ostream &os) const {
   bool hasPrivateMethod = false;
   os << "class " << className << " {\n";
   os << "public:\n";
-  for (const auto &method : methods) {
+  for (const auto &method :
+       llvm::concat<const OpMethod>(constructors, methods)) {
     if (!method.isPrivate()) {
       method.writeDeclTo(os);
       os << '\n';
@@ -163,7 +186,8 @@ void tblgen::Class::writeDeclTo(raw_ostream &os) const {
   os << '\n';
   os << "private:\n";
   if (hasPrivateMethod) {
-    for (const auto &method : methods) {
+    for (const auto &method :
+         llvm::concat<const OpMethod>(constructors, methods)) {
       if (method.isPrivate()) {
         method.writeDeclTo(os);
         os << '\n';
@@ -177,7 +201,8 @@ void tblgen::Class::writeDeclTo(raw_ostream &os) const {
 }
 
 void tblgen::Class::writeDefTo(raw_ostream &os) const {
-  for (const auto &method : methods) {
+  for (const auto &method :
+       llvm::concat<const OpMethod>(constructors, methods)) {
     method.writeDefTo(os, className);
     os << "\n\n";
   }
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 2f77184980e28..f575fedc1f24c 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -59,6 +59,10 @@ std::string tblgen::Operator::getOperationName() const {
   return std::string(llvm::formatv("{0}.{1}", prefix, opName));
 }
 
+std::string tblgen::Operator::getAdaptorName() const {
+  return std::string(llvm::formatv("{0}OperandAdaptor", getCppClassName()));
+}
+
 StringRef tblgen::Operator::getDialectName() const { return dialect.getName(); }
 
 StringRef tblgen::Operator::getCppClassName() const { return cppClassName; }
diff --git a/mlir/test/mlir-tblgen/op-decl.td b/mlir/test/mlir-tblgen/op-decl.td
index 565f1921125a3..a101103b08fc0 100644
--- a/mlir/test/mlir-tblgen/op-decl.td
+++ b/mlir/test/mlir-tblgen/op-decl.td
@@ -49,14 +49,14 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 
 // CHECK: class AOpOperandAdaptor {
 // CHECK: public:
-// CHECK:   AOpOperandAdaptor(ArrayRef<Value> values
-// CHECK:   ArrayRef<Value> getODSOperands(unsigned index);
+// CHECK:   AOpOperandAdaptor(ValueRange values
+// CHECK:   ValueRange getODSOperands(unsigned index);
 // CHECK:   Value a();
-// CHECK:   ArrayRef<Value> b();
+// CHECK:   ValueRange b();
 // CHECK:   IntegerAttr attr1();
 // CHECL:   FloatAttr attr2();
 // CHECK: private:
-// CHECK:   ArrayRef<Value> odsOperands;
+// CHECK:   ValueRange odsOperands;
 // CHECK: };
 
 // CHECK: class AOp : public Op<AOp, OpTrait::AtLeastNRegions<1>::Impl, OpTrait::AtLeastNResults<1>::Impl, OpTrait::ZeroSuccessor, OpTrait::AtLeastNOperands<1>::Impl, OpTrait::IsIsolatedFromAbove
@@ -106,12 +106,12 @@ def NS_AttrSizedOperandOp : NS_Op<"attr_sized_operands",
 }
 
 // CHECK-LABEL: AttrSizedOperandOpOperandAdaptor(
-// CHECK-SAME:    ArrayRef<Value> values
+// CHECK-SAME:    ValueRange values
 // CHECK-SAME:    DictionaryAttr attrs
-// CHECK:  ArrayRef<Value> a();
-// CHECK:  ArrayRef<Value> b();
+// CHECK:  ValueRange a();
+// CHECK:  ValueRange b();
 // CHECK:  Value c();
-// CHECK:  ArrayRef<Value> d();
+// CHECK:  ValueRange d();
 // CHECK:  DenseIntElementsAttr operand_segment_sizes();
 
 // Check op trait for different number of operands
diff --git a/mlir/test/mlir-tblgen/op-operand.td b/mlir/test/mlir-tblgen/op-operand.td
index 5f0bfae928120..a9b61c179be0c 100644
--- a/mlir/test/mlir-tblgen/op-operand.td
+++ b/mlir/test/mlir-tblgen/op-operand.td
@@ -15,7 +15,7 @@ def OpA : NS_Op<"one_normal_operand_op", []> {
 // CHECK-LABEL: OpA definitions
 
 // CHECK:      OpAOperandAdaptor::OpAOperandAdaptor
-// CHECK-NEXT: odsOperands = values
+// CHECK-SAME: odsOperands(values), odsAttrs(attrs)
 
 // CHECK:      void OpA::build
 // CHECK:        Value input
@@ -39,13 +39,13 @@ def OpD : NS_Op<"mix_variadic_and_normal_inputs_op", [SameVariadicOperandSize]>
   let arguments = (ins Variadic<AnyTensor>:$input1, AnyTensor:$input2, Variadic<AnyTensor>:$input3);
 }
 
-// CHECK-LABEL: ArrayRef<Value> OpDOperandAdaptor::input1
+// CHECK-LABEL: ValueRange OpDOperandAdaptor::input1
 // CHECK-NEXT:    return getODSOperands(0);
 
 // CHECK-LABEL: Value OpDOperandAdaptor::input2
 // CHECK-NEXT:    return *getODSOperands(1).begin();
 
-// CHECK-LABEL: ArrayRef<Value> OpDOperandAdaptor::input3
+// CHECK-LABEL: ValueRange OpDOperandAdaptor::input3
 // CHECK-NEXT:    return getODSOperands(2);
 
 // CHECK-LABEL: Operation::operand_range OpD::input1
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0b55825d1a46c..7b0cd9d7a4826 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -1890,27 +1890,38 @@ class OpOperandAdaptorEmitter {
 private:
   explicit OpOperandAdaptorEmitter(const Operator &op);
 
-  Class adapterClass;
+  Class adaptor;
 };
 } // end namespace
 
 OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op)
-    : adapterClass(op.getCppClassName().str() + "OperandAdaptor") {
-  adapterClass.newField("ArrayRef<Value>", "odsOperands");
-  adapterClass.newField("DictionaryAttr", "odsAttrs");
+    : adaptor(op.getAdaptorName()) {
+  adaptor.newField("ValueRange", "odsOperands");
+  adaptor.newField("DictionaryAttr", "odsAttrs");
   const auto *attrSizedOperands =
       op.getTrait("OpTrait::AttrSizedOperandSegments");
-  auto &constructor = adapterClass.newConstructor(
-      attrSizedOperands
-          ? "ArrayRef<Value> values, DictionaryAttr attrs"
-          : "ArrayRef<Value> values, DictionaryAttr attrs = nullptr");
-  constructor.body() << "  odsOperands = values;\n";
-  constructor.body() << "  odsAttrs = attrs;\n";
+  {
+    auto &constructor = adaptor.newConstructor(
+        attrSizedOperands
+            ? "ValueRange values, DictionaryAttr attrs"
+            : "ValueRange values, DictionaryAttr attrs = nullptr");
+    constructor.addMemberInitializer("odsOperands", "values");
+    constructor.addMemberInitializer("odsAttrs", "attrs");
+  }
+
+  {
+    auto &constructor = adaptor.newConstructor(
+        llvm::formatv("{0}& op", op.getCppClassName()).str());
+    constructor.addMemberInitializer("odsOperands",
+                                     "op.getOperation()->getOperands()");
+    constructor.addMemberInitializer("odsAttrs",
+                                     "op.getOperation()->getAttrDictionary()");
+  }
 
   std::string sizeAttrInit =
       formatv(adapterSegmentSizeAttrInitCode, "operand_segment_sizes");
-  generateNamedOperandGetters(op, adapterClass, sizeAttrInit,
-                              /*rangeType=*/"ArrayRef<Value>",
+  generateNamedOperandGetters(op, adaptor, sizeAttrInit,
+                              /*rangeType=*/"ValueRange",
                               /*rangeBeginCall=*/"odsOperands.begin()",
                               /*rangeSizeCall=*/"odsOperands.size()",
                               /*getOperandCallPattern=*/"odsOperands[{0}]");
@@ -1919,7 +1930,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op)
   fctx.withBuilder("mlir::Builder(odsAttrs.getContext())");
 
   auto emitAttr = [&](StringRef name, Attribute attr) {
-    auto &body = adapterClass.newMethod(attr.getStorageType(), name).body();
+    auto &body = adaptor.newMethod(attr.getStorageType(), name).body();
     body << "  assert(odsAttrs && \"no attributes when constructing adapter\");"
          << "\n  " << attr.getStorageType() << " attr = "
          << "odsAttrs.get(\"" << name << "\").";
@@ -1949,11 +1960,11 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(const Operator &op)
 }
 
 void OpOperandAdaptorEmitter::emitDecl(const Operator &op, raw_ostream &os) {
-  OpOperandAdaptorEmitter(op).adapterClass.writeDeclTo(os);
+  OpOperandAdaptorEmitter(op).adaptor.writeDeclTo(os);
 }
 
 void OpOperandAdaptorEmitter::emitDef(const Operator &op, raw_ostream &os) {
-  OpOperandAdaptorEmitter(op).adapterClass.writeDefTo(os);
+  OpOperandAdaptorEmitter(op).adaptor.writeDefTo(os);
 }
 
 // Emits the opcode enum and op classes.

From 2321ab9c69ad33944697cde68525fd7b2bf4b36a Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 28 May 2020 09:06:47 -0700
Subject: [PATCH 374/770] [mlir] Fix mismatched-tags warning

---
 mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 2c4b3dc6ac88b..b022ebc042c9b 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -15,7 +15,7 @@
 namespace mlir {
 
 class Location;
-class LogicalResult;
+struct LogicalResult;
 class ModuleOp;
 class Operation;
 

From b726d071b4aa46004228fc38ee5bfd167f999bfe Mon Sep 17 00:00:00 2001
From: alex-t <alexander.timofeev@amd.com>
Date: Fri, 22 May 2020 12:15:57 +0300
Subject: [PATCH 375/770] [AMDGPU] Reject moving PHI to VALU if the only VGPR
 input originated from move immediate

Summary:
PHIs result register class is set to VGPR or SGPR depending on the cross block value divergence.
         In some cases uniform PHI need to be converted to return VGPR to prevent the oddnumber of moves values from VGPR to SGPR and back.
         PHI should certainly return VGPR if it has at least one VGPR input. This change adds the exception.
         We don't want to convert uniform PHI to VGPRs in case the only VGPR input is a VGPR to SGPR COPY and definition od the
         source VGPR in this COPY is move immediate.

  bb.0:

     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %2:sreg_32 = .....

  bb.1:
     %3:sreg_32 = PHI %1, %bb.3, %2, %bb.1
     S_BRANCH %bb.3

  bb.3:
     %1:sreg_32 = COPY %0
     S_BRANCH %bb.2

Reviewers: rampitec

Reviewed By: rampitec

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80434
---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    | 18 +++-
 .../CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir | 97 +++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 7283c33fe9851..ef64c5674bd1c 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -835,8 +835,22 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     }
     else if (Def->isCopy() &&
       TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
-      hasVGPRInput = true;
-      break;
+      Register SrcReg = Def->getOperand(1).getReg();
+      MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
+      unsigned SMovOp;
+      int64_t Imm;
+      if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
+        hasVGPRInput = true;
+        break;
+      } else {
+        // Formally, if we did not do this right away
+        // it would be done on the next iteration of the
+        // runOnMachineFunction main loop. But why not if we can?
+        MachineFunction *MF = MI.getParent()->getParent();
+        Def->getOperand(1).ChangeToImmediate(Imm);
+        Def->addImplicitDefUseOperands(*MF);
+        Def->setDesc(TII->get(SMovOp));
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
new file mode 100644
index 0000000000000..ff061dec039a8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
@@ -0,0 +1,97 @@
+# RUN: llc  -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
+---
+# GCN_LABEL: phi_moveimm_input
+# GCN-NOT: %{{[0-9]+}}:vgpr_32 = PHI %{{[0-9]+}}, %bb.3, %{{[0-9]+}}, %bb.1
+# GCN:     %{{[0-9]+}}:sreg_32 = PHI %{{[0-9]+}}, %bb.3, %{{[0-9]+}}, %bb.1
+
+name:            phi_moveimm_input
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr0, $sgpr1
+
+    %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+    %4:sreg_32 = COPY $sgpr0
+    %5:sreg_32 = COPY $sgpr1
+
+  bb.1:
+    successors: %bb.2
+    %2:sreg_32 =  S_ADD_U32 %4, %5, implicit-def $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %3:sreg_32 = PHI %1, %bb.3, %2, %bb.1
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.2
+    %1:sreg_32 = COPY %0
+    S_BRANCH %bb.2
+...
+
+---
+# GCN_LABEL: phi_moveimm_subreg_input
+# GCN-NOT: %{{[0-9]+}}:sreg_64 = PHI %{{[0-9]+}}, %bb.3, %{{[0-9]+}}, %bb.1
+# GCN: %{{[0-9]+}}:vreg_64 = PHI %{{[0-9]+}}, %bb.3, %{{[0-9]+}}, %bb.1
+name:            phi_moveimm_subreg_input
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr0, $sgpr1
+
+    %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+    %4:sreg_32 = COPY $sgpr0
+    %5:sreg_32 = COPY $sgpr1
+
+  bb.1:
+    successors: %bb.2
+    undef %2.sub0:sreg_64 =  S_ADD_U32 %4, %5, implicit-def $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %3:sreg_64 = PHI %1, %bb.3, %2, %bb.1
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.2
+    undef %1.sub0:sreg_64 = COPY %0
+    S_BRANCH %bb.2
+...
+
+
+---
+# GCN_LABEL: phi_moveimm_bad_opcode_input
+# GCN-NOT: %{{[0-9]+}}:sreg_32 = PHI %{{[0-9]+}}, %bb.3, %{{[0-9]+}}, %bb.1
+# GCN: %{{[0-9]+}}:vgpr_32 = PHI %{{[0-9]+}}, %bb.3, %{{[0-9]+}}, %bb.1
+name:            phi_moveimm_bad_opcode_input
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr0, $sgpr1, $vgpr0
+    %6:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_MOV_B32_sdwa 0, %6:vgpr_32, 0, 5, 2, 4,  implicit $exec, implicit %6:vgpr_32(tied-def 0)
+
+    %4:sreg_32 = COPY $sgpr0
+    %5:sreg_32 = COPY $sgpr1
+
+  bb.1:
+
+    successors: %bb.2
+    %2:sreg_32 =  S_ADD_U32 %4, %5, implicit-def $scc
+    S_BRANCH %bb.2
+  bb.2:
+    successors: %bb.3
+    %3:sreg_32 = PHI %1, %bb.3, %2, %bb.1
+    S_BRANCH %bb.3
+  bb.3:
+    successors: %bb.2
+    %1:sreg_32 = COPY %0
+    S_BRANCH %bb.2
+...

From 77b9abfc8e89ca627e4f9a1cc206bea131db6db1 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne@apple.com>
Date: Fri, 22 May 2020 09:59:48 -0400
Subject: [PATCH 376/770] [libc++] Complete overhaul of constexpr support in
 std::array

This commit adds missing support for constexpr in std::array under all
standard modes up to and including C++20. It also transforms the <array>
tests to check for constexpr-friendliness under the right standard modes.

Fixes https://llvm.org/PR40124
Fixes rdar://57522096
Supersedes https://reviews.llvm.org/D60666

Differential Revision: https://reviews.llvm.org/D80452
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +
 libcxx/include/array                          | 156 ++++++-----
 libcxx/include/version                        |   5 +-
 .../sequences/array/aggregate.pass.cpp        |  50 ++++
 .../array/array.cons/deduct.pass.cpp          |  57 ++--
 .../array/array.cons/default.pass.cpp         |  47 ++--
 .../array/array.cons/implicit_copy.pass.cpp   | 129 +++++----
 .../array.cons/initializer_list.pass.cpp      |  21 +-
 .../array/array.creation/to_array.pass.cpp    |  16 +-
 .../sequences/array/array.data/data.pass.cpp  |  68 +++--
 .../array/array.data/data_const.pass.cpp      |  63 +++--
 .../sequences/array/array.fill/fill.pass.cpp  |  13 +-
 .../sequences/array/array.size/size.pass.cpp  |   2 +-
 .../array/array.special/swap.pass.cpp         |  21 +-
 .../sequences/array/array.swap/swap.pass.cpp  |  16 +-
 .../sequences/array/array.tuple/get.pass.cpp  |  72 +++--
 .../array/array.tuple/get_const.pass.cpp      |  42 +--
 .../array/array.tuple/get_const_rv.pass.cpp   |  30 +-
 .../array/array.tuple/get_rv.pass.cpp         |   2 +-
 .../array/array.tuple/tuple_element.fail.cpp  |   1 -
 .../containers/sequences/array/at.pass.cpp    | 128 ++++-----
 .../sequences/array/at_const.pass.cpp         | 109 ++++++++
 .../containers/sequences/array/begin.pass.cpp |  53 ----
 .../sequences/array/compare.pass.cpp          |  69 +++--
 .../sequences/array/contiguous.pass.cpp       |  35 ++-
 .../containers/sequences/array/empty.pass.cpp |  41 ++-
 .../sequences/array/front_back.pass.cpp       | 114 +++-----
 .../sequences/array/front_back_const.pass.cpp |  73 +++++
 .../sequences/array/indexing.pass.cpp         | 107 +++----
 .../sequences/array/indexing_const.pass.cpp   |  73 +++++
 .../sequences/array/iterators.pass.cpp        | 261 ++++++++++--------
 .../sequences/array/max_size.pass.cpp         |  41 ++-
 .../array/size_and_alignment.pass.cpp         |   8 -
 .../array.version.pass.cpp                    |   5 +-
 .../iterator.version.pass.cpp                 |   5 +-
 .../version.version.pass.cpp                  |   5 +-
 libcxx/test/support/test_macros.h             |   6 +
 .../generate_feature_test_macro_components.py |   1 +
 38 files changed, 1159 insertions(+), 788 deletions(-)
 create mode 100644 libcxx/test/std/containers/sequences/array/aggregate.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/array/at_const.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/array/begin.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/array/front_back_const.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/array/indexing_const.pass.cpp

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index e8628408afd09..c04a883de1797 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -168,6 +168,8 @@ Status
     ------------------------------------------------- -----------------
     **C++ 2a**                                                         
     -------------------------------------------------------------------
+    ``__cpp_lib_array_constexpr``                     ``201811L``      
+    ------------------------------------------------- -----------------
     ``__cpp_lib_atomic_ref``                          *unimplemented*  
     ------------------------------------------------- -----------------
     ``__cpp_lib_bind_front``                          *unimplemented*  
diff --git a/libcxx/include/array b/libcxx/include/array
index 7ffa825a9652c..215d4e89f0ea7 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -32,24 +32,24 @@ struct array
     typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
     // No explicit construct/copy/destroy for aggregate type
-    void fill(const T& u);
-    void swap(array& a) noexcept(is_nothrow_swappable_v<T>);
+    void fill(const T& u);                                      // constexpr in C++20
+    void swap(array& a) noexcept(is_nothrow_swappable_v<T>);    // constexpr in C++20
 
     // iterators:
-    iterator begin() noexcept;
-    const_iterator begin() const noexcept;
-    iterator end() noexcept;
-    const_iterator end() const noexcept;
+    iterator begin() noexcept;                                  // constexpr in C++17
+    const_iterator begin() const noexcept;                      // constexpr in C++17
+    iterator end() noexcept;                                    // constexpr in C++17
+    const_iterator end() const noexcept;                        // constexpr in C++17
 
-    reverse_iterator rbegin() noexcept;
-    const_reverse_iterator rbegin() const noexcept;
-    reverse_iterator rend() noexcept;
-    const_reverse_iterator rend() const noexcept;
+    reverse_iterator rbegin() noexcept;                         // constexpr in C++17
+    const_reverse_iterator rbegin() const noexcept;             // constexpr in C++17
+    reverse_iterator rend() noexcept;                           // constexpr in C++17
+    const_reverse_iterator rend() const noexcept;               // constexpr in C++17
 
-    const_iterator cbegin() const noexcept;
-    const_iterator cend() const noexcept;
-    const_reverse_iterator crbegin() const noexcept;
-    const_reverse_iterator crend() const noexcept;
+    const_iterator cbegin() const noexcept;                     // constexpr in C++17
+    const_iterator cend() const noexcept;                       // constexpr in C++17
+    const_reverse_iterator crbegin() const noexcept;            // constexpr in C++17
+    const_reverse_iterator crend() const noexcept;              // constexpr in C++17
 
     // capacity:
     constexpr size_type size() const noexcept;
@@ -57,46 +57,51 @@ struct array
     constexpr bool empty() const noexcept;
 
     // element access:
-    reference operator[](size_type n);
-    const_reference operator[](size_type n) const; // constexpr in C++14
-    const_reference at(size_type n) const; // constexpr in C++14
-    reference at(size_type n);
-
-    reference front();
-    const_reference front() const; // constexpr in C++14
-    reference back();
-    const_reference back() const; // constexpr in C++14
-
-    T* data() noexcept;
-    const T* data() const noexcept;
+    reference operator[](size_type n);                          // constexpr in C++17
+    const_reference operator[](size_type n) const;              // constexpr in C++14
+    reference at(size_type n);                                  // constexpr in C++17
+    const_reference at(size_type n) const;                      // constexpr in C++14
+
+    reference front();                                          // constexpr in C++17
+    const_reference front() const;                              // constexpr in C++14
+    reference back();                                           // constexpr in C++17
+    const_reference back() const;                               // constexpr in C++14
+
+    T* data() noexcept;                                         // constexpr in C++17
+    const T* data() const noexcept;                             // constexpr in C++17
 };
 
-  template <class T, class... U>
-    array(T, U...) -> array<T, 1 + sizeof...(U)>;
+template <class T, class... U>
+  array(T, U...) -> array<T, 1 + sizeof...(U)>;                 // C++17
 
 template <class T, size_t N>
-  bool operator==(const array<T,N>& x, const array<T,N>& y);
+  bool operator==(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 template <class T, size_t N>
-  bool operator!=(const array<T,N>& x, const array<T,N>& y);
+  bool operator!=(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 template <class T, size_t N>
-  bool operator<(const array<T,N>& x, const array<T,N>& y);
+  bool operator<(const array<T,N>& x, const array<T,N>& y);     // constexpr in C++20
 template <class T, size_t N>
-  bool operator>(const array<T,N>& x, const array<T,N>& y);
+  bool operator>(const array<T,N>& x, const array<T,N>& y);     // constexpr in C++20
 template <class T, size_t N>
-  bool operator<=(const array<T,N>& x, const array<T,N>& y);
+  bool operator<=(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 template <class T, size_t N>
-  bool operator>=(const array<T,N>& x, const array<T,N>& y);
+  bool operator>=(const array<T,N>& x, const array<T,N>& y);    // constexpr in C++20
 
 template <class T, size_t N >
-  void swap(array<T,N>& x, array<T,N>& y) noexcept(noexcept(x.swap(y))); // C++17
+  void swap(array<T,N>& x, array<T,N>& y) noexcept(noexcept(x.swap(y))); // constexpr in C++20
+
+template <class T, size_t N>
+  constexpr array<remove_cv_t<T>, N> to_array(T (&a)[N]);  // C++20
+template <class T, size_t N>
+  constexpr array<remove_cv_t<T>, N> to_array(T (&&a)[N]); // C++20
 
 template <class T> struct tuple_size;
 template <size_t I, class T> struct tuple_element;
 template <class T, size_t N> struct tuple_size<array<T, N>>;
 template <size_t I, class T, size_t N> struct tuple_element<I, array<T, N>>;
-template <size_t I, class T, size_t N> T& get(array<T, N>&) noexcept; // constexpr in C++14
-template <size_t I, class T, size_t N> const T& get(const array<T, N>&) noexcept; // constexpr in C++14
-template <size_t I, class T, size_t N> T&& get(array<T, N>&&) noexcept; // constexpr in C++14
+template <size_t I, class T, size_t N> T& get(array<T, N>&) noexcept;               // constexpr in C++14
+template <size_t I, class T, size_t N> const T& get(const array<T, N>&) noexcept;   // constexpr in C++14
+template <size_t I, class T, size_t N> T&& get(array<T, N>&&) noexcept;             // constexpr in C++14
 template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexcept; // constexpr in C++14
 
 }  // std
@@ -143,11 +148,12 @@ struct _LIBCPP_TEMPLATE_VIS array
     _Tp __elems_[_Size];
 
     // No explicit construct/copy/destroy for aggregate type
-    _LIBCPP_INLINE_VISIBILITY void fill(const value_type& __u) {
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    void fill(const value_type& __u) {
       _VSTD::fill_n(__elems_, _Size, __u);
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     void swap(array& __a) _NOEXCEPT_(__is_nothrow_swappable<_Tp>::value) {
       std::swap_ranges(__elems_, __elems_ + _Size, __a.__elems_);
     }
@@ -236,50 +242,71 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
     typedef std::reverse_iterator<iterator>       reverse_iterator;
     typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
+#ifndef _LIBCPP_CXX03_LANG
+    union __wrapper {
+        _LIBCPP_CONSTEXPR __wrapper() : __b() { }
+        ~__wrapper() = default;
+
+        bool __b;
+        _Tp __t;
+    } __w;
+
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+    value_type* data() _NOEXCEPT {return &__w.__t;}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+    const value_type* data() const _NOEXCEPT {return &__w.__t;}
+#else // C++03
     typedef typename conditional<is_const<_Tp>::value, const char,
                                 char>::type _CharType;
 
     struct  _ArrayInStructT { _Tp __data_[1]; };
     _ALIGNAS_TYPE(_ArrayInStructT) _CharType __elems_[sizeof(_ArrayInStructT)];
 
+    _LIBCPP_INLINE_VISIBILITY
+    value_type* data() _NOEXCEPT {return reinterpret_cast<value_type*>(__elems_);}
+    _LIBCPP_INLINE_VISIBILITY
+    const value_type* data() const _NOEXCEPT {return reinterpret_cast<const value_type*>(__elems_);}
+#endif
+
     // No explicit construct/copy/destroy for aggregate type
-    _LIBCPP_INLINE_VISIBILITY void fill(const value_type&) {
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
+    void fill(const value_type&) {
       static_assert(!is_const<_Tp>::value,
                     "cannot fill zero-sized array of type 'const T'");
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
     void swap(array&) _NOEXCEPT {
       static_assert(!is_const<_Tp>::value,
                     "cannot swap zero-sized array of type 'const T'");
     }
 
     // iterators:
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     iterator begin() _NOEXCEPT {return iterator(data());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_iterator begin() const _NOEXCEPT {return const_iterator(data());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     iterator end() _NOEXCEPT {return iterator(data());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_iterator end() const _NOEXCEPT {return const_iterator(data());}
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     reverse_iterator rbegin() _NOEXCEPT {return reverse_iterator(end());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_reverse_iterator rbegin() const _NOEXCEPT {return const_reverse_iterator(end());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     reverse_iterator rend() _NOEXCEPT {return reverse_iterator(begin());}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_reverse_iterator rend() const _NOEXCEPT {return const_reverse_iterator(begin());}
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_iterator cbegin() const _NOEXCEPT {return begin();}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_iterator cend() const _NOEXCEPT {return end();}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_reverse_iterator crbegin() const _NOEXCEPT {return rbegin();}
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     const_reverse_iterator crend() const _NOEXCEPT {return rend();}
 
     // capacity:
@@ -291,7 +318,7 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
     _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT {return true;}
 
     // element access:
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     reference operator[](size_type) _NOEXCEPT {
       _LIBCPP_ASSERT(false, "cannot call array<T, 0>::operator[] on a zero-sized array");
       _LIBCPP_UNREACHABLE();
@@ -303,46 +330,41 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
       _LIBCPP_UNREACHABLE();
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     reference at(size_type) {
       __throw_out_of_range("array<T, 0>::at");
       _LIBCPP_UNREACHABLE();
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
     const_reference at(size_type) const {
       __throw_out_of_range("array<T, 0>::at");
       _LIBCPP_UNREACHABLE();
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     reference front() _NOEXCEPT {
       _LIBCPP_ASSERT(false, "cannot call array<T, 0>::front() on a zero-sized array");
       _LIBCPP_UNREACHABLE();
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
     const_reference front() const _NOEXCEPT {
       _LIBCPP_ASSERT(false, "cannot call array<T, 0>::front() on a zero-sized array");
       _LIBCPP_UNREACHABLE();
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
     reference back() _NOEXCEPT {
       _LIBCPP_ASSERT(false, "cannot call array<T, 0>::back() on a zero-sized array");
       _LIBCPP_UNREACHABLE();
     }
 
-    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
     const_reference back() const _NOEXCEPT {
       _LIBCPP_ASSERT(false, "cannot call array<T, 0>::back() on a zero-sized array");
       _LIBCPP_UNREACHABLE();
     }
-
-    _LIBCPP_INLINE_VISIBILITY
-    value_type* data() _NOEXCEPT {return reinterpret_cast<value_type*>(__elems_);}
-    _LIBCPP_INLINE_VISIBILITY
-    const value_type* data() const _NOEXCEPT {return reinterpret_cast<const value_type*>(__elems_);}
 };
 
 
@@ -404,7 +426,7 @@ operator>=(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y)
 }
 
 template <class _Tp, size_t _Size>
-inline _LIBCPP_INLINE_VISIBILITY
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 typename enable_if
 <
     _Size == 0 ||
diff --git a/libcxx/include/version b/libcxx/include/version
index 3951693ea826f..5a250471a03e2 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -21,7 +21,8 @@ __cpp_lib_allocator_traits_is_always_equal              201411L <memory> <scoped
                                                                 <unordered_map> <unordered_set>
 __cpp_lib_any                                           201606L <any>
 __cpp_lib_apply                                         201603L <tuple>
-__cpp_lib_array_constexpr                               201603L <iterator> <array>
+__cpp_lib_array_constexpr                               201811L <iterator> <array>
+                                                        201603L // C++17
 __cpp_lib_as_const                                      201510L <utility>
 __cpp_lib_atomic_is_always_lock_free                    201603L <atomic>
 __cpp_lib_atomic_ref                                    201806L <atomic>
@@ -212,6 +213,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #endif
 
 #if _LIBCPP_STD_VER > 17
+# undef  __cpp_lib_array_constexpr
+# define __cpp_lib_array_constexpr                      201811L
 # if !defined(_LIBCPP_HAS_NO_THREADS)
 // #   define __cpp_lib_atomic_ref                         201806L
 # endif
diff --git a/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp b/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp
new file mode 100644
index 0000000000000..dd4064bb2fe84
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Make sure std::array is an aggregate type.
+
+#include <array>
+#include <type_traits>
+
+template <typename T>
+void tests()
+{
+    // Test aggregate initialization
+    {
+        std::array<T, 0> a0 = {}; (void)a0;
+        std::array<T, 1> a1 = {T()}; (void)a1;
+        std::array<T, 2> a2 = {T(), T()}; (void)a2;
+        std::array<T, 3> a3 = {T(), T(), T()}; (void)a3;
+    }
+
+    // Test the is_aggregate trait.
+#if TEST_STD_VER >= 17 // The trait is only available in C++17 and above
+    static_assert(std::is_aggregate<std::array<T, 0> >::value, "");
+    static_assert(std::is_aggregate<std::array<T, 1> >::value, "");
+    static_assert(std::is_aggregate<std::array<T, 2> >::value, "");
+    static_assert(std::is_aggregate<std::array<T, 3> >::value, "");
+    static_assert(std::is_aggregate<std::array<T, 4> >::value, "");
+#endif
+}
+
+struct Empty { };
+struct NonEmpty { int i; int j; };
+
+int main(int, char**)
+{
+    tests<char>();
+    tests<int>();
+    tests<long>();
+    tests<float>();
+    tests<double>();
+    tests<long double>();
+    tests<NonEmpty>();
+    tests<Empty>();
+
+    return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp
index 141aafc2a318f..42ebef6ddc44d 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp
@@ -30,37 +30,44 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
+constexpr bool tests()
 {
-//  Test the explicit deduction guides
+    //  Test the explicit deduction guides
     {
-    std::array arr{1,2,3};  // array(T, U...)
-    static_assert(std::is_same_v<decltype(arr), std::array<int, 3>>, "");
-    assert(arr[0] == 1);
-    assert(arr[1] == 2);
-    assert(arr[2] == 3);
+        std::array arr{1,2,3};  // array(T, U...)
+        static_assert(std::is_same_v<decltype(arr), std::array<int, 3>>, "");
+        assert(arr[0] == 1);
+        assert(arr[1] == 2);
+        assert(arr[2] == 3);
     }
 
     {
-    const long l1 = 42;
-    std::array arr{1L, 4L, 9L, l1}; // array(T, U...)
-    static_assert(std::is_same_v<decltype(arr)::value_type, long>, "");
-    static_assert(arr.size() == 4, "");
-    assert(arr[0] == 1);
-    assert(arr[1] == 4);
-    assert(arr[2] == 9);
-    assert(arr[3] == l1);
+        const long l1 = 42;
+        std::array arr{1L, 4L, 9L, l1}; // array(T, U...)
+        static_assert(std::is_same_v<decltype(arr)::value_type, long>, "");
+        static_assert(arr.size() == 4, "");
+        assert(arr[0] == 1);
+        assert(arr[1] == 4);
+        assert(arr[2] == 9);
+        assert(arr[3] == l1);
     }
 
-//  Test the implicit deduction guides
-  {
-  std::array<double, 2> source = {4.0, 5.0};
-  std::array arr(source);   // array(array)
-    static_assert(std::is_same_v<decltype(arr), decltype(source)>, "");
-    static_assert(std::is_same_v<decltype(arr), std::array<double, 2>>, "");
-    assert(arr[0] == 4.0);
-    assert(arr[1] == 5.0);
-  }
+    //  Test the implicit deduction guides
+    {
+        std::array<double, 2> source = {4.0, 5.0};
+        std::array arr(source);   // array(array)
+        static_assert(std::is_same_v<decltype(arr), decltype(source)>, "");
+        static_assert(std::is_same_v<decltype(arr), std::array<double, 2>>, "");
+        assert(arr[0] == 4.0);
+        assert(arr[1] == 5.0);
+    }
 
-  return 0;
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+    static_assert(tests(), "");
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp
index 16671e3b5d1b1..e73a9671f478a 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp
@@ -19,32 +19,43 @@
 #include "disable_missing_braces_warning.h"
 
 struct NoDefault {
-  NoDefault(int) {}
+    TEST_CONSTEXPR NoDefault(int) { }
 };
 
-int main(int, char**)
+struct Default {
+    TEST_CONSTEXPR Default() { }
+};
+
+TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        C c;
-        assert(c.size() == 3);
+        std::array<Default, 3> array;
+        assert(array.size() == 3);
     }
+
     {
-        typedef double T;
-        typedef std::array<T, 0> C;
-        C c;
-        assert(c.size() == 0);
+        std::array<Default, 0> array;
+        assert(array.size() == 0);
     }
+
     {
-      typedef std::array<NoDefault, 0> C;
-      C c;
-      assert(c.size() == 0);
-      C c1 = {};
-      assert(c1.size() == 0);
-      C c2 = {{}};
-      assert(c2.size() == 0);
+        typedef std::array<NoDefault, 0> C;
+        C c;
+        assert(c.size() == 0);
+        C c1 = {};
+        assert(c1.size() == 0);
+        C c2 = {{}};
+        assert(c2.size() == 0);
     }
 
-  return 0;
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp
index c0e205c8322a4..cb9a182980315 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp
@@ -23,72 +23,81 @@
 // generated operator would be ill-formed; like in the case of a struct with a
 // const member.
 #if TEST_STD_VER < 11
-#define TEST_NOT_COPY_ASSIGNABLE(T) ((void)0)
+#   define TEST_NOT_COPY_ASSIGNABLE(T) ((void)0)
 #else
-#define TEST_NOT_COPY_ASSIGNABLE(T) static_assert(!std::is_copy_assignable<T>::value, "")
+#   define TEST_NOT_COPY_ASSIGNABLE(T) static_assert(!std::is_copy_assignable<T>::value, "")
 #endif
 
 struct NoDefault {
-  NoDefault(int) {}
+    TEST_CONSTEXPR NoDefault(int) { }
 };
 
-int main(int, char**) {
-  {
-    typedef double T;
-    typedef std::array<T, 3> C;
-    C c = {1.1, 2.2, 3.3};
-    C c2 = c;
-    c2 = c;
-    static_assert(std::is_copy_constructible<C>::value, "");
-    static_assert(std::is_copy_assignable<C>::value, "");
-  }
-  {
-    typedef double T;
-    typedef std::array<const T, 3> C;
-    C c = {1.1, 2.2, 3.3};
-    C c2 = c;
-    ((void)c2);
-    static_assert(std::is_copy_constructible<C>::value, "");
-    TEST_NOT_COPY_ASSIGNABLE(C);
-  }
-  {
-    typedef double T;
-    typedef std::array<T, 0> C;
-    C c = {};
-    C c2 = c;
-    c2 = c;
-    static_assert(std::is_copy_constructible<C>::value, "");
-    static_assert(std::is_copy_assignable<C>::value, "");
-  }
-  {
-    // const arrays of size 0 should disable the implicit copy assignment operator.
-    typedef double T;
-    typedef std::array<const T, 0> C;
-    C c = {{}};
-    C c2 = c;
-    ((void)c2);
-    static_assert(std::is_copy_constructible<C>::value, "");
-    TEST_NOT_COPY_ASSIGNABLE(C);
-  }
-  {
-    typedef NoDefault T;
-    typedef std::array<T, 0> C;
-    C c = {};
-    C c2 = c;
-    c2 = c;
-    static_assert(std::is_copy_constructible<C>::value, "");
-    static_assert(std::is_copy_assignable<C>::value, "");
-  }
-  {
-    typedef NoDefault T;
-    typedef std::array<const T, 0> C;
-    C c = {{}};
-    C c2 = c;
-    ((void)c2);
-    static_assert(std::is_copy_constructible<C>::value, "");
-    TEST_NOT_COPY_ASSIGNABLE(C);
-  }
+TEST_CONSTEXPR_CXX14 bool tests()
+{
+    {
+        typedef double T;
+        typedef std::array<T, 3> C;
+        C c = {1.1, 2.2, 3.3};
+        C c2 = c;
+        c2 = c;
+        static_assert(std::is_copy_constructible<C>::value, "");
+        static_assert(std::is_copy_assignable<C>::value, "");
+    }
+    {
+        typedef double T;
+        typedef std::array<const T, 3> C;
+        C c = {1.1, 2.2, 3.3};
+        C c2 = c;
+        ((void)c2);
+        static_assert(std::is_copy_constructible<C>::value, "");
+        TEST_NOT_COPY_ASSIGNABLE(C);
+    }
+    {
+        typedef double T;
+        typedef std::array<T, 0> C;
+        C c = {};
+        C c2 = c;
+        c2 = c;
+        static_assert(std::is_copy_constructible<C>::value, "");
+        static_assert(std::is_copy_assignable<C>::value, "");
+    }
+    {
+        // const arrays of size 0 should disable the implicit copy assignment operator.
+        typedef double T;
+        typedef std::array<const T, 0> C;
+        C c = {{}};
+        C c2 = c;
+        ((void)c2);
+        static_assert(std::is_copy_constructible<C>::value, "");
+        TEST_NOT_COPY_ASSIGNABLE(C);
+    }
+    {
+        typedef NoDefault T;
+        typedef std::array<T, 0> C;
+        C c = {};
+        C c2 = c;
+        c2 = c;
+        static_assert(std::is_copy_constructible<C>::value, "");
+        static_assert(std::is_copy_assignable<C>::value, "");
+    }
+    {
+        typedef NoDefault T;
+        typedef std::array<const T, 0> C;
+        C c = {{}};
+        C c2 = c;
+        ((void)c2);
+        static_assert(std::is_copy_constructible<C>::value, "");
+        TEST_NOT_COPY_ASSIGNABLE(C);
+    }
 
+    return true;
+}
 
-  return 0;
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp
index cdc04b18d24f1..49437546a8f73 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp
@@ -18,12 +18,12 @@
 #include "test_macros.h"
 #include "disable_missing_braces_warning.h"
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
         typedef double T;
         typedef std::array<T, 3> C;
-        C c = {1, 2, 3.5};
+        C const c = {1, 2, 3.5};
         assert(c.size() == 3);
         assert(c[0] == 1);
         assert(c[1] == 2);
@@ -32,23 +32,32 @@ int main(int, char**)
     {
         typedef double T;
         typedef std::array<T, 0> C;
-        C c = {};
+        C const c = {};
         assert(c.size() == 0);
     }
 
     {
         typedef double T;
         typedef std::array<T, 3> C;
-        C c = {1};
+        C const c = {1};
         assert(c.size() == 3.0);
         assert(c[0] == 1);
     }
     {
         typedef int T;
         typedef std::array<T, 1> C;
-        C c = {};
+        C const c = {};
         assert(c.size() == 1);
     }
 
-  return 0;
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.creation/to_array.pass.cpp b/libcxx/test/std/containers/sequences/array/array.creation/to_array.pass.cpp
index d5df96a270053..87165498416b0 100644
--- a/libcxx/test/std/containers/sequences/array/array.creation/to_array.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.creation/to_array.pass.cpp
@@ -23,7 +23,8 @@
 #include "test_macros.h"
 #include "MoveOnly.h"
 
-int main(int, char**) {
+constexpr bool tests()
+{
   //  Test deduced type.
   {
     auto arr = std::to_array({1, 2, 3});
@@ -110,13 +111,12 @@ int main(int, char**) {
     assert(arr[0].b == .1);
   }
 
-  // Test constexpr.
-  {
-    constexpr std::array<int, 3> arr = std::to_array({1, 2, 3});
-    static_assert(arr[0] == 1);
-    static_assert(arr[1] == 2);
-    static_assert(arr[2] == 3);
-  }
+  return true;
+}
 
+int main(int, char**)
+{
+  tests();
+  static_assert(tests(), "");
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp b/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp
index 434cbc5fe4e90..a41409f8df1c0 100644
--- a/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp
@@ -21,7 +21,7 @@
 #include "disable_missing_braces_warning.h"
 
 struct NoDefault {
-  NoDefault(int) {}
+    TEST_CONSTEXPR NoDefault(int) { }
 };
 
 #if TEST_STD_VER < 11
@@ -33,7 +33,7 @@ struct natural_alignment {
 };
 #endif
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX17 bool tests()
 {
     {
         typedef double T;
@@ -52,33 +52,49 @@ int main(int, char**)
         LIBCPP_ASSERT(p != nullptr);
     }
     {
-      typedef double T;
-      typedef std::array<const T, 0> C;
-      C c = {{}};
-      const T* p = c.data();
-      static_assert((std::is_same<decltype(c.data()), const T*>::value), "");
-      LIBCPP_ASSERT(p != nullptr);
+        typedef double T;
+        typedef std::array<const T, 0> C;
+        C c = {{}};
+        const T* p = c.data();
+        LIBCPP_ASSERT(p != nullptr);
+        static_assert((std::is_same<decltype(c.data()), const T*>::value), "");
     }
-  {
-#if TEST_STD_VER < 11
-      typedef natural_alignment T;
-#else
-      typedef std::max_align_t T;
-#endif
-      typedef std::array<T, 0> C;
-      const C c = {};
-      const T* p = c.data();
-      LIBCPP_ASSERT(p != nullptr);
-      std::uintptr_t pint = reinterpret_cast<std::uintptr_t>(p);
-      assert(pint % TEST_ALIGNOF(T) == 0);
+    {
+        typedef NoDefault T;
+        typedef std::array<T, 0> C;
+        C c = {};
+        T* p = c.data();
+        LIBCPP_ASSERT(p != nullptr);
     }
     {
-      typedef NoDefault T;
-      typedef std::array<T, 0> C;
-      C c = {};
-      T* p = c.data();
-      LIBCPP_ASSERT(p != nullptr);
+        std::array<int, 5> c = {0, 1, 2, 3, 4};
+        assert(c.data() == &c[0]);
+        assert(*c.data() == c[0]);
     }
 
-  return 0;
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 17
+    static_assert(tests(), "");
+#endif
+
+    // Test the alignment of data()
+    {
+#if TEST_STD_VER < 11
+        typedef natural_alignment T;
+#else
+        typedef std::max_align_t T;
+#endif
+        typedef std::array<T, 0> C;
+        const C c = {};
+        const T* p = c.data();
+        LIBCPP_ASSERT(p != nullptr);
+        std::uintptr_t pint = reinterpret_cast<std::uintptr_t>(p);
+        assert(pint % TEST_ALIGNOF(T) == 0);
+    }
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp b/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp
index 92828eef07115..0f79237b48a6e 100644
--- a/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp
@@ -21,7 +21,7 @@
 #include "disable_missing_braces_warning.h"
 
 struct NoDefault {
-  NoDefault(int) {}
+    TEST_CONSTEXPR NoDefault(int) { }
 };
 
 #if TEST_STD_VER < 11
@@ -33,7 +33,7 @@ struct natural_alignment {
 };
 #endif
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX17 bool tests()
 {
     {
         typedef double T;
@@ -49,40 +49,45 @@ int main(int, char**)
         typedef std::array<T, 0> C;
         const C c = {};
         const T* p = c.data();
-        (void)p; // to placate scan-build
+        LIBCPP_ASSERT(p != nullptr);
+    }
+    {
+        typedef NoDefault T;
+        typedef std::array<T, 0> C;
+        const C c = {};
+        const T* p = c.data();
+        LIBCPP_ASSERT(p != nullptr);
     }
     {
-      typedef NoDefault T;
-      typedef std::array<T, 0> C;
-      const C c = {};
-      const T* p = c.data();
-      LIBCPP_ASSERT(p != nullptr);
+        std::array<int, 5> const c = {0, 1, 2, 3, 4};
+        assert(c.data() == &c[0]);
+        assert(*c.data() == c[0]);
     }
+
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 17
+    static_assert(tests(), "");
+#endif
+
+    // Test the alignment of data()
     {
 #if TEST_STD_VER < 11
-      typedef natural_alignment T;
+        typedef natural_alignment T;
 #else
-      typedef std::max_align_t T;
+        typedef std::max_align_t T;
 #endif
-      typedef std::array<T, 0> C;
-      const C c = {};
-      const T* p = c.data();
-      LIBCPP_ASSERT(p != nullptr);
-      std::uintptr_t pint = reinterpret_cast<std::uintptr_t>(p);
-      assert(pint % TEST_ALIGNOF(T) == 0);
-    }
-#if TEST_STD_VER > 14
-    {
-        typedef std::array<int, 5> C;
-        constexpr C c1{0,1,2,3,4};
-        constexpr const C c2{0,1,2,3,4};
-
-        static_assert (  c1.data()  == &c1[0], "");
-        static_assert ( *c1.data()  ==  c1[0], "");
-        static_assert (  c2.data()  == &c2[0], "");
-        static_assert ( *c2.data()  ==  c2[0], "");
+        typedef std::array<T, 0> C;
+        const C c = {};
+        const T* p = c.data();
+        LIBCPP_ASSERT(p != nullptr);
+        std::uintptr_t pint = reinterpret_cast<std::uintptr_t>(p);
+        assert(pint % TEST_ALIGNOF(T) == 0);
     }
-#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.fill/fill.pass.cpp b/libcxx/test/std/containers/sequences/array/array.fill/fill.pass.cpp
index f480d17393d45..3a185110a91e5 100644
--- a/libcxx/test/std/containers/sequences/array/array.fill/fill.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.fill/fill.pass.cpp
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "disable_missing_braces_warning.h"
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX20 bool tests()
 {
     {
         typedef double T;
@@ -30,6 +30,7 @@ int main(int, char**)
         assert(c[1] == 5.5);
         assert(c[2] == 5.5);
     }
+
     {
         typedef double T;
         typedef std::array<T, 0> C;
@@ -37,6 +38,14 @@ int main(int, char**)
         c.fill(5.5);
         assert(c.size() == 0);
     }
+    return true;
+}
 
-  return 0;
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 20
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.size/size.pass.cpp b/libcxx/test/std/containers/sequences/array/array.size/size.pass.cpp
index f837bdcf8fb82..e55030015b03e 100644
--- a/libcxx/test/std/containers/sequences/array/array.size/size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.size/size.pass.cpp
@@ -56,5 +56,5 @@ int main(int, char**)
     }
 #endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.special/swap.pass.cpp b/libcxx/test/std/containers/sequences/array/array.special/swap.pass.cpp
index 6c9ed957b836b..788f0ed486b8c 100644
--- a/libcxx/test/std/containers/sequences/array/array.special/swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.special/swap.pass.cpp
@@ -19,10 +19,10 @@
 #include "disable_missing_braces_warning.h"
 
 struct NonSwappable {
-  NonSwappable() {}
+    TEST_CONSTEXPR NonSwappable() { }
 private:
-  NonSwappable(NonSwappable const&);
-  NonSwappable& operator=(NonSwappable const&);
+    NonSwappable(NonSwappable const&);
+    NonSwappable& operator=(NonSwappable const&);
 };
 
 template <class Tp>
@@ -33,9 +33,9 @@ template <class Tp>
 std::false_type can_swap_imp(...);
 
 template <class Tp>
-struct can_swap : std::is_same<decltype(can_swap_imp<Tp>(0)), void> {};
+struct can_swap : std::is_same<decltype(can_swap_imp<Tp>(0)), void> { };
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX20 bool tests()
 {
     {
         typedef double T;
@@ -82,5 +82,14 @@ int main(int, char**)
     }
 #endif
 
-  return 0;
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 20
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.swap/swap.pass.cpp b/libcxx/test/std/containers/sequences/array/array.swap/swap.pass.cpp
index aac8a13b29a38..1f2d8e7de35fe 100644
--- a/libcxx/test/std/containers/sequences/array/array.swap/swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.swap/swap.pass.cpp
@@ -21,13 +21,13 @@
 #include "disable_missing_braces_warning.h"
 
 struct NonSwappable {
-  NonSwappable() {}
+    TEST_CONSTEXPR NonSwappable() { }
 private:
-  NonSwappable(NonSwappable const&);
-  NonSwappable& operator=(NonSwappable const&);
+    NonSwappable(NonSwappable const&);
+    NonSwappable& operator=(NonSwappable const&);
 };
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX20 bool tests()
 {
     {
         typedef double T;
@@ -89,6 +89,14 @@ int main(int, char**)
 #endif
     }
 
+    return true;
+}
 
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 20
+    static_assert(tests(), "");
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/get.pass.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/get.pass.cpp
index 9e94417ac9733..1b13d773b0de4 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/get.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/get.pass.cpp
@@ -20,41 +20,57 @@
 #include "disable_missing_braces_warning.h"
 
 
-#if TEST_STD_VER > 11
-struct S {
-   std::array<int, 3> a;
-   int k;
-   constexpr S() : a{1,2,3}, k(std::get<2>(a)) {}
-};
-
-constexpr std::array<int, 2> getArr () { return { 3, 4 }; }
-#endif
+template <typename ...T>
+TEST_CONSTEXPR std::array<int, sizeof...(T)> tempArray(T ...args)
+{
+    return {args...};
+}
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        C c = {1, 2, 3.5};
-        std::get<1>(c) = 5.5;
-        assert(c[0] == 1);
-        assert(c[1] == 5.5);
-        assert(c[2] == 3.5);
+        std::array<double, 1> array = {3.3};
+        assert(std::get<0>(array) == 3.3);
+        std::get<0>(array) = 99.1;
+        assert(std::get<0>(array) == 99.1);
     }
-#if TEST_STD_VER > 11
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        constexpr C c = {1, 2, 3.5};
-        static_assert(std::get<0>(c) == 1, "");
-        static_assert(std::get<1>(c) == 2, "");
-        static_assert(std::get<2>(c) == 3.5, "");
+        std::array<double, 2> array = {3.3, 4.4};
+        assert(std::get<0>(array) == 3.3);
+        assert(std::get<1>(array) == 4.4);
+        std::get<0>(array) = 99.1;
+        std::get<1>(array) = 99.2;
+        assert(std::get<0>(array) == 99.1);
+        assert(std::get<1>(array) == 99.2);
     }
     {
-        static_assert(S().k == 3, "");
-        static_assert(std::get<1>(getArr()) == 4, "");
+        std::array<double, 3> array = {3.3, 4.4, 5.5};
+        assert(std::get<0>(array) == 3.3);
+        assert(std::get<1>(array) == 4.4);
+        assert(std::get<2>(array) == 5.5);
+        std::get<1>(array) = 99.2;
+        assert(std::get<0>(array) == 3.3);
+        assert(std::get<1>(array) == 99.2);
+        assert(std::get<2>(array) == 5.5);
     }
-#endif
+    {
+        std::array<double, 1> array = {3.3};
+        static_assert(std::is_same<double&, decltype(std::get<0>(array))>::value, "");
+    }
+    {
+        assert(std::get<0>(tempArray(1, 2, 3)) == 1);
+        assert(std::get<1>(tempArray(1, 2, 3)) == 2);
+        assert(std::get<2>(tempArray(1, 2, 3)) == 3);
+    }
+
+    return true;
+}
 
-  return 0;
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/get_const.pass.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/get_const.pass.cpp
index b22a76185b6fc..3c1941c031fb3 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/get_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/get_const.pass.cpp
@@ -19,26 +19,36 @@
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        const C c = {1, 2, 3.5};
-        assert(std::get<0>(c) == 1);
-        assert(std::get<1>(c) == 2);
-        assert(std::get<2>(c) == 3.5);
+        std::array<double, 1> const array = {3.3};
+        assert(std::get<0>(array) == 3.3);
     }
-#if TEST_STD_VER > 11
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        constexpr const C c = {1, 2, 3.5};
-        static_assert(std::get<0>(c) == 1, "");
-        static_assert(std::get<1>(c) == 2, "");
-        static_assert(std::get<2>(c) == 3.5, "");
+        std::array<double, 2> const array = {3.3, 4.4};
+        assert(std::get<0>(array) == 3.3);
+        assert(std::get<1>(array) == 4.4);
+    }
+    {
+        std::array<double, 3> const array = {3.3, 4.4, 5.5};
+        assert(std::get<0>(array) == 3.3);
+        assert(std::get<1>(array) == 4.4);
+        assert(std::get<2>(array) == 5.5);
+    }
+    {
+        std::array<double, 1> const array = {3.3};
+        static_assert(std::is_same<double const&, decltype(std::get<0>(array))>::value, "");
     }
-#endif
 
-  return 0;
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/get_const_rv.pass.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/get_const_rv.pass.cpp
index ce8fc4fd3651c..346978008c846 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/get_const_rv.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/get_const_rv.pass.cpp
@@ -28,25 +28,25 @@ int main(int, char**)
 {
 
     {
-    typedef std::unique_ptr<double> T;
-    typedef std::array<T, 1> C;
-    const C c = {std::unique_ptr<double>(new double(3.5))};
-    static_assert(std::is_same<const T&&, decltype(std::get<0>(std::move(c)))>::value, "");
-    static_assert(noexcept(std::get<0>(std::move(c))), "");
-    const T&& t = std::get<0>(std::move(c));
-    assert(*t == 3.5);
+        typedef std::unique_ptr<double> T;
+        typedef std::array<T, 1> C;
+        const C c = {std::unique_ptr<double>(new double(3.5))};
+        static_assert(std::is_same<const T&&, decltype(std::get<0>(std::move(c)))>::value, "");
+        static_assert(noexcept(std::get<0>(std::move(c))), "");
+        const T&& t = std::get<0>(std::move(c));
+        assert(*t == 3.5);
     }
 
-#if TEST_STD_VER > 11
+#if TEST_STD_VER >= 14
     {
-    typedef double T;
-    typedef std::array<T, 3> C;
-    constexpr const C c = {1, 2, 3.5};
-    static_assert(std::get<0>(std::move(c)) == 1, "");
-    static_assert(std::get<1>(std::move(c)) == 2, "");
-    static_assert(std::get<2>(std::move(c)) == 3.5, "");
+        typedef double T;
+        typedef std::array<T, 3> C;
+        constexpr const C c = {1, 2, 3.5};
+        static_assert(std::get<0>(std::move(c)) == 1, "");
+        static_assert(std::get<1>(std::move(c)) == 2, "");
+        static_assert(std::get<2>(std::move(c)) == 3.5, "");
     }
 #endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/get_rv.pass.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/get_rv.pass.cpp
index d1d8b28f4c9d7..e557c19ba2a44 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/get_rv.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/get_rv.pass.cpp
@@ -33,5 +33,5 @@ int main(int, char**)
         assert(*t == 3.5);
     }
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.fail.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.fail.cpp
index a4fbd3ab4c214..cde4ba7c759f0 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.fail.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.fail.cpp
@@ -18,7 +18,6 @@
 #include <array>
 #include <cassert>
 
-
 // std::array is explicitly allowed to be initialized with A a = { init-list };.
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
diff --git a/libcxx/test/std/containers/sequences/array/at.pass.cpp b/libcxx/test/std/containers/sequences/array/at.pass.cpp
index 0454643541c9d..ed4ab80fb0cd6 100644
--- a/libcxx/test/std/containers/sequences/array/at.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/at.pass.cpp
@@ -8,10 +8,7 @@
 
 // <array>
 
-// reference operator[] (size_type)
-// const_reference operator[] (size_type); // constexpr in C++14
-// reference at (size_type)
-// const_reference at (size_type); // constexpr in C++14
+// reference at (size_type); // constexpr in C++17
 
 #include <array>
 #include <cassert>
@@ -26,100 +23,91 @@
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
 
-#if TEST_STD_VER > 14
-constexpr bool check_idx( size_t idx, double val )
-{
-    std::array<double, 3> arr = {1, 2, 3.5};
-    return arr.at(idx) == val;
-}
-#endif
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX17 bool tests()
 {
     {
         typedef double T;
         typedef std::array<T, 3> C;
         C c = {1, 2, 3.5};
-        C::reference r1 = c.at(0);
+        typename C::reference r1 = c.at(0);
         assert(r1 == 1);
         r1 = 5.5;
-        assert(c.front() == 5.5);
+        assert(c[0] == 5.5);
 
-        C::reference r2 = c.at(2);
+        typename C::reference r2 = c.at(2);
         assert(r2 == 3.5);
         r2 = 7.5;
-        assert(c.back() == 7.5);
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-        try
-        {
-            TEST_IGNORE_NODISCARD  c.at(3);
-            assert(false);
-        }
-        catch (const std::out_of_range &) {}
-#endif
+        assert(c[2] == 7.5);
     }
+    return true;
+}
+
+void test_exceptions()
+{
 #ifndef TEST_HAS_NO_EXCEPTIONS
     {
-        typedef double T;
-        typedef std::array<T, 0> C;
-        C c = {};
-        C const& cc = c;
-        try
-        {
-            TEST_IGNORE_NODISCARD  c.at(0);
+        std::array<int, 4> array = {1, 2, 3, 4};
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(4);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
             assert(false);
         }
-        catch (const std::out_of_range &) {}
-        try
-        {
-            TEST_IGNORE_NODISCARD  cc.at(0);
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(5);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
             assert(false);
         }
-        catch (const std::out_of_range &) {}
-    }
-#endif
-    {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        const C c = {1, 2, 3.5};
-        C::const_reference r1 = c.at(0);
-        assert(r1 == 1);
 
-        C::const_reference r2 = c.at(2);
-        assert(r2 == 3.5);
+        try {
+            TEST_IGNORE_NODISCARD array.at(6);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
 
-#ifndef TEST_HAS_NO_EXCEPTIONS
-        try
-        {
-            TEST_IGNORE_NODISCARD  c.at(3);
+        try {
+            TEST_IGNORE_NODISCARD array.at(-1);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
             assert(false);
         }
-        catch (const std::out_of_range &) {}
-#endif
     }
 
-#if TEST_STD_VER > 11
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        constexpr C c = {1, 2, 3.5};
-
-        constexpr T t1 = c.at(0);
-        static_assert (t1 == 1, "");
+        std::array<int, 0> array = {};
 
-        constexpr T t2 = c.at(2);
-        static_assert (t2 == 3.5, "");
+        try {
+            TEST_IGNORE_NODISCARD array.at(0);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
     }
 #endif
+}
 
-#if TEST_STD_VER > 14
-    {
-        static_assert (check_idx(0, 1), "");
-        static_assert (check_idx(1, 2), "");
-        static_assert (check_idx(2, 3.5), "");
-    }
-#endif
+int main(int, char**)
+{
+    tests();
+    test_exceptions();
 
-  return 0;
+#if TEST_STD_VER >= 17
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/at_const.pass.cpp b/libcxx/test/std/containers/sequences/array/at_const.pass.cpp
new file mode 100644
index 0000000000000..d79c5054ae831
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/at_const.pass.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <array>
+
+// const_reference at (size_type) const; // constexpr in C++14
+
+#include <array>
+#include <cassert>
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#include "test_macros.h"
+
+// std::array is explicitly allowed to be initialized with A a = { init-list };.
+// Disable the missing braces warning for this reason.
+#include "disable_missing_braces_warning.h"
+
+
+TEST_CONSTEXPR_CXX14 bool tests()
+{
+    {
+        typedef double T;
+        typedef std::array<T, 3> C;
+        C const c = {1, 2, 3.5};
+        typename C::const_reference r1 = c.at(0);
+        assert(r1 == 1);
+
+        typename C::const_reference r2 = c.at(2);
+        assert(r2 == 3.5);
+    }
+    return true;
+}
+
+void test_exceptions()
+{
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    {
+        std::array<int, 4> const array = {1, 2, 3, 4};
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(4);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(5);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(6);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(-1);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
+    }
+
+    {
+        std::array<int, 0> array = {};
+
+        try {
+            TEST_IGNORE_NODISCARD array.at(0);
+            assert(false);
+        } catch (std::out_of_range const&) {
+            // pass
+        } catch (...) {
+            assert(false);
+        }
+    }
+#endif
+}
+
+int main(int, char**)
+{
+    tests();
+    test_exceptions();
+
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/begin.pass.cpp b/libcxx/test/std/containers/sequences/array/begin.pass.cpp
deleted file mode 100644
index 7b26d231dbdd9..0000000000000
--- a/libcxx/test/std/containers/sequences/array/begin.pass.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <array>
-
-// iterator begin();
-
-#include <array>
-#include <cassert>
-
-#include "test_macros.h"
-
-// std::array is explicitly allowed to be initialized with A a = { init-list };.
-// Disable the missing braces warning for this reason.
-#include "disable_missing_braces_warning.h"
-
-struct NoDefault {
-  NoDefault(int) {}
-};
-
-
-int main(int, char**)
-{
-    {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        C c = {1, 2, 3.5};
-        C::iterator i;
-        i = c.begin();
-        assert(*i == 1);
-        assert(&*i == c.data());
-        *i = 5.5;
-        assert(c[0] == 5.5);
-    }
-    {
-      typedef NoDefault T;
-      typedef std::array<T, 0> C;
-      C c = {};
-      C::iterator ib, ie;
-      ib = c.begin();
-      ie = c.end();
-      assert(ib == ie);
-      LIBCPP_ASSERT(ib != nullptr);
-      LIBCPP_ASSERT(ie != nullptr);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/containers/sequences/array/compare.pass.cpp b/libcxx/test/std/containers/sequences/array/compare.pass.cpp
index c05dd19405bff..c89216c8bb87e 100644
--- a/libcxx/test/std/containers/sequences/array/compare.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/compare.pass.cpp
@@ -8,17 +8,15 @@
 
 // <array>
 
-//  These are all constexpr in C++20
-// bool operator==(array<T, N> const&, array<T, N> const&);
-// bool operator!=(array<T, N> const&, array<T, N> const&);
-// bool operator<(array<T, N> const&, array<T, N> const&);
-// bool operator<=(array<T, N> const&, array<T, N> const&);
-// bool operator>(array<T, N> const&, array<T, N> const&);
-// bool operator>=(array<T, N> const&, array<T, N> const&);
+// bool operator==(array<T, N> const&, array<T, N> const&);   // constexpr in C++20
+// bool operator!=(array<T, N> const&, array<T, N> const&);   // constexpr in C++20
+// bool operator<(array<T, N> const&, array<T, N> const&);    // constexpr in C++20
+// bool operator<=(array<T, N> const&, array<T, N> const&);   // constexpr in C++20
+// bool operator>(array<T, N> const&, array<T, N> const&);    // constexpr in C++20
+// bool operator>=(array<T, N> const&, array<T, N> const&);   // constexpr in C++20
 
 
 #include <array>
-#include <vector>
 #include <cassert>
 
 #include "test_macros.h"
@@ -28,36 +26,33 @@
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
 
+TEST_CONSTEXPR_CXX20 bool tests()
+{
+    {
+        typedef std::array<int, 3> C;
+        C c1 = {1, 2, 3};
+        C c2 = {1, 2, 3};
+        C c3 = {3, 2, 1};
+        C c4 = {1, 2, 1};
+        assert(testComparisons6(c1, c2, true, false));
+        assert(testComparisons6(c1, c3, false, true));
+        assert(testComparisons6(c1, c4, false, false));
+    }
+    {
+        typedef std::array<int, 0> C;
+        C c1 = {};
+        C c2 = {};
+        assert(testComparisons6(c1, c2, true, false));
+    }
+
+    return true;
+}
+
 int main(int, char**)
 {
-  {
-    typedef int T;
-    typedef std::array<T, 3> C;
-    C c1 = {1, 2, 3};
-    C c2 = {1, 2, 3};
-    C c3 = {3, 2, 1};
-    C c4 = {1, 2, 1};
-    assert(testComparisons6(c1, c2, true, false));
-    assert(testComparisons6(c1, c3, false, true));
-    assert(testComparisons6(c1, c4, false, false));
-  }
-  {
-    typedef int T;
-    typedef std::array<T, 0> C;
-    C c1 = {};
-    C c2 = {};
-    assert(testComparisons6(c1, c2, true, false));
-  }
-
-#if TEST_STD_VER > 17
-  {
-  constexpr std::array<int, 3> a1 = {1, 2, 3};
-  constexpr std::array<int, 3> a2 = {2, 3, 4};
-  static_assert(testComparisons6(a1, a1, true, false), "");
-  static_assert(testComparisons6(a1, a2, false, true), "");
-  static_assert(testComparisons6(a2, a1, false, false), "");
-  }
+    tests();
+#if TEST_STD_VER >= 20
+    static_assert(tests(), "");
 #endif
-
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/contiguous.pass.cpp b/libcxx/test/std/containers/sequences/array/contiguous.pass.cpp
index 41a7153e88d46..9589e63643941 100644
--- a/libcxx/test/std/containers/sequences/array/contiguous.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/contiguous.pass.cpp
@@ -15,20 +15,33 @@
 
 #include "test_macros.h"
 
-template <class C>
-void test_contiguous ( const C &c )
+template <class Container>
+TEST_CONSTEXPR_CXX14 void assert_contiguous(Container const& c)
 {
-    for ( size_t i = 0; i < c.size(); ++i )
-        assert ( *(c.begin() + i) == *(std::addressof(*c.begin()) + i));
+    for (size_t i = 0; i < c.size(); ++i)
+        assert(*(c.begin() + i) == *(std::addressof(*c.begin()) + i));
 }
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX17 bool tests()
 {
-    {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        test_contiguous (C());
-    }
+    assert_contiguous(std::array<double, 0>());
+    assert_contiguous(std::array<double, 1>());
+    assert_contiguous(std::array<double, 2>());
+    assert_contiguous(std::array<double, 3>());
+
+    assert_contiguous(std::array<char, 0>());
+    assert_contiguous(std::array<char, 1>());
+    assert_contiguous(std::array<char, 2>());
+    assert_contiguous(std::array<char, 3>());
+
+    return true;
+}
 
-  return 0;
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 17 // begin() & friends are constexpr in >= C++17 only
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/empty.pass.cpp b/libcxx/test/std/containers/sequences/array/empty.pass.cpp
index a17aa50c5b219..8b61575c2e731 100644
--- a/libcxx/test/std/containers/sequences/array/empty.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/empty.pass.cpp
@@ -10,28 +10,45 @@
 
 // class array
 
-// bool empty() const noexcept;
+// constexpr bool empty() const noexcept;
 
 #include <array>
 #include <cassert>
 
 #include "test_macros.h"
-#include "min_allocator.h"
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
-    typedef std::array<int, 2> C;
-    C c;
-    ASSERT_NOEXCEPT(c.empty());
-    assert(!c.empty());
+        typedef std::array<int, 2> C;
+        C c = {};
+        ASSERT_NOEXCEPT(c.empty());
+        assert(!c.empty());
+    }
+    {
+        typedef std::array<int, 0> C;
+        C c = {};
+        ASSERT_NOEXCEPT(c.empty());
+        assert(c.empty());
     }
+
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+
+#if TEST_STD_VER >= 11
+    // Sanity check for constexpr in C++11
     {
-    typedef std::array<int, 0> C;
-    C c;
-    ASSERT_NOEXCEPT(c.empty());
-    assert( c.empty());
+        constexpr std::array<int, 3> array = {};
+        static_assert(!array.empty(), "");
     }
+#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/front_back.pass.cpp b/libcxx/test/std/containers/sequences/array/front_back.pass.cpp
index 5e0cb08ede077..18985e90a89e1 100644
--- a/libcxx/test/std/containers/sequences/array/front_back.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/front_back.pass.cpp
@@ -8,10 +8,8 @@
 
 // <array>
 
-// reference front();       // constexpr in C++17
-// reference back();        // constexpr in C++17
-// const_reference front(); // constexpr in C++14
-// const_reference back();  // constexpr in C++14
+// reference front();  // constexpr in C++17
+// reference back();   // constexpr in C++17
 
 #include <array>
 #include <cassert>
@@ -22,21 +20,8 @@
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
 
-#if TEST_STD_VER > 14
-constexpr bool check_front( double val )
-{
-    std::array<double, 3> arr = {1, 2, 3.5};
-    return arr.front() == val;
-}
 
-constexpr bool check_back( double val )
-{
-    std::array<double, 3> arr = {1, 2, 3.5};
-    return arr.back() == val;
-}
-#endif
-
-int main(int, char**)
+TEST_CONSTEXPR_CXX17 bool tests()
 {
     {
         typedef double T;
@@ -55,74 +40,39 @@ int main(int, char**)
     }
     {
         typedef double T;
-        typedef std::array<T, 3> C;
-        const C c = {1, 2, 3.5};
-        C::const_reference r1 = c.front();
-        assert(r1 == 1);
-
-        C::const_reference r2 = c.back();
-        assert(r2 == 3.5);
-    }
-    {
-      typedef double T;
-      typedef std::array<T, 0> C;
-      C c = {};
-      C const& cc = c;
-      ASSERT_SAME_TYPE(decltype( c.back()), typename C::reference);
-      ASSERT_SAME_TYPE(decltype(cc.back()), typename C::const_reference);
-      LIBCPP_ASSERT_NOEXCEPT(    c.back());
-      LIBCPP_ASSERT_NOEXCEPT(   cc.back());
-      ASSERT_SAME_TYPE(decltype( c.front()), typename C::reference);
-      ASSERT_SAME_TYPE(decltype(cc.front()), typename C::const_reference);
-      LIBCPP_ASSERT_NOEXCEPT(    c.front());
-      LIBCPP_ASSERT_NOEXCEPT(   cc.front());
-      if (c.size() > (0)) { // always false
-        TEST_IGNORE_NODISCARD c.front();
-        TEST_IGNORE_NODISCARD c.back();
-        TEST_IGNORE_NODISCARD cc.front();
-        TEST_IGNORE_NODISCARD cc.back();
-      }
+        typedef std::array<T, 0> C;
+        C c = {};
+        ASSERT_SAME_TYPE(decltype(c.back()), C::reference);
+        LIBCPP_ASSERT_NOEXCEPT(c.back());
+        ASSERT_SAME_TYPE(decltype(c.front()), C::reference);
+        LIBCPP_ASSERT_NOEXCEPT(c.front());
+        if (c.size() > (0)) { // always false
+            TEST_IGNORE_NODISCARD c.front();
+            TEST_IGNORE_NODISCARD c.back();
+        }
     }
-    {
-      typedef double T;
-      typedef std::array<const T, 0> C;
-      C c = {{}};
-      C const& cc = c;
-      ASSERT_SAME_TYPE(decltype( c.back()), typename C::reference);
-      ASSERT_SAME_TYPE(decltype(cc.back()), typename C::const_reference);
-      LIBCPP_ASSERT_NOEXCEPT(    c.back());
-      LIBCPP_ASSERT_NOEXCEPT(   cc.back());
-      ASSERT_SAME_TYPE(decltype( c.front()), typename C::reference);
-      ASSERT_SAME_TYPE(decltype(cc.front()), typename C::const_reference);
-      LIBCPP_ASSERT_NOEXCEPT(    c.front());
-      LIBCPP_ASSERT_NOEXCEPT(   cc.front());
-      if (c.size() > (0)) {
-        TEST_IGNORE_NODISCARD c.front();
-        TEST_IGNORE_NODISCARD c.back();
-        TEST_IGNORE_NODISCARD cc.front();
-        TEST_IGNORE_NODISCARD cc.back();
-      }
-    }
-#if TEST_STD_VER > 11
     {
         typedef double T;
-        typedef std::array<T, 3> C;
-        constexpr C c = {1, 2, 3.5};
-
-        constexpr T t1 = c.front();
-        static_assert (t1 == 1, "");
-
-        constexpr T t2 = c.back();
-        static_assert (t2 == 3.5, "");
+        typedef std::array<const T, 0> C;
+        C c = {};
+        ASSERT_SAME_TYPE(decltype( c.back()), C::reference);
+        LIBCPP_ASSERT_NOEXCEPT(    c.back());
+        ASSERT_SAME_TYPE(decltype( c.front()), C::reference);
+        LIBCPP_ASSERT_NOEXCEPT(    c.front());
+        if (c.size() > (0)) {
+            TEST_IGNORE_NODISCARD c.front();
+            TEST_IGNORE_NODISCARD c.back();
+        }
     }
-#endif
 
-#if TEST_STD_VER > 14
-    {
-        static_assert (check_front(1),   "");
-        static_assert (check_back (3.5), "");
-    }
-#endif
+    return true;
+}
 
-  return 0;
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 17
+    static_assert(tests(), "");
+#endif
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/front_back_const.pass.cpp b/libcxx/test/std/containers/sequences/array/front_back_const.pass.cpp
new file mode 100644
index 0000000000000..af519127bed51
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/front_back_const.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <array>
+
+// const_reference front() const; // constexpr in C++14
+// const_reference back() const;  // constexpr in C++14
+
+#include <array>
+#include <cassert>
+
+#include "test_macros.h"
+
+// std::array is explicitly allowed to be initialized with A a = { init-list };.
+// Disable the missing braces warning for this reason.
+#include "disable_missing_braces_warning.h"
+
+
+TEST_CONSTEXPR_CXX14 bool tests()
+{
+    {
+        typedef double T;
+        typedef std::array<T, 3> C;
+        C const c = {1, 2, 3.5};
+        C::const_reference r1 = c.front();
+        assert(r1 == 1);
+
+        C::const_reference r2 = c.back();
+        assert(r2 == 3.5);
+    }
+    {
+        typedef double T;
+        typedef std::array<T, 0> C;
+        C const c = {};
+        ASSERT_SAME_TYPE(decltype(c.back()), C::const_reference);
+        LIBCPP_ASSERT_NOEXCEPT(c.back());
+        ASSERT_SAME_TYPE(decltype(c.front()), C::const_reference);
+        LIBCPP_ASSERT_NOEXCEPT(c.front());
+        if (c.size() > (0)) { // always false
+            TEST_IGNORE_NODISCARD c.front();
+            TEST_IGNORE_NODISCARD c.back();
+        }
+    }
+    {
+        typedef double T;
+        typedef std::array<const T, 0> C;
+        C const c = {};
+        ASSERT_SAME_TYPE(decltype(c.back()), C::const_reference);
+        LIBCPP_ASSERT_NOEXCEPT(c.back());
+        ASSERT_SAME_TYPE(decltype(c.front()), C::const_reference);
+        LIBCPP_ASSERT_NOEXCEPT(c.front());
+        if (c.size() > (0)) {
+            TEST_IGNORE_NODISCARD c.front();
+            TEST_IGNORE_NODISCARD c.back();
+        }
+    }
+
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+    return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/indexing.pass.cpp b/libcxx/test/std/containers/sequences/array/indexing.pass.cpp
index 0df672d8b0196..6b5d8486fda22 100644
--- a/libcxx/test/std/containers/sequences/array/indexing.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/indexing.pass.cpp
@@ -8,11 +8,8 @@
 
 // <array>
 
-// reference operator[] (size_type)
-// const_reference operator[] (size_type); // constexpr in C++14
-// reference at (size_type)
-// const_reference at (size_type); // constexpr in C++14
-// Libc++ marks these as noexcept
+// reference operator[](size_type); // constexpr in C++17
+// Libc++ marks it as noexcept
 
 #include <array>
 #include <cassert>
@@ -23,15 +20,8 @@
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
 
-#if TEST_STD_VER > 14
-constexpr bool check_idx( size_t idx, double val )
-{
-    std::array<double, 3> arr = {1, 2, 3.5};
-    return arr[idx] == val;
-}
-#endif
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX17 bool tests()
 {
     {
         typedef double T;
@@ -49,72 +39,41 @@ int main(int, char**)
         r2 = 7.5;
         assert(c.back() == 7.5);
     }
+
+    // Test operator[] "works" on zero sized arrays
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        const C c = {1, 2, 3.5};
-        LIBCPP_ASSERT_NOEXCEPT(c[0]);
-        ASSERT_SAME_TYPE(C::const_reference, decltype(c[0]));
-        C::const_reference r1 = c[0];
-        assert(r1 == 1);
-        C::const_reference r2 = c[2];
-        assert(r2 == 3.5);
-    }
-    { // Test operator[] "works" on zero sized arrays
-        typedef double T;
-        typedef std::array<T, 0> C;
-        C c = {};
-        C const& cc = c;
-        LIBCPP_ASSERT_NOEXCEPT(c[0]);
-        LIBCPP_ASSERT_NOEXCEPT(cc[0]);
-        ASSERT_SAME_TYPE(C::reference, decltype(c[0]));
-        ASSERT_SAME_TYPE(C::const_reference, decltype(cc[0]));
-        if (c.size() > (0)) { // always false
-          C::reference r1 = c[0];
-          C::const_reference r2 = cc[0];
-          ((void)r1);
-          ((void)r2);
+        {
+            typedef double T;
+            typedef std::array<T, 0> C;
+            C c = {};
+            LIBCPP_ASSERT_NOEXCEPT(c[0]);
+            ASSERT_SAME_TYPE(C::reference, decltype(c[0]));
+            if (c.size() > (0)) { // always false
+              C::reference r = c[0];
+              (void)r;
+            }
         }
-    }
-    { // Test operator[] "works" on zero sized arrays
-        typedef double T;
-        typedef std::array<const T, 0> C;
-        C c = {{}};
-        C const& cc = c;
-        LIBCPP_ASSERT_NOEXCEPT(c[0]);
-        LIBCPP_ASSERT_NOEXCEPT(cc[0]);
-        ASSERT_SAME_TYPE(C::reference, decltype(c[0]));
-        ASSERT_SAME_TYPE(C::const_reference, decltype(cc[0]));
-        if (c.size() > (0)) { // always false
-          C::reference r1 = c[0];
-          C::const_reference r2 = cc[0];
-          ((void)r1);
-          ((void)r2);
+        {
+            typedef double T;
+            typedef std::array<const T, 0> C;
+            C c = {};
+            LIBCPP_ASSERT_NOEXCEPT(c[0]);
+            ASSERT_SAME_TYPE(C::reference, decltype(c[0]));
+            if (c.size() > (0)) { // always false
+              C::reference r = c[0];
+              (void)r;
+            }
         }
     }
-#if TEST_STD_VER > 11
-    {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        constexpr C c = {1, 2, 3.5};
-        LIBCPP_ASSERT_NOEXCEPT(c[0]);
-        ASSERT_SAME_TYPE(C::const_reference, decltype(c[0]));
 
-        constexpr T t1 = c[0];
-        static_assert (t1 == 1, "");
-
-        constexpr T t2 = c[2];
-        static_assert (t2 == 3.5, "");
-    }
-#endif
+    return true;
+}
 
-#if TEST_STD_VER > 14
-    {
-        static_assert (check_idx(0, 1), "");
-        static_assert (check_idx(1, 2), "");
-        static_assert (check_idx(2, 3.5), "");
-    }
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 17
+    static_assert(tests(), "");
 #endif
-
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/indexing_const.pass.cpp b/libcxx/test/std/containers/sequences/array/indexing_const.pass.cpp
new file mode 100644
index 0000000000000..3f34dfe7dd325
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/indexing_const.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <array>
+
+// const_reference operator[](size_type) const; // constexpr in C++14
+// Libc++ marks it as noexcept
+
+#include <array>
+#include <cassert>
+
+#include "test_macros.h"
+
+// std::array is explicitly allowed to be initialized with A a = { init-list };.
+// Disable the missing braces warning for this reason.
+#include "disable_missing_braces_warning.h"
+
+
+TEST_CONSTEXPR_CXX14 bool tests()
+{
+    {
+        typedef double T;
+        typedef std::array<T, 3> C;
+        C const c = {1, 2, 3.5};
+        LIBCPP_ASSERT_NOEXCEPT(c[0]);
+        ASSERT_SAME_TYPE(C::const_reference, decltype(c[0]));
+        C::const_reference r1 = c[0];
+        assert(r1 == 1);
+        C::const_reference r2 = c[2];
+        assert(r2 == 3.5);
+    }
+    // Test operator[] "works" on zero sized arrays
+    {
+        {
+            typedef double T;
+            typedef std::array<T, 0> C;
+            C const c = {};
+            LIBCPP_ASSERT_NOEXCEPT(c[0]);
+            ASSERT_SAME_TYPE(C::const_reference, decltype(c[0]));
+            if (c.size() > (0)) { // always false
+                C::const_reference r = c[0];
+                (void)r;
+            }
+        }
+        {
+            typedef double T;
+            typedef std::array<T const, 0> C;
+            C const c = {};
+            LIBCPP_ASSERT_NOEXCEPT(c[0]);
+            ASSERT_SAME_TYPE(C::const_reference, decltype(c[0]));
+            if (c.size() > (0)) { // always false
+              C::const_reference r = c[0];
+              (void)r;
+            }
+        }
+    }
+
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+  return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
index 71fad183ff730..39d8a1a5dfa4c 100644
--- a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
@@ -8,7 +8,20 @@
 
 // <array>
 
-// iterator, const_iterator
+// iterator begin() noexcept;                         // constexpr in C++17
+// const_iterator begin() const noexcept;             // constexpr in C++17
+// iterator end() noexcept;                           // constexpr in C++17
+// const_iterator end() const noexcept;               // constexpr in C++17
+//
+// reverse_iterator rbegin() noexcept;                // constexpr in C++17
+// const_reverse_iterator rbegin() const noexcept;    // constexpr in C++17
+// reverse_iterator rend() noexcept;                  // constexpr in C++17
+// const_reverse_iterator rend() const noexcept;      // constexpr in C++17
+//
+// const_iterator cbegin() const noexcept;            // constexpr in C++17
+// const_iterator cend() const noexcept;              // constexpr in C++17
+// const_reverse_iterator crbegin() const noexcept;   // constexpr in C++17
+// const_reverse_iterator crend() const noexcept;     // constexpr in C++17
 
 #include <array>
 #include <iterator>
@@ -20,127 +33,157 @@
 // Disable the missing braces warning for this reason.
 #include "disable_missing_braces_warning.h"
 
-int main(int, char**)
+struct NoDefault {
+    TEST_CONSTEXPR NoDefault(int) { }
+};
+
+TEST_CONSTEXPR_CXX17 bool tests()
 {
     {
-    typedef std::array<int, 5> C;
-    C c;
-    C::iterator i;
-    i = c.begin();
-    C::const_iterator j;
-    j = c.cbegin();
-    assert(i == j);
+        typedef std::array<int, 5> C;
+        C array = {};
+        typename C::iterator i = array.begin();
+        typename C::const_iterator j = array.cbegin();
+        assert(i == j);
+    }
+    {
+        typedef std::array<int, 0> C;
+        C array = {};
+        typename C::iterator i = array.begin();
+        typename C::const_iterator j = array.cbegin();
+        assert(i == j);
+        LIBCPP_ASSERT(i != nullptr);
+        LIBCPP_ASSERT(j != nullptr);
+    }
+
+    {
+        typedef std::array<int, 0> C;
+        C array = {};
+        typename C::iterator i = array.begin();
+        typename C::const_iterator j = array.cbegin();
+        assert(i == array.end());
+        assert(j == array.cend());
+        LIBCPP_ASSERT(i != nullptr);
+        LIBCPP_ASSERT(j != nullptr);
+    }
+    {
+        typedef std::array<int, 1> C;
+        C array = {1};
+        typename C::iterator i = array.begin();
+        assert(*i == 1);
+        assert(&*i == array.data());
+        *i = 99;
+        assert(array[0] == 99);
     }
     {
-    typedef std::array<int, 0> C;
-    C c;
-    C::iterator i;
-    i = c.begin();
-    C::const_iterator j;
-    j = c.cbegin();
-    assert(i == j);
+        typedef std::array<int, 2> C;
+        C array = {1, 2};
+        typename C::iterator i = array.begin();
+        assert(*i == 1);
+        assert(&*i == array.data());
+        *i = 99;
+        assert(array[0] == 99);
+        assert(array[1] == 2);
+    }
+    {
+        typedef std::array<double, 3> C;
+        C array = {1, 2, 3.5};
+        typename C::iterator i = array.begin();
+        assert(*i == 1);
+        assert(&*i == array.data());
+        *i = 5.5;
+        assert(array[0] == 5.5);
+        assert(array[1] == 2.0);
+    }
+    {
+        typedef std::array<NoDefault, 0> C;
+        C array = {};
+        typename C::iterator ib = array.begin();
+        typename C::iterator ie = array.end();
+        assert(ib == ie);
+        LIBCPP_ASSERT(ib != nullptr);
+        LIBCPP_ASSERT(ie != nullptr);
     }
 
-#if TEST_STD_VER > 11
+#if TEST_STD_VER >= 14
     { // N3644 testing
         {
-        typedef std::array<int, 5> C;
-        C::iterator ii1{}, ii2{};
-        C::iterator ii4 = ii1;
-        C::const_iterator cii{};
-        assert ( ii1 == ii2 );
-        assert ( ii1 == ii4 );
-        assert ( ii1 == cii );
-
-        assert ( !(ii1 != ii2 ));
-        assert ( !(ii1 != cii ));
-
-        C c;
-        assert ( c.begin()   == std::begin(c));
-        assert ( c.cbegin()  == std::cbegin(c));
-        assert ( c.rbegin()  == std::rbegin(c));
-        assert ( c.crbegin() == std::crbegin(c));
-        assert ( c.end()     == std::end(c));
-        assert ( c.cend()    == std::cend(c));
-        assert ( c.rend()    == std::rend(c));
-        assert ( c.crend()   == std::crend(c));
-
-        assert ( std::begin(c)   != std::end(c));
-        assert ( std::rbegin(c)  != std::rend(c));
-        assert ( std::cbegin(c)  != std::cend(c));
-        assert ( std::crbegin(c) != std::crend(c));
+            typedef std::array<int, 5> C;
+            C::iterator ii1{}, ii2{};
+            C::iterator ii4 = ii1;
+            C::const_iterator cii{};
+            assert(ii1 == ii2);
+            assert(ii1 == ii4);
+            assert(ii1 == cii);
+
+            assert(!(ii1 != ii2));
+            assert(!(ii1 != cii));
+
+            C c = {};
+            assert(c.begin()   == std::begin(c));
+            assert(c.cbegin()  == std::cbegin(c));
+            assert(c.rbegin()  == std::rbegin(c));
+            assert(c.crbegin() == std::crbegin(c));
+            assert(c.end()     == std::end(c));
+            assert(c.cend()    == std::cend(c));
+            assert(c.rend()    == std::rend(c));
+            assert(c.crend()   == std::crend(c));
+
+            assert(std::begin(c)   != std::end(c));
+            assert(std::rbegin(c)  != std::rend(c));
+            assert(std::cbegin(c)  != std::cend(c));
+            assert(std::crbegin(c) != std::crend(c));
         }
         {
-        typedef std::array<int, 0> C;
-        C::iterator ii1{}, ii2{};
-        C::iterator ii4 = ii1;
-        C::const_iterator cii{};
-        assert ( ii1 == ii2 );
-        assert ( ii1 == ii4 );
-
-        assert (!(ii1 != ii2 ));
-
-        assert ( (ii1 == cii ));
-        assert ( (cii == ii1 ));
-        assert (!(ii1 != cii ));
-        assert (!(cii != ii1 ));
-        assert (!(ii1 <  cii ));
-        assert (!(cii <  ii1 ));
-        assert ( (ii1 <= cii ));
-        assert ( (cii <= ii1 ));
-        assert (!(ii1 >  cii ));
-        assert (!(cii >  ii1 ));
-        assert ( (ii1 >= cii ));
-        assert ( (cii >= ii1 ));
-        assert (cii - ii1 == 0);
-        assert (ii1 - cii == 0);
-
-        C c;
-        assert ( c.begin()   == std::begin(c));
-        assert ( c.cbegin()  == std::cbegin(c));
-        assert ( c.rbegin()  == std::rbegin(c));
-        assert ( c.crbegin() == std::crbegin(c));
-        assert ( c.end()     == std::end(c));
-        assert ( c.cend()    == std::cend(c));
-        assert ( c.rend()    == std::rend(c));
-        assert ( c.crend()   == std::crend(c));
-
-        assert ( std::begin(c)   == std::end(c));
-        assert ( std::rbegin(c)  == std::rend(c));
-        assert ( std::cbegin(c)  == std::cend(c));
-        assert ( std::crbegin(c) == std::crend(c));
+            typedef std::array<int, 0> C;
+            C::iterator ii1{}, ii2{};
+            C::iterator ii4 = ii1;
+            C::const_iterator cii{};
+            assert(ii1 == ii2);
+            assert(ii1 == ii4);
+
+            assert(!(ii1 != ii2));
+
+            assert( (ii1 == cii));
+            assert( (cii == ii1));
+            assert(!(ii1 != cii));
+            assert(!(cii != ii1));
+            assert(!(ii1 <  cii));
+            assert(!(cii <  ii1));
+            assert( (ii1 <= cii));
+            assert( (cii <= ii1));
+            assert(!(ii1 >  cii));
+            assert(!(cii >  ii1));
+            assert( (ii1 >= cii));
+            assert( (cii >= ii1));
+            assert(cii - ii1 == 0);
+            assert(ii1 - cii == 0);
+
+            C c = {};
+            assert(c.begin()   == std::begin(c));
+            assert(c.cbegin()  == std::cbegin(c));
+            assert(c.rbegin()  == std::rbegin(c));
+            assert(c.crbegin() == std::crbegin(c));
+            assert(c.end()     == std::end(c));
+            assert(c.cend()    == std::cend(c));
+            assert(c.rend()    == std::rend(c));
+            assert(c.crend()   == std::crend(c));
+
+            assert(std::begin(c)   == std::end(c));
+            assert(std::rbegin(c)  == std::rend(c));
+            assert(std::cbegin(c)  == std::cend(c));
+            assert(std::crbegin(c) == std::crend(c));
         }
     }
 #endif
-#if TEST_STD_VER > 14
-    {
-        typedef std::array<int, 5> C;
-        constexpr C c{0,1,2,3,4};
-
-        static_assert ( c.begin()   == std::begin(c), "");
-        static_assert ( c.cbegin()  == std::cbegin(c), "");
-        static_assert ( c.end()     == std::end(c), "");
-        static_assert ( c.cend()    == std::cend(c), "");
-
-        static_assert ( c.rbegin()  == std::rbegin(c), "");
-        static_assert ( c.crbegin() == std::crbegin(c), "");
-        static_assert ( c.rend()    == std::rend(c), "");
-        static_assert ( c.crend()   == std::crend(c), "");
-
-        static_assert ( std::begin(c)   != std::end(c), "");
-        static_assert ( std::rbegin(c)  != std::rend(c), "");
-        static_assert ( std::cbegin(c)  != std::cend(c), "");
-        static_assert ( std::crbegin(c) != std::crend(c), "");
-
-        static_assert ( *c.begin()  == 0, "");
-        static_assert ( *c.rbegin()  == 4, "");
-
-        static_assert ( *std::begin(c)   == 0, "" );
-        static_assert ( *std::cbegin(c)  == 0, "" );
-        static_assert ( *std::rbegin(c)  == 4, "" );
-        static_assert ( *std::crbegin(c) == 4, "" );
-    }
-#endif
+    return true;
+}
 
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 17
+    static_assert(tests(), "");
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/max_size.pass.cpp b/libcxx/test/std/containers/sequences/array/max_size.pass.cpp
index a0b77392ee804..8ca8cd44725e2 100644
--- a/libcxx/test/std/containers/sequences/array/max_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/max_size.pass.cpp
@@ -10,28 +10,45 @@
 
 // class array
 
-// bool max_size() const noexcept;
+// constexpr bool max_size() const noexcept;
 
 #include <array>
 #include <cassert>
 
 #include "test_macros.h"
-#include "min_allocator.h"
 
-int main(int, char**)
+TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
-    typedef std::array<int, 2> C;
-    C c;
-    ASSERT_NOEXCEPT(c.max_size());
-    assert(c.max_size() == 2);
+        typedef std::array<int, 2> C;
+        C c = {};
+        ASSERT_NOEXCEPT(c.max_size());
+        assert(c.max_size() == 2);
+    }
+    {
+        typedef std::array<int, 0> C;
+        C c = {};
+        ASSERT_NOEXCEPT(c.max_size());
+        assert(c.max_size() == 0);
     }
+
+    return true;
+}
+
+int main(int, char**)
+{
+    tests();
+#if TEST_STD_VER >= 14
+    static_assert(tests(), "");
+#endif
+
+#if TEST_STD_VER >= 11
+    // Sanity check for constexpr in C++11
     {
-    typedef std::array<int, 0> C;
-    C c;
-    ASSERT_NOEXCEPT(c.max_size());
-    assert(c.max_size() == 0);
+        constexpr std::array<int, 3> array = {};
+        static_assert(array.max_size() == 3, "");
     }
+#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/size_and_alignment.pass.cpp b/libcxx/test/std/containers/sequences/array/size_and_alignment.pass.cpp
index 51982ea5cab1a..11eac5cc825bb 100644
--- a/libcxx/test/std/containers/sequences/array/size_and_alignment.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/size_and_alignment.pass.cpp
@@ -34,12 +34,6 @@ void test() {
   static_assert(sizeof(ArrayT) == sizeof(CArrayT), "");
   static_assert(sizeof(ArrayT) == sizeof(MyArrayT), "");
   static_assert(TEST_ALIGNOF(ArrayT) == TEST_ALIGNOF(MyArrayT), "");
-#if defined(_LIBCPP_VERSION)
-  ArrayT a;
-  ((void)a);
-  static_assert(sizeof(ArrayT) == sizeof(a.__elems_), "");
-  static_assert(TEST_ALIGNOF(ArrayT) == __alignof__(a.__elems_), "");
-#endif
 }
 
 template <class T>
@@ -67,8 +61,6 @@ struct TEST_ALIGNAS(TEST_ALIGNOF(std::max_align_t) * 2) TestType2 {
 };
 #endif
 
-//static_assert(sizeof(void*) == 4, "");
-
 int main(int, char**) {
   test_type<char>();
   test_type<int>();
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.pass.cpp
index 152458908e7d5..933420be50c96 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.pass.cpp
@@ -15,6 +15,7 @@
 
 /*  Constant                                Value
     __cpp_lib_array_constexpr               201603L [C++17]
+                                            201811L [C++2a]
     __cpp_lib_constexpr_misc                201811L [C++2a]
     __cpp_lib_nonmember_container_access    201411L [C++17]
     __cpp_lib_to_array                      201907L [C++2a]
@@ -88,8 +89,8 @@
 # ifndef __cpp_lib_array_constexpr
 #   error "__cpp_lib_array_constexpr should be defined in c++2a"
 # endif
-# if __cpp_lib_array_constexpr != 201603L
-#   error "__cpp_lib_array_constexpr should have the value 201603L in c++2a"
+# if __cpp_lib_array_constexpr != 201811L
+#   error "__cpp_lib_array_constexpr should have the value 201811L in c++2a"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.pass.cpp
index 9c1719b53cfe6..4e68cf9a72323 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.pass.cpp
@@ -15,6 +15,7 @@
 
 /*  Constant                                Value
     __cpp_lib_array_constexpr               201603L [C++17]
+                                            201811L [C++2a]
     __cpp_lib_constexpr_misc                201811L [C++2a]
     __cpp_lib_make_reverse_iterator         201402L [C++14]
     __cpp_lib_nonmember_container_access    201411L [C++17]
@@ -126,8 +127,8 @@
 # ifndef __cpp_lib_array_constexpr
 #   error "__cpp_lib_array_constexpr should be defined in c++2a"
 # endif
-# if __cpp_lib_array_constexpr != 201603L
-#   error "__cpp_lib_array_constexpr should have the value 201603L in c++2a"
+# if __cpp_lib_array_constexpr != 201811L
+#   error "__cpp_lib_array_constexpr should have the value 201811L in c++2a"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp
index 0c2cd53757586..901cb2539635c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp
@@ -19,6 +19,7 @@
     __cpp_lib_any                                  201606L [C++17]
     __cpp_lib_apply                                201603L [C++17]
     __cpp_lib_array_constexpr                      201603L [C++17]
+                                                   201811L [C++2a]
     __cpp_lib_as_const                             201510L [C++17]
     __cpp_lib_atomic_is_always_lock_free           201603L [C++17]
     __cpp_lib_atomic_ref                           201806L [C++2a]
@@ -1537,8 +1538,8 @@
 # ifndef __cpp_lib_array_constexpr
 #   error "__cpp_lib_array_constexpr should be defined in c++2a"
 # endif
-# if __cpp_lib_array_constexpr != 201603L
-#   error "__cpp_lib_array_constexpr should have the value 201603L in c++2a"
+# if __cpp_lib_array_constexpr != 201811L
+#   error "__cpp_lib_array_constexpr should have the value 201811L in c++2a"
 # endif
 
 # ifndef __cpp_lib_as_const
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 79ee5ddace8c8..63aa4f47a70f0 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -149,6 +149,12 @@
 # define TEST_CONSTEXPR_CXX14
 #endif
 
+#if TEST_STD_VER >= 17
+# define TEST_CONSTEXPR_CXX17 constexpr
+#else
+# define TEST_CONSTEXPR_CXX17
+#endif
+
 #if TEST_STD_VER >= 20
 # define TEST_CONSTEXPR_CXX20 constexpr
 #else
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 4a9772cfc66d8..7a3f441c1cab8 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -409,6 +409,7 @@ def add_version_header(tc):
   {"name": "__cpp_lib_array_constexpr",
    "values": {
      "c++17": int(201603),
+     "c++2a": int(201811),
    },
    "headers": ["iterator", "array"],
    },

From 8cec5c35816d80852bef917ed8a3002fcc5961b0 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 28 May 2020 09:35:06 -0700
Subject: [PATCH 377/770] Make VE.def a textual header

---
 llvm/include/llvm/module.modulemap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap
index baab05c0928c3..93c30d6d102ec 100644
--- a/llvm/include/llvm/module.modulemap
+++ b/llvm/include/llvm/module.modulemap
@@ -75,6 +75,7 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/RISCV.def"
     textual header "BinaryFormat/ELFRelocs/Sparc.def"
     textual header "BinaryFormat/ELFRelocs/SystemZ.def"
+    textual header "BinaryFormat/ELFRelocs/VE.def"
     textual header "BinaryFormat/ELFRelocs/x86_64.def"
     textual header "BinaryFormat/WasmRelocs.def"
     textual header "BinaryFormat/MsgPack.def"

From 7cfdff7b4a6704b8ef2a1b594e1ec19d2d89f385 Mon Sep 17 00:00:00 2001
From: Tom Lokovic <tdl@google.com>
Date: Thu, 28 May 2020 12:22:30 -0400
Subject: [PATCH 378/770] [clang-tidy] Add abseil-string-find-str-contains
 checker.

Summary: This adds a checker which suggests replacing string.find(...) == npos with absl::StrContains.

Reviewers: alexfh, hokein, aaron.ballman, njames93, ymandel

Reviewed By: ymandel

Subscribers: ymandel, Eugene.Zelenko, xazax.hun, mgorny, Charusso, phosek, cfe-commits

Tags: #clang, #clang-tools-extra

Differential Revision: https://reviews.llvm.org/D80023
---
 .../clang-tidy/abseil/AbseilTidyModule.cpp    |   5 +-
 .../clang-tidy/abseil/CMakeLists.txt          |   1 +
 .../abseil/StringFindStrContainsCheck.cpp     | 110 +++++++
 .../abseil/StringFindStrContainsCheck.h       |  39 +++
 clang-tools-extra/docs/ReleaseNotes.rst       |   7 +
 .../abseil-string-find-str-contains.rst       |  52 ++++
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../abseil-string-find-str-contains.cpp       | 290 ++++++++++++++++++
 8 files changed, 504 insertions(+), 1 deletion(-)
 create mode 100644 clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/abseil-string-find-str-contains.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/abseil-string-find-str-contains.cpp

diff --git a/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp b/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
index c70ef9007fbd1..7d592d7e3e559 100644
--- a/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
@@ -21,8 +21,9 @@
 #include "NoInternalDependenciesCheck.h"
 #include "NoNamespaceCheck.h"
 #include "RedundantStrcatCallsCheck.h"
-#include "StringFindStartswithCheck.h"
 #include "StrCatAppendCheck.h"
+#include "StringFindStartswithCheck.h"
+#include "StringFindStrContainsCheck.h"
 #include "TimeComparisonCheck.h"
 #include "TimeSubtractionCheck.h"
 #include "UpgradeDurationConversionsCheck.h"
@@ -61,6 +62,8 @@ class AbseilModule : public ClangTidyModule {
         "abseil-str-cat-append");
     CheckFactories.registerCheck<StringFindStartswithCheck>(
         "abseil-string-find-startswith");
+    CheckFactories.registerCheck<StringFindStrContainsCheck>(
+        "abseil-string-find-str-contains");
     CheckFactories.registerCheck<TimeComparisonCheck>(
         "abseil-time-comparison");
     CheckFactories.registerCheck<TimeSubtractionCheck>(
diff --git a/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt b/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt
index bd8865ef92690..c4efa0fe27437 100644
--- a/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt
@@ -20,6 +20,7 @@ add_clang_library(clangTidyAbseilModule
   RedundantStrcatCallsCheck.cpp
   StrCatAppendCheck.cpp
   StringFindStartswithCheck.cpp
+  StringFindStrContainsCheck.cpp
   TimeComparisonCheck.cpp
   TimeSubtractionCheck.cpp
   UpgradeDurationConversionsCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
new file mode 100644
index 0000000000000..f60ce20007661
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
@@ -0,0 +1,110 @@
+//===--- StringFindStrContainsCheck.cc - clang-tidy------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "StringFindStrContainsCheck.h"
+
+#include "../utils/OptionsUtils.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Tooling/Transformer/RewriteRule.h"
+#include "clang/Tooling/Transformer/Stencil.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang {
+namespace tidy {
+namespace abseil {
+
+using ::clang::transformer::applyFirst;
+using ::clang::transformer::cat;
+using ::clang::transformer::change;
+using ::clang::transformer::makeRule;
+using ::clang::transformer::node;
+
+static const char DefaultStringLikeClasses[] = "::std::basic_string;"
+                                               "::std::basic_string_view;"
+                                               "::absl::string_view";
+static const char DefaultAbseilStringsMatchHeader[] = "absl/strings/match.h";
+
+static llvm::Optional<transformer::RewriteRule>
+MakeRule(const LangOptions &LangOpts,
+         const ClangTidyCheck::OptionsView &Options) {
+  // Parse options.
+  //
+  // FIXME(tdl-g): These options are being parsed redundantly with the
+  // constructor because TransformerClangTidyCheck forces us to provide MakeRule
+  // before "this" is fully constructed, but StoreOptions requires us to store
+  // the parsed options in "this".  We need to fix TransformerClangTidyCheck and
+  // then we can clean this up.
+  const std::vector<std::string> StringLikeClassNames =
+      utils::options::parseStringList(
+          Options.get("StringLikeClasses", DefaultStringLikeClasses));
+  const std::string AbseilStringsMatchHeader =
+      Options.get("AbseilStringsMatchHeader", DefaultAbseilStringsMatchHeader);
+
+  auto StringLikeClass = cxxRecordDecl(hasAnyName(SmallVector<StringRef, 4>(
+      StringLikeClassNames.begin(), StringLikeClassNames.end())));
+  auto StringType =
+      hasUnqualifiedDesugaredType(recordType(hasDeclaration(StringLikeClass)));
+  auto CharStarType =
+      hasUnqualifiedDesugaredType(pointerType(pointee(isAnyCharacter())));
+  auto StringNpos = declRefExpr(
+      to(varDecl(hasName("npos"), hasDeclContext(StringLikeClass))));
+  auto StringFind = cxxMemberCallExpr(
+      callee(cxxMethodDecl(
+          hasName("find"),
+          hasParameter(0, parmVarDecl(anyOf(hasType(StringType),
+                                            hasType(CharStarType)))))),
+      on(hasType(StringType)), hasArgument(0, expr().bind("parameter_to_find")),
+      anyOf(hasArgument(1, integerLiteral(equals(0))),
+            hasArgument(1, cxxDefaultArgExpr())),
+      onImplicitObjectArgument(expr().bind("string_being_searched")));
+
+  tooling::RewriteRule rule = applyFirst(
+      {makeRule(binaryOperator(hasOperatorName("=="),
+                               hasOperands(ignoringParenImpCasts(StringNpos),
+                                           ignoringParenImpCasts(StringFind))),
+                change(cat("!absl::StrContains(", node("string_being_searched"),
+                           ", ", node("parameter_to_find"), ")")),
+                cat("use !absl::StrContains instead of find() == npos")),
+       makeRule(binaryOperator(hasOperatorName("!="),
+                               hasOperands(ignoringParenImpCasts(StringNpos),
+                                           ignoringParenImpCasts(StringFind))),
+                change(cat("absl::StrContains(", node("string_being_searched"),
+                           ", ", node("parameter_to_find"), ")")),
+                cat("use absl::StrContains instead of find() != npos"))});
+  addInclude(rule, AbseilStringsMatchHeader);
+  return rule;
+}
+
+StringFindStrContainsCheck::StringFindStrContainsCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : TransformerClangTidyCheck(&MakeRule, Name, Context),
+      StringLikeClassesOption(utils::options::parseStringList(
+          Options.get("StringLikeClasses", DefaultStringLikeClasses))),
+      AbseilStringsMatchHeaderOption(Options.get(
+          "AbseilStringsMatchHeader", DefaultAbseilStringsMatchHeader)) {}
+
+bool StringFindStrContainsCheck::isLanguageVersionSupported(
+    const LangOptions &LangOpts) const {
+  return LangOpts.CPlusPlus11;
+}
+
+void StringFindStrContainsCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  TransformerClangTidyCheck::storeOptions(Opts);
+  Options.store(Opts, "StringLikeClasses",
+                utils::options::serializeStringList(StringLikeClassesOption));
+  Options.store(Opts, "AbseilStringsMatchHeader",
+                AbseilStringsMatchHeaderOption);
+}
+
+} // namespace abseil
+} // namespace tidy
+} // namespace clang
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h
new file mode 100644
index 0000000000000..351cc3784a96e
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h
@@ -0,0 +1,39 @@
+//===--- StringFindStrContainsCheck.h - clang-tidy---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_STRINGFINDSTRCONTAINSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_STRINGFINDSTRCONTAINSCHECK_H
+
+#include "../ClangTidy.h"
+#include "../utils/TransformerClangTidyCheck.h"
+
+namespace clang {
+namespace tidy {
+namespace abseil {
+
+/// Finds s.find(...) == string::npos comparisons (for various string-like
+/// types) and suggests replacing with absl::StrContains.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/abseil-string-find-str-contains.html
+class StringFindStrContainsCheck : public utils::TransformerClangTidyCheck {
+public:
+  StringFindStrContainsCheck(StringRef Name, ClangTidyContext *Context);
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+
+private:
+  const std::vector<std::string> StringLikeClassesOption;
+  const std::string AbseilStringsMatchHeaderOption;
+};
+
+} // namespace abseil
+} // namespace tidy
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ABSEIL_STRINGFINDSTRCONTAINSCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 1827dfe913380..e6583c17978b0 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -75,6 +75,13 @@ New module
 
 New checks
 ^^^^^^^^^^
+
+- New :doc:`abseil-string-find-str-contains
+  <clang-tidy/checks/abseil-string-find-str-contains>` check.
+
+  Finds ``s.find(...) == string::npos`` comparisons (for various string-like types)
+  and suggests replacing with ``absl::StrContains()``.
+
 - New :doc:`cppcoreguidelines-avoid-non-const-global-variables
   <clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables>` check.
   Finds non-const global variables as described in check I.2 of C++ Core
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil-string-find-str-contains.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil-string-find-str-contains.rst
new file mode 100644
index 0000000000000..4cf99d5877a95
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil-string-find-str-contains.rst
@@ -0,0 +1,52 @@
+.. title:: clang-tidy - abseil-string-find-str-contains
+
+abseil-string-find-str-contains
+===============================
+
+Finds ``s.find(...) == string::npos`` comparisons (for various string-like types)
+and suggests replacing with ``absl::StrContains()``.
+
+This improves readability and reduces the likelihood of accidentally mixing
+``find()`` and ``npos`` from different string-like types.
+
+By default, "string-like types" includes ``::std::basic_string``,
+``::std::basic_string_view``, and ``::absl::string_view``.  See the
+StringLikeClasses option to change this.
+
+.. code-block:: c++
+
+  std::string s = "...";
+  if (s.find("Hello World") == std::string::npos) { /* do something */ }
+
+  absl::string_view a = "...";
+  if (absl::string_view::npos != a.find("Hello World")) { /* do something */ }
+
+becomes
+
+.. code-block:: c++
+
+  std::string s = "...";
+  if (!absl::StrContains(s, "Hello World")) { /* do something */ }
+
+  absl::string_view a = "...";
+  if (absl::StrContains(a, "Hello World")) { /* do something */ }
+
+
+Options
+-------
+
+.. option:: StringLikeClasses
+
+   Semicolon-separated list of names of string-like classes. By default includes
+   ``::std::basic_string``, ``::std::basic_string_view``, and
+   ``::absl::string_view``.
+
+.. option:: IncludeStyle
+
+   A string specifying which include-style is used, `llvm` or `google`. Default
+   is `llvm`.
+
+.. option:: AbseilStringsMatchHeader
+
+   The location of Abseil's ``strings/match.h``. Defaults to
+   ``absl/strings/match.h``.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index b4d09d5267262..6d5f8fcbb05a5 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -26,6 +26,7 @@ Clang-Tidy Checks
    `abseil-redundant-strcat-calls <abseil-redundant-strcat-calls.html>`_, "Yes"
    `abseil-str-cat-append <abseil-str-cat-append.html>`_, "Yes"
    `abseil-string-find-startswith <abseil-string-find-startswith.html>`_, "Yes"
+   `abseil-string-find-str-contains <abseil-string-find-str-contains.html>`_, "Yes"
    `abseil-time-comparison <abseil-time-comparison.html>`_, "Yes"
    `abseil-time-subtraction <abseil-time-subtraction.html>`_, "Yes"
    `abseil-upgrade-duration-conversions <abseil-upgrade-duration-conversions.html>`_, "Yes"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil-string-find-str-contains.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil-string-find-str-contains.cpp
new file mode 100644
index 0000000000000..871c830b81cf8
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/abseil-string-find-str-contains.cpp
@@ -0,0 +1,290 @@
+// RUN: %check_clang_tidy %s abseil-string-find-str-contains %t -- \
+// RUN:   -config="{CheckOptions: []}"
+
+using size_t = decltype(sizeof(int));
+
+namespace std {
+
+// Lightweight standin for std::string.
+template <typename C>
+class basic_string {
+public:
+  basic_string();
+  basic_string(const basic_string &);
+  basic_string(const C *);
+  ~basic_string();
+  int find(basic_string s, int pos = 0);
+  int find(const C *s, int pos = 0);
+  int find(char c, int pos = 0);
+  static constexpr size_t npos = -1;
+};
+typedef basic_string<char> string;
+
+// Lightweight standin for std::string_view.
+template <typename C>
+class basic_string_view {
+public:
+  basic_string_view();
+  basic_string_view(const basic_string_view &);
+  basic_string_view(const C *);
+  ~basic_string_view();
+  int find(basic_string_view s, int pos = 0);
+  int find(const C *s, int pos = 0);
+  int find(char c, int pos = 0);
+  static constexpr size_t npos = -1;
+};
+typedef basic_string_view<char> string_view;
+
+} // namespace std
+
+namespace absl {
+
+// Lightweight standin for absl::string_view.
+class string_view {
+public:
+  string_view();
+  string_view(const string_view &);
+  string_view(const char *);
+  ~string_view();
+  int find(string_view s, int pos = 0);
+  int find(const char *s, int pos = 0);
+  int find(char c, int pos = 0);
+  static constexpr size_t npos = -1;
+};
+
+} // namespace absl
+
+// Functions that take and return our various string-like types.
+std::string foo_ss(std::string);
+std::string_view foo_ssv(std::string_view);
+absl::string_view foo_asv(absl::string_view);
+std::string bar_ss();
+std::string_view bar_ssv();
+absl::string_view bar_asv();
+
+// Confirms that find==npos and find!=npos work for each supported type, when
+// npos comes from the correct type.
+void basic_tests() {
+  std::string ss;
+  ss.find("a") == std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of find() == npos
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ss, "a");{{$}}
+
+  ss.find("a") != std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of find() != npos
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ss, "a");{{$}}
+
+  std::string::npos != ss.find("a");
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ss, "a");{{$}}
+
+  std::string_view ssv;
+  ssv.find("a") == std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ssv, "a");{{$}}
+
+  ssv.find("a") != std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ssv, "a");{{$}}
+
+  std::string_view::npos != ssv.find("a");
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ssv, "a");{{$}}
+
+  absl::string_view asv;
+  asv.find("a") == absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(asv, "a");{{$}}
+
+  asv.find("a") != absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(asv, "a");{{$}}
+
+  absl::string_view::npos != asv.find("a");
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(asv, "a");{{$}}
+}
+
+// Confirms that it works even if you mix-and-match the type for find and for
+// npos.  (One of the reasons for this checker is to clean up cases that
+// accidentally mix-and-match like this.  absl::StrContains is less
+// error-prone.)
+void mismatched_npos() {
+  std::string ss;
+  ss.find("a") == std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ss, "a");{{$}}
+
+  ss.find("a") != absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ss, "a");{{$}}
+
+  std::string_view ssv;
+  ssv.find("a") == absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ssv, "a");{{$}}
+
+  ssv.find("a") != std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ssv, "a");{{$}}
+
+  absl::string_view asv;
+  asv.find("a") == std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(asv, "a");{{$}}
+
+  asv.find("a") != std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(asv, "a");{{$}}
+}
+
+// Confirms that it works even when the needle or the haystack are more
+// complicated expressions.
+void subexpression_tests() {
+  std::string ss, ss2;
+  foo_ss(ss).find(ss2) == std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(foo_ss(ss), ss2);{{$}}
+
+  ss.find(foo_ss(ss2)) != std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ss, foo_ss(ss2));{{$}}
+
+  foo_ss(bar_ss()).find(foo_ss(ss2)) != std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(foo_ss(bar_ss()), foo_ss(ss2));{{$}}
+
+  std::string_view ssv, ssv2;
+  foo_ssv(ssv).find(ssv2) == std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(foo_ssv(ssv), ssv2);{{$}}
+
+  ssv.find(foo_ssv(ssv2)) != std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(ssv, foo_ssv(ssv2));{{$}}
+
+  foo_ssv(bar_ssv()).find(foo_ssv(ssv2)) != std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(foo_ssv(bar_ssv()), foo_ssv(ssv2));{{$}}
+
+  absl::string_view asv, asv2;
+  foo_asv(asv).find(asv2) == absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(foo_asv(asv), asv2);{{$}}
+
+  asv.find(foo_asv(asv2)) != absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(asv, foo_asv(asv2));{{$}}
+
+  foo_asv(bar_asv()).find(foo_asv(asv2)) != absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}absl::StrContains(foo_asv(bar_asv()), foo_asv(asv2));{{$}}
+}
+
+// Confirms that it works with string literal, char* and const char* parameters.
+void string_literal_and_char_ptr_tests() {
+  char *c = nullptr;
+  const char *cc = nullptr;
+
+  std::string ss;
+  ss.find("c") == std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ss, "c");{{$}}
+
+  ss.find(c) == std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ss, c);{{$}}
+
+  ss.find(cc) == std::string::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ss, cc);{{$}}
+
+  std::string_view ssv;
+  ssv.find("c") == std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ssv, "c");{{$}}
+
+  ssv.find(c) == std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ssv, c);{{$}}
+
+  ssv.find(cc) == std::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(ssv, cc);{{$}}
+
+  absl::string_view asv;
+  asv.find("c") == absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(asv, "c");{{$}}
+
+  asv.find(c) == absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(asv, c);{{$}}
+
+  asv.find(cc) == absl::string_view::npos;
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use !absl::StrContains instead of
+  // CHECK-FIXES: {{^[[:space:]]*}}!absl::StrContains(asv, cc);{{$}}
+}
+
+// Confirms that it does *not* match when the parameter to find() is a char,
+// because absl::StrContains is not implemented for char.
+void no_char_param_tests() {
+  std::string ss;
+  ss.find('c') == std::string::npos;
+
+  std::string_view ssv;
+  ssv.find('c') == std::string_view::npos;
+
+  absl::string_view asv;
+  asv.find('c') == absl::string_view::npos;
+}
+
+#define COMPARE_MACRO(x, y) ((x) == (y))
+#define FIND_MACRO(x, y) ((x).find(y))
+#define FIND_COMPARE_MACRO(x, y, z) ((x).find(y) == (z))
+
+// Confirms that it does not match when a macro is involved.
+void no_macros() {
+  std::string s;
+  COMPARE_MACRO(s.find("a"), std::string::npos);
+  FIND_MACRO(s, "a") == std::string::npos;
+  FIND_COMPARE_MACRO(s, "a", std::string::npos);
+}
+
+// Confirms that it does not match when the pos parameter is non-zero.
+void no_nonzero_pos() {
+  std::string ss;
+  ss.find("a", 1) == std::string::npos;
+
+  std::string_view ssv;
+  ssv.find("a", 2) == std::string_view::npos;
+
+  absl::string_view asv;
+  asv.find("a", 3) == std::string_view::npos;
+}
+
+// Confirms that it does not match when it's compared to something other than
+// npos, even if the value is the same as npos.
+void no_non_npos() {
+  std::string ss;
+  ss.find("a") == 0;
+  ss.find("a") == 1;
+  ss.find("a") == -1;
+
+  std::string_view ssv;
+  ssv.find("a") == 0;
+  ssv.find("a") == 1;
+  ssv.find("a") == -1;
+
+  absl::string_view asv;
+  asv.find("a") == 0;
+  asv.find("a") == 1;
+  asv.find("a") == -1;
+}
+
+// Confirms that it does not match if the two operands are the same.
+void no_symmetric_operands() {
+  std::string ss;
+  ss.find("a") == ss.find("a");
+  std::string::npos == std::string::npos;
+}

From cc8fafa2be8d5315cc55aec54b2a6d7e60f470c4 Mon Sep 17 00:00:00 2001
From: Vy Nguyen <vyng@google.com>
Date: Wed, 27 May 2020 18:06:40 -0400
Subject: [PATCH 379/770] [llvm-exegesis] Make a few counter methods virtual to
 allow targets to provide target-specific support. Misc: Also include errno in
 failure message.

Differential Revision: https://reviews.llvm.org/D80610
---
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     | 26 ++++++++++------
 llvm/tools/llvm-exegesis/lib/PerfHelper.cpp   | 31 ++++++++++++++++---
 llvm/tools/llvm-exegesis/lib/PerfHelper.h     | 23 ++++++++++----
 llvm/tools/llvm-exegesis/lib/Target.cpp       | 15 +++++++++
 llvm/tools/llvm-exegesis/lib/Target.h         |  6 ++++
 5 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 9592fd8faa0ce..522d4210245c3 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <array>
+#include <memory>
 #include <string>
 
 #include "Assembler.h"
@@ -14,11 +15,13 @@
 #include "Error.h"
 #include "MCInstrDescView.h"
 #include "PerfHelper.h"
+#include "Target.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
@@ -38,7 +41,7 @@ class FunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
   FunctionExecutorImpl(const LLVMState &State,
                        object::OwningBinary<object::ObjectFile> Obj,
                        BenchmarkRunner::ScratchSpace *Scratch)
-      : Function(State.createTargetMachine(), std::move(Obj)),
+      : State(State), Function(State.createTargetMachine(), std::move(Obj)),
         Scratch(Scratch) {}
 
 private:
@@ -51,30 +54,33 @@ class FunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
     char *const ScratchPtr = Scratch->ptr();
     for (auto &CounterName : CounterNames) {
       CounterName = CounterName.trim();
-      pfm::PerfEvent PerfEvent(CounterName);
-      if (!PerfEvent.valid())
-        return make_error<Failure>(
-            Twine("invalid perf event '").concat(CounterName).concat("'"));
-      pfm::Counter Counter(std::move(PerfEvent));
+      auto CounterOrError =
+          State.getExegesisTarget().createCounter(CounterName.data(), State);
+
+      if (!CounterOrError)
+        return CounterOrError.takeError();
+
+      pfm::Counter *Counter = CounterOrError.get().get();
       Scratch->clear();
       {
         CrashRecoveryContext CRC;
         CrashRecoveryContext::Enable();
-        const bool Crashed = !CRC.RunSafely([this, &Counter, ScratchPtr]() {
-          Counter.start();
+        const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
+          Counter->start();
           this->Function(ScratchPtr);
-          Counter.stop();
+          Counter->stop();
         });
         CrashRecoveryContext::Disable();
         // FIXME: Better diagnosis.
         if (Crashed)
           return make_error<SnippetCrash>("snippet crashed while running");
       }
-      CounterValue += Counter.read();
+      CounterValue += Counter->read();
     }
     return CounterValue;
   }
 
+  const LLVMState &State;
   const ExecutableFunction Function;
   BenchmarkRunner::ScratchSpace *const Scratch;
 };
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
index c36bd5c8b9c39..c372ac4f364e3 100644
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -8,13 +8,19 @@
 
 #include "PerfHelper.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 #ifdef HAVE_LIBPFM
 #include "perfmon/perf_event.h"
 #include "perfmon/pfmlib.h"
 #include "perfmon/pfmlib_perf_event.h"
 #endif
+
 #include <cassert>
+#include <cstddef>
+#include <errno.h>  // for erno
+#include <string.h> // for strerror()
 
 namespace llvm {
 namespace exegesis {
@@ -97,7 +103,8 @@ Counter::Counter(PerfEvent &&E) : Event(std::move(E)){
   perf_event_attr AttrCopy = *Event.attribute();
   FileDescriptor = perf_event_open(&AttrCopy, Pid, Cpu, GroupFd, Flags);
   if (FileDescriptor == -1) {
-    errs() << "Unable to open event, make sure your kernel allows user "
+    errs() << "Unable to open event. ERRNO: " << strerror(errno)
+           << ". Make sure your kernel allows user "
               "space perf monitoring.\nYou may want to try:\n$ sudo sh "
               "-c 'echo -1 > /proc/sys/kernel/perf_event_paranoid'\n";
   }
@@ -111,12 +118,21 @@ void Counter::start() { ioctl(FileDescriptor, PERF_EVENT_IOC_RESET, 0); }
 void Counter::stop() { ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0); }
 
 int64_t Counter::read() const {
+  auto ValueOrError = readOrError();
+  if (ValueOrError)
+    return ValueOrError.get();
+
+  errs() << ValueOrError.takeError() << "\n";
+  return -1;
+}
+
+llvm::Expected<int64_t> Counter::readOrError() const {
   int64_t Count = 0;
   ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count));
-  if (ReadSize != sizeof(Count)) {
-    Count = -1;
-    errs() << "Failed to read event counter\n";
-  }
+  if (ReadSize != sizeof(Count))
+    return llvm::make_error<llvm::StringError>("Failed to read event counter",
+                                               llvm::errc::io_error);
+
   return Count;
 }
 
@@ -132,6 +148,11 @@ void Counter::stop() {}
 
 int64_t Counter::read() const { return 42; }
 
+llvm::Expected<int64_t> Counter::readOrError() const {
+  return llvm::make_error<llvm::StringError>("Not implemented",
+                                             llvm::errc::io_error);
+}
+
 #endif
 
 } // namespace pfm
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
index 99c555587c538..7562af9c45242 100644
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
@@ -17,6 +17,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
 #include <functional>
 #include <memory>
 
@@ -36,7 +38,7 @@ class PerfEvent {
 public:
   // http://perfmon2.sourceforge.net/manv4/libpfm.html
   // Events are expressed as strings. e.g. "INSTRUCTION_RETIRED"
-  explicit PerfEvent(StringRef pfm_event_string);
+  explicit PerfEvent(StringRef PfmEventString);
 
   PerfEvent(const PerfEvent &) = delete;
   PerfEvent(PerfEvent &&other);
@@ -63,18 +65,27 @@ class PerfEvent {
 
 // Uses a valid PerfEvent to configure the Kernel so we can measure the
 // underlying event.
-struct Counter {
+class Counter {
+public:
   // event: the PerfEvent to measure.
   explicit Counter(PerfEvent &&event);
 
   Counter(const Counter &) = delete;
   Counter(Counter &&other) = default;
 
-  ~Counter();
+  virtual ~Counter();
+
+  /// Starts the measurement of the event.
+  virtual void start();
+
+  /// Stops the measurement of the event.
+  void stop();
+
+  /// Returns the current value of the counter or -1 if it cannot be read.
+  int64_t read() const;
 
-  void start();         // Starts the measurement of the event.
-  void stop();          // Stops the measurement of the event.
-  int64_t read() const; // Return the current value of the counter.
+  /// Returns the current value of the counter or error if it cannot be read.
+  virtual llvm::Expected<int64_t> readOrError() const;
 
 private:
   PerfEvent Event;
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 4e0de93756693..61821bf4bb4d7 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -11,6 +11,8 @@
 #include "ParallelSnippetGenerator.h"
 #include "SerialSnippetGenerator.h"
 #include "UopsBenchmarkRunner.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace exegesis {
@@ -27,6 +29,19 @@ const ExegesisTarget *ExegesisTarget::lookup(Triple TT) {
   return nullptr;
 }
 
+Expected<std::unique_ptr<pfm::Counter>>
+ExegesisTarget::createCounter(const char *CounterName,
+                              const LLVMState &) const {
+  pfm::PerfEvent Event(CounterName);
+  if (!Event.valid())
+    return llvm::make_error<Failure>(
+        llvm::Twine("Unable to create counter with name '")
+            .concat(CounterName)
+            .concat("'"));
+
+  return std::make_unique<pfm::Counter>(std::move(Event));
+}
+
 void ExegesisTarget::registerTarget(ExegesisTarget *Target) {
   if (FirstTarget == nullptr) {
     FirstTarget = Target;
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 298cfac23e810..937b88ddb4cc0 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -20,6 +20,7 @@
 #include "BenchmarkRunner.h"
 #include "Error.h"
 #include "LlvmState.h"
+#include "PerfHelper.h"
 #include "SnippetGenerator.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -27,6 +28,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace exegesis {
@@ -65,6 +67,10 @@ class ExegesisTarget {
   explicit ExegesisTarget(ArrayRef<CpuAndPfmCounters> CpuPfmCounters)
       : CpuPfmCounters(CpuPfmCounters) {}
 
+  // Targets can use this to create target-specific perf counters.
+  virtual Expected<std::unique_ptr<pfm::Counter>>
+  createCounter(const char *CounterName, const LLVMState &State) const;
+
   // Targets can use this to add target-specific passes in assembleToStream();
   virtual void addTargetSpecificPasses(PassManagerBase &PM) const {}
 

From 15b6730f078329b3103a7a0476bc2227df214f4a Mon Sep 17 00:00:00 2001
From: Sidharth Baveja <Sidharth.Baveja@ibm.com>
Date: Thu, 28 May 2020 16:12:45 +0000
Subject: [PATCH 380/770] Create utility function to Merge Adjacent Basic
 Blocks

Summary: The following code from
/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp can be used by other
transformations:

while (!MergeBlocks.empty()) {
    BasicBlock *BB = *MergeBlocks.begin();
    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
    if (Term && Term->isUnconditional() &&
L->contains(Term->getSuccessor(0))) {
      BasicBlock *Dest = Term->getSuccessor(0);
      BasicBlock *Fold = Dest->getUniquePredecessor();
      if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
        // Don't remove BB and add Fold as they are the same BB
        assert(Fold == BB);
        (void)Fold;
        MergeBlocks.erase(Dest);
      } else
        MergeBlocks.erase(BB);
    } else
      MergeBlocks.erase(BB);
  }
Hence it should be separated into its own utility function.

Authored By: sidbav
Reviewer: Whitney, Meinersbur, asbirlea, dmgreen, etiotto
Reviewed By: asbirlea
Subscribers: hiraditya, zzheng, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D80583
---
 .../llvm/Transforms/Utils/BasicBlockUtils.h   | 12 +++++++++
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 25 +++++++++++++++++++
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp | 19 +++-----------
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 905233b54c9cb..26ef08b5f2538 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstrTypes.h"
@@ -96,6 +97,17 @@ bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU = nullptr,
                                MemoryDependenceResults *MemDep = nullptr,
                                bool PredecessorWithTwoSuccessors = false);
 
+/// Merge block(s) sucessors, if possible. Return true if at least two
+/// of the blocks were merged together.
+/// In order to merge, each block must be terminated by an unconditional
+/// branch. If L is provided, then the blocks merged into their predecessors
+/// must be in L. In addition, This utility calls on another utility:
+/// MergeBlockIntoPredecessor. Blocks are successfully merged when the call to
+/// MergeBlockIntoPredecessor returns true.
+bool MergeBlockSuccessorsIntoGivenBlocks(
+    SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L = nullptr,
+    DomTreeUpdater *DTU = nullptr, LoopInfo *LI = nullptr);
+
 /// Try to remove redundant dbg.value instructions from given basic block.
 /// Returns true if at least one instruction was removed.
 bool RemoveRedundantDbgInstrs(BasicBlock *BB);
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 95e2b26e98d54..085d91031cf90 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -315,6 +315,31 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   return true;
 }
 
+bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
+    SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU,
+    LoopInfo *LI) {
+  assert(!MergeBlocks.empty() && "MergeBlocks should not be empty");
+
+  bool BlocksHaveBeenMerged = false;
+  while (!MergeBlocks.empty()) {
+    BasicBlock *BB = *MergeBlocks.begin();
+    BasicBlock *Dest = BB->getSingleSuccessor();
+    if (Dest && (!L || L->contains(Dest))) {
+      BasicBlock *Fold = Dest->getUniquePredecessor();
+      (void)Fold;
+      if (MergeBlockIntoPredecessor(Dest, DTU, LI)) {
+        assert(Fold == BB &&
+               "Expecting BB to be unique predecessor of the Dest block");
+        MergeBlocks.erase(Dest);
+        BlocksHaveBeenMerged = true;
+      } else
+        MergeBlocks.erase(BB);
+    } else
+      MergeBlocks.erase(BB);
+  }
+  return BlocksHaveBeenMerged;
+}
+
 /// Remove redundant instructions within sequences of consecutive dbg.value
 /// instructions. This is done using a backward scan to keep the last dbg.value
 /// describing a specific variable/fragment.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 8ac6b0894d1c8..dd628f3e7e0ca 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -593,22 +593,9 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
-  while (!MergeBlocks.empty()) {
-    BasicBlock *BB = *MergeBlocks.begin();
-    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
-    if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
-      BasicBlock *Dest = Term->getSuccessor(0);
-      BasicBlock *Fold = Dest->getUniquePredecessor();
-      if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
-        // Don't remove BB and add Fold as they are the same BB
-        assert(Fold == BB);
-        (void)Fold;
-        MergeBlocks.erase(Dest);
-      } else
-        MergeBlocks.erase(BB);
-    } else
-      MergeBlocks.erase(BB);
-  }
+
+  MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
+
   // Apply updates to the DomTree.
   DT = &DTU.getDomTree();
 

From e0e5c644607ad71888c1df0fdbf3331d0fc2559f Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 21 May 2020 16:48:05 +0200
Subject: [PATCH 381/770] [SDAG] Don't require LazyBlockFrequencyInfo at
 optnone

While LazyBlockFrequencyInfo itself is lazy, the dominator tree
and loop info analyses it requires are not. Drop the dependency
on this pass in SelectionDAGIsel at O0.
This makes for a ~0.6% O0 compile-time improvement.

Differential Revision: https://reviews.llvm.org/D80387
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 9 +++++----
 llvm/test/CodeGen/AArch64/O0-pipeline.ll           | 4 ----
 llvm/test/CodeGen/X86/O0-pipeline.ll               | 4 ----
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index a0cfd3eb729f0..d2fac644d9024 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -337,7 +337,8 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
-  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  if (OptLevel != CodeGenOpt::None)
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -441,9 +442,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
   auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
-              &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
-              nullptr;
+  BlockFrequencyInfo *BFI = nullptr;
+  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOpt::None)
+    BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
 
   LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index 23d66af605cfe..a1141e2255e6e 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -42,10 +42,6 @@
 ; CHECK-NEXT:       Analysis for ComputingKnownBits
 ; CHECK-NEXT:       InstructionSelect
 ; CHECK-NEXT:       ResetMachineFunction
-; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       AArch64 Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index e2e3437e2cbd9..6d94bc163a0f7 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -33,10 +33,6 @@
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
-; CHECK-NEXT:       Dominator Tree Construction
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       X86 DAG->DAG Instruction Selection
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions

From db923ce6123068aec22735ea21a4abf8f02135d2 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 28 May 2020 16:49:43 +0000
Subject: [PATCH 382/770] [gn build] Port 7cfdff7b4a6

---
 .../gn/secondary/clang-tools-extra/clang-tidy/abseil/BUILD.gn    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/abseil/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/abseil/BUILD.gn
index febcc0873282b..732dd0d45ead4 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/abseil/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/abseil/BUILD.gn
@@ -28,6 +28,7 @@ static_library("abseil") {
     "RedundantStrcatCallsCheck.cpp",
     "StrCatAppendCheck.cpp",
     "StringFindStartswithCheck.cpp",
+    "StringFindStrContainsCheck.cpp",
     "TimeComparisonCheck.cpp",
     "TimeSubtractionCheck.cpp",
     "UpgradeDurationConversionsCheck.cpp",

From 723a1caa377b898ff3dc0897156f544feab99ac8 Mon Sep 17 00:00:00 2001
From: Jim Ingham <jingham@apple.com>
Date: Wed, 27 May 2020 17:26:32 -0700
Subject: [PATCH 383/770] Fix the crashlog.py script's use of the load_address
 property.

This property is explicitly for use only in the interactive editor,
and NOT in commands.  It's use worked until we got more careful about
not leaving lldb.target lying around in the script interpreter.

I also added a quick sniff test for the save_crashlog command.

<rdar://problem/60350620>
Differential Revision: https://reviews.llvm.org/D80680
---
 lldb/examples/python/crashlog.py              |  6 +-
 lldb/test/API/macosx/save_crashlog/Makefile   |  4 ++
 .../macosx/save_crashlog/TestSaveCrashlog.py  | 68 +++++++++++++++++++
 lldb/test/API/macosx/save_crashlog/main.c     | 13 ++++
 4 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 lldb/test/API/macosx/save_crashlog/Makefile
 create mode 100644 lldb/test/API/macosx/save_crashlog/TestSaveCrashlog.py
 create mode 100644 lldb/test/API/macosx/save_crashlog/main.c

diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py
index b7b62acc60ef1..1c1602b0131ea 100755
--- a/lldb/examples/python/crashlog.py
+++ b/lldb/examples/python/crashlog.py
@@ -791,11 +791,11 @@ def save_crashlog(debugger, command, exe_ctx, result, dict):
                     block_range = block.range[frame.addr]
                     if block_range:
                         block_start_addr = block_range[0]
-                        frame_offset = frame_pc - block_start_addr.load_addr
+                        frame_offset = frame_pc - block_start_addr.GetLoadAddress(target)
                     else:
-                        frame_offset = frame_pc - frame.function.addr.load_addr
+                        frame_offset = frame_pc - frame.function.addr.GetLoadAddress(target)
                 elif frame.symbol:
-                    frame_offset = frame_pc - frame.symbol.addr.load_addr
+                    frame_offset = frame_pc - frame.symbol.addr.GetLoadAddress(target)
                 out_file.write(
                     '%-3u %-32s 0x%16.16x %s' %
                     (frame_idx, frame.module.file.basename, frame_pc, frame.name))
diff --git a/lldb/test/API/macosx/save_crashlog/Makefile b/lldb/test/API/macosx/save_crashlog/Makefile
new file mode 100644
index 0000000000000..695335e068c0c
--- /dev/null
+++ b/lldb/test/API/macosx/save_crashlog/Makefile
@@ -0,0 +1,4 @@
+C_SOURCES := main.c
+CFLAGS_EXTRAS := -std=c99
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/save_crashlog/TestSaveCrashlog.py b/lldb/test/API/macosx/save_crashlog/TestSaveCrashlog.py
new file mode 100644
index 0000000000000..c86294490f61f
--- /dev/null
+++ b/lldb/test/API/macosx/save_crashlog/TestSaveCrashlog.py
@@ -0,0 +1,68 @@
+"""
+Test that the save_crashlog command functions
+"""
+
+
+import os
+import lldb
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+
+
+class TestSaveCrashlog(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    # If your test case doesn't stress debug info, the
+    # set this to true.  That way it won't be run once for
+    # each debug info format.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipUnlessDarwin
+    def test_save_crashlog(self):
+        """There can be many tests in a test case - describe this test here."""
+        self.build()
+        self.main_source_file = lldb.SBFileSpec("main.c")
+        self.save_crashlog()
+
+    def save_crashlog(self):
+
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(self,
+                                   "I was called", self.main_source_file)
+
+        self.runCmd("command script import lldb.macosx.crashlog")
+        out_file = os.path.join(self.getBuildDir(), "crash.log")
+        self.runCmd("save_crashlog '%s'"%(out_file))
+
+        # Make sure we wrote the file:
+        self.assertTrue(os.path.exists(out_file), "We wrote our file")
+        
+        # Now scan the file to make sure it looks right:
+        # First get a few facts we'll use:
+        exe_module = target.FindModule(target.GetExecutable())
+        uuid_str = exe_module.GetUUIDString()
+
+        # We'll set these to true when we find the elements in the file
+        found_call_me = False
+        found_main_line = False
+        found_thread_header = False
+        found_uuid_str = False
+
+        with open(out_file, "r") as f:
+            # We want to see a line with
+            for line in f:
+                if "Thread 0:" in line:
+                    found_thread_header = True
+                if "call_me" in line and "main.c:" in line:
+                    found_call_me = True
+                if "main" in line and "main.c:" in line:
+                    found_main_line = True
+                if uuid_str in line and "a.out" in line:
+                    found_uuid_str = True
+        
+        self.assertTrue(found_thread_header, "Found thread header")
+        self.assertTrue(found_call_me, "Found call_me line in stack")
+        self.assertTrue(found_uuid_str, "Found main binary UUID")
+        self.assertTrue(found_main_line, "Found main line in call stack")
+                        
diff --git a/lldb/test/API/macosx/save_crashlog/main.c b/lldb/test/API/macosx/save_crashlog/main.c
new file mode 100644
index 0000000000000..ca94bc9708156
--- /dev/null
+++ b/lldb/test/API/macosx/save_crashlog/main.c
@@ -0,0 +1,13 @@
+#include <stdio.h>
+
+void
+call_me() {
+  printf("I was called");
+}
+
+int
+main()
+{
+  call_me();
+  return 0;
+}

From 69ede516c7ff97c208d4a2378bb56ccaa242bbc1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 28 May 2020 08:28:12 -0700
Subject: [PATCH 384/770] [X86] Add 'avx512vp2intersect' to getHostCPUFeatures.

---
 llvm/lib/Support/Host.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index da68464c4a3d9..d80c7228f6e04 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1494,6 +1494,8 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["movdir64b"]       = HasLeaf7 && ((ECX >> 28) & 1);
   Features["enqcmd"]          = HasLeaf7 && ((ECX >> 29) & 1);
 
+  Features["avx512vp2intersect"] =
+      HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save;
   Features["serialize"]       = HasLeaf7 && ((EDX >> 14) & 1);
   Features["tsxldtrk"]        = HasLeaf7 && ((EDX >> 16) & 1);
   // There are two CPUID leafs which information associated with the pconfig

From f0c2cfe4d044be26031acbf95471a4ac0bca5bf1 Mon Sep 17 00:00:00 2001
From: Hiroshi Yamauchi <yamauchi@google.com>
Date: Wed, 27 May 2020 10:13:33 -0700
Subject: [PATCH 385/770] [PGO] Guard the memcmp/bcmp size value profiling
 instrumentation behind flag.

Summary:
Follow up D79751 and put the instrumentation / value collection side (in
addition to the optimization side) behind the flag as well.

Reviewers: davidxl

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80646
---
 llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp   | 2 +-
 .../Transforms/Instrumentation/ValueProfilePlugins.inc    | 4 ++++
 llvm/test/Transforms/PGOProfile/memop_size_annotation.ll  | 8 ++++----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index bef0e0257f029..604a20f3951a6 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -95,7 +95,7 @@ extern cl::opt<std::string> MemOPSizeRange;
 // This option sets the value that groups large memop sizes
 extern cl::opt<unsigned> MemOPSizeLarge;
 
-static cl::opt<bool>
+cl::opt<bool>
     MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(false),
                        cl::Hidden,
                        cl::desc("Size-specialize memcmp and bcmp calls"));
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index b5dd9fab24a54..8d0cf5843ebce 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -20,6 +20,8 @@
 using namespace llvm;
 using CandidateInfo = ValueProfileCollector::CandidateInfo;
 
+extern cl::opt<bool> MemOPOptMemcmpBcmp;
+
 ///--------------------------- MemIntrinsicPlugin ------------------------------
 class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
   Function &F;
@@ -48,6 +50,8 @@ public:
     Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
   }
   void visitCallInst(CallInst &CI) {
+    if (!MemOPOptMemcmpBcmp)
+      return;
     auto *F = CI.getCalledFunction();
     if (!F)
       return;
diff --git a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll
index 5884a6ebbb25d..f57f3d34f4ac5 100644
--- a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll
+++ b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-profdata merge %S/Inputs/memop_size_annotation.proftext -o %t.profdata
-; RUN: opt < %s -pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
-; RUN: opt < %s -passes=pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
-; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
+; RUN: opt < %s -pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
+; RUN: opt < %s -passes=pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

From 587fa99cfdb7d2a97143ba20ed8e8face57aa01c Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 10:11:08 -0700
Subject: [PATCH 386/770] Default to generating statepoints with deopt and
 gc-transition bundles if needed

Continues from D80598.

The key point of the change is to default to using operand bundles instead of the inline length prefix argument lists for statepoint nodes. An important subtlety to note is that the presence of a bundle has semantic meaning, even if it is empty. As such, we need to make a somewhat deeper change to the interface than is first obvious.

Existing code treats statepoint deopt arguments and the deopt bundle operands differently during inlining. The former is ignored (resulting in caller state being dropped), the later is merged.

We can't preserve the old behaviour for calls with deopt fed to RS4GC and then inlining, but we can avoid the no-deopt case changing. At least in internal testing, that seem to be the important one. (I'd argue the "stop merging after RS4GC" behaviour for the former was always "unexpected", but that the behaviour for non-deopt calls actually make sense.)

Differential Revision: https://reviews.llvm.org/D80674
---
 llvm/include/llvm/IR/IRBuilder.h              | 16 ++---
 llvm/lib/IR/IRBuilder.cpp                     | 68 +++++++++++++------
 .../Scalar/RewriteStatepointsForGC.cpp        | 12 ++--
 .../base-pointers-4.ll                        | 10 +--
 .../RewriteStatepointsForGC/basic.ll          | 24 +++----
 .../deopt-lowering-attrs.ll                   |  8 +--
 .../scalar-base-vector-2.ll                   | 12 ++--
 .../scalar-base-vector.ll                     | 14 ++--
 8 files changed, 96 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index b6dca11527d6b..d6bb479fdf2d3 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -789,7 +789,7 @@ class IRBuilderBase {
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee,
                                    ArrayRef<Value *> CallArgs,
-                                   ArrayRef<Value *> DeoptArgs,
+                                   Optional<ArrayRef<Value *>> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
 
@@ -798,8 +798,8 @@ class IRBuilderBase {
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee, uint32_t Flags,
                                    ArrayRef<Use> CallArgs,
-                                   ArrayRef<Use> TransitionArgs,
-                                   ArrayRef<Use> DeoptArgs,
+                                   Optional<ArrayRef<Use>> TransitionArgs,
+                                   Optional<ArrayRef<Use>> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
 
@@ -808,7 +808,7 @@ class IRBuilderBase {
   /// .get()'ed to get the Value pointer.
   CallInst *CreateGCStatepointCall(uint64_t ID, uint32_t NumPatchBytes,
                                    Value *ActualCallee, ArrayRef<Use> CallArgs,
-                                   ArrayRef<Value *> DeoptArgs,
+                                   Optional<ArrayRef<Value *>> DeoptArgs,
                                    ArrayRef<Value *> GCArgs,
                                    const Twine &Name = "");
 
@@ -818,7 +818,7 @@ class IRBuilderBase {
   CreateGCStatepointInvoke(uint64_t ID, uint32_t NumPatchBytes,
                            Value *ActualInvokee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest, ArrayRef<Value *> InvokeArgs,
-                           ArrayRef<Value *> DeoptArgs,
+                           Optional<ArrayRef<Value *>> DeoptArgs,
                            ArrayRef<Value *> GCArgs, const Twine &Name = "");
 
   /// Create an invoke to the experimental.gc.statepoint intrinsic to
@@ -826,8 +826,8 @@ class IRBuilderBase {
   InvokeInst *CreateGCStatepointInvoke(
       uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
       BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
-      ArrayRef<Use> InvokeArgs, ArrayRef<Use> TransitionArgs,
-      ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs,
+      ArrayRef<Use> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
+      Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
       const Twine &Name = "");
 
   // Convenience function for the common case when CallArgs are filled in using
@@ -837,7 +837,7 @@ class IRBuilderBase {
   CreateGCStatepointInvoke(uint64_t ID, uint32_t NumPatchBytes,
                            Value *ActualInvokee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs,
-                           ArrayRef<Value *> DeoptArgs,
+                           Optional<ArrayRef<Value *>> DeoptArgs,
                            ArrayRef<Value *> GCArgs, const Twine &Name = "");
 
   /// Create a call to the experimental.gc.result intrinsic to extract
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index ea5032eb83d7a..ef75d5c732163 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -575,12 +575,11 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
   return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
 }
 
-template <typename T0, typename T1, typename T2, typename T3>
+template <typename T0, typename T1>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
                   Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
-                  ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs,
-                  ArrayRef<T3> GCArgs) {
+                  ArrayRef<T1> GCArgs) {
   std::vector<Value *> Args;
   Args.push_back(B.getInt64(ID));
   Args.push_back(B.getInt32(NumPatchBytes));
@@ -588,20 +587,40 @@ getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
   Args.push_back(B.getInt32(CallArgs.size()));
   Args.push_back(B.getInt32(Flags));
   Args.insert(Args.end(), CallArgs.begin(), CallArgs.end());
-  Args.push_back(B.getInt32(TransitionArgs.size()));
-  Args.insert(Args.end(), TransitionArgs.begin(), TransitionArgs.end());
-  Args.push_back(B.getInt32(DeoptArgs.size()));
-  Args.insert(Args.end(), DeoptArgs.begin(), DeoptArgs.end());
+  // GC Transition and Deopt args are now always handled via operand bundle.
+  // They will be removed from the signature of gc.statepoint shortly.
+  Args.push_back(B.getInt32(0));
+  Args.push_back(B.getInt32(0));
   Args.insert(Args.end(), GCArgs.begin(), GCArgs.end());
 
   return Args;
 }
 
+template<typename T1, typename T2>
+static std::vector<OperandBundleDef>
+getStatepointBundles(Optional<ArrayRef<T1>> TransitionArgs,
+                     Optional<ArrayRef<T2>> DeoptArgs) {
+  std::vector<OperandBundleDef> Rval;
+  if (DeoptArgs) {
+    SmallVector<Value*, 16> DeoptValues;
+    DeoptValues.insert(DeoptValues.end(), DeoptArgs->begin(), DeoptArgs->end());
+    Rval.emplace_back("deopt", DeoptValues);
+  }
+  if (TransitionArgs) {
+    SmallVector<Value*, 16> TransitionValues;
+    TransitionValues.insert(TransitionValues.end(),
+                            TransitionArgs->begin(), TransitionArgs->end());
+    Rval.emplace_back("gc-transition", TransitionValues);
+  }
+  return Rval;
+}
+
 template <typename T0, typename T1, typename T2, typename T3>
 static CallInst *CreateGCStatepointCallCommon(
     IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
     Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
-    ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs,
+    Optional<ArrayRef<T1>> TransitionArgs,
+    Optional<ArrayRef<T2>> DeoptArgs, ArrayRef<T3> GCArgs,
     const Twine &Name) {
   // Extract out the type of the callee.
   auto *FuncPtrType = cast<PointerType>(ActualCallee->getType());
@@ -617,13 +636,16 @@ static CallInst *CreateGCStatepointCallCommon(
 
   std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualCallee, Flags,
-                        CallArgs, TransitionArgs, DeoptArgs, GCArgs);
-  return createCallHelper(FnStatepoint, Args, Builder, Name);
+                        CallArgs, GCArgs);
+
+  return Builder->CreateCall(FnStatepoint, Args,
+                             getStatepointBundles(TransitionArgs, DeoptArgs),
+                             Name);
 }
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
-    ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Value *> CallArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Value *, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None),
@@ -632,8 +654,9 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags,
-    ArrayRef<Use> CallArgs, ArrayRef<Use> TransitionArgs,
-    ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    ArrayRef<Use> CallArgs, Optional<ArrayRef<Use>> TransitionArgs,
+    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
+    const Twine &Name) {
   return CreateGCStatepointCallCommon<Use, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs,
       DeoptArgs, GCArgs, Name);
@@ -641,7 +664,7 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
-    ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Use> CallArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Use, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None),
@@ -652,8 +675,9 @@ template <typename T0, typename T1, typename T2, typename T3>
 static InvokeInst *CreateGCStatepointInvokeCommon(
     IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
     Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest,
-    uint32_t Flags, ArrayRef<T0> InvokeArgs, ArrayRef<T1> TransitionArgs,
-    ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs, const Twine &Name) {
+    uint32_t Flags, ArrayRef<T0> InvokeArgs,
+    Optional<ArrayRef<T1>> TransitionArgs, Optional<ArrayRef<T2>> DeoptArgs,
+    ArrayRef<T3> GCArgs, const Twine &Name) {
   // Extract out the type of the callee.
   auto *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
   assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
@@ -666,15 +690,17 @@ static InvokeInst *CreateGCStatepointInvokeCommon(
 
   std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee, Flags,
-                        InvokeArgs, TransitionArgs, DeoptArgs, GCArgs);
+                        InvokeArgs, GCArgs);
+
   return Builder->CreateInvoke(FnStatepoint, NormalDest, UnwindDest, Args,
+                               getStatepointBundles(TransitionArgs, DeoptArgs),
                                Name);
 }
 
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest,
-    ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Value *, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
@@ -685,8 +711,8 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
-    ArrayRef<Use> InvokeArgs, ArrayRef<Use> TransitionArgs,
-    ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    ArrayRef<Use> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
+    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Use, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags,
       InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
@@ -695,7 +721,7 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs,
-    ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    Optional<ArrayRef<Value *>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Use, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
       uint32_t(StatepointFlags::None), InvokeArgs, None, DeoptArgs, GCArgs,
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index ab284b75ee2c2..ec14bca90801a 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1490,12 +1490,14 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   uint32_t Flags = uint32_t(StatepointFlags::None);
 
   ArrayRef<Use> CallArgs(Call->arg_begin(), Call->arg_end());
-  ArrayRef<Use> DeoptArgs = GetDeoptBundleOperands(Call);
-  ArrayRef<Use> TransitionArgs;
-  if (auto TransitionBundle =
-          Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
+  Optional<ArrayRef<Use>> DeoptArgs;
+  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
+    DeoptArgs = Bundle->Inputs;
+  Optional<ArrayRef<Use>> TransitionArgs;
+  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
+    TransitionArgs = Bundle->Inputs;
+    // TODO: This flag no longer serves a purpose and can be removed later
     Flags |= uint32_t(StatepointFlags::GCTransition);
-    TransitionArgs = TransitionBundle->Inputs;
   }
 
   // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll b/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
index 7fe70b22eb100..d66a3b537404e 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
@@ -15,7 +15,7 @@ define void @test(i32 %condition) gc "statepoint-example" {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i64 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* ()* @generate_obj, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i64 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* ()* @generate_obj, i32 0, i32 0, i32 0, i32 0) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 addrspace(1)* @llvm.experimental.gc.result.p1i64(token [[STATEPOINT_TOKEN]])
 ; CHECK-NEXT:    switch i32 [[CONDITION:%.*]], label [[DEST_A:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[DEST_B:%.*]]
@@ -30,14 +30,14 @@ define void @test(i32 %condition) gc "statepoint-example" {
 ; CHECK:       merge:
 ; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE:%.*]] = phi i64 addrspace(1)* [ [[TMP0]], [[DEST_A]] ], [ null, [[DEST_B]] ], [ null, [[DEST_C]] ], !is_base_value !0
 ; CHECK-NEXT:    [[OBJ_TO_CONSUME:%.*]] = phi i64 addrspace(1)* [ [[TMP0]], [[DEST_A]] ], [ null, [[DEST_B]] ], [ null, [[DEST_C]] ]
-; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 2882400000, i32 0, void (i64 addrspace(1)*)* @consume_obj, i32 1, i32 0, i64 addrspace(1)* [[OBJ_TO_CONSUME]], i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* [[OBJ_TO_CONSUME_BASE]], i64 addrspace(1)* [[OBJ_TO_CONSUME]])
-; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 13, i32 13)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 2882400000, i32 0, void (i64 addrspace(1)*)* @consume_obj, i32 1, i32 0, i64 addrspace(1)* [[OBJ_TO_CONSUME]], i32 0, i32 0, i64 addrspace(1)* [[OBJ_TO_CONSUME_BASE]], i64 addrspace(1)* [[OBJ_TO_CONSUME]]) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 8)
 ; CHECK-NEXT:    [[OBJ_TO_CONSUME_BASE_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[OBJ_TO_CONSUME_BASE_RELOCATED]] to i64 addrspace(1)*
-; CHECK-NEXT:    [[OBJ_TO_CONSUME_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 13, i32 14)
+; CHECK-NEXT:    [[OBJ_TO_CONSUME_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 9)
 ; CHECK-NEXT:    [[OBJ_TO_CONSUME_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[OBJ_TO_CONSUME_RELOCATED]] to i64 addrspace(1)*
 ; CHECK-NEXT:    br label [[MERGE_SPLIT:%.*]]
 ; CHECK:       merge.split:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll b/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll
index 8e052a61a4dc3..06b88b4457ef1 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/basic.ll
@@ -8,8 +8,8 @@ declare i32 @h()
 define i32 addrspace(1)* @f0(i32 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
-; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[ARG:%.*]]) [ "deopt"(i32 100) ]
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
 ; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
 ;
@@ -22,16 +22,16 @@ define i32 addrspace(1)* @f0(i32 addrspace(1)* %arg) gc "statepoint-example" {
 define i32 addrspace(1)* @f1(i32 addrspace(1)* %arg) gc "statepoint-example"  personality i32 8  {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[ARG:%.*]]) [ "deopt"(i32 100) ]
 ; CHECK-NEXT:    to label [[NORMAL_DEST:%.*]] unwind label [[UNWIND_DEST:%.*]]
 ; CHECK:       normal_dest:
-; CHECK-NEXT:    [[ARG_RELOCATED1:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED1:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED1_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED1]] to i32 addrspace(1)*
 ; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED1_CASTED]]
 ; CHECK:       unwind_dest:
 ; CHECK-NEXT:    [[LPAD:%.*]] = landingpad token
 ; CHECK-NEXT:    cleanup
-; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[LPAD]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[LPAD]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
 ; CHECK-NEXT:    resume token undef
 ;
@@ -49,9 +49,9 @@ define i32 addrspace(1)* @f1(i32 addrspace(1)* %arg) gc "statepoint-example"  pe
 define i32 addrspace(1)* @f2(i32 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[ARG:%.*]]) [ "deopt"(i32 100) ]
 ; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @llvm.experimental.gc.result.i32(token [[STATEPOINT_TOKEN]])
-; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
 ; CHECK-NEXT:    store i32 [[VAL1]], i32 addrspace(1)* [[ARG_RELOCATED_CASTED]], align 4
 ; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
@@ -67,18 +67,18 @@ define i32 addrspace(1)* @f2(i32 addrspace(1)* %arg) gc "statepoint-example" {
 define i32 addrspace(1)* @f3(i32 addrspace(1)* %arg) gc "statepoint-example"  personality i32 8  {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = invoke token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* [[ARG:%.*]])
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = invoke token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[ARG:%.*]]) [ "deopt"(i32 100) ]
 ; CHECK-NEXT:    to label [[NORMAL_DEST:%.*]] unwind label [[UNWIND_DEST:%.*]]
 ; CHECK:       normal_dest:
 ; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @llvm.experimental.gc.result.i32(token [[STATEPOINT_TOKEN]])
-; CHECK-NEXT:    [[ARG_RELOCATED2:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED2:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED2_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED2]] to i32 addrspace(1)*
 ; CHECK-NEXT:    store i32 [[VAL1]], i32 addrspace(1)* [[ARG_RELOCATED2_CASTED]], align 4
 ; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED2_CASTED]]
 ; CHECK:       unwind_dest:
 ; CHECK-NEXT:    [[LPAD:%.*]] = landingpad token
 ; CHECK-NEXT:    cleanup
-; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[LPAD]], i32 8, i32 8)
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[LPAD]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
 ; CHECK-NEXT:    resume token undef
 ;
@@ -99,8 +99,8 @@ define i32 addrspace(1)* @f3(i32 addrspace(1)* %arg) gc "statepoint-example"  pe
 define i32 addrspace(1)* @f4(i32 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: @f4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 1, i32 2, i32 400, i8 90, i32 0, i32 addrspace(1)* [[ARG:%.*]])
-; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 9, i32 9)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* [[ARG:%.*]]) [ "gc-transition"(i32 400, i8 90) ]
+; CHECK-NEXT:    [[ARG_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
 ; CHECK-NEXT:    [[ARG_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
 ; CHECK-NEXT:    ret i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
 ;
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll b/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
index 65e38d9d37587..961b9a11246ea 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
@@ -14,9 +14,9 @@ declare void @baz() "deopt-lowering"="live-through"
 define void @test1() gc "statepoint-example" {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 1, i32 57)
-; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 42)
-; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 13)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0) [ "deopt"(i32 57) ]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) [ "deopt"(i32 42) ]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 0) [ "deopt"(i32 13) ]
 ; CHECK-NEXT:    ret void
 ;
 
@@ -31,7 +31,7 @@ entry:
 define void @test2() gc "statepoint-example" {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 2, i32 0, i32 1, i32 57) #0
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 2, i32 0, i32 0) #0 [ "deopt"(i32 57) ]
 ; CHECK-NEXT:    ret void
 ;
 
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll
index 1cfda09b2c1b0..ceef9ae8f6e49 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector-2.ll
@@ -21,18 +21,18 @@ define void @widget() gc "statepoint-example" {
 ; CHECK:       bb11:
 ; CHECK-NEXT:    [[TMP12_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE_EE]], [[BB7]] ], [ [[BASE_EE]], [[BB9]] ], !is_base_value !0
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi i8 addrspace(1)* [ [[TMP8]], [[BB7]] ], [ [[TMP10]], [[BB9]] ]
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 1, i32 undef, i8 addrspace(1)* [[TMP12_BASE]], i8 addrspace(1)* [[TMP12]])
-; CHECK-NEXT:    [[TMP12_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
-; CHECK-NEXT:    [[TMP12_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 9)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* [[TMP12_BASE]], i8 addrspace(1)* [[TMP12]]) [ "deopt"(i32 undef) ]
+; CHECK-NEXT:    [[TMP12_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
+; CHECK-NEXT:    [[TMP12_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 8)
 ; CHECK-NEXT:    br label [[BB15]]
 ; CHECK:       bb15:
 ; CHECK-NEXT:    [[TMP16_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE_EE]], [[BB9]] ], [ [[TMP12_BASE_RELOCATED]], [[BB11]] ], !is_base_value !0
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i8 addrspace(1)* [ [[TMP10]], [[BB9]] ], [ [[TMP12_RELOCATED]], [[BB11]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB17:%.*]], label [[BB20:%.*]]
 ; CHECK:       bb17:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 1, i32 undef, i8 addrspace(1)* [[TMP16_BASE]], i8 addrspace(1)* [[TMP16]])
-; CHECK-NEXT:    [[TMP16_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 8)
-; CHECK-NEXT:    [[TMP16_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 8, i32 9)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @snork, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* [[TMP16_BASE]], i8 addrspace(1)* [[TMP16]]) [ "deopt"(i32 undef) ]
+; CHECK-NEXT:    [[TMP16_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 7, i32 7)
+; CHECK-NEXT:    [[TMP16_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN1]], i32 7, i32 8)
 ; CHECK-NEXT:    br label [[BB20]]
 ; CHECK:       bb20:
 ; CHECK-DAG:    [[DOT05:%.*]] = phi i8 addrspace(1)* [ [[TMP16_BASE_RELOCATED]], [[BB17]] ], [ [[TMP16_BASE]], [[BB15]] ]
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
index 34af81cd7337e..fad51e32ad2f1 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
@@ -10,7 +10,7 @@ define i32 addrspace(1)* @test1(i8 addrspace(1)* %base1, <2 x i64> %offsets) gc
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[FIRST:%.*]], label [[SECOND:%.*]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i8 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i8f(i64 2882400000, i32 0, i8 addrspace(1)* ()* @def_ptr, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i8 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i8f(i64 2882400000, i32 0, i8 addrspace(1)* ()* @def_ptr, i32 0, i32 0, i32 0, i32 0) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
 ; CHECK-NEXT:    [[BASE21:%.*]] = call i8 addrspace(1)* @llvm.experimental.gc.result.p1i8(token [[STATEPOINT_TOKEN]])
 ; CHECK-NEXT:    br label [[SECOND]]
 ; CHECK:       second:
@@ -25,10 +25,10 @@ define i32 addrspace(1)* @test1(i8 addrspace(1)* %base1, <2 x i64> %offsets) gc
 ; CHECK-NEXT:    [[VEC:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[DOTSPLAT]], <2 x i64> [[OFFSETS:%.*]]
 ; CHECK-NEXT:    [[PTR_BASE:%.*]] = extractelement <2 x i32 addrspace(1)*> [[DOTSPLAT_BASE]], i32 1, !is_base_value !0
 ; CHECK-NEXT:    [[PTR:%.*]] = extractelement <2 x i32 addrspace(1)*> [[VEC]], i32 1
-; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* [[PTR]], i32 addrspace(1)* [[PTR_BASE]])
-; CHECK-NEXT:    [[PTR_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 13, i32 12)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[PTR]], i32 addrspace(1)* [[PTR_BASE]]) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+; CHECK-NEXT:    [[PTR_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 8, i32 7)
 ; CHECK-NEXT:    [[PTR_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_RELOCATED]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[PTR_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 13, i32 13)
+; CHECK-NEXT:    [[PTR_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 8, i32 8)
 ; CHECK-NEXT:    [[PTR_BASE_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_BASE_RELOCATED]] to i32 addrspace(1)*
 ; CHECK-NEXT:    ret i32 addrspace(1)* [[PTR_RELOCATED_CASTED]]
 ;
@@ -151,9 +151,9 @@ define void @test6() gc "statepoint-example" {
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i8 addrspace(1)* [ [[TMP6:%.*]], [[LATCH]] ], [ undef, [[BB]] ]
 ; CHECK-NEXT:    br label [[BB10:%.*]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @spam, i32 0, i32 0, i32 0, i32 1, i8 addrspace(1)* [[TMP]], i8 addrspace(1)* [[TMP]], i8 addrspace(1)* [[TMP_BASE]])
-; CHECK-NEXT:    [[TMP_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 9, i32 8)
-; CHECK-NEXT:    [[TMP_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 9, i32 9)
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @spam, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* [[TMP]], i8 addrspace(1)* [[TMP_BASE]]) [ "deopt"(i8 addrspace(1)* [[TMP]]) ]
+; CHECK-NEXT:    [[TMP_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 7)
+; CHECK-NEXT:    [[TMP_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
 ; CHECK-NEXT:    br label [[BB25:%.*]]
 ; CHECK:       bb25:
 ; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, <2 x i8 addrspace(1)*> ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_v2p1i8f(i64 2882400000, i32 0, <2 x i8 addrspace(1)*> ()* @baz, i32 0, i32 0, i32 0, i32 0)

From db3b970a84325e326bbcec4bc3e5e663f148a481 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Thu, 28 May 2020 20:22:18 +0300
Subject: [PATCH 387/770] [analyzer] Remove unused function. NFC.

---
 .../Core/RangeConstraintManager.cpp           | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 6f92b965ce5b3..0822a9461fa7c 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -929,30 +929,6 @@ RangeSet RangeConstraintManager::getRange(ProgramStateRef State,
   return SymbolicRangeInferrer::inferRange(getBasicVals(), F, State, Sym);
 }
 
-// FIXME: Once SValBuilder supports unary minus, we should use SValBuilder to
-//        obtain the negated symbolic expression instead of constructing the
-//        symbol manually. This will allow us to support finding ranges of not
-//        only negated SymSymExpr-type expressions, but also of other, simpler
-//        expressions which we currently do not know how to negate.
-const RangeSet*
-RangeConstraintManager::getRangeForMinusSymbol(ProgramStateRef State,
-                                               SymbolRef Sym) {
-  if (const SymSymExpr *SSE = dyn_cast<SymSymExpr>(Sym)) {
-    if (SSE->getOpcode() == BO_Sub) {
-      QualType T = Sym->getType();
-      SymbolManager &SymMgr = State->getSymbolManager();
-      SymbolRef negSym = SymMgr.getSymSymExpr(SSE->getRHS(), BO_Sub,
-                                              SSE->getLHS(), T);
-      if (const RangeSet *negV = State->get<ConstraintRange>(negSym)) {
-        if (T->isUnsignedIntegerOrEnumerationType() ||
-            T->isSignedIntegerOrEnumerationType())
-          return negV;
-      }
-    }
-  }
-  return nullptr;
-}
-
 //===------------------------------------------------------------------------===
 // assumeSymX methods: protected interface for RangeConstraintManager.
 //===------------------------------------------------------------------------===/

From 116dcbebc6a1648b4acd1a1a391c1d66a3eb4b5f Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Thu, 28 May 2020 20:28:17 +0300
Subject: [PATCH 388/770] [analyzer] Remove unused function declaration. NFC.

---
 clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index 0822a9461fa7c..a14b29c6face6 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -801,7 +801,6 @@ class RangeConstraintManager : public RangedConstraintManager {
   RangeSet::Factory F;
 
   RangeSet getRange(ProgramStateRef State, SymbolRef Sym);
-  const RangeSet *getRangeForMinusSymbol(ProgramStateRef State, SymbolRef Sym);
 
   RangeSet getSymLTRange(ProgramStateRef St, SymbolRef Sym,
                          const llvm::APSInt &Int,

From a7fa35a629e85a72b8cf07a8f95c7c09d9663808 Mon Sep 17 00:00:00 2001
From: Hiroshi Yamauchi <yamauchi@google.com>
Date: Thu, 21 May 2020 13:28:24 -0700
Subject: [PATCH 389/770] [ThinLTO] Compute the basic block count across
 modules.

Summary:
Count the per-module number of basic blocks when the module summary is computed
and sum them up during Thin LTO indexing.

This is used to estimate the working set size under the partial sample PGO.

This is split off of D79831.

Reviewers: davidxl, espindola

Subscribers: emaste, inglorion, hiraditya, MaskRay, steven_wu, dexonsmith, arphaman, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80403
---
 lld/test/COFF/thinlto-index-only.ll            |  1 +
 lld/test/ELF/lto/thinlto-index-only.ll         |  1 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h       |  2 ++
 llvm/include/llvm/IR/ModuleSummaryIndex.h      | 12 ++++++++++--
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp    |  1 +
 llvm/lib/AsmParser/LLLexer.cpp                 |  1 +
 llvm/lib/AsmParser/LLParser.cpp                | 18 ++++++++++++++++++
 llvm/lib/AsmParser/LLParser.h                  |  1 +
 llvm/lib/AsmParser/LLToken.h                   |  1 +
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp    |  1 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp      |  3 +++
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp      |  6 ++++++
 llvm/lib/IR/AsmWriter.cpp                      |  7 ++++++-
 llvm/test/Assembler/thinlto-vtable-summary.ll  |  1 +
 llvm/test/Bitcode/thinlto-alias.ll             |  4 ++++
 llvm/test/Bitcode/thinlto-alias2.ll            |  1 +
 .../thinlto-function-summary-callgraph-cast.ll |  1 +
 .../thinlto-function-summary-callgraph-pgo.ll  |  2 ++
 ...nction-summary-callgraph-profile-summary.ll |  6 ++++--
 ...thinlto-function-summary-callgraph-relbf.ll |  1 +
 ...summary-callgraph-sample-profile-summary.ll |  6 ++++--
 .../thinlto-function-summary-callgraph.ll      |  2 ++
 .../thinlto-function-summary-originalnames.ll  |  1 +
 llvm/test/Bitcode/thinlto-function-summary.ll  |  1 +
 llvm/test/ThinLTO/X86/distributed_indexes.ll   |  2 ++
 llvm/test/tools/llvm-lto/thinlto.ll            |  1 +
 26 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/lld/test/COFF/thinlto-index-only.ll b/lld/test/COFF/thinlto-index-only.ll
index 996c888cbe22b..8abc7b5213c99 100644
--- a/lld/test/COFF/thinlto-index-only.ll
+++ b/lld/test/COFF/thinlto-index-only.ll
@@ -39,6 +39,7 @@
 ; BACKEND2-NEXT: <FLAGS
 ; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
+; BACKEND2-NEXT: <BLOCK_COUNT op0=2/>
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 ; Thin archive tests. Check that the module paths point to the original files.
diff --git a/lld/test/ELF/lto/thinlto-index-only.ll b/lld/test/ELF/lto/thinlto-index-only.ll
index 86510f6cd3dd7..cccab1d16eb91 100644
--- a/lld/test/ELF/lto/thinlto-index-only.ll
+++ b/lld/test/ELF/lto/thinlto-index-only.ll
@@ -71,6 +71,7 @@
 ; BACKEND2-NEXT: <FLAGS
 ; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
+; BACKEND2-NEXT: <BLOCK_COUNT op0=2/>
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index e0093a12a7fe1..4de7a152fa6b9 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -290,6 +290,8 @@ enum GlobalValueSummarySymtabCodes {
   //                                        numrefs, numrefs x valueid,
   //                                        n x (valueid, offset)]
   FS_PERMODULE_VTABLE_GLOBALVAR_INIT_REFS = 23,
+  // The total number of basic blocks in the module.
+  FS_BLOCK_COUNT = 24,
 };
 
 enum MetadataCodes {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 8259df101178d..7a7ca7f76ae8e 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1007,6 +1007,10 @@ class ModuleSummaryIndex {
   StringSaver Saver;
   BumpPtrAllocator Alloc;
 
+  // The total number of basic blocks in the module in the per-module summary or
+  // the total number of basic blocks in the LTO unit in the combined index.
+  uint64_t BlockCount;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -1019,8 +1023,8 @@ class ModuleSummaryIndex {
 public:
   // See HaveGVs variable comment.
   ModuleSummaryIndex(bool HaveGVs, bool EnableSplitLTOUnit = false)
-      : HaveGVs(HaveGVs), EnableSplitLTOUnit(EnableSplitLTOUnit), Saver(Alloc) {
-  }
+      : HaveGVs(HaveGVs), EnableSplitLTOUnit(EnableSplitLTOUnit), Saver(Alloc),
+        BlockCount(0) {}
 
   // Current version for the module summary in bitcode files.
   // The BitcodeSummaryVersion should be bumped whenever we introduce changes
@@ -1039,6 +1043,10 @@ class ModuleSummaryIndex {
   uint64_t getFlags() const;
   void setFlags(uint64_t Flags);
 
+  uint64_t getBlockCount() const { return BlockCount; }
+  void addBlockCount(uint64_t C) { BlockCount += C; }
+  void setBlockCount(uint64_t C) { BlockCount = C; }
+
   gvsummary_iterator begin() { return GlobalValueMap.begin(); }
   const_gvsummary_iterator begin() const { return GlobalValueMap.begin(); }
   gvsummary_iterator end() { return GlobalValueMap.end(); }
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index b900a38fdc0af..c5e5e32b6d65c 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -390,6 +390,7 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
     }
+  Index.addBlockCount(F.size());
 
   std::vector<ValueInfo> Refs;
   if (IsThinLTO) {
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 71cf22ca47536..eb1209ad9c6fe 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -741,6 +741,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(name);
   KEYWORD(summaries);
   KEYWORD(flags);
+  KEYWORD(blockcount);
   KEYWORD(linkage);
   KEYWORD(notEligibleToImport);
   KEYWORD(live);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index a2c1b3f632af8..fb6e61ffce4ca 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -853,6 +853,9 @@ bool LLParser::ParseSummaryEntry() {
   case lltok::kw_flags:
     result = ParseSummaryIndexFlags();
     break;
+  case lltok::kw_blockcount:
+    result = ParseBlockCount();
+    break;
   default:
     result = Error(Lex.getLoc(), "unexpected summary kind");
     break;
@@ -8111,6 +8114,21 @@ bool LLParser::ParseSummaryIndexFlags() {
   return false;
 }
 
+/// ParseBlockCount
+///   ::= 'blockcount' ':' UInt64
+bool LLParser::ParseBlockCount() {
+  assert(Lex.getKind() == lltok::kw_blockcount);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+  uint64_t BlockCount;
+  if (ParseUInt64(BlockCount))
+    return true;
+  Index->setBlockCount(BlockCount);
+  return false;
+}
+
 /// ParseGVEntry
 ///   ::= 'gv' ':' '(' ('name' ':' STRINGCONSTANT | 'guid' ':' UInt64)
 ///         [',' 'summaries' ':' Summary[',' Summary]* ]? ')'
diff --git a/llvm/lib/AsmParser/LLParser.h b/llvm/lib/AsmParser/LLParser.h
index 894e23b684033..be23ba436370d 100644
--- a/llvm/lib/AsmParser/LLParser.h
+++ b/llvm/lib/AsmParser/LLParser.h
@@ -341,6 +341,7 @@ namespace llvm {
     bool ParseModuleReference(StringRef &ModulePath);
     bool ParseGVReference(ValueInfo &VI, unsigned &GVId);
     bool ParseSummaryIndexFlags();
+    bool ParseBlockCount();
     bool ParseGVEntry(unsigned ID);
     bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
     bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index 201ba04025a05..a8b4ad963f0a8 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -374,6 +374,7 @@ enum Kind {
   kw_name,
   kw_summaries,
   kw_flags,
+  kw_blockcount,
   kw_linkage,
   kw_notEligibleToImport,
   kw_live,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 69ab822a74bb2..a72a33a19113d 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -305,6 +305,7 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, CFI_FUNCTION_DECLS)
       STRINGIFY_CODE(FS, TYPE_ID)
       STRINGIFY_CODE(FS, TYPE_ID_METADATA)
+      STRINGIFY_CODE(FS, BLOCK_COUNT)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 4bad2b55bf94e..615b6465e98fb 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -6239,6 +6239,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_TYPE_ID_METADATA:
       parseTypeIdCompatibleVtableSummaryRecord(Record);
       break;
+
+    case bitc::FS_BLOCK_COUNT:
+      TheIndex.addBlockCount(Record[0]);
     }
   }
   llvm_unreachable("Exit infinite loop");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index a46339a4ec1fd..992cdfc851fb2 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -3906,6 +3906,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     NameVals.clear();
   }
 
+  Stream.EmitRecord(bitc::FS_BLOCK_COUNT,
+                    ArrayRef<uint64_t>{Index->getBlockCount()});
+
   Stream.ExitBlock();
 }
 
@@ -4189,6 +4192,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
   }
 
+  Stream.EmitRecord(bitc::FS_BLOCK_COUNT,
+                    ArrayRef<uint64_t>{Index.getBlockCount()});
+
   Stream.ExitBlock();
 }
 
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 68edb6bad9396..0fc7d66d9fa61 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2822,8 +2822,13 @@ void AssemblyWriter::printModuleSummaryIndex() {
   }
 
   // Don't emit flags when it's not really needed (value is zero by default).
-  if (TheIndex->getFlags())
+  if (TheIndex->getFlags()) {
     Out << "^" << NumSlots << " = flags: " << TheIndex->getFlags() << "\n";
+    ++NumSlots;
+  }
+
+  Out << "^" << NumSlots << " = blockcount: " << TheIndex->getBlockCount()
+      << "\n";
 }
 
 static const char *
diff --git a/llvm/test/Assembler/thinlto-vtable-summary.ll b/llvm/test/Assembler/thinlto-vtable-summary.ll
index 67e7a2f1c93c7..fff1ce7931bf9 100644
--- a/llvm/test/Assembler/thinlto-vtable-summary.ll
+++ b/llvm/test/Assembler/thinlto-vtable-summary.ll
@@ -36,3 +36,4 @@ declare i32 @_ZN1C1fEi(%struct.C*, i32)
 ^6 = typeidCompatibleVTable: (name: "_ZTS1A", summary: ((offset: 16, ^2), (offset: 16, ^4))) ; guid = 7004155349499253778
 ^7 = typeidCompatibleVTable: (name: "_ZTS1B", summary: ((offset: 16, ^2))) ; guid = 6203814149063363976
 ^8 = typeidCompatibleVTable: (name: "_ZTS1C", summary: ((offset: 16, ^4))) ; guid = 1884921850105019584
+^9 = blockcount: 0
diff --git a/llvm/test/Bitcode/thinlto-alias.ll b/llvm/test/Bitcode/thinlto-alias.ll
index a1412f5269ee5..7e8213ec0ca88 100644
--- a/llvm/test/Bitcode/thinlto-alias.ll
+++ b/llvm/test/Bitcode/thinlto-alias.ll
@@ -22,6 +22,7 @@
 ; See if the call to func is registered.
 ; The value id 1 matches the second FUNCTION record above.
 ; CHECK-NEXT:    <PERMODULE {{.*}} op7=1/>
+; CHECK-NEXT:    <BLOCK_COUNT op0=1/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -37,6 +38,7 @@
 ; COMBINED-NEXT:    <COMBINED {{.*}} op9=[[ALIASID]]/>
 ; COMBINED-NEXT:    <COMBINED {{.*}}
 ; COMBINED-NEXT:    <COMBINED_ALIAS  {{.*}} op3=[[ALIASEEID]]
+; COMBINED-NEXT:    <BLOCK_COUNT op0=2/>
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
@@ -55,9 +57,11 @@ declare void @analias(...)
 ; DIS: ^0 = module: (path: "{{.*}}", hash: (0, 0, 0, 0, 0))
 ; DIS: ^1 = gv: (name: "analias", summaries: (alias: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0), aliasee: ^2))) ; guid = 12695095382722328222
 ; DIS: ^2 = gv: (name: "aliasee", summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0), insts: 1))) ; guid = 17407585008595848568
+; DIS: ^3 = blockcount: 1
 
 ; COMBINED-DIS: ^0 = module: (path: "{{.*}}thinlto-alias.ll.tmp.o", hash: (0, 0, 0, 0, 0))
 ; COMBINED-DIS: ^1 = module: (path: "{{.*}}thinlto-alias.ll.tmp2.o", hash: (0, 0, 0, 0, 0))
 ; COMBINED-DIS: ^2 = gv: (guid: 12695095382722328222, summaries: (alias: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0), aliasee: ^4)))
 ; COMBINED-DIS: ^3 = gv: (guid: 15822663052811949562, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0), insts: 2, calls: ((callee: ^2)))))
 ; COMBINED-DIS: ^4 = gv: (guid: 17407585008595848568, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0), insts: 1)))
+; COMBINED-DIS: ^5 = blockcount: 2
diff --git a/llvm/test/Bitcode/thinlto-alias2.ll b/llvm/test/Bitcode/thinlto-alias2.ll
index d43870632743e..22815f8ec24c1 100644
--- a/llvm/test/Bitcode/thinlto-alias2.ll
+++ b/llvm/test/Bitcode/thinlto-alias2.ll
@@ -8,6 +8,7 @@
 ; CHECK-NEXT:    <PERMODULE {{.*}} op4=0 op5=0 op6=0 op7=[[ALIASID:[0-9]+]]/>
 ; CHECK-NEXT:    <PERMODULE {{.*}} op0=[[ALIASEEID:[0-9]+]]
 ; CHECK-NEXT:    <ALIAS {{.*}} op0=[[ALIASID]] {{.*}} op2=[[ALIASEEID]]/>
+; CHECK-NEXT:    <BLOCK_COUNT op0=2/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; ModuleID = 'thinlto-alias2.ll'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-cast.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-cast.ll
index e6fc9392745b2..a0cd958eeaaa3 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-cast.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-cast.ll
@@ -12,6 +12,7 @@
 ; CHECK-NEXT:    <PERMODULE {{.*}} op4=3 {{.*}} op9={{[0-9]+}}/>
 ; CHECK-NEXT:    <PERMODULE {{.*}} op0=[[ALIASEEID:[0-9]+]]
 ; CHECK-NEXT:    <ALIAS {{.*}} op0=[[ALIASID]] {{.*}} op2=[[ALIASEEID]]/>
+; CHECK-NEXT:    <BLOCK_COUNT op0=3/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; ModuleID = 'thinlto-function-summary-callgraph-cast.ll'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
index 2bbab0c6bb0d0..c353b0a8b836b 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
@@ -19,6 +19,7 @@
 ; CHECK-NEXT:    <FLAGS
 ; See if the call to func is registered, using the expected hotness type.
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=1 op8=2/>
+; CHECK-NEXT:    <BLOCK_COUNT op0=1/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK: <STRTAB_BLOCK
 ; CHECK-NEXT: blob data = 'mainfunc{{.*}}'
@@ -32,6 +33,7 @@
 ; See if the call to func is registered, using the expected hotness type.
 ; op6=2 which is hotnessType::None.
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op9=[[FUNCID]] op10=2/>
+; COMBINED-NEXT:    <BLOCK_COUNT op0=2/>
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index d444ee7c01fbb..e13e42d8a5623 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -50,6 +50,7 @@
 ; CHECK-NEXT:    <VALUE_GUID op0=25 op1=123/>
 ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=1 op8=3 op9=5 op10=1 op11=2 op12=3 op13=4 op14=1 op15=6 op16=2 op17=3 op18=3 op19=7 op20=2 op21=8 op22=2 op23=25 op24=4/>
+; CHECK-NEXT:    <BLOCK_COUNT op0=6/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -73,8 +74,9 @@
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op9=[[HOT1:.*]] op10=3 op11=[[COLD:.*]] op12=1 op13=[[HOT2:.*]] op14=3 op15=[[NONE1:.*]] op16=2 op17=[[HOT3:.*]] op18=3 op19=[[NONE2:.*]] op20=2 op21=[[NONE3:.*]] op22=2/>
-; COMBINED_NEXT:    <COMBINED abbrevid=
-; COMBINED_NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <BLOCK_COUNT op0=13/>
+; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll
index 7d13ae43b5065..94d409b1b815b 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-relbf.ll
@@ -15,6 +15,7 @@
 ; CHECK-NEXT:    <FLAGS
 ; See if the call to func is registered.
 ; CHECK-NEXT:    <PERMODULE_RELBF {{.*}} op4=1 {{.*}} op9=256
+; CHECK-NEXT:    <BLOCK_COUNT op0=1/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK: <STRTAB_BLOCK
 ; CHECK-NEXT: blob data = 'undefinedglobmainfunc{{.*}}'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
index 1e23b10ddc4f0..a59bae3cd7572 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -33,6 +33,7 @@
 ; CHECK-NEXT:    <VALUE_GUID op0=26 op1=123/>
 ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=7 op8=0 op9=1 op10=3 op11=4 op12=1 op13=8 op14=0 op15=2 op16=3 op17=5 op18=1 op19=9 op20=0 op21=3 op22=3 op23=6 op24=1 op25=26 op26=4/>
+; CHECK-NEXT:    <BLOCK_COUNT op0=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; CHECK: <STRTAB_BLOCK
@@ -60,8 +61,9 @@
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED abbrevid=
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op9=[[NONE1:.*]] op10=0 op11=[[HOT1:.*]] op12=3 op13=[[COLD1:.*]] op14=1 op15=[[NONE2:.*]] op16=0 op17=[[HOT2:.*]] op18=3 op19=[[COLD2:.*]] op20=1 op21=[[NONE3:.*]] op22=0 op23=[[HOT3:.*]] op24=3 op25=[[COLD3:.*]] op26=1/>
-; COMBINED_NEXT:    <COMBINED abbrevid=
-; COMBINED_NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
+; COMBINED-NEXT:    <COMBINED abbrevid=
+; COMBINED-NEXT:    <BLOCK_COUNT op0=13/>
+; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
index 765efcf7b8226..19cfbf7931a31 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
@@ -20,6 +20,7 @@
 ; CHECK-NEXT:    <FLAGS
 ; See if the call to func is registered
 ; CHECK-NEXT:    <PERMODULE {{.*}} op4=1
+; CHECK-NEXT:    <BLOCK_COUNT op0=1/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK: <STRTAB_BLOCK
 ; CHECK-NEXT: blob data = 'undefinedglobmainfunc{{.*}}'
@@ -35,6 +36,7 @@
 ; COMBINED-NEXT:    <COMBINED
 ; See if the call to func is registered.
 ; COMBINED-NEXT:    <COMBINED {{.*}} op9=[[FUNCID]]/>
+; COMBINED-NEXT:    <BLOCK_COUNT op0=2/>
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
diff --git a/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll b/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
index 4d840d1f8ec8d..208ded58d683a 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
@@ -15,6 +15,7 @@
 ; COMBINED-DAG:    <COMBINED_ORIGINAL_NAME op0=-2012135647395072713/>
 ; COMBINED-DAG:    <COMBINED_ALIAS
 ; COMBINED-DAG:    <COMBINED_ORIGINAL_NAME op0=-4170563161550796836/>
+; COMBINED-NEXT:   <BLOCK_COUNT op0=1/>
 ; COMBINED-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 source_filename = "/path/to/source.c"
diff --git a/llvm/test/Bitcode/thinlto-function-summary.ll b/llvm/test/Bitcode/thinlto-function-summary.ll
index 67c50379e7abd..c9a64018f9d89 100644
--- a/llvm/test/Bitcode/thinlto-function-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary.ll
@@ -25,6 +25,7 @@
 ; BC-NEXT: <PERMODULE {{.*}} op0=3 op1=7
 ; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=0 op2=4 op3=0
 ; BC-NEXT: <ALIAS {{.*}} op0=6 op1=0 op2=3
+; BC-NEXT: <BLOCK_COUNT op0=5/>
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC: <STRTAB_BLOCK
 ; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicllvm.va_startf{{.*}}'
diff --git a/llvm/test/ThinLTO/X86/distributed_indexes.ll b/llvm/test/ThinLTO/X86/distributed_indexes.ll
index 228744f638491..65f14a2cb5b86 100644
--- a/llvm/test/ThinLTO/X86/distributed_indexes.ll
+++ b/llvm/test/ThinLTO/X86/distributed_indexes.ll
@@ -23,6 +23,7 @@
 ; BACKEND1-DAG: <COMBINED {{.*}} op1=0
 ; BACKEND1-DAG: <COMBINED {{.*}} op1=1
 ; BACKEND1-DAG: <COMBINED_ALIAS {{.*}} op1=1
+; BACKEND1-NEXT: <BLOCK_COUNT op0=3/>
 ; BACKEND1-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 ; The backend index for Input/distributed_indexes.ll contains summaries from
@@ -38,6 +39,7 @@
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: <COMBINED_ALIAS
+; BACKEND2-NEXT: <BLOCK_COUNT op0=3/>
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 ; Make sure that when the alias is imported as a copy of the aliasee, but the
diff --git a/llvm/test/tools/llvm-lto/thinlto.ll b/llvm/test/tools/llvm-lto/thinlto.ll
index 23843e07d6a5b..542525039a0dd 100644
--- a/llvm/test/tools/llvm-lto/thinlto.ll
+++ b/llvm/test/tools/llvm-lto/thinlto.ll
@@ -16,6 +16,7 @@
 ; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <BLOCK_COUNT op0=2/>
 ; COMBINED-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

From 9c52422cd83137a55e9d198bf123982c040b2e99 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 28 May 2020 17:15:21 +0200
Subject: [PATCH 390/770] [IR] Avoid linear scan in MDNode::intersect() (NFC)

00940fb8544767ba5217922c4ba96677aabe9eb3 changed this code to
construct a set for the B metadata. However, it still performs a
linear is_contained query, rather than making use of the set
structure.
---
 llvm/lib/IR/Metadata.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index cdd544040f528..ce89009e86eba 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -914,7 +914,7 @@ MDNode *MDNode::intersect(MDNode *A, MDNode *B) {
 
   SmallSetVector<Metadata *, 4> MDs(A->op_begin(), A->op_end());
   SmallPtrSet<Metadata *, 4> BSet(B->op_begin(), B->op_end());
-  MDs.remove_if([&](Metadata *MD) { return !is_contained(BSet, MD); });
+  MDs.remove_if([&](Metadata *MD) { return !BSet.count(MD); });
 
   // FIXME: This preserves long-standing behaviour, but is it really the right
   // behaviour?  Or was that an unintended side-effect of node uniquing?

From a1b88afe46d7a0f72d2acd8792951bd959b27545 Mon Sep 17 00:00:00 2001
From: Yitzhak Mandelbaum <yitzhakm@google.com>
Date: Thu, 28 May 2020 13:36:45 -0400
Subject: [PATCH 391/770] [clang-tidy] Fix build broken by commit
 7cfdff7b4a6704b8ef2a1b594e1ec19d2d89f385 (D80023)

---
 clang-tools-extra/clang-tidy/abseil/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt b/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt
index c4efa0fe27437..5926717c6c0a6 100644
--- a/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/abseil/CMakeLists.txt
@@ -33,4 +33,5 @@ add_clang_library(clangTidyAbseilModule
   clangTidy
   clangTidyUtils
   clangTooling
+  clangTransformer
   )

From 5395389475bcaba16966ab62125f2f54ea81c915 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <vsavchenko@apple.com>
Date: Wed, 27 May 2020 16:06:45 +0300
Subject: [PATCH 392/770] [analyzer] SATestBuild.py: Make verbosity level a cmd
 option

Reviewers: NoQ, dcoughlin

Subscribers: xazax.hun, baloghadamsoftware, szepet, a.sidorin, mikhail.ramalho, Szelethus, donat.nagy, dkrupp, Charusso, ASDenysPetrov, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80626
---
 clang/utils/analyzer/SATestBuild.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/clang/utils/analyzer/SATestBuild.py b/clang/utils/analyzer/SATestBuild.py
index e2fe6a95138b3..5ff430d5fcf3a 100755
--- a/clang/utils/analyzer/SATestBuild.py
+++ b/clang/utils/analyzer/SATestBuild.py
@@ -389,7 +389,7 @@ def test(self) -> bool:
         start_time = time.time()
 
         project_dir = self.get_project_dir()
-        if VERBOSE == 1:
+        if VERBOSE >= 1:
             stdout(f"  Build directory: {project_dir}.\n")
 
         # Set the build results directory.
@@ -431,7 +431,7 @@ def build(self, directory: str, output_dir: str):
 
         # Clean up scan build results.
         if os.path.exists(output_dir):
-            if VERBOSE == 1:
+            if VERBOSE >= 1:
                 stdout(f"  Removing old results: {output_dir}\n")
 
             shutil.rmtree(output_dir)
@@ -517,7 +517,7 @@ def scan_build(self, directory: str, output_dir: str, build_log_file: IO):
 
                 command_to_run = command_prefix + command
 
-                if VERBOSE == 1:
+                if VERBOSE >= 1:
                     stdout(f"  Executing: {command_to_run}\n")
 
                 check_call(command_to_run, cwd=cwd,
@@ -575,7 +575,7 @@ def analyze_preprocessed(self, directory: str, output_dir: str):
             log_path = os.path.join(fail_path, file_name + ".stderr.txt")
             with open(log_path, "w+") as log_file:
                 try:
-                    if VERBOSE == 1:
+                    if VERBOSE >= 1:
                         stdout(f"  Executing: {command}\n")
 
                     check_call(command, cwd=directory, stderr=log_file,
@@ -744,7 +744,7 @@ def run_cmp_results(directory: str, strictness: int = 0) -> bool:
     for ref_dir, new_dir in zip(ref_list, new_list):
         assert(ref_dir != new_dir)
 
-        if VERBOSE == 1:
+        if VERBOSE >= 1:
             stdout(f"  Comparing Results: {ref_dir} {new_dir}\n")
 
         patched_source = os.path.join(directory, PATCHED_SOURCE_DIR_NAME)
@@ -818,7 +818,7 @@ def remove_log_file(output_dir: str):
 
     # Clean up the log file.
     if os.path.exists(build_log_path):
-        if VERBOSE == 1:
+        if VERBOSE >= 1:
             stdout(f"  Removing log file: {build_log_path}\n")
 
         os.remove(build_log_path)
@@ -887,29 +887,31 @@ def validate_project_file(map_file: IO):
 
 if __name__ == "__main__":
     # Parse command line arguments.
-    Parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(
         description="Test the Clang Static Analyzer.")
 
-    Parser.add_argument("--strictness", dest="strictness", type=int, default=0,
+    parser.add_argument("--strictness", dest="strictness", type=int, default=0,
                         help="0 to fail on runtime errors, 1 to fail when the "
                         "number of found bugs are different from the "
                         "reference, 2 to fail on any difference from the "
                         "reference. Default is 0.")
-    Parser.add_argument("-r", dest="regenerate", action="store_true",
+    parser.add_argument("-r", dest="regenerate", action="store_true",
                         default=False, help="Regenerate reference output.")
-    Parser.add_argument("--override-compiler", action="store_true",
+    parser.add_argument("--override-compiler", action="store_true",
                         default=False, help="Call scan-build with "
                         "--override-compiler option.")
-    Parser.add_argument("-j", "--jobs", dest="jobs", type=int,
+    parser.add_argument("-j", "--jobs", dest="jobs", type=int,
                         default=0,
                         help="Number of projects to test concurrently")
-    Parser.add_argument("--extra-analyzer-config",
+    parser.add_argument("--extra-analyzer-config",
                         dest="extra_analyzer_config", type=str,
                         default="",
                         help="Arguments passed to to -analyzer-config")
+    parser.add_argument("-v", "--verbose", action="count", default=0)
 
-    args = Parser.parse_args()
+    args = parser.parse_args()
 
+    VERBOSE = args.verbose
     tester = RegressionTester(args.jobs, args.override_compiler,
                               args.extra_analyzer_config, args.regenerate,
                               args.strictness)

From b950c261d9c107d2c9366dd7e583f9f1c4f282f3 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 28 May 2020 10:39:45 -0700
Subject: [PATCH 393/770] [lldb/Reproducers] Add top-level-target
 check-lldb-reproducers

This adds a new target `check-lldb-reproducers` that replaces the old
`check-lldb-repro`. The latter would only run the shell tests, while
`check-lldb-reproducers` includes the API tests as well. The new target
will be used on GreenDragon.

It's still possible to run just the shell tests with reproducers,
although now that requires crafting the lit invocation yourself. The
parameters haven't changed and are the shame for the API and shell
tests:

  --param lldb-run-with-repro=capture
  --param lldb-run-with-repro=replay

This patch also updates the reproducer documentation.
---
 lldb/docs/resources/reproducers.rst | 21 ++++++++++++---------
 lldb/test/CMakeLists.txt            | 21 +++++++++++++++++++++
 lldb/test/Shell/CMakeLists.txt      | 17 -----------------
 lldb/test/Shell/lit.cfg.py          |  2 +-
 4 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/lldb/docs/resources/reproducers.rst b/lldb/docs/resources/reproducers.rst
index 370a7925a9e90..1b109c06d9075 100644
--- a/lldb/docs/resources/reproducers.rst
+++ b/lldb/docs/resources/reproducers.rst
@@ -169,15 +169,18 @@ Reproducers are tested in the following ways:
    directory. These tests serve as integration and regression tests for the
    reproducers infrastructure, as well as doing some sanity checking for basic
    debugger functionality.
- - The shell tests can be run against a reproducer replay. The
-   ``check-lldb-repro`` target will run the shell test suite twice. First it
-   runs the test suite and captures a reproducer for every lldb invocation and
-   saves it to a known location based off lldb's arguments and  working
-   directory. Then it runs the test suite again, this time replaying the
-   reproducers. Certain tests do not fit this paradigm (for example test that
-   check the output of the binary being debugged) and are skipped by marking
-   them as unsupported by adding ``UNSUPPORTED: lldb-repro`` to the top of the
-   test.
+ - The API and shell tests can be run against a replayed reproducer. The
+   ``check-lldb-reproducers`` target will run the API and shell test suite
+   twice: first running the test normally while capturing a reproducer and then
+   a second time using the replayed session as the test input. For the shell
+   tests this use a little shim (``lldb-repro``) that uses the arguments and
+   current working directory to transparently generate or replay a reproducer.
+   For the API tests an extra argument with the reproducer path is passed to
+   ``dotest.py`` which initializes the debugger in the appropriate mode.
+   Certain tests do not fit this paradigm (for example test that check the
+   output of the binary being debugged) and are skipped by marking them as
+   unsupported by adding ``UNSUPPORTED: lldb-repro`` to the top of the shell
+   test or adding the ``skipIfReproducer`` decorator for the API tests.
 
 Additional testing is possible:
 
diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 3cad416f923a7..96931cbaeaf99 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -192,6 +192,27 @@ add_dependencies(check-lldb lldb-test-deps)
 set_target_properties(check-lldb PROPERTIES FOLDER "lldb misc")
 add_dependencies(check-lldb check-lldb-lit)
 
+# Add a lit test suite that runs the API & shell test while capturing a
+# reproducer.
+add_lit_testsuite(check-lldb-reproducers-capture
+  "Running lldb test suite with reproducer capture"
+  ${CMAKE_CURRENT_BINARY_DIR}/API
+  ${CMAKE_CURRENT_BINARY_DIR}/Shell
+  PARAMS "lldb-run-with-repro=capture"
+  EXCLUDE_FROM_CHECK_ALL
+  DEPENDS lldb-test-deps)
+
+# Add a lit test suite that runs the API & shell test by replaying a
+# reproducer.
+add_lit_testsuite(check-lldb-reproducers
+  "Running lldb test suite with reproducer replay"
+  ${CMAKE_CURRENT_BINARY_DIR}/API
+  ${CMAKE_CURRENT_BINARY_DIR}/Shell
+  PARAMS "lldb-run-with-repro=replay"
+  EXCLUDE_FROM_CHECK_ALL
+  DEPENDS lldb-test-deps)
+add_dependencies(check-lldb-reproducers check-lldb-reproducers-capture)
+
 if(LLDB_BUILT_STANDALONE)
   # This has to happen *AFTER* add_lit_testsuite.
   if (EXISTS ${LLVM_MAIN_SRC_DIR}/utils/llvm-lit)
diff --git a/lldb/test/Shell/CMakeLists.txt b/lldb/test/Shell/CMakeLists.txt
index 0c78e25d9ceeb..d203f1e093c74 100644
--- a/lldb/test/Shell/CMakeLists.txt
+++ b/lldb/test/Shell/CMakeLists.txt
@@ -15,20 +15,3 @@ if (CMAKE_GENERATOR STREQUAL "Xcode")
     ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS lldb-test-deps)
 endif()
-
-# Add a lit test suite that runs the shell test while capturing a reproducer.
-add_lit_testsuite(check-lldb-repro-capture
-  "Running lldb shell test suite with reproducer capture"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  PARAMS "lldb-run-with-repro=capture"
-  EXCLUDE_FROM_CHECK_ALL
-  DEPENDS lldb-test-deps)
-
-# Add a lit test suite that runs the shell test by replaying a reproducer.
-add_lit_testsuite(check-lldb-repro
-  "Running lldb shell test suite with reproducer replay"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  PARAMS "lldb-run-with-repro=replay"
-  EXCLUDE_FROM_CHECK_ALL
-  DEPENDS lldb-test-deps)
-add_dependencies(check-lldb-repro check-lldb-repro-capture)
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 2e23d9dd0eda3..2ee646e3fc7df 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -49,7 +49,7 @@
 lldb_repro_mode = lit_config.params.get('lldb-run-with-repro', None)
 if lldb_repro_mode:
   config.available_features.add('lldb-repro')
-  lit_config.note("Running Shell test with lldb-repro in {} mode.".format(lldb_repro_mode))
+  lit_config.note("Running Shell tests in {} mode.".format(lldb_repro_mode))
   toolchain.use_lldb_repro_substitutions(config, lldb_repro_mode)
 
 llvm_config.use_default_substitutions()

From 501aa47ab8fa62f5be1e41bee6d5b8fbd1fa6627 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 10:45:07 -0700
Subject: [PATCH 394/770] [Statepoint] Sink logic about actual callee into
 GCStatepointInst

Sinking logic around actual callee from Statepoint to GCStatepointInst.  While doing so, adjust naming to be consistent about refering to "actual" callee and follow precedent on naming from CallBase otherwise.

Use the result to simplify one consumer.  This is mostly just to ensure the new code is exercised, but is also a helpful cleanup on it's own.
---
 llvm/include/llvm/IR/Statepoint.h             | 43 +++++++++++--------
 .../SelectionDAG/StatepointLowering.cpp       | 11 ++---
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index 34eb1126b373f..ce3d5a655df82 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -115,6 +115,25 @@ class GCStatepointInst : public CallBase {
   uint64_t getFlags() const {
     return cast<ConstantInt>(getArgOperand(FlagsPos))->getZExtValue();
   }
+
+  /// Return the value actually being called or invoked.
+  Value *getActualCalledOperand() const {
+    return getArgOperand(CalledFunctionPos);
+  }
+
+  /// Returns the function called if this is a wrapping a direct call, and null
+  /// otherwise.
+  Function *getActualCalledFunction() const {
+    return dyn_cast_or_null<Function>(getActualCalledOperand());
+  }
+
+  /// Return the type of the value returned by the call underlying the
+  /// statepoint.
+  Type *getActualReturnType() const {
+    auto *CalleeTy =
+      cast<PointerType>(getActualCalledOperand()->getType())->getElementType();
+    return cast<FunctionType>(CalleeTy)->getReturnType();
+  }
 };
 
 /// A wrapper around a GC intrinsic call, this provides most of the actual
@@ -139,7 +158,6 @@ class StatepointBase {
   using arg_iterator = typename CallTy::const_op_iterator;
 
   enum {
-    CalledFunctionPos = GCStatepointInst::CalledFunctionPos,
     CallArgsBeginPos = GCStatepointInst::CallArgsBeginPos,
   };
 
@@ -162,22 +180,18 @@ class StatepointBase {
   uint64_t getID() const { return getCall()->getID(); }
   uint32_t getNumPatchBytes() const { return getCall()->getNumPatchBytes(); }
   int getNumCallArgs() const { return getCall()->getNumCallArgs(); }
-
-
-  /// Return the value actually being called or invoked.
   ValueTy *getCalledValue() const {
-    return getCall()->getArgOperand(CalledFunctionPos);
+    return getCall()->getActualCalledOperand();
+  }
+  Type *getActualReturnType() const { return getCall()->getActualReturnType(); }
+  FunTy *getCalledFunction() const {
+    return getCall()->getActualCalledFunction();
   }
 
+  
   // FIXME: Migrate users of this to `getCall` and remove it.
   InstructionTy *getInstruction() const { return getCall(); }
 
-  /// Return the function being called if this is a direct call, otherwise
-  /// return null (if it's an indirect call).
-  FunTy *getCalledFunction() const {
-    return dyn_cast<Function>(getCalledValue());
-  }
-
   /// Return the caller function for this statepoint.
   FunTy *getCaller() const { return getCall()->getCaller(); }
 
@@ -187,13 +201,6 @@ class StatepointBase {
     return getCall()->doesNotThrow() || (F ? F->doesNotThrow() : false);
   }
 
-  /// Return the type of the value returned by the call underlying the
-  /// statepoint.
-  Type *getActualReturnType() const {
-    auto *FTy = cast<FunctionType>(
-        cast<PointerType>(getCalledValue()->getType())->getElementType());
-    return FTy->getReturnType();
-  }
 
   size_t arg_size() const { return getNumCallArgs(); }
   arg_iterator arg_begin() const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index acb68405470ca..664f56523d9b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -822,7 +822,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
 #endif
 
   SDValue ActualCallee;
-  SDValue Callee = getValue(ISP.getCalledValue());
+  SDValue Callee = getValue(I.getActualCalledOperand());
 
   if (I.getNumPatchBytes() > 0) {
     // If we've been asked to emit a nop sequence instead of a call instruction
@@ -838,7 +838,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
   StatepointLoweringInfo SI(DAG);
   populateCallLoweringInfo(SI.CLI, &I, GCStatepointInst::CallArgsBeginPos,
                            I.getNumCallArgs(), ActualCallee,
-                           ISP.getActualReturnType(), false /* IsPatchPoint */);
+                           I.getActualReturnType(), false /* IsPatchPoint */);
 
   // There may be duplication in the gc.relocate list; such as two copies of
   // each relocation on normal and exceptional path for an invoke.  We only
@@ -894,7 +894,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
 
   // Export the result value if needed
   const GCResultInst *GCResult = ISP.getGCResult();
-  Type *RetTy = ISP.getActualReturnType();
+  Type *RetTy = I.getActualReturnType();
   if (!RetTy->isVoidTy() && GCResult) {
     if (GCResult->getParent() != I.getParent()) {
       // Result value will be used in a different basic block so we need to
@@ -979,10 +979,7 @@ void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
     // register because statepoint and actual call return types can be
     // different, and getValue() will use CopyFromReg of the wrong type,
     // which is always i32 in our case.
-    PointerType *CalleeType = cast<PointerType>(
-        ImmutableStatepoint(I).getCalledValue()->getType());
-    Type *RetTy =
-        cast<FunctionType>(CalleeType->getElementType())->getReturnType();
+    Type *RetTy = cast<GCStatepointInst>(I)->getActualReturnType();
     SDValue CopyFromReg = getCopyFromRegs(I, RetTy);
 
     assert(CopyFromReg.getNode());

From 72ede60b75ee90b7f4e33d88e4df18bb6f6dc94a Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Thu, 28 May 2020 18:57:39 +0200
Subject: [PATCH 395/770] [mlir][GPU] Link relevant LLVM components in
 GPUCommon instead of test

D80142 restructured MLIR-to-GPU-binary conversion to support multiple
targets. It also modified cmake files to link relevant LLVM components
in test/lib, which broke shared-library builds, and likely made the
conversions unusable outside mlir-opt (or other tools that link in test
library targets). Link these components to GPUCommon instead.

Differential Revision: https://reviews.llvm.org/D80739
---
 mlir/lib/Conversion/GPUCommon/CMakeLists.txt | 20 ++++++++++++++++++
 mlir/test/lib/Transforms/CMakeLists.txt      | 22 --------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index eb7d21f66f73d..91c281614214b 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -1,3 +1,21 @@
+if (MLIR_CUDA_CONVERSIONS_ENABLED)
+  set(NVPTX_LIBS
+    MC
+    NVPTXCodeGen
+    NVPTXDesc
+    NVPTXInfo
+  )
+endif()
+
+if (MLIR_ROCM_CONVERSIONS_ENABLED)
+  set(AMDGPU_LIBS
+    MC
+    AMDGPUCodeGen
+    AMDGPUDesc
+    AMDGPUInfo
+  )
+endif()
+
 add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
   ConvertLaunchFuncToRuntimeCalls.cpp
   ConvertKernelFuncToBlob.cpp
@@ -8,6 +26,8 @@ add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
 
   LINK_COMPONENTS
   Core
+  ${AMDGPU_LIBS}
+  ${NVPTX_LIBS}
 
   LINK_LIBS PUBLIC
   MLIRGPU
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 55bf84cb16373..4ea7498d34822 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -1,21 +1,3 @@
-if (MLIR_CUDA_CONVERSIONS_ENABLED)
-  set(NVPTX_LIBS
-    MC
-    NVPTXCodeGen
-    NVPTXDesc
-    NVPTXInfo
-  )
-endif()
-
-if (MLIR_ROCM_CONVERSIONS_ENABLED)
-  set(AMDGPU_LIBS
-    MC
-    AMDGPUCodeGen
-    AMDGPUDesc
-    AMDGPUInfo
-  )
-endif()
-
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestTransforms
   TestAllReduceLowering.cpp
@@ -50,10 +32,6 @@ add_mlir_library(MLIRTestTransforms
   MLIRStandardOpsIncGen
   MLIRTestVectorTransformPatternsIncGen
 
-  LINK_COMPONENTS
-  ${AMDGPU_LIBS}
-  ${NVPTX_LIBS}
-
   LINK_LIBS PUBLIC
   MLIRAffineOps
   MLIRAnalysis

From 2b0c8546ac9fb47e1bf9c5e54f1450420eadeab7 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Thu, 28 May 2020 10:19:47 -0700
Subject: [PATCH 396/770] [mlir][Linalg] Add pass to remove unit-extent dims
 from tensor operands of Generic ops.

Unit-extent dimensions are typically used for achieving broadcasting
behavior. The pattern added (along with canonicalization patterns
added previously) removes the use of unit-extent dimensions, and
instead uses a more canonical representation of the computation.  This
new pattern is not added as a canonicalization for now since it
entails adding additional reshape operations. A pass is added to
exercise these patterns, along with an API entry to populate a
patterns list with these patterns.

Differential Revision: https://reviews.llvm.org/D79766
---
 .../Linalg/IR/LinalgStructuredOpsInterface.td |   8 +
 .../mlir/Dialect/Linalg/IR/LinalgTraits.h     |  19 +
 mlir/include/mlir/Dialect/Linalg/Passes.h     |   7 +
 mlir/include/mlir/Dialect/Linalg/Passes.td    |  11 +
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |   6 +-
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |   1 +
 .../Linalg/Transforms/DropUnitDims.cpp        | 375 ++++++++++++++++++
 mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp |   4 +-
 .../Dialect/Linalg/drop-unit-extent-dims.mlir | 165 ++++++++
 .../Dialect/Linalg/fold-unit-trip-loops.mlir  | 110 +++++
 10 files changed, 700 insertions(+), 6 deletions(-)
 create mode 100644 mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
 create mode 100644 mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
 create mode 100644 mlir/test/Dialect/Linalg/fold-unit-trip-loops.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
index beac1135a0bca..b03001c9b8e96 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td
@@ -123,6 +123,10 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
       "Return the range over inputs (irrespective of type) and output buffers.",
       "Operation::operand_range", "getInputsAndOutputBuffers"
     >,
+    InterfaceMethod<
+      "Return the shaped types for all the inputs and outputs",
+      "SmallVector<ShapedType, 4>", "getInputOutputShapedTypes"
+    >,
 
     //===------------------------------------------------------------------===//
     // Other interface methods.
@@ -153,6 +157,10 @@ def LinalgStructuredInterface : OpInterface<"LinalgOp"> {
       "Return the indexing maps attribute within the current operation.",
       "ArrayAttr", "indexing_maps"
     >,
+    InterfaceMethod<
+      "Return the indexing maps within the current operation.",
+      "SmallVector<AffineMap, 4>", "getIndexingMaps"
+    >,
     InterfaceMethod<"Return the input or output indexing map at index `i`.",
       "AffineMap", "getIndexingMap", (ins "unsigned":$i)
     >,
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
index b7bba5a310113..4ab547be20197 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTraits.h
@@ -217,6 +217,18 @@ class StructuredOpTraits
     return getOutputTensorTypes()[i - getNumInputsAndOutputBuffers()]
         .template cast<ShapedType>();
   }
+  /// Return the shaped types for all the inputs and outputs
+  SmallVector<ShapedType, 4> getInputOutputShapedTypes() {
+    SmallVector<Type, 4> inputOutputTypes(
+        this->getOperation()->operand_type_begin(),
+        this->getOperation()->operand_type_end());
+    inputOutputTypes.append(this->getOperation()->result_type_begin(),
+                            this->getOperation()->result_type_end());
+    return llvm::to_vector<4>(
+        llvm::map_range(inputOutputTypes, [](Type type) -> ShapedType {
+          return type.cast<ShapedType>();
+        }));
+  }
 
   //==========================================================================//
   // Other interface methods.
@@ -295,6 +307,13 @@ class StructuredOpTraits
     return attr;
   }
 
+  SmallVector<AffineMap, 4> getIndexingMaps() {
+    return llvm::to_vector<4>(
+        llvm::map_range(indexing_maps(), [](Attribute attr) -> AffineMap {
+          return attr.cast<AffineMapAttr>().getValue();
+        }));
+  }
+
   AffineMap getIndexingMap(unsigned i) {
     assert(i < getNumInputsAndOutputs());
     return indexing_maps()
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index d3bfa90e6bdb5..8a274ed48dc55 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -24,6 +24,8 @@ template <typename T> class OperationPass;
 class OwningRewritePatternList;
 class Pass;
 
+std::unique_ptr<OperationPass<FuncOp>> createLinalgFoldUnitExtentDimsPass();
+
 std::unique_ptr<OperationPass<FuncOp>> createLinalgFusionPass();
 std::unique_ptr<Pass> createLinalgFusionOfTensorOpsPass();
 
@@ -59,6 +61,11 @@ createConvertLinalgOnTensorsToBuffersPass();
 void populateLinalgTensorOpsFusionPatterns(MLIRContext *context,
                                            OwningRewritePatternList &patterns);
 
+/// Patterns to fold unit-extent dimensions in operands/results of linalg ops on
+/// tensors.
+void populateLinalgFoldUnitExtentDimsPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_LINALG_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 850f381dd4efa..1fc7fa5bf729e 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -11,6 +11,17 @@
 
 include "mlir/Pass/PassBase.td"
 
+def LinalgFoldUnitExtentDims : FunctionPass<"linalg-fold-unit-extent-dims"> {
+  let summary = "Remove unit-extent dimension in Linalg ops on tensors";
+  let constructor = "mlir::createLinalgFoldUnitExtentDimsPass()";
+  let options = [
+    Option<"foldOneTripLoopsOnly", "fold-one-trip-loops-only", "bool",
+            /*default=*/"false",
+	    "Only folds the one-trip loops from Linalg ops on tensors "
+	    "(for testing purposes only)">
+  ];
+}
+
 def LinalgFusion : FunctionPass<"linalg-fusion"> {
   let summary = "Fuse operations in the linalg dialect";
   let constructor = "mlir::createLinalgFusionPass()";
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index c7a0f9d3812d9..db4587fce014c 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -265,7 +265,7 @@ static LogicalResult verify(IndexedGenericOp op) { return verifyGenericOp(op); }
 static ArrayAttr collapseReassociationMaps(ArrayRef<AffineMap> mapsProducer,
                                            ArrayRef<AffineMap> mapsConsumer,
                                            MLIRContext *context) {
-  if (mapsProducer.size() == 0 || mapsConsumer.size() == 0 ||
+  if (mapsProducer.empty() || mapsConsumer.empty() ||
       mapsProducer[0].getNumDims() < mapsConsumer[0].getNumDims() ||
       mapsProducer.size() != mapsConsumer[0].getNumDims())
     return nullptr;
@@ -277,7 +277,7 @@ static ArrayAttr collapseReassociationMaps(ArrayRef<AffineMap> mapsProducer,
     for (AffineExpr rhsExpr : rhs.getResults()) {
       AffineDimExpr dimExpr = rhsExpr.cast<AffineDimExpr>();
       for (int i = 0, e = mapsProducer[dimExpr.getPosition()].getNumResults();
-           i != e; ++i) {
+           i < e; ++i) {
         reassociations.push_back(getAffineDimExpr(currDim++, context));
       }
     }
@@ -1129,8 +1129,6 @@ OpFoldResult SliceOp::fold(ArrayRef<Attribute>) {
   return {};
 }
 OpFoldResult TensorReshapeOp::fold(ArrayRef<Attribute>) {
-  if (succeeded(foldMemRefCast(*this)))
-    return getResult();
   return foldReshapeOp(*this);
 }
 OpFoldResult TransposeOp::fold(ArrayRef<Attribute>) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 097fa355a131a..c87e3d4f15b66 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIRLinalgTransforms
+  DropUnitDims.cpp
   Fusion.cpp
   Interchange.cpp
   Loops.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
new file mode 100644
index 0000000000000..e08c43d48ba04
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -0,0 +1,375 @@
+//===- DropUnitDims.cpp - Pass to drop use of unit-extent for broadcasting ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements patterns/pass to remove usage of unit-extent dimensions
+// to specify broadcasting in favor of more canonical representation of the
+// computation
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "linalg-drop-unit-dims"
+
+using namespace mlir;
+using namespace mlir::edsc;
+using namespace mlir::edsc::intrinsics;
+using namespace mlir::linalg;
+
+/// Implements a pass that canonicalizes the uses of unit-extent dimensions for
+/// broadcasting. For example,
+///
+/// ```mlir
+/// #accesses = [
+///   affine_map<(d0, d1) -> (0, d1)>,
+///   affine_map<(d0, d1) -> (d0, 0)>,
+///   affine_map<(d0, d1) -> (d0, d1)>
+/// ]
+///
+/// #trait = {
+///   args_in = 2,
+///   args_out = 1,
+///   indexing_maps = #accesses,
+///   iterator_types = ["parallel", "parallel"],
+///   library_call = "some_external_fn"
+/// }
+///
+/// func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>) ->
+/// tensor<5x5xf32>
+/// {
+///   %0 = linalg.tensor_reshape %arg0 [affine_map<(d0, d1) -> (d0, d1)>] :
+///        tensor<5xf32> into tensor<1x5xf32>
+///   %1 = linalg.tensor_reshape %arg1 [affine_map<(d0, d1) -> (d0, d1)>] :
+///        tensor<5xf32> into tensor<5x1xf32>
+///   %2 = linalg.generic #trait %0, %1 {
+///        ^bb0(%arg2: f32, %arg3: f32):
+///          %3 = addf %arg2, %arg3 : f32
+///          linalg.yield %3 : f32
+///        } : tensor<1x5xf32>, tensor<5x1xf32> -> tensor<5x5xf32>
+///   return %2 : tensor<5x5xf32>
+/// }
+///
+/// would canonicalize to
+///
+/// ```mlir
+/// #accesses = [
+///   affine_map<(d0, d1) -> (d1)>,
+///   affine_map<(d0, d1) -> (d0)>,
+///   affine_map<(d0, d1) -> (d0, d1)>
+/// ]
+///
+/// #trait = {
+///   args_in = 2,
+///   args_out = 1,
+///   indexing_maps = #accesses,
+///   iterator_types = ["parallel", "parallel"],
+///   library_call = "some_external_fn"
+/// }
+///
+/// func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>) ->
+/// tensor<5x5xf32>
+/// {
+///   %0 = linalg.generic #trait %arg0, %arg1 {
+///        ^bb0(%arg2: f32, %arg3: f32):
+///          %3 = addf %arg2, %arg3 : f32
+///          linalg.yield %3 : f32
+///        } : tensor<5xf32>, tensor<5xf32> -> tensor<5x5xf32>
+///   return %0 : tensor<5x5xf32>
+/// }
+
+/// Given dims of the iteration space of a structured op that are known to be
+/// single trip count (`unitDims`), return the indexing maps to use in the
+/// canonicalized op with these dims removed, given the original `indexingMaps`.
+static ArrayAttr replaceUnitDims(DenseSet<unsigned> &unitDims,
+                                 ArrayRef<AffineMap> indexingMaps,
+                                 MLIRContext *context) {
+  if (indexingMaps.empty())
+    return nullptr;
+  unsigned numIterationDims = indexingMaps.front().getNumDims();
+  unsigned numSymbols = indexingMaps.front().getNumSymbols();
+
+  // Compute the replacement for each dim expr.
+  SmallVector<AffineExpr, 4> dimReplacements;
+  dimReplacements.reserve(numIterationDims);
+  unsigned numKeptDims = 0;
+  for (unsigned dim : llvm::seq<unsigned>(0, numIterationDims)) {
+    if (unitDims.count(dim))
+      dimReplacements.push_back(getAffineConstantExpr(0, context));
+    else
+      dimReplacements.push_back(getAffineDimExpr(numKeptDims++, context));
+  }
+
+  // Symbols remain the same.
+  SmallVector<AffineExpr, 4> symReplacements;
+  symReplacements.reserve(numSymbols);
+  for (unsigned symbol : llvm::seq<unsigned>(0, numSymbols))
+    symReplacements.push_back(getAffineSymbolExpr(symbol, context));
+
+  SmallVector<AffineMap, 4> newIndexingMaps;
+  newIndexingMaps.reserve(indexingMaps.size());
+  for (AffineMap operandMap : indexingMaps) {
+    // Expected indexing maps to have no symbols.
+    if (operandMap.getNumSymbols())
+      return nullptr;
+    newIndexingMaps.push_back(simplifyAffineMap(
+        operandMap.replaceDimsAndSymbols(dimReplacements, symReplacements,
+                                         numIterationDims - unitDims.size(),
+                                         numSymbols)));
+  }
+
+  // Check that the new index maps are invertible. If not, something went
+  // wrong, so abort.
+  if (!inversePermutation(concatAffineMaps(newIndexingMaps)))
+    return nullptr;
+  return ArrayAttr::get(
+      llvm::to_vector<4>(llvm::map_range(
+          newIndexingMaps,
+          [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); })),
+      context);
+}
+
+namespace {
+/// Pattern to fold unit-trip count loops in GenericOps.
+// TODO: Generalize this to indexed-generic as well by modifying the region args
+// as well.
+struct FoldUnitDimLoops : public OpRewritePattern<GenericOp> {
+  using OpRewritePattern<GenericOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(GenericOp genericOp,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<AffineMap, 4> indexingMaps = genericOp.getIndexingMaps();
+    if (indexingMaps.empty())
+      return failure();
+
+    // Check if any of the iteration dimensions are unit-trip count. They will
+    // end up being unit-trip count if they are used to index into a unit-dim
+    // tensor/memref.
+    AffineMap invertedMap = inversePermutation(concatAffineMaps(indexingMaps));
+    if (!invertedMap)
+      return failure();
+    SmallVector<int64_t, 4> dims;
+    for (ShapedType shapedType : genericOp.getInputOutputShapedTypes())
+      dims.append(shapedType.getShape().begin(), shapedType.getShape().end());
+    DenseSet<unsigned> unitDims;
+    ArrayAttr iteratorTypes = genericOp.iterator_types();
+    for (auto expr : enumerate(invertedMap.getResults())) {
+      if (AffineDimExpr dimExpr = expr.value().dyn_cast<AffineDimExpr>())
+        if (dims[dimExpr.getPosition()] == 1 &&
+            iteratorTypes[expr.index()].dyn_cast<StringAttr>().getValue() ==
+                getParallelIteratorTypeName())
+          unitDims.insert(expr.index());
+    }
+    if (unitDims.empty())
+      return failure();
+
+    // Compute the modified indexing maps.
+    MLIRContext *context = rewriter.getContext();
+    ArrayAttr newIndexingMapAttr =
+        replaceUnitDims(unitDims, indexingMaps, context);
+    if (!newIndexingMapAttr)
+      return genericOp.emitError("unable to compute modified indexing_maps");
+
+    // Compute the iterator types of the modified op by dropping the one-trip
+    // count loops.
+    SmallVector<Attribute, 4> newIteratorTypes;
+    for (auto attr : llvm::enumerate(iteratorTypes)) {
+      if (!unitDims.count(attr.index()))
+        newIteratorTypes.push_back(attr.value());
+    }
+
+    rewriter.startRootUpdate(genericOp);
+    genericOp.indexing_mapsAttr(newIndexingMapAttr);
+    genericOp.iterator_typesAttr(ArrayAttr::get(newIteratorTypes, context));
+    rewriter.finalizeRootUpdate(genericOp);
+    return success();
+  }
+};
+
+struct UnitExtentReplacementInfo {
+  RankedTensorType type;
+  AffineMap indexMap;
+  ArrayAttr reassociation;
+};
+} // namespace
+
+/// Utility function for replacing operands/results to a linalg generic
+/// operation on tensors with unit-extent dimensions. These can be replaced with
+/// an operand/result with the unit-extent dimension removed. This is only done
+/// if the indexing map used to access that didimensionmension has a
+/// AffineConstantExpr of value 0. Given the `type` of an result/operand of a
+/// Linalg op, and its `indexMap` the utility function returns:
+/// - the new type with dimensions of size 1 removed.
+/// - modified index map that can be used to access the replaced result/operand
+/// - the reassociation that converts from the original tensor type to the
+///   modified tensor type.
+static UnitExtentReplacementInfo replaceUnitExtents(AffineMap indexMap,
+                                                    RankedTensorType type,
+                                                    MLIRContext *context) {
+  ArrayRef<int64_t> shape = type.getShape();
+  ArrayRef<AffineExpr> exprs = indexMap.getResults();
+  SmallVector<AffineExpr, 2> reassociations;
+  SmallVector<Attribute, 4> reassociationMaps;
+  SmallVector<AffineExpr, 4> newIndexExprs;
+  SmallVector<int64_t, 4> newShape;
+
+  int64_t origRank = type.getRank();
+  AffineExpr zeroExpr = getAffineConstantExpr(0, context);
+  auto isUnitExtent = [&](int64_t dim) -> bool {
+    return shape[dim] == 1 && exprs[dim] == zeroExpr;
+  };
+
+  unsigned dim = 0;
+  // Fold dimensions that are unit-extent at the beginning of the tensor.
+  while (dim < origRank && isUnitExtent(dim))
+    reassociations.push_back(getAffineDimExpr(dim++, context));
+  while (dim < origRank) {
+    reassociations.push_back(getAffineDimExpr(dim, context));
+    newIndexExprs.push_back(exprs[dim]);
+    newShape.push_back(shape[dim]);
+    // Fold all following dimensions that are unit-extent.
+    while (dim + 1 < origRank && isUnitExtent(dim + 1)) {
+      ++dim;
+      reassociations.push_back(getAffineDimExpr(dim, context));
+    }
+    reassociationMaps.push_back(AffineMapAttr::get(AffineMap::get(
+        origRank, /*numSymbols = */ 0, reassociations, context)));
+    reassociations.clear();
+    ++dim;
+  }
+  UnitExtentReplacementInfo info = {
+      RankedTensorType::get(newShape, type.getElementType()),
+      AffineMap::get(indexMap.getNumDims(), indexMap.getNumSymbols(),
+                     newIndexExprs, context),
+      ArrayAttr::get(reassociationMaps, context)};
+  return info;
+}
+
+namespace {
+/// Pattern to replace tensors operands/results that are unit extents.
+struct ReplaceUnitExtentTensors : public OpRewritePattern<GenericOp> {
+  using OpRewritePattern<GenericOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(GenericOp genericOp,
+                                PatternRewriter &rewriter) const override {
+    if (!genericOp.hasTensorSemantics())
+      return failure();
+
+    MLIRContext *context = rewriter.getContext();
+    Location loc = genericOp.getLoc();
+
+    SmallVector<AffineMap, 4> newIndexingMaps;
+    SmallVector<ArrayAttr, 4> reassociationMaps;
+    SmallVector<ShapedType, 4> newInputOutputTypes;
+    bool doCanonicalization = false;
+    for (auto it : llvm::zip(genericOp.getIndexingMaps(),
+                             genericOp.getInputOutputShapedTypes())) {
+      auto replacementInfo = replaceUnitExtents(
+          std::get<0>(it), std::get<1>(it).cast<RankedTensorType>(), context);
+      reassociationMaps.push_back(replacementInfo.reassociation);
+      newIndexingMaps.push_back(replacementInfo.indexMap);
+      newInputOutputTypes.push_back(replacementInfo.type);
+      doCanonicalization =
+          doCanonicalization || replacementInfo.type != std::get<1>(it);
+    }
+
+    // If the indexing maps of the result operation are not invertible (i.e. not
+    // legal), abort.
+    if (!doCanonicalization ||
+        !inversePermutation(concatAffineMaps(newIndexingMaps)))
+      return failure();
+
+    // If any operand type change, insert a reshape to convert from the original
+    // type to the new type.
+    SmallVector<Value, 4> newOperands;
+    newOperands.reserve(genericOp.getNumOperands());
+    for (auto operand : llvm::enumerate(genericOp.getOperands())) {
+      if (operand.value().getType() == newInputOutputTypes[operand.index()]) {
+        newOperands.push_back(operand.value());
+      } else {
+        newOperands.push_back(rewriter.create<linalg::TensorReshapeOp>(
+            loc, newInputOutputTypes[operand.index()], operand.value(),
+            reassociationMaps[operand.index()]));
+      }
+    }
+
+    // If any result type change, insert a reshape to convert from the original
+    // type to the new type.
+    SmallVector<Type, 4> resultTypes;
+    resultTypes.reserve(genericOp.getNumResults());
+    for (unsigned i : llvm::seq<unsigned>(0, genericOp.getNumResults()))
+      resultTypes.push_back(
+          newInputOutputTypes[i + genericOp.getNumOperands()]);
+    GenericOp replacementOp = rewriter.create<GenericOp>(
+        loc, resultTypes, newOperands, genericOp.args_in(),
+        genericOp.args_out(), rewriter.getAffineMapArrayAttr(newIndexingMaps),
+        genericOp.iterator_types(),
+        /*doc = */ nullptr,
+        /*library_call = */ nullptr);
+    rewriter.inlineRegionBefore(genericOp.region(), replacementOp.region(),
+                                replacementOp.region().begin());
+
+    // If any result tensor has a modified shape, then add reshape to recover
+    // the original shape.
+    SmallVector<Value, 4> resultReplacements;
+    for (auto result : llvm::enumerate(replacementOp.getResults())) {
+      unsigned index = result.index() + replacementOp.getNumOperands();
+      RankedTensorType origResultType = genericOp.getResult(result.index())
+                                            .getType()
+                                            .cast<RankedTensorType>();
+      if (origResultType != result.value().getType()) {
+        resultReplacements.push_back(rewriter.create<linalg::TensorReshapeOp>(
+            loc, origResultType, result.value(), reassociationMaps[index]));
+      } else {
+        resultReplacements.push_back(result.value());
+      }
+    }
+    rewriter.replaceOp(genericOp, resultReplacements);
+    return success();
+  }
+};
+} // namespace
+
+/// Patterns that are used to canonicalize the use of unit-extent dims for
+/// broadcasting.
+void mlir::populateLinalgFoldUnitExtentDimsPatterns(
+    MLIRContext *context, OwningRewritePatternList &patterns) {
+  patterns.insert<FoldUnitDimLoops, ReplaceUnitExtentTensors>(context);
+  TensorReshapeOp::getCanonicalizationPatterns(patterns, context);
+}
+
+namespace {
+/// Pass that removes unit-extent dims within generic ops.
+struct LinalgFoldUnitExtentDimsPass
+    : public LinalgFoldUnitExtentDimsBase<LinalgFoldUnitExtentDimsPass> {
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    FuncOp funcOp = getFunction();
+    MLIRContext *context = funcOp.getContext();
+    if (foldOneTripLoopsOnly)
+      patterns.insert<FoldUnitDimLoops>(context);
+    else
+      populateLinalgFoldUnitExtentDimsPatterns(context, patterns);
+    applyPatternsAndFoldGreedily(funcOp.getBody(), patterns);
+  }
+};
+} // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+mlir::createLinalgFoldUnitExtentDimsPass() {
+  return std::make_unique<LinalgFoldUnitExtentDimsPass>();
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
index 3123f95452fda..3f3c1c53fc3aa 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -575,8 +575,8 @@ struct FuseGenericOpsOnTensors {
       if (auto yieldOp = dyn_cast<YieldOp>(op)) {
         // Lookup the value the yield operation is mapped to.
         Value yieldVal = yieldOp.getOperand(0);
-        auto clonedVal = mapper.lookup(yieldVal);
-        mapper.map(consumerBlock.getArgument(consumerIdx), clonedVal);
+        if (Value clonedVal = mapper.lookupOrNull(yieldVal))
+          mapper.map(consumerBlock.getArgument(consumerIdx), clonedVal);
         continue;
       }
       rewriter.clone(op, mapper);
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
new file mode 100644
index 0000000000000..a5169c35d18d5
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -0,0 +1,165 @@
+// RUN: mlir-opt %s -linalg-fold-unit-extent-dims -split-input-file | FileCheck %s
+
+#accesses = [
+  affine_map<(i, j, k, l, m) -> (i, k, m)>,
+  affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
+]
+
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
+  indexing_maps = #accesses,
+  library_call = "some_external_func"
+}
+
+func @drop_one_trip_loops(%arg0 : tensor<?x1x?xf32>) -> tensor<?x1x?x1x?xf32>
+{
+  %0 = linalg.generic #trait %arg0 {
+       ^bb0(%arg1 : f32) :
+         linalg.yield %arg1 : f32
+       } : tensor<?x1x?xf32> -> tensor<?x1x?x1x?xf32>
+  return %0 : tensor<?x1x?x1x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d2)>
+//   CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+//   CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+//   CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
+//   CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>
+//   CHECK-DAG: #[[MAP6:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4)>
+// CHECK-LABEL: func @drop_one_trip_loops
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP0]], #[[MAP1]]]
+//       CHECK: linalg.generic
+//  CHECK-SAME:   indexing_maps = [#[[MAP2]], #[[MAP3]]]
+//  CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel"]
+//       CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP4]], #[[MAP5]], #[[MAP6]]]
+
+// -----
+
+#map0 = affine_map<(i, j) -> (i, j)>
+#access = [#map0, #map0]
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel"],
+  indexing_maps = #access,
+  library_call = "some_external_func"
+}
+
+func @drop_all_loops(%arg0 : tensor<1x1xf32>) -> tensor<1x1xf32>
+{
+  %0 = linalg.generic #trait %arg0 {
+       ^bb0(%arg1: f32) :
+         linalg.yield %arg1 : f32
+       } : tensor<1x1xf32> -> tensor<1x1xf32>
+  return %0 : tensor<1x1xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<() -> ()>
+// CHECK-LABEL: func @drop_all_loops
+//       CHECK:   linalg.tensor_reshape %{{.*}} []
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP0]]]
+//  CHECK-SAME:     iterator_types = []
+
+// -----
+
+#accesses = [
+  affine_map<(d0) -> (0, d0)>,
+  affine_map<(d0) -> (d0)>
+]
+
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = #accesses,
+  iterator_types = ["parallel"],
+  library_call = "some_external_fn"
+}
+
+func @leading_dim_1_canonicalization(%arg0: tensor<1x5xf32>) -> tensor<5xf32> {
+  %0 = linalg.generic #trait %arg0 {
+  ^bb0(%arg2: f32):     // no predecessors
+    linalg.yield %arg2 : f32
+  }  : tensor<1x5xf32> -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @leading_dim_1_canonicalization
+//       CHECK:   linalg.tensor_reshape %{{.*}} [#[[MAP0]]]
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP1]], #[[MAP1]]]
+//  CHECK-SAME:     iterator_types = ["parallel"]
+
+// -----
+
+#accesses = [
+  affine_map<(d0, d1) -> (0, d1)>,
+  affine_map<(d0, d1) -> (d0, 0)>,
+  affine_map<(d0, d1) -> (d0, d1)>
+]
+
+#trait = {
+  args_in = 2,
+  args_out = 1,
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel"],
+  library_call = "some_external_fn"
+}
+
+func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>) -> tensor<5x5xf32>
+{
+  %0 = linalg.tensor_reshape %arg0 [affine_map<(d0, d1) -> (d0, d1)>] :
+       tensor<5xf32> into tensor<1x5xf32>
+  %1 = linalg.tensor_reshape %arg1 [affine_map<(d0, d1) -> (d0, d1)>] :
+       tensor<5xf32> into tensor<5x1xf32>
+  %2 = linalg.generic #trait %0, %1 {
+       ^bb0(%arg2: f32, %arg3: f32):
+         %3 = addf %arg2, %arg3 : f32
+         linalg.yield %3 : f32
+       } : tensor<1x5xf32>, tensor<5x1xf32> -> tensor<5x5xf32>
+  return %2 : tensor<5x5xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d1)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0)>
+//   CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @broadcast_test
+//   CHECK-NOT:   linalg.tensor_reshape
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
+//  CHECK-SAME:     iterator_types = ["parallel", "parallel"]
+//   CHECK-NOT:   linalg.tensor_reshape
+
+// -----
+
+#accesses = [
+  affine_map<(d0, d1) -> (0, 0)>,
+  affine_map<(d0, d1) -> (d0, d1)>
+]
+
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel"],
+  library_call = "some_external_fn"
+}
+
+func @broadcast_scalar(%arg0 : tensor<1x1xf32>) -> tensor<?x?xf32>
+{
+   %0 = linalg.generic #trait %arg0 {
+        ^bb0(%arg1 : f32):
+	  linalg.yield %arg1 : f32
+	} : tensor<1x1xf32> -> tensor<?x?xf32>
+   return %0 : tensor<?x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> ()>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @broadcast_scalar
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<1x1xf32>
+//       CHECK:   %[[A:.*]] = linalg.tensor_reshape %[[ARG0]] []
+//  CHECK-SAME:     tensor<1x1xf32> into tensor<f32>
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
+//  CHECK-SAME:     iterator_types = ["parallel", "parallel"]
+//  CHECK-SAME:     %[[A]]
diff --git a/mlir/test/Dialect/Linalg/fold-unit-trip-loops.mlir b/mlir/test/Dialect/Linalg/fold-unit-trip-loops.mlir
new file mode 100644
index 0000000000000..a977ab4cadd9f
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/fold-unit-trip-loops.mlir
@@ -0,0 +1,110 @@
+// RUN: mlir-opt %s -linalg-fold-unit-extent-dims="fold-one-trip-loops-only" -split-input-file | FileCheck %s
+
+#accesses = [
+  affine_map<(i, j, k, l, m) -> (i, k, m)>,
+  affine_map<(i, j, k, l, m) -> (i, k, j, l, m)>
+]
+
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"],
+  indexing_maps = #accesses,
+  library_call = "some_external_func"
+}
+
+func @drop_one_trip_loops(%arg0 : tensor<?x1x?xf32>) -> tensor<?x1x?x1x?xf32>
+{
+  %0 = linalg.generic #trait %arg0 {
+       ^bb0(%arg1 : f32) :
+         linalg.yield %arg1 : f32
+       } : tensor<?x1x?xf32> -> tensor<?x1x?x1x?xf32>
+  return %0 : tensor<?x1x?x1x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, 0, d2)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, 0, d1, 0, d2)>
+// CHECK-LABEL: func @drop_one_trip_loops
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
+//  CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel"]
+
+// -----
+
+#map0 = affine_map<(i, j) -> (i, j)>
+#access = [#map0, #map0]
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel"],
+  indexing_maps = #access,
+  library_call = "some_external_func"
+}
+
+func @drop_all_loops(%arg0 : tensor<1x1xf32>) -> tensor<1x1xf32>
+{
+  %0 = linalg.generic #trait %arg0 {
+       ^bb0(%arg1: f32) :
+         linalg.yield %arg1 : f32
+       } : tensor<1x1xf32> -> tensor<1x1xf32>
+  return %0 : tensor<1x1xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<() -> (0, 0)>
+// CHECK-LABEL: func @drop_all_loops
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP0]]]
+//  CHECK-SAME:     iterator_types = []
+
+// -----
+
+#map0 = affine_map<(i, j) -> (i, j)>
+#access = [#map0, #map0]
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  iterator_types = ["parallel", "parallel"],
+  indexing_maps = #access,
+  library_call = "some_external_func"
+}
+
+func @drop_all_loops(%arg0 : memref<1x1xf32>, %arg1 : memref<1x1xf32>)
+{
+  linalg.generic #trait %arg0, %arg1 {
+    ^bb0(%arg2: f32, %arg3 : f32) :
+      linalg.yield %arg2 : f32
+    } : memref<1x1xf32>, memref<1x1xf32>
+  return
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<() -> (0, 0)>
+// CHECK-LABEL: func @drop_all_loops
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP0]]]
+//  CHECK-SAME:     iterator_types = []
+
+// -----
+
+#accesses = [
+  affine_map<(d0, d1) -> (d0, d1)>,
+  affine_map<(d0, d1) -> (d1)>
+]
+
+#trait = {
+  args_in = 1,
+  args_out = 1,
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel"],
+  library_call = "some_external_fn"
+}
+
+func @leading_dim_1_canonicalization(%arg0: tensor<1x5xf32>) -> tensor<5xf32> {
+  %0 = linalg.generic #trait %arg0 {
+  ^bb0(%arg2: f32):     // no predecessors
+    linalg.yield %arg2 : f32
+  }  : tensor<1x5xf32> -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (0, d0)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-LABEL: func @leading_dim_1_canonicalization
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
+//  CHECK-SAME:     iterator_types = ["parallel"]

From 0073c293a401774ac96b4b3d27f05e13f379f98e Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Thu, 28 May 2020 21:04:38 +0300
Subject: [PATCH 397/770] [clang] Avoid linking libdl unless needed

Differential Revision: https://reviews.llvm.org/D80492
---
 clang/tools/libclang/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index ba286b6727724..9b34682cc49bb 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -66,9 +66,8 @@ if (LIBCLANG_INCLUDE_CLANG_TOOLS_EXTRA)
   endif ()
 endif ()
 
-find_library(DL_LIBRARY_PATH dl)
-if (DL_LIBRARY_PATH)
-  list(APPEND LIBS dl)
+if (HAVE_LIBDL)
+  list(APPEND LIBS ${CMAKE_DL_LIBS})
 endif()
 
 option(LIBCLANG_BUILD_STATIC

From 504d8d9d8a20fae1717bcf65dbc7577d0f26ffd7 Mon Sep 17 00:00:00 2001
From: Anthony Steinhauser <asteinhauser@google.com>
Date: Thu, 28 May 2020 11:19:20 -0700
Subject: [PATCH 398/770] [libc] Fixing the build command for benchmarks.

Building libc without clang fails with:
CMake Error at /home/asteinhauser/llvm-project/libc/CMakeLists.txt:49 (message):
'clang' and 'clang-tools-extra' are required in LLVM_ENABLE_PROJECTS to
lint llvm-libc. The linting step performs important checks to help prevent
the introduction of subtle bugs, but it may increase build times.

Reviewers: sivachandra

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D80495
---
 libc/utils/benchmarks/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/utils/benchmarks/README.md b/libc/utils/benchmarks/README.md
index fdd0223196f27..ed7d80e1c14dd 100644
--- a/libc/utils/benchmarks/README.md
+++ b/libc/utils/benchmarks/README.md
@@ -35,7 +35,7 @@ informations** and **benchmarking configuration**.
 
 ```shell
 cd llvm-project
-cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS=libc -DCMAKE_BUILD_TYPE=Release
+cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS='clang;clang-tools-extra;libc' -DCMAKE_BUILD_TYPE=Release
 make -C /tmp/build -j display-libc-memcpy-benchmark-small
 ```
 

From 97f3f0bab0982f84745c7ac5ce8fb6b0918ff718 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 6 May 2020 16:43:33 -0400
Subject: [PATCH 399/770] AMDGPU: Add intrinsic for s_setreg

This will be more useful with fenv access implemented.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  1 +
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |  6 ++
 .../test/SemaOpenCL/builtins-amdgcn-error.cl  |  5 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 10 ++++
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |  7 ---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 11 ++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 28 ++++-----
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  2 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  4 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll | 59 +++++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll    | 55 +++++++++++++++++
 11 files changed, 164 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5633ccd5d744c..28379142b05ad 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -44,6 +44,7 @@ BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
 // Instruction builtins.
 //===----------------------------------------------------------------------===//
 BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n")
+BUILTIN(__builtin_amdgcn_s_setreg, "vIiUi", "n")
 BUILTIN(__builtin_amdgcn_s_getpc, "LUi", "n")
 BUILTIN(__builtin_amdgcn_s_waitcnt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_sendmsg, "vIiUi", "n")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 8f2f149103b34..3563ad464c66a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -715,6 +715,12 @@ kernel void test_mqsad_u32_u8(global uint4* out, ulong src0, uint src1, uint4 sr
   *out = __builtin_amdgcn_mqsad_u32_u8(src0, src1, src2);
 }
 
+// CHECK-LABEL: test_s_setreg(
+// CHECK: call void @llvm.amdgcn.s.setreg(i32 8193, i32 %val)
+kernel void test_s_setreg(uint val) {
+  __builtin_amdgcn_s_setreg(8193, val);
+}
+
 // CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
 // CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
index ad5e8776b2e87..dbe2900b600bf 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
@@ -139,3 +139,8 @@ void test_fence() {
   const char ptr[] = "workgroup";
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, ptr); // expected-error {{expression is not a string literal}}
 }
+
+void test_s_setreg(int x, int y) {
+  __builtin_amdgcn_s_setreg(x, 0); // expected-error {{argument to '__builtin_amdgcn_s_setreg' must be a constant integer}}
+  __builtin_amdgcn_s_setreg(x, y); // expected-error {{argument to '__builtin_amdgcn_s_setreg' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e2d8f3cb1bd60..40449304ed04f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1207,6 +1207,16 @@ def int_amdgcn_s_getreg :
   [IntrInaccessibleMemOnly, IntrReadMem, IntrSpeculatable, ImmArg<ArgIndex<0>>]
 >;
 
+// Note this can be used to set FP environment properties that are
+// unsafe to change in non-strictfp functions. The register properties
+// available (and value required to access them) may differ per
+// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
+def int_amdgcn_s_setreg :
+  GCCBuiltin<"__builtin_amdgcn_s_setreg">,
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
+>;
+
 // int_amdgcn_s_getpc is provided to allow a specific style of position
 // independent code to determine the high part of its address when it is
 // known (through convention) that the code and any data of interest does
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 3b8f88271458d..59f9866b93b65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -202,13 +202,6 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
 
 def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
 
-def AMDGPUSetRegOp :  SDTypeProfile<0, 2, [
-  SDTCisInt<0>, SDTCisInt<1>
-]>;
-
-def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
-  SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
-
 def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
    SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f11563a66d410..c6e0cb2b9cfa1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2783,6 +2783,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
       return;
     }
+    case Intrinsic::amdgcn_s_setreg: {
+      constrainOpWithReadfirstlane(MI, MRI, 2);
+      return;
+    }
     default: {
       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -3924,6 +3928,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_s_setreg: {
+      // This must be an SGPR, but accept a VGPR.
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+                                   AMDGPU::SGPRRegBankID);
+      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
+      break;
+    }
     case Intrinsic::amdgcn_end_cf:
     case Intrinsic::amdgcn_init_exec: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 042087ec5a4de..3b8930c433a3a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7977,32 +7977,32 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
 
   const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
 
   if (!HasFP32Denormals) {
     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
 
-    SDValue EnableDenorm;
+    SDNode *EnableDenorm;
     if (Subtarget->hasDenormModeInst()) {
       const SDValue EnableDenormValue =
           getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
 
       EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
-                                 DAG.getEntryNode(), EnableDenormValue);
+                                 DAG.getEntryNode(), EnableDenormValue).getNode();
     } else {
       const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
                                                         SL, MVT::i32);
-      EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
-                                 DAG.getEntryNode(), EnableDenormValue,
-                                 BitField);
+      EnableDenorm =
+          DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+                             {EnableDenormValue, BitField, DAG.getEntryNode()});
     }
 
     SDValue Ops[3] = {
       NegDivScale0,
-      EnableDenorm.getValue(0),
-      EnableDenorm.getValue(1)
+      SDValue(EnableDenorm, 0),
+      SDValue(EnableDenorm, 1)
     };
 
     NegDivScale0 = DAG.getMergeValues(Ops, SL);
@@ -8026,25 +8026,25 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
                              NumeratorScaled, Fma3);
 
   if (!HasFP32Denormals) {
-    SDValue DisableDenorm;
+    SDNode *DisableDenorm;
     if (Subtarget->hasDenormModeInst()) {
       const SDValue DisableDenormValue =
           getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
 
       DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
                                   Fma4.getValue(1), DisableDenormValue,
-                                  Fma4.getValue(2));
+                                  Fma4.getValue(2)).getNode();
     } else {
       const SDValue DisableDenormValue =
           DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
 
-      DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
-                                  Fma4.getValue(1), DisableDenormValue,
-                                  BitField, Fma4.getValue(2));
+      DisableDenorm = DAG.getMachineNode(
+          AMDGPU::S_SETREG_B32, SL, MVT::Other,
+          {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
     }
 
     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
-                                      DisableDenorm, DAG.getRoot());
+                                      SDValue(DisableDenorm, 0), DAG.getRoot());
     DAG.setRoot(OutputChain);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 62b7f8318fd02..529e80e679689 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1131,7 +1131,7 @@ def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
 def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
 def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
 
-def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
 
 def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 7b8c2c27b8063..dbafea5a1347e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -801,13 +801,13 @@ def S_GETREG_B32 : SOPK_Pseudo <
 >;
 }
 
-let hasSideEffects = 1 in {
+let hasSideEffects = 1, mayLoad = 0, mayStore =0 in {
 
 def S_SETREG_B32 : SOPK_Pseudo <
   "s_setreg_b32",
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
   "$simm16, $sdst",
-  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]> {
+  [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
   let Defs = [MODE];
   let Uses = [MODE];
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
new file mode 100644
index 0000000000000..85ed95eec0ae3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; Set FP32 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f32_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 3)
+  ret void
+}
+
+; Set FP64/FP16 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f64_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4225, i32 3)
+  ret void
+}
+
+; Set all fp_round to round to zero
+define amdgpu_kernel void @test_setreg_all_round_mode_rtz() {
+; GCN-LABEL: test_setreg_all_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 5), 7
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 8193, i32 7)
+  ret void
+}
+
+; Set FP32 fp_round to dynamic mode
+define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) {
+; GCN-LABEL: test_setreg_roundingmode_var:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  ret void
+}
+
+define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
+; GCN-LABEL: test_setreg_roundingmode_var_vgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
new file mode 100644
index 0000000000000..88e6bd4adb7bf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; FIXME: This copy of the test is a subset of the -global-isel version, since the VGPR case doesn't work.
+
+; Set FP32 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f32_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 3)
+  ret void
+}
+
+; Set FP64/FP16 fp_round to round to zero
+define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() {
+; GCN-LABEL: test_setreg_f64_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 3), 3
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4225, i32 3)
+  ret void
+}
+
+; Set all fp_round to round to zero
+define amdgpu_kernel void @test_setreg_all_round_mode_rtz() {
+; GCN-LABEL: test_setreg_all_round_mode_rtz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 5), 7
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 8193, i32 7)
+  ret void
+}
+
+; Set FP32 fp_round to dynamic mode
+define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) {
+; GCN-LABEL: test_setreg_roundingmode_var:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  ret void
+}
+
+; FIXME: Broken for DAG
+; define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
+;   call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+;   ret void
+; }
+
+declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #0
+
+attributes #0 = { nounwind }

From 281058226587d8c70172ff0fb1e55d58876da229 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Thu, 28 May 2020 18:23:56 +0000
Subject: [PATCH 400/770] [LoopUnroll] Support loops with exiting block that is
 neither header nor latch.

Summary: Remove the limitation in LoopUnrollPass that exiting block must
be either header or latch.
Reviewer: dmgreen, jdoerfert, Meinersbur, kbarton, bmahjour, etiotto,
fhahn, efriedma
Reviewed By: etiotto, fhahn, efriedma
Subscribers: efriedma, lkail, xbolva00, hiraditya, zzheng, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D80477
---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 115 +++++++-----------
 .../Transforms/LoopUnroll/nonlatchcondbr.ll   |  69 +++++++++++
 2 files changed, 110 insertions(+), 74 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index d9323e70bef60..c926f1d100314 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -81,8 +81,8 @@ using namespace llvm;
 // TODO: Should these be here or in LoopUnroll?
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
-STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a "
-                                 "conditional latch (completely or otherwise)");
+STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
+                               "latch (completely or otherwise)");
 
 static cl::opt<bool>
 UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
@@ -304,48 +304,30 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // The current loop unroll pass can unroll loops with a single latch or header
-  // that's a conditional branch exiting the loop.
+  // The current loop unroll pass can unroll loops that have
+  // (1) single latch; and
+  // (2a) latch is an exiting block; or
+  // (2b) latch is unconditional and there exists a single exiting block.
   // FIXME: The implementation can be extended to work with more complicated
   // cases, e.g. loops with multiple latches.
   BasicBlock *Header = L->getHeader();
-  BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator());
-  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-
-  // FIXME: Support loops without conditional latch and multiple exiting blocks.
-  if (!BI ||
-      (BI->isUnconditional() && (!HeaderBI || HeaderBI->isUnconditional() ||
-                                 L->getExitingBlock() != Header))) {
+  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  // A conditional branch which exits the loop, which can be optimized to an
+  // unconditional branch in the unrolled loop in some cases.
+  BranchInst *ExitingBI = nullptr;
+  bool LatchIsExiting = L->isLoopExiting(LatchBlock);
+  if (LatchIsExiting)
+    ExitingBI = LatchBI;
+  else if (BasicBlock *ExitingBlock = L->getExitingBlock())
+    ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (!LatchBI || !ExitingBI) {
     LLVM_DEBUG(dbgs() << "  Can't unroll; loop not terminated by a conditional "
-                         "branch in the latch or header.\n");
-    return LoopUnrollResult::Unmodified;
-  }
-
-  auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) {
-    return BI->isConditional() && BI->getSuccessor(S1) == Header &&
-           !L->contains(BI->getSuccessor(S2));
-  };
-
-  // If we have a conditional latch, it must exit the loop.
-  if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) &&
-      !CheckLatchSuccessors(1, 0)) {
-    LLVM_DEBUG(
-        dbgs() << "Can't unroll; a conditional latch must exit the loop");
-    return LoopUnrollResult::Unmodified;
-  }
-
-  auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) {
-    return HeaderBI && HeaderBI->isConditional() &&
-           L->contains(HeaderBI->getSuccessor(S1)) &&
-           !L->contains(HeaderBI->getSuccessor(S2));
-  };
-
-  // If we do not have a conditional latch, the header must exit the loop.
-  if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() &&
-      !CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) {
-    LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop");
+                         "branch in latch or a single exiting block.\n");
     return LoopUnrollResult::Unmodified;
   }
+  LLVM_DEBUG(dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
+                    << "\n");
 
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
@@ -534,17 +516,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       SE->forgetTopmostLoop(L);
   }
 
-  bool ContinueOnTrue;
-  bool LatchIsExiting = BI->isConditional();
-  BasicBlock *LoopExit = nullptr;
-  if (LatchIsExiting) {
-    ContinueOnTrue = L->contains(BI->getSuccessor(0));
-    LoopExit = BI->getSuccessor(ContinueOnTrue);
-  } else {
-    NumUnrolledWithHeader++;
-    ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0));
-    LoopExit = HeaderBI->getSuccessor(ContinueOnTrue);
-  }
+  if (!LatchIsExiting)
+    ++NumUnrolledNotLatch;
+  bool ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
+  BasicBlock *LoopExit = ExitingBI->getSuccessor(ContinueOnTrue);
 
   // For the first iteration of the loop, we should use the precloned values for
   // PHI nodes.  Insert associations now.
@@ -555,21 +530,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   }
 
   std::vector<BasicBlock *> Headers;
-  std::vector<BasicBlock *> HeaderSucc;
+  std::vector<BasicBlock *> ExitingBlocks;
+  std::vector<BasicBlock *> ExitingSucc;
   std::vector<BasicBlock *> Latches;
   Headers.push_back(Header);
   Latches.push_back(LatchBlock);
-
-  if (!LatchIsExiting) {
-    auto *Term = cast<BranchInst>(Header->getTerminator());
-    if (Term->isUnconditional() || L->contains(Term->getSuccessor(0))) {
-      assert(L->contains(Term->getSuccessor(0)));
-      HeaderSucc.push_back(Term->getSuccessor(0));
-    } else {
-      assert(L->contains(Term->getSuccessor(1)));
-      HeaderSucc.push_back(Term->getSuccessor(1));
-    }
-  }
+  ExitingBlocks.push_back(ExitingBI->getParent());
+  ExitingSucc.push_back(ExitingBI->getSuccessor(!ContinueOnTrue));
 
   // The current on-the-fly SSA update requires blocks to be processed in
   // reverse postorder so that LastValueMap contains the correct value at each
@@ -660,12 +627,12 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (*BB == LatchBlock)
         Latches.push_back(New);
 
-      // Keep track of the successor of the new header in the current iteration.
-      for (auto *Pred : predecessors(*BB))
-        if (Pred == Header) {
-          HeaderSucc.push_back(New);
-          break;
-        }
+      // Keep track of the exiting block and its successor block contained in
+      // the loop for the current iteration.
+      if (*BB == ExitingBlocks[0])
+        ExitingBlocks.push_back(New);
+      if (*BB == ExitingSucc[0])
+        ExitingSucc.push_back(New);
 
       NewBlocks.push_back(New);
       UnrolledLoopBlocks.push_back(New);
@@ -784,7 +751,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (!LatchIsExiting) {
     // If the latch is not exiting, we may be able to simplify the conditional
     // branches in the unrolled exiting blocks.
-    for (unsigned i = 0, e = Headers.size(); i != e; ++i) {
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       // The branch destination.
       unsigned j = (i + 1) % e;
       bool NeedConditional = true;
@@ -807,7 +774,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       // already correct.
       if (NeedConditional)
         continue;
-      setDest(Headers[i], HeaderSucc[i], HeaderSucc[i], NeedConditional,
+      setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
               ContinueOnTrue, false);
     }
 
@@ -833,8 +800,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           ChildrenToUpdate.push_back(ChildBB);
       }
       BasicBlock *NewIDom;
-      BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header;
-      auto &TermBlocks = LatchIsExiting ? Latches : Headers;
+      BasicBlock *&TermBlock = ExitingBlocks[0];
+      auto &TermBlocks = ExitingBlocks;
       if (BB == TermBlock) {
         // The latch is special because we emit unconditional branches in
         // some cases where the original loop contained a conditional branch.
@@ -843,13 +810,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // must also be a latch.  Specifically, the dominator is the first
         // latch which ends in a conditional branch, or the last latch if
         // there is no such latch.
-        // For loops exiting from the header, we limit the supported loops
-        // to have a single exiting block.
+        // For loops exiting from non latch exiting block, we limit the
+        // supported loops to have a single exiting block.
         NewIDom = TermBlocks.back();
         for (BasicBlock *Iter : TermBlocks) {
           Instruction *Term = Iter->getTerminator();
           if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
-            NewIDom = Iter;
+            NewIDom = DT->findNearestCommonDominator(Iter, LatchBlock);
             break;
           }
         }
diff --git a/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
new file mode 100644
index 0000000000000..547b05d1e186d
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-unroll -S | FileCheck %s
+; RUN: opt < %s -passes='require<opt-remark-emit>,unroll' -S | FileCheck %s
+
+define void @foo(i32* noalias %A) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]]
+; CHECK:       for.body.for.body_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    br label [[FOR_BODY_1:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body.1:
+; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE_1:%.*]]
+; CHECK:       for.body.for.body_crit_edge.1:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[DOTPRE_1:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_1]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_1]])
+; CHECK-NEXT:    br label [[FOR_BODY_2:%.*]]
+; CHECK:       for.body.2:
+; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE_2:%.*]]
+; CHECK:       for.body.for.body_crit_edge.2:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_2]])
+; CHECK-NEXT:    br label [[FOR_BODY_3:%.*]]
+; CHECK:       for.body.3:
+; CHECK-NEXT:    br i1 false, label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.for.body_crit_edge.3:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  call void @bar(i32 %0)
+  br label %for.header
+
+for.header:
+  %1 = phi i32 [ %0, %entry ], [ %.pre, %for.body.for.body_crit_edge ]
+  %i = phi i64 [ 0, %entry ], [ %inc, %for.body.for.body_crit_edge ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i
+  call void @bar(i32 %1)
+  br label %for.body
+
+for.body:
+  %inc = add nsw i64 %i, 1
+  %cmp = icmp slt i64 %inc, 4
+  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
+
+for.body.for.body_crit_edge:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %A, i64 %inc
+  %.pre = load i32, i32* %arrayidx.phi.trans.insert, align 4
+  br label %for.header
+
+for.end:
+  ret void
+}
+
+declare void @bar(i32)

From ebddf90a4e8ba062999f622cdcea2793ffc37426 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne@apple.com>
Date: Thu, 28 May 2020 14:28:09 -0400
Subject: [PATCH 401/770] [libc++] NFC: Remove outdated numbering in <bit>
 synopsis

---
 libcxx/include/bit | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/bit b/libcxx/include/bit
index 6dc85b5d01fb2..ae4605b191666 100644
--- a/libcxx/include/bit
+++ b/libcxx/include/bit
@@ -15,6 +15,7 @@
 
 namespace std {
 
+  // [bit.pow.two], integral powers of 2
   template <class T>
     constexpr bool ispow2(T x) noexcept; // C++20
   template <class T>
@@ -24,13 +25,13 @@ namespace std {
   template <class T>
     constexpr T log2p1(T x) noexcept;    // C++20
 
-  // 23.20.2, rotating
+  // [bit.rotate], rotating
   template<class T>
     constexpr T rotl(T x, unsigned int s) noexcept; // C++20
   template<class T>
     constexpr T rotr(T x, unsigned int s) noexcept; // C++20
 
-  // 23.20.3, counting
+  // [bit.count], counting
   template<class T>
     constexpr int countl_zero(T x) noexcept;  // C++20
   template<class T>
@@ -42,7 +43,7 @@ namespace std {
   template<class T>
     constexpr int popcount(T x) noexcept;     // C++20
 
-  // 20.15.9, endian
+  // [bit.endian], endian
   enum class endian {
     little = see below,        // C++20
     big = see below,           // C++20
@@ -350,7 +351,7 @@ _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 bool __ispow2(_Tp __t) _NOEXCEPT
 {
     static_assert(__bitop_unsigned_integer<_Tp>::value, "__ispow2 requires unsigned");
-	return __t != 0 && (((__t & (__t - 1)) == 0));
+    return __t != 0 && (((__t & (__t - 1)) == 0));
 }
 
 

From 4859dd41707e3d2a007a58491f1cf20b9510b753 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 18 May 2020 11:32:46 -0400
Subject: [PATCH 402/770] AMDGPU: Handle rewriting ptrmask for more address
 spaces

If this mask only clears bits in the low 32-bit half of a flat
pointer, these bits are always preserved in the result address
space. If the high bits are modified, they may need to be preserved
for some kind of user pointer tagging.
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  35 +++-
 .../InferAddressSpaces/AMDGPU/ptrmask.ll      | 185 ++++++++++++++++++
 2 files changed, 211 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 324dcba86c2ca..eeedfe7a8c029 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -886,17 +886,34 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   case Intrinsic::ptrmask: {
     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
-    if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS))
-      return nullptr;
-
-    Module *M = II->getParent()->getParent()->getParent();
     Value *MaskOp = II->getArgOperand(1);
     Type *MaskTy = MaskOp->getType();
-    Function *NewDecl = Intrinsic::getDeclaration(M, Intrinsic::ptrmask,
-                                                  {NewV->getType(), MaskTy});
-    CallInst *NewCall = CallInst::Create(NewDecl->getFunctionType(), NewDecl,
-                                         {NewV, MaskOp}, "", II);
-    return NewCall;
+
+    bool DoTruncate = false;
+    if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
+      // All valid 64-bit to 32-bit casts work by chopping off the high
+      // bits. Any masking only clearing the low bits will also apply in the new
+      // address space.
+      if (DL.getPointerSizeInBits(OldAS) != 64 ||
+          DL.getPointerSizeInBits(NewAS) != 32)
+        return nullptr;
+
+      // TODO: Do we need to thread more context in here?
+      KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
+      if (Known.countMinLeadingOnes() < 32)
+        return nullptr;
+
+      DoTruncate = true;
+    }
+
+    IRBuilder<> B(II);
+    if (DoTruncate) {
+      MaskTy = B.getInt32Ty();
+      MaskOp = B.CreateTrunc(MaskOp, MaskTy);
+    }
+
+    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
+                             {NewV, MaskOp});
   }
   default:
     return nullptr;
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
index 6f25a3ec2adbe..ee0bb6319fdc0 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces -instsimplify %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 define i8 @ptrmask_cast_local_to_flat(i8 addrspace(3)* %src.ptr, i64 %mask) {
 ; CHECK-LABEL: @ptrmask_cast_local_to_flat(
 ; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8*
@@ -179,9 +181,192 @@ define i8 @multi_ptrmask_cast_region_to_flat(i8 addrspace(2)* %src.ptr, i64 %mas
   ret i8 %add
 }
 
+; Do not fold this since it clears a single high bit.
+define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 -4294967297)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -4294967297)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+; Do not fold this since it clears a single high bit.
+define i8 @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 9223372036854775807)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 9223372036854775807)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 0)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -4294967296)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff80000000(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffff80000000(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -2147483648)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -2147483648)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+; Test some align-down patterns. These only touch the low bits, which are preserved through the cast.
+define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffff0000(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffff0000(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -65536)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -65536)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffff00(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffff00(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -256)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -256)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffe0(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffe0(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -32)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -32)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff0(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff0(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -16)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -16)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff8(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff8(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -8)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -8)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffc(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffc(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -4)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffe(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffe(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -2)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -2)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -1)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -1)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+; Make sure non-constant masks can also be handled.
+define i8 @ptrmask_cast_local_to_flat_load_range_mask(i8 addrspace(3)* %src.ptr, i64 addrspace(1)* %mask.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_load_range_mask(
+; CHECK-NEXT:    [[LOAD_MASK:%.*]] = load i64, i64 addrspace(1)* [[MASK_PTR:%.*]], align 8, !range !0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[LOAD_MASK]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 [[TMP1]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP2]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %load.mask = load i64, i64 addrspace(1)* %mask.ptr, align 8, !range !0
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %load.mask)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
+; This should not be folded, as the mask is implicitly zero extended,
+; so it would clear the high bits.
+define i8 @ptrmask_cast_local_to_flat_const_mask_32bit_neg4(i8 addrspace(3)* %src.ptr) {
+; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_32bit_neg4(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8*
+; CHECK-NEXT:    [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i32(i8* [[CAST]], i32 -4)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8*
+  %masked = call i8* @llvm.ptrmask.p0i8.i32(i8* %cast, i32 -4)
+  %load = load i8, i8* %masked
+  ret i8 %load
+}
+
 declare i8* @llvm.ptrmask.p0i8.i64(i8*, i64) #0
+declare i8* @llvm.ptrmask.p0i8.i32(i8*, i32) #0
 declare i8 addrspace(5)* @llvm.ptrmask.p5i8.i32(i8 addrspace(5)*, i32) #0
 declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)*, i32) #0
 declare i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)*, i64) #0
 
 attributes #0 = { nounwind readnone speculatable willreturn }
+
+!0 = !{i64 -64, i64 -1}

From 58beb76b7bd2f7caa1df461b9db6629521c3b60b Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 11:28:58 -0700
Subject: [PATCH 403/770] [Statepoint] Convert a few more isStatepoint calls to
 idiomatic isa/cast

I'd apparently only grepped in the lib directories and missed a few used in the Statepoint header itself.  Beyond simple mechanical cleanup, changed the type of one routine to reflect the fact it also returns a statepoint.
---
 llvm/include/llvm/IR/Statepoint.h                 | 15 ++++++---------
 .../CodeGen/SelectionDAG/StatepointLowering.cpp   |  4 ++--
 llvm/lib/IR/Verifier.cpp                          |  2 +-
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index ce3d5a655df82..5ca6939ce7731 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -147,11 +147,11 @@ class StatepointBase {
 
 protected:
   explicit StatepointBase(InstructionTy *I) {
-    StatepointCall = isStatepoint(I) ? cast<CallTy>(I) : nullptr;
+    StatepointCall = dyn_cast<GCStatepointInst>(I);
   }
 
   explicit StatepointBase(CallTy *Call) {
-    StatepointCall = isStatepoint(Call) ? Call : nullptr;
+    StatepointCall = dyn_cast<GCStatepointInst>(Call);
   }
 
 public:
@@ -369,15 +369,13 @@ class GCProjectionInst : public IntrinsicInst {
   }
 
   /// The statepoint with which this gc.relocate is associated.
-  const CallBase *getStatepoint() const {
+  const GCStatepointInst *getStatepoint() const {
     const Value *Token = getArgOperand(0);
 
     // This takes care both of relocates for call statepoints and relocates
     // on normal path of invoke statepoint.
-    if (!isa<LandingPadInst>(Token)) {
-      assert(isStatepoint(Token));
-      return cast<CallBase>(Token);
-    }
+    if (!isa<LandingPadInst>(Token))
+      return cast<GCStatepointInst>(Token);
 
     // This relocate is on exceptional path of an invoke statepoint
     const BasicBlock *InvokeBB =
@@ -386,9 +384,8 @@ class GCProjectionInst : public IntrinsicInst {
     assert(InvokeBB && "safepoints should have unique landingpads");
     assert(InvokeBB->getTerminator() &&
            "safepoint block should be well formed");
-    assert(isStatepoint(InvokeBB->getTerminator()));
 
-    return cast<CallBase>(InvokeBB->getTerminator());
+    return cast<GCStatepointInst>(InvokeBB->getTerminator());
   }
 };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 664f56523d9b1..d826fe7b0936b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -970,7 +970,7 @@ void SelectionDAGBuilder::LowerCallSiteWithDeoptBundle(
 void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
   // The result value of the gc_result is simply the result of the actual
   // call.  We've already emitted this, so just grab the value.
-  const Instruction *I = CI.getStatepoint();
+  const GCStatepointInst *I = CI.getStatepoint();
 
   if (I->getParent() != CI.getParent()) {
     // Statepoint is in different basic block so we should have stored call
@@ -979,7 +979,7 @@ void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
     // register because statepoint and actual call return types can be
     // different, and getValue() will use CopyFromReg of the wrong type,
     // which is always i32 in our case.
-    Type *RetTy = cast<GCStatepointInst>(I)->getActualReturnType();
+    Type *RetTy = I->getActualReturnType();
     SDValue CopyFromReg = getCopyFromRegs(I, RetTy);
 
     assert(CopyFromReg.getNode());
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 388fc72417ade..e0d28b35efddf 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4733,7 +4733,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     // Verify rest of the relocate arguments.
     const CallBase &StatepointCall =
-        *cast<CallBase>(cast<GCRelocateInst>(Call).getStatepoint());
+      *cast<GCRelocateInst>(Call).getStatepoint();
 
     // Both the base and derived must be piped through the safepoint.
     Value *Base = Call.getArgOperand(1);

From 6c824c81a911a2782c640c3854992160752578e0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 28 May 2020 11:36:36 -0700
Subject: [PATCH 404/770] AMDGPU/GlobalISel: precommit extractelement test.
 NFC.

---
 .../AMDGPU/GlobalISel/extractelement.ll       | 2094 ++++++++---------
 1 file changed, 995 insertions(+), 1099 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 443944408f339..202ddb0d21a28 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -1,149 +1,83 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GPRIDX %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=MOVREL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
 
 define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_const_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b32 s11, 0x41000000
-; GPRIDX-NEXT:    s_mov_b32 s10, 0x40e00000
-; GPRIDX-NEXT:    s_mov_b32 s9, 0x40c00000
-; GPRIDX-NEXT:    s_mov_b32 s8, 0x40a00000
-; GPRIDX-NEXT:    s_mov_b32 s7, 4.0
-; GPRIDX-NEXT:    s_mov_b32 s6, 0x40400000
-; GPRIDX-NEXT:    s_mov_b32 s5, 2.0
-; GPRIDX-NEXT:    s_mov_b32 s4, 1.0
-; GPRIDX-NEXT:    s_mov_b64 s[12:13], exec
-; GPRIDX-NEXT:  BB0_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s14, v0
-; GPRIDX-NEXT:    s_mov_b32 m0, s14
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s14, v0
-; GPRIDX-NEXT:    s_movrels_b32 s14, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s14
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB0_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[12:13]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v8f32_const_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b32 s11, 0x41000000
-; MOVREL-NEXT:    s_mov_b32 s10, 0x40e00000
-; MOVREL-NEXT:    s_mov_b32 s9, 0x40c00000
-; MOVREL-NEXT:    s_mov_b32 s8, 0x40a00000
-; MOVREL-NEXT:    s_mov_b32 s7, 4.0
-; MOVREL-NEXT:    s_mov_b32 s6, 0x40400000
-; MOVREL-NEXT:    s_mov_b32 s5, 2.0
-; MOVREL-NEXT:    s_mov_b32 s4, 1.0
-; MOVREL-NEXT:    s_mov_b64 s[12:13], exec
-; MOVREL-NEXT:  BB0_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s14, v0
-; MOVREL-NEXT:    s_mov_b32 m0, s14
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s14, v0
-; MOVREL-NEXT:    s_movrels_b32 s14, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s14
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB0_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[12:13]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v8f32_const_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s11, 0x41000000
+; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
+; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
+; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
+; GCN-NEXT:    s_mov_b32 s7, 4.0
+; GCN-NEXT:    s_mov_b32 s6, 0x40400000
+; GCN-NEXT:    s_mov_b32 s5, 2.0
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b64 s[12:13], exec
+; GCN-NEXT:  BB0_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mov_b32 m0, s14
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s14, v0
+; GCN-NEXT:    s_movrels_b32 s14, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB0_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_const_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s4, 1.0
-; GPRIDX-NEXT:    s_mov_b32 m0, s2
-; GPRIDX-NEXT:    s_mov_b32 s11, 0x41000000
-; GPRIDX-NEXT:    s_mov_b32 s10, 0x40e00000
-; GPRIDX-NEXT:    s_mov_b32 s9, 0x40c00000
-; GPRIDX-NEXT:    s_mov_b32 s8, 0x40a00000
-; GPRIDX-NEXT:    s_mov_b32 s7, 4.0
-; GPRIDX-NEXT:    s_mov_b32 s6, 0x40400000
-; GPRIDX-NEXT:    s_mov_b32 s5, 2.0
-; GPRIDX-NEXT:    s_movrels_b32 s0, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_const_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s4, 1.0
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    s_mov_b32 s11, 0x41000000
-; MOVREL-NEXT:    s_mov_b32 s10, 0x40e00000
-; MOVREL-NEXT:    s_mov_b32 s9, 0x40c00000
-; MOVREL-NEXT:    s_mov_b32 s8, 0x40a00000
-; MOVREL-NEXT:    s_mov_b32 s7, 4.0
-; MOVREL-NEXT:    s_mov_b32 s6, 0x40400000
-; MOVREL-NEXT:    s_mov_b32 s5, 2.0
-; MOVREL-NEXT:    s_movrels_b32 s0, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_const_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_mov_b32 s11, 0x41000000
+; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
+; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
+; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
+; GCN-NEXT:    s_mov_b32 s7, 4.0
+; GCN-NEXT:    s_mov_b32 s6, 0x40400000
+; GCN-NEXT:    s_mov_b32 s5, 2.0
+; GCN-NEXT:    s_movrels_b32 s0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b64 s[8:9], exec
-; GPRIDX-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s10, v0
-; GPRIDX-NEXT:    s_mov_b32 m0, s10
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s10, v0
-; GPRIDX-NEXT:    s_movrels_b32 s10, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s10
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB2_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[8:9]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b64 s[8:9], exec
-; MOVREL-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s10, v0
-; MOVREL-NEXT:    s_mov_b32 m0, s10
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s10, v0
-; MOVREL-NEXT:    s_movrels_b32 s10, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s10
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB2_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[8:9]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b64 s[8:9], exec
+; GCN-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_mov_b32 m0, s10
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s10, v0
+; GCN-NEXT:    s_movrels_b32 s10, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB2_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <8 x float> %vec, i32 %sel
   ret float %ext
@@ -208,94 +142,52 @@ entry:
 }
 
 define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 m0, s10
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_movrels_b32 s0, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 m0, s10
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_movrels_b32 s0, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 m0, s10
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <8 x float> %vec, i32 %sel
   ret float %ext
 }
 
 define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v8i64_const_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[18:19], 8
-; GPRIDX-NEXT:    s_mov_b64 s[16:17], 7
-; GPRIDX-NEXT:    s_mov_b64 s[14:15], 6
-; GPRIDX-NEXT:    s_mov_b64 s[12:13], 5
-; GPRIDX-NEXT:    s_mov_b64 s[10:11], 4
-; GPRIDX-NEXT:    s_mov_b64 s[8:9], 3
-; GPRIDX-NEXT:    s_mov_b64 s[6:7], 2
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], 1
-; GPRIDX-NEXT:    s_mov_b64 s[20:21], exec
-; GPRIDX-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s22, v0
-; GPRIDX-NEXT:    s_lshl_b32 m0, s22, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s22, v0
-; GPRIDX-NEXT:    s_movrels_b32 s22, s4
-; GPRIDX-NEXT:    s_movrels_b32 s23, s5
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB6_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[20:21]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s22
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s23
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v8i64_const_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[18:19], 8
-; MOVREL-NEXT:    s_mov_b64 s[16:17], 7
-; MOVREL-NEXT:    s_mov_b64 s[14:15], 6
-; MOVREL-NEXT:    s_mov_b64 s[12:13], 5
-; MOVREL-NEXT:    s_mov_b64 s[10:11], 4
-; MOVREL-NEXT:    s_mov_b64 s[8:9], 3
-; MOVREL-NEXT:    s_mov_b64 s[6:7], 2
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 1
-; MOVREL-NEXT:    s_mov_b64 s[20:21], exec
-; MOVREL-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s22, v0
-; MOVREL-NEXT:    s_lshl_b32 m0, s22, 1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s22, v0
-; MOVREL-NEXT:    s_movrels_b32 s22, s4
-; MOVREL-NEXT:    s_movrels_b32 s23, s5
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB6_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[20:21]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s22
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s23
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v8i64_const_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[18:19], 8
+; GCN-NEXT:    s_mov_b64 s[16:17], 7
+; GCN-NEXT:    s_mov_b64 s[14:15], 6
+; GCN-NEXT:    s_mov_b64 s[12:13], 5
+; GCN-NEXT:    s_mov_b64 s[10:11], 4
+; GCN-NEXT:    s_mov_b64 s[8:9], 3
+; GCN-NEXT:    s_mov_b64 s[6:7], 2
+; GCN-NEXT:    s_mov_b64 s[4:5], 1
+; GCN-NEXT:    s_mov_b64 s[20:21], exec
+; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s22, v0
+; GCN-NEXT:    s_lshl_b32 m0, s22, 1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s22, v0
+; GCN-NEXT:    s_movrels_b32 s22, s4
+; GCN-NEXT:    s_movrels_b32 s23, s5
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB6_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[20:21]
+; GCN-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>, i32 %sel
   ret i64 %ext
@@ -543,35 +435,20 @@ entry:
 }
 
 define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_s_s_offset3:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 m0, s10
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_movrels_b32 s0, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_s_s_offset3:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 m0, s10
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_movrels_b32 s0, s3
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_s_s_offset3:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 m0, s10
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_movrels_b32 s0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %add = add i32 %sel, 3
   %ext = extractelement <8 x float> %vec, i32 %add
@@ -620,49 +497,27 @@ entry:
 }
 
 define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset1:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[2:3]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset1:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 m0, s18
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[2:3]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f64_s_s_offset1:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %add = add i32 %sel, 1
   %ext = extractelement <8 x double> %vec, i32 %add
@@ -670,49 +525,27 @@ entry:
 }
 
 define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset2:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[4:5]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset2:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 m0, s18
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[4:5]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f64_s_s_offset2:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[4:5]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %add = add i32 %sel, 2
   %ext = extractelement <8 x double> %vec, i32 %add
@@ -720,7 +553,119 @@ entry:
 }
 
 define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset3:
+; GCN-LABEL: dyn_extract_v8f64_s_s_offset3:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[6:7]
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %add = add i32 %sel, 3
+  %ext = extractelement <8 x double> %vec, i32 %add
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec, i32 inreg %sel) {
+; GCN-LABEL: dyn_extract_v8f64_s_s_offset4:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[8:9]
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %add = add i32 %sel, 4
+  %ext = extractelement <8 x double> %vec, i32 %add
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec, i32 inreg %sel) {
+; GCN-LABEL: dyn_extract_v8f64_s_s_offset5:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[10:11]
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %add = add i32 %sel, 5
+  %ext = extractelement <8 x double> %vec, i32 %add
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec, i32 inreg %sel) {
+; GCN-LABEL: dyn_extract_v8f64_s_s_offset6:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[12:13]
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %add = add i32 %sel, 6
+  %ext = extractelement <8 x double> %vec, i32 %add
+  ret double %ext
+}
+
+define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset7:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_mov_b32 s0, s2
 ; GPRIDX-NEXT:    s_mov_b32 s1, s3
@@ -730,7 +675,6 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
 ; GPRIDX-NEXT:    s_mov_b32 s5, s7
 ; GPRIDX-NEXT:    s_mov_b32 s6, s8
 ; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
 ; GPRIDX-NEXT:    s_mov_b32 s8, s10
 ; GPRIDX-NEXT:    s_mov_b32 s9, s11
 ; GPRIDX-NEXT:    s_mov_b32 s10, s12
@@ -739,10 +683,12 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
 ; GPRIDX-NEXT:    s_mov_b32 s13, s15
 ; GPRIDX-NEXT:    s_mov_b32 s14, s16
 ; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[6:7]
+; GPRIDX-NEXT:    s_mov_b32 m0, s18
+; GPRIDX-NEXT:    s_nop 0
+; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[14:15]
 ; GPRIDX-NEXT:    ; return to shader part epilog
 ;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset3:
+; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset7:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_mov_b32 s0, s2
 ; MOVREL-NEXT:    s_mov_b32 s1, s3
@@ -752,7 +698,6 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
 ; MOVREL-NEXT:    s_mov_b32 s5, s7
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
 ; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 m0, s18
 ; MOVREL-NEXT:    s_mov_b32 s8, s10
 ; MOVREL-NEXT:    s_mov_b32 s9, s11
 ; MOVREL-NEXT:    s_mov_b32 s10, s12
@@ -761,259 +706,37 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
 ; MOVREL-NEXT:    s_mov_b32 s13, s15
 ; MOVREL-NEXT:    s_mov_b32 s14, s16
 ; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[6:7]
+; MOVREL-NEXT:    s_mov_b32 m0, s18
+; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[14:15]
 ; MOVREL-NEXT:    ; return to shader part epilog
 entry:
-  %add = add i32 %sel, 3
-  %ext = extractelement <8 x double> %vec, i32 %add
-  ret double %ext
-}
-
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset4:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[8:9]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset4:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 m0, s18
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[8:9]
-; MOVREL-NEXT:    ; return to shader part epilog
-entry:
-  %add = add i32 %sel, 4
-  %ext = extractelement <8 x double> %vec, i32 %add
-  ret double %ext
-}
-
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset5:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[10:11]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset5:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 m0, s18
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[10:11]
-; MOVREL-NEXT:    ; return to shader part epilog
-entry:
-  %add = add i32 %sel, 5
-  %ext = extractelement <8 x double> %vec, i32 %add
-  ret double %ext
-}
-
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset6:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[12:13]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset6:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 m0, s18
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[12:13]
-; MOVREL-NEXT:    ; return to shader part epilog
-entry:
-  %add = add i32 %sel, 6
-  %ext = extractelement <8 x double> %vec, i32 %add
-  ret double %ext
-}
-
-define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset7:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_mov_b32 m0, s18
-; GPRIDX-NEXT:    s_nop 0
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[14:15]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset7:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_mov_b32 m0, s18
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[14:15]
-; MOVREL-NEXT:    ; return to shader part epilog
-entry:
-  %add = add i32 %sel, 7
+  %add = add i32 %sel, 7
   %ext = extractelement <8 x double> %vec, i32 %add
   ret double %ext
 }
 
 define amdgpu_ps double @dyn_extract_v8f64_s_s_offsetm1(<8 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offsetm1:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_add_i32 m0, s18, -1
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[0:1]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f64_s_s_offsetm1:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_add_i32 m0, s18, -1
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[0:1]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f64_s_s_offsetm1:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_add_i32 m0, s18, -1
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s15, s17
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %add = add i32 %sel, -1
   %ext = extractelement <8 x double> %vec, i32 %add
@@ -1310,260 +1033,140 @@ entry:
 }
 
 define amdgpu_ps float @dyn_extract_v16f32_s_s(i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v16f32_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s4, 1.0
-; GPRIDX-NEXT:    s_mov_b32 m0, s2
-; GPRIDX-NEXT:    s_mov_b32 s19, 0x41800000
-; GPRIDX-NEXT:    s_mov_b32 s18, 0x41700000
-; GPRIDX-NEXT:    s_mov_b32 s17, 0x41600000
-; GPRIDX-NEXT:    s_mov_b32 s16, 0x41500000
-; GPRIDX-NEXT:    s_mov_b32 s15, 0x41400000
-; GPRIDX-NEXT:    s_mov_b32 s14, 0x41300000
-; GPRIDX-NEXT:    s_mov_b32 s13, 0x41200000
-; GPRIDX-NEXT:    s_mov_b32 s12, 0x41100000
-; GPRIDX-NEXT:    s_mov_b32 s11, 0x41000000
-; GPRIDX-NEXT:    s_mov_b32 s10, 0x40e00000
-; GPRIDX-NEXT:    s_mov_b32 s9, 0x40c00000
-; GPRIDX-NEXT:    s_mov_b32 s8, 0x40a00000
-; GPRIDX-NEXT:    s_mov_b32 s7, 4.0
-; GPRIDX-NEXT:    s_mov_b32 s6, 0x40400000
-; GPRIDX-NEXT:    s_mov_b32 s5, 2.0
-; GPRIDX-NEXT:    s_movrels_b32 s0, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v16f32_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s4, 1.0
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    s_mov_b32 s19, 0x41800000
-; MOVREL-NEXT:    s_mov_b32 s18, 0x41700000
-; MOVREL-NEXT:    s_mov_b32 s17, 0x41600000
-; MOVREL-NEXT:    s_mov_b32 s16, 0x41500000
-; MOVREL-NEXT:    s_mov_b32 s15, 0x41400000
-; MOVREL-NEXT:    s_mov_b32 s14, 0x41300000
-; MOVREL-NEXT:    s_mov_b32 s13, 0x41200000
-; MOVREL-NEXT:    s_mov_b32 s12, 0x41100000
-; MOVREL-NEXT:    s_mov_b32 s11, 0x41000000
-; MOVREL-NEXT:    s_mov_b32 s10, 0x40e00000
-; MOVREL-NEXT:    s_mov_b32 s9, 0x40c00000
-; MOVREL-NEXT:    s_mov_b32 s8, 0x40a00000
-; MOVREL-NEXT:    s_mov_b32 s7, 4.0
-; MOVREL-NEXT:    s_mov_b32 s6, 0x40400000
-; MOVREL-NEXT:    s_mov_b32 s5, 2.0
-; MOVREL-NEXT:    s_movrels_b32 s0, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v16f32_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_mov_b32 s19, 0x41800000
+; GCN-NEXT:    s_mov_b32 s18, 0x41700000
+; GCN-NEXT:    s_mov_b32 s17, 0x41600000
+; GCN-NEXT:    s_mov_b32 s16, 0x41500000
+; GCN-NEXT:    s_mov_b32 s15, 0x41400000
+; GCN-NEXT:    s_mov_b32 s14, 0x41300000
+; GCN-NEXT:    s_mov_b32 s13, 0x41200000
+; GCN-NEXT:    s_mov_b32 s12, 0x41100000
+; GCN-NEXT:    s_mov_b32 s11, 0x41000000
+; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
+; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
+; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
+; GCN-NEXT:    s_mov_b32 s7, 4.0
+; GCN-NEXT:    s_mov_b32 s6, 0x40400000
+; GCN-NEXT:    s_mov_b32 s5, 2.0
+; GCN-NEXT:    s_movrels_b32 s0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v32f32_s_s(i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v32f32_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s36, 1.0
-; GPRIDX-NEXT:    s_mov_b32 m0, s2
-; GPRIDX-NEXT:    s_mov_b32 s67, 0x42000000
-; GPRIDX-NEXT:    s_mov_b32 s66, 0x41f80000
-; GPRIDX-NEXT:    s_mov_b32 s65, 0x41f00000
-; GPRIDX-NEXT:    s_mov_b32 s64, 0x41e80000
-; GPRIDX-NEXT:    s_mov_b32 s63, 0x41e00000
-; GPRIDX-NEXT:    s_mov_b32 s62, 0x41d80000
-; GPRIDX-NEXT:    s_mov_b32 s61, 0x41d00000
-; GPRIDX-NEXT:    s_mov_b32 s60, 0x41c80000
-; GPRIDX-NEXT:    s_mov_b32 s59, 0x41c00000
-; GPRIDX-NEXT:    s_mov_b32 s58, 0x41b80000
-; GPRIDX-NEXT:    s_mov_b32 s57, 0x41b00000
-; GPRIDX-NEXT:    s_mov_b32 s56, 0x41a80000
-; GPRIDX-NEXT:    s_mov_b32 s55, 0x41a00000
-; GPRIDX-NEXT:    s_mov_b32 s54, 0x41980000
-; GPRIDX-NEXT:    s_mov_b32 s53, 0x41900000
-; GPRIDX-NEXT:    s_mov_b32 s52, 0x41880000
-; GPRIDX-NEXT:    s_mov_b32 s51, 0x41800000
-; GPRIDX-NEXT:    s_mov_b32 s50, 0x41700000
-; GPRIDX-NEXT:    s_mov_b32 s49, 0x41600000
-; GPRIDX-NEXT:    s_mov_b32 s48, 0x41500000
-; GPRIDX-NEXT:    s_mov_b32 s47, 0x41400000
-; GPRIDX-NEXT:    s_mov_b32 s46, 0x41300000
-; GPRIDX-NEXT:    s_mov_b32 s45, 0x41200000
-; GPRIDX-NEXT:    s_mov_b32 s44, 0x41100000
-; GPRIDX-NEXT:    s_mov_b32 s43, 0x41000000
-; GPRIDX-NEXT:    s_mov_b32 s42, 0x40e00000
-; GPRIDX-NEXT:    s_mov_b32 s41, 0x40c00000
-; GPRIDX-NEXT:    s_mov_b32 s40, 0x40a00000
-; GPRIDX-NEXT:    s_mov_b32 s39, 4.0
-; GPRIDX-NEXT:    s_mov_b32 s38, 0x40400000
-; GPRIDX-NEXT:    s_mov_b32 s37, 2.0
-; GPRIDX-NEXT:    s_movrels_b32 s0, s36
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v32f32_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s36, 1.0
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    s_mov_b32 s67, 0x42000000
-; MOVREL-NEXT:    s_mov_b32 s66, 0x41f80000
-; MOVREL-NEXT:    s_mov_b32 s65, 0x41f00000
-; MOVREL-NEXT:    s_mov_b32 s64, 0x41e80000
-; MOVREL-NEXT:    s_mov_b32 s63, 0x41e00000
-; MOVREL-NEXT:    s_mov_b32 s62, 0x41d80000
-; MOVREL-NEXT:    s_mov_b32 s61, 0x41d00000
-; MOVREL-NEXT:    s_mov_b32 s60, 0x41c80000
-; MOVREL-NEXT:    s_mov_b32 s59, 0x41c00000
-; MOVREL-NEXT:    s_mov_b32 s58, 0x41b80000
-; MOVREL-NEXT:    s_mov_b32 s57, 0x41b00000
-; MOVREL-NEXT:    s_mov_b32 s56, 0x41a80000
-; MOVREL-NEXT:    s_mov_b32 s55, 0x41a00000
-; MOVREL-NEXT:    s_mov_b32 s54, 0x41980000
-; MOVREL-NEXT:    s_mov_b32 s53, 0x41900000
-; MOVREL-NEXT:    s_mov_b32 s52, 0x41880000
-; MOVREL-NEXT:    s_mov_b32 s51, 0x41800000
-; MOVREL-NEXT:    s_mov_b32 s50, 0x41700000
-; MOVREL-NEXT:    s_mov_b32 s49, 0x41600000
-; MOVREL-NEXT:    s_mov_b32 s48, 0x41500000
-; MOVREL-NEXT:    s_mov_b32 s47, 0x41400000
-; MOVREL-NEXT:    s_mov_b32 s46, 0x41300000
-; MOVREL-NEXT:    s_mov_b32 s45, 0x41200000
-; MOVREL-NEXT:    s_mov_b32 s44, 0x41100000
-; MOVREL-NEXT:    s_mov_b32 s43, 0x41000000
-; MOVREL-NEXT:    s_mov_b32 s42, 0x40e00000
-; MOVREL-NEXT:    s_mov_b32 s41, 0x40c00000
-; MOVREL-NEXT:    s_mov_b32 s40, 0x40a00000
-; MOVREL-NEXT:    s_mov_b32 s39, 4.0
-; MOVREL-NEXT:    s_mov_b32 s38, 0x40400000
-; MOVREL-NEXT:    s_mov_b32 s37, 2.0
-; MOVREL-NEXT:    s_movrels_b32 s0, s36
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v32f32_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s36, 1.0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_mov_b32 s67, 0x42000000
+; GCN-NEXT:    s_mov_b32 s66, 0x41f80000
+; GCN-NEXT:    s_mov_b32 s65, 0x41f00000
+; GCN-NEXT:    s_mov_b32 s64, 0x41e80000
+; GCN-NEXT:    s_mov_b32 s63, 0x41e00000
+; GCN-NEXT:    s_mov_b32 s62, 0x41d80000
+; GCN-NEXT:    s_mov_b32 s61, 0x41d00000
+; GCN-NEXT:    s_mov_b32 s60, 0x41c80000
+; GCN-NEXT:    s_mov_b32 s59, 0x41c00000
+; GCN-NEXT:    s_mov_b32 s58, 0x41b80000
+; GCN-NEXT:    s_mov_b32 s57, 0x41b00000
+; GCN-NEXT:    s_mov_b32 s56, 0x41a80000
+; GCN-NEXT:    s_mov_b32 s55, 0x41a00000
+; GCN-NEXT:    s_mov_b32 s54, 0x41980000
+; GCN-NEXT:    s_mov_b32 s53, 0x41900000
+; GCN-NEXT:    s_mov_b32 s52, 0x41880000
+; GCN-NEXT:    s_mov_b32 s51, 0x41800000
+; GCN-NEXT:    s_mov_b32 s50, 0x41700000
+; GCN-NEXT:    s_mov_b32 s49, 0x41600000
+; GCN-NEXT:    s_mov_b32 s48, 0x41500000
+; GCN-NEXT:    s_mov_b32 s47, 0x41400000
+; GCN-NEXT:    s_mov_b32 s46, 0x41300000
+; GCN-NEXT:    s_mov_b32 s45, 0x41200000
+; GCN-NEXT:    s_mov_b32 s44, 0x41100000
+; GCN-NEXT:    s_mov_b32 s43, 0x41000000
+; GCN-NEXT:    s_mov_b32 s42, 0x40e00000
+; GCN-NEXT:    s_mov_b32 s41, 0x40c00000
+; GCN-NEXT:    s_mov_b32 s40, 0x40a00000
+; GCN-NEXT:    s_mov_b32 s39, 4.0
+; GCN-NEXT:    s_mov_b32 s38, 0x40400000
+; GCN-NEXT:    s_mov_b32 s37, 2.0
+; GCN-NEXT:    s_movrels_b32 s0, s36
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v16f64_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s66, 0
-; GPRIDX-NEXT:    s_mov_b64 s[36:37], 1.0
-; GPRIDX-NEXT:    s_mov_b32 m0, s2
-; GPRIDX-NEXT:    s_mov_b32 s67, 0x40300000
-; GPRIDX-NEXT:    s_mov_b32 s65, 0x402e0000
-; GPRIDX-NEXT:    s_mov_b32 s64, s66
-; GPRIDX-NEXT:    s_mov_b32 s63, 0x402c0000
-; GPRIDX-NEXT:    s_mov_b32 s62, s66
-; GPRIDX-NEXT:    s_mov_b32 s61, 0x402a0000
-; GPRIDX-NEXT:    s_mov_b32 s60, s66
-; GPRIDX-NEXT:    s_mov_b32 s59, 0x40280000
-; GPRIDX-NEXT:    s_mov_b32 s58, s66
-; GPRIDX-NEXT:    s_mov_b32 s57, 0x40260000
-; GPRIDX-NEXT:    s_mov_b32 s56, s66
-; GPRIDX-NEXT:    s_mov_b32 s55, 0x40240000
-; GPRIDX-NEXT:    s_mov_b32 s54, s66
-; GPRIDX-NEXT:    s_mov_b32 s53, 0x40220000
-; GPRIDX-NEXT:    s_mov_b32 s52, s66
-; GPRIDX-NEXT:    s_mov_b32 s51, 0x40200000
-; GPRIDX-NEXT:    s_mov_b32 s50, s66
-; GPRIDX-NEXT:    s_mov_b32 s49, 0x401c0000
-; GPRIDX-NEXT:    s_mov_b32 s48, s66
-; GPRIDX-NEXT:    s_mov_b32 s47, 0x40180000
-; GPRIDX-NEXT:    s_mov_b32 s46, s66
-; GPRIDX-NEXT:    s_mov_b32 s45, 0x40140000
-; GPRIDX-NEXT:    s_mov_b32 s44, s66
-; GPRIDX-NEXT:    s_mov_b64 s[42:43], 4.0
-; GPRIDX-NEXT:    s_mov_b32 s41, 0x40080000
-; GPRIDX-NEXT:    s_mov_b32 s40, s66
-; GPRIDX-NEXT:    s_mov_b64 s[38:39], 2.0
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[36:37]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v16f64_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s66, 0
-; MOVREL-NEXT:    s_mov_b64 s[36:37], 1.0
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    s_mov_b32 s67, 0x40300000
-; MOVREL-NEXT:    s_mov_b32 s65, 0x402e0000
-; MOVREL-NEXT:    s_mov_b32 s64, s66
-; MOVREL-NEXT:    s_mov_b32 s63, 0x402c0000
-; MOVREL-NEXT:    s_mov_b32 s62, s66
-; MOVREL-NEXT:    s_mov_b32 s61, 0x402a0000
-; MOVREL-NEXT:    s_mov_b32 s60, s66
-; MOVREL-NEXT:    s_mov_b32 s59, 0x40280000
-; MOVREL-NEXT:    s_mov_b32 s58, s66
-; MOVREL-NEXT:    s_mov_b32 s57, 0x40260000
-; MOVREL-NEXT:    s_mov_b32 s56, s66
-; MOVREL-NEXT:    s_mov_b32 s55, 0x40240000
-; MOVREL-NEXT:    s_mov_b32 s54, s66
-; MOVREL-NEXT:    s_mov_b32 s53, 0x40220000
-; MOVREL-NEXT:    s_mov_b32 s52, s66
-; MOVREL-NEXT:    s_mov_b32 s51, 0x40200000
-; MOVREL-NEXT:    s_mov_b32 s50, s66
-; MOVREL-NEXT:    s_mov_b32 s49, 0x401c0000
-; MOVREL-NEXT:    s_mov_b32 s48, s66
-; MOVREL-NEXT:    s_mov_b32 s47, 0x40180000
-; MOVREL-NEXT:    s_mov_b32 s46, s66
-; MOVREL-NEXT:    s_mov_b32 s45, 0x40140000
-; MOVREL-NEXT:    s_mov_b32 s44, s66
-; MOVREL-NEXT:    s_mov_b64 s[42:43], 4.0
-; MOVREL-NEXT:    s_mov_b32 s41, 0x40080000
-; MOVREL-NEXT:    s_mov_b32 s40, s66
-; MOVREL-NEXT:    s_mov_b64 s[38:39], 2.0
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[36:37]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v16f64_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s66, 0
+; GCN-NEXT:    s_mov_b64 s[36:37], 1.0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_mov_b32 s67, 0x40300000
+; GCN-NEXT:    s_mov_b32 s65, 0x402e0000
+; GCN-NEXT:    s_mov_b32 s64, s66
+; GCN-NEXT:    s_mov_b32 s63, 0x402c0000
+; GCN-NEXT:    s_mov_b32 s62, s66
+; GCN-NEXT:    s_mov_b32 s61, 0x402a0000
+; GCN-NEXT:    s_mov_b32 s60, s66
+; GCN-NEXT:    s_mov_b32 s59, 0x40280000
+; GCN-NEXT:    s_mov_b32 s58, s66
+; GCN-NEXT:    s_mov_b32 s57, 0x40260000
+; GCN-NEXT:    s_mov_b32 s56, s66
+; GCN-NEXT:    s_mov_b32 s55, 0x40240000
+; GCN-NEXT:    s_mov_b32 s54, s66
+; GCN-NEXT:    s_mov_b32 s53, 0x40220000
+; GCN-NEXT:    s_mov_b32 s52, s66
+; GCN-NEXT:    s_mov_b32 s51, 0x40200000
+; GCN-NEXT:    s_mov_b32 s50, s66
+; GCN-NEXT:    s_mov_b32 s49, 0x401c0000
+; GCN-NEXT:    s_mov_b32 s48, s66
+; GCN-NEXT:    s_mov_b32 s47, 0x40180000
+; GCN-NEXT:    s_mov_b32 s46, s66
+; GCN-NEXT:    s_mov_b32 s45, 0x40140000
+; GCN-NEXT:    s_mov_b32 s44, s66
+; GCN-NEXT:    s_mov_b64 s[42:43], 4.0
+; GCN-NEXT:    s_mov_b32 s41, 0x40080000
+; GCN-NEXT:    s_mov_b32 s40, s66
+; GCN-NEXT:    s_mov_b64 s[38:39], 2.0
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[36:37]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
   ret double %ext
 }
 
 define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f32_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b64 s[6:7], exec
-; GPRIDX-NEXT:  BB33_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s8, v0
-; GPRIDX-NEXT:    s_mov_b32 m0, s8
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
-; GPRIDX-NEXT:    s_movrels_b32 s8, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s8
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB33_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[6:7]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v6f32_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b64 s[6:7], exec
-; MOVREL-NEXT:  BB33_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s8, v0
-; MOVREL-NEXT:    s_mov_b32 m0, s8
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
-; MOVREL-NEXT:    s_movrels_b32 s8, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s8
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB33_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[6:7]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v6f32_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:  BB33_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s8, v0
+; GCN-NEXT:    s_mov_b32 m0, s8
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
+; GCN-NEXT:    s_movrels_b32 s8, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB33_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x float> %vec, i32 %sel
   ret float %ext
@@ -1628,84 +1231,47 @@ entry:
 }
 
 define amdgpu_ps float @dyn_extract_v6f32_s_s(<6 x float> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f32_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 m0, s8
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_movrels_b32 s0, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v6f32_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 m0, s8
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_movrels_b32 s0, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v6f32_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 m0, s8
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x float> %vec, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f32_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b64 s[8:9], exec
-; GPRIDX-NEXT:  BB37_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s7, v0
-; GPRIDX-NEXT:    s_mov_b32 m0, s7
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v0
-; GPRIDX-NEXT:    s_movrels_b32 s7, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s7
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB37_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[8:9]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v7f32_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b64 s[8:9], exec
-; MOVREL-NEXT:  BB37_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s7, v0
-; MOVREL-NEXT:    s_mov_b32 m0, s7
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v0
-; MOVREL-NEXT:    s_movrels_b32 s7, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s7
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB37_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[8:9]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v7f32_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b64 s[8:9], exec
+; GCN-NEXT:  BB37_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mov_b32 m0, s7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v0
+; GCN-NEXT:    s_movrels_b32 s7, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB37_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <7 x float> %vec, i32 %sel
   ret float %ext
@@ -1770,94 +1336,52 @@ entry:
 }
 
 define amdgpu_ps float @dyn_extract_v7f32_s_s(<7 x float> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f32_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 m0, s9
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_movrels_b32 s0, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v7f32_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 m0, s9
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_movrels_b32 s0, s0
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v7f32_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 m0, s9
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <7 x float> %vec, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f64_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s16, s2
-; GPRIDX-NEXT:    s_mov_b32 s17, s3
-; GPRIDX-NEXT:    s_mov_b32 s18, s4
-; GPRIDX-NEXT:    s_mov_b32 s19, s5
-; GPRIDX-NEXT:    s_mov_b32 s20, s6
-; GPRIDX-NEXT:    s_mov_b32 s21, s7
-; GPRIDX-NEXT:    s_mov_b32 s22, s8
-; GPRIDX-NEXT:    s_mov_b32 s23, s9
-; GPRIDX-NEXT:    s_mov_b32 s24, s10
-; GPRIDX-NEXT:    s_mov_b32 s25, s11
-; GPRIDX-NEXT:    s_mov_b32 s26, s12
-; GPRIDX-NEXT:    s_mov_b32 s27, s13
-; GPRIDX-NEXT:    s_mov_b64 s[2:3], exec
-; GPRIDX-NEXT:  BB41_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
-; GPRIDX-NEXT:    s_lshl_b32 m0, s0, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; GPRIDX-NEXT:    s_movrels_b32 s0, s16
-; GPRIDX-NEXT:    s_movrels_b32 s1, s17
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB41_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[2:3]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v6f64_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s16, s2
-; MOVREL-NEXT:    s_mov_b32 s17, s3
-; MOVREL-NEXT:    s_mov_b32 s18, s4
-; MOVREL-NEXT:    s_mov_b32 s19, s5
-; MOVREL-NEXT:    s_mov_b32 s20, s6
-; MOVREL-NEXT:    s_mov_b32 s21, s7
-; MOVREL-NEXT:    s_mov_b32 s22, s8
-; MOVREL-NEXT:    s_mov_b32 s23, s9
-; MOVREL-NEXT:    s_mov_b32 s24, s10
-; MOVREL-NEXT:    s_mov_b32 s25, s11
-; MOVREL-NEXT:    s_mov_b32 s26, s12
-; MOVREL-NEXT:    s_mov_b32 s27, s13
-; MOVREL-NEXT:    s_mov_b64 s[2:3], exec
-; MOVREL-NEXT:  BB41_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
-; MOVREL-NEXT:    s_lshl_b32 m0, s0, 1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; MOVREL-NEXT:    s_movrels_b32 s0, s16
-; MOVREL-NEXT:    s_movrels_b32 s1, s17
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB41_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[2:3]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v6f64_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s16, s2
+; GCN-NEXT:    s_mov_b32 s17, s3
+; GCN-NEXT:    s_mov_b32 s18, s4
+; GCN-NEXT:    s_mov_b32 s19, s5
+; GCN-NEXT:    s_mov_b32 s20, s6
+; GCN-NEXT:    s_mov_b32 s21, s7
+; GCN-NEXT:    s_mov_b32 s22, s8
+; GCN-NEXT:    s_mov_b32 s23, s9
+; GCN-NEXT:    s_mov_b32 s24, s10
+; GCN-NEXT:    s_mov_b32 s25, s11
+; GCN-NEXT:    s_mov_b32 s26, s12
+; GCN-NEXT:    s_mov_b32 s27, s13
+; GCN-NEXT:    s_mov_b64 s[2:3], exec
+; GCN-NEXT:  BB41_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    s_lshl_b32 m0, s0, 1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; GCN-NEXT:    s_movrels_b32 s0, s16
+; GCN-NEXT:    s_movrels_b32 s1, s17
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB41_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x double> %vec, i32 %sel
   ret double %ext
@@ -1934,106 +1458,58 @@ entry:
 }
 
 define amdgpu_ps double @dyn_extract_v6f64_s_s(<6 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f64_s_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 m0, s14
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[0:1]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v6f64_s_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 m0, s14
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[0:1]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v6f64_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 m0, s14
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x double> %vec, i32 %sel
   ret double %ext
 }
 
 define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f64_s_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s16, s2
-; GPRIDX-NEXT:    s_mov_b32 s17, s3
-; GPRIDX-NEXT:    s_mov_b32 s18, s4
-; GPRIDX-NEXT:    s_mov_b32 s19, s5
-; GPRIDX-NEXT:    s_mov_b32 s20, s6
-; GPRIDX-NEXT:    s_mov_b32 s21, s7
-; GPRIDX-NEXT:    s_mov_b32 s22, s8
-; GPRIDX-NEXT:    s_mov_b32 s23, s9
-; GPRIDX-NEXT:    s_mov_b32 s24, s10
-; GPRIDX-NEXT:    s_mov_b32 s25, s11
-; GPRIDX-NEXT:    s_mov_b32 s26, s12
-; GPRIDX-NEXT:    s_mov_b32 s27, s13
-; GPRIDX-NEXT:    s_mov_b32 s28, s14
-; GPRIDX-NEXT:    s_mov_b32 s29, s15
-; GPRIDX-NEXT:    s_mov_b64 s[2:3], exec
-; GPRIDX-NEXT:  BB45_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s0, v0
-; GPRIDX-NEXT:    s_lshl_b32 m0, s0, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; GPRIDX-NEXT:    s_movrels_b32 s0, s16
-; GPRIDX-NEXT:    s_movrels_b32 s1, s17
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB45_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[2:3]
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v7f64_s_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s16, s2
-; MOVREL-NEXT:    s_mov_b32 s17, s3
-; MOVREL-NEXT:    s_mov_b32 s18, s4
-; MOVREL-NEXT:    s_mov_b32 s19, s5
-; MOVREL-NEXT:    s_mov_b32 s20, s6
-; MOVREL-NEXT:    s_mov_b32 s21, s7
-; MOVREL-NEXT:    s_mov_b32 s22, s8
-; MOVREL-NEXT:    s_mov_b32 s23, s9
-; MOVREL-NEXT:    s_mov_b32 s24, s10
-; MOVREL-NEXT:    s_mov_b32 s25, s11
-; MOVREL-NEXT:    s_mov_b32 s26, s12
-; MOVREL-NEXT:    s_mov_b32 s27, s13
-; MOVREL-NEXT:    s_mov_b32 s28, s14
-; MOVREL-NEXT:    s_mov_b32 s29, s15
-; MOVREL-NEXT:    s_mov_b64 s[2:3], exec
-; MOVREL-NEXT:  BB45_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s0, v0
-; MOVREL-NEXT:    s_lshl_b32 m0, s0, 1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; MOVREL-NEXT:    s_movrels_b32 s0, s16
-; MOVREL-NEXT:    s_movrels_b32 s1, s17
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB45_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[2:3]
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v7f64_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s16, s2
+; GCN-NEXT:    s_mov_b32 s17, s3
+; GCN-NEXT:    s_mov_b32 s18, s4
+; GCN-NEXT:    s_mov_b32 s19, s5
+; GCN-NEXT:    s_mov_b32 s20, s6
+; GCN-NEXT:    s_mov_b32 s21, s7
+; GCN-NEXT:    s_mov_b32 s22, s8
+; GCN-NEXT:    s_mov_b32 s23, s9
+; GCN-NEXT:    s_mov_b32 s24, s10
+; GCN-NEXT:    s_mov_b32 s25, s11
+; GCN-NEXT:    s_mov_b32 s26, s12
+; GCN-NEXT:    s_mov_b32 s27, s13
+; GCN-NEXT:    s_mov_b32 s28, s14
+; GCN-NEXT:    s_mov_b32 s29, s15
+; GCN-NEXT:    s_mov_b64 s[2:3], exec
+; GCN-NEXT:  BB45_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    s_lshl_b32 m0, s0, 1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; GCN-NEXT:    s_movrels_b32 s0, s16
+; GCN-NEXT:    s_movrels_b32 s1, s17
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB45_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <7 x double> %vec, i32 %sel
   ret double %ext
@@ -2110,46 +1586,466 @@ entry:
 }
 
 define amdgpu_ps double @dyn_extract_v7f64_s_s(<7 x double> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f64_s_s:
+; GCN-LABEL: dyn_extract_v7f64_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 m0, s16
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <7 x double> %vec, i32 %sel
+  ret double %ext
+}
+
+define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v5f64_s_s:
+; GPRIDX:         .amd_kernel_code_t
+; GPRIDX-NEXT:     amd_code_version_major = 1
+; GPRIDX-NEXT:     amd_code_version_minor = 2
+; GPRIDX-NEXT:     amd_machine_kind = 1
+; GPRIDX-NEXT:     amd_machine_version_major = 9
+; GPRIDX-NEXT:     amd_machine_version_minor = 0
+; GPRIDX-NEXT:     amd_machine_version_stepping = 0
+; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
+; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
+; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     priority = 0
+; GPRIDX-NEXT:     float_mode = 240
+; GPRIDX-NEXT:     priv = 0
+; GPRIDX-NEXT:     enable_dx10_clamp = 1
+; GPRIDX-NEXT:     debug_mode = 0
+; GPRIDX-NEXT:     enable_ieee_mode = 1
+; GPRIDX-NEXT:     enable_wgp_mode = 0
+; GPRIDX-NEXT:     enable_mem_ordered = 0
+; GPRIDX-NEXT:     enable_fwd_progress = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT:     user_sgpr_count = 6
+; GPRIDX-NEXT:     enable_trap_handler = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_z = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_info = 0
+; GPRIDX-NEXT:     enable_vgpr_workitem_id = 0
+; GPRIDX-NEXT:     enable_exception_msb = 0
+; GPRIDX-NEXT:     granulated_lds_size = 0
+; GPRIDX-NEXT:     enable_exception = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GPRIDX-NEXT:     enable_sgpr_dispatch_ptr = 0
+; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
+; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GPRIDX-NEXT:     enable_wavefront_size32 = 0
+; GPRIDX-NEXT:     enable_ordered_append_gds = 0
+; GPRIDX-NEXT:     private_element_size = 1
+; GPRIDX-NEXT:     is_ptr64 = 1
+; GPRIDX-NEXT:     is_dynamic_callstack = 0
+; GPRIDX-NEXT:     is_debug_enabled = 0
+; GPRIDX-NEXT:     is_xnack_enabled = 0
+; GPRIDX-NEXT:     workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT:     workgroup_group_segment_byte_size = 0
+; GPRIDX-NEXT:     gds_segment_byte_size = 0
+; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
+; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     workitem_vgpr_count = 4
+; GPRIDX-NEXT:     reserved_vgpr_first = 0
+; GPRIDX-NEXT:     reserved_vgpr_count = 0
+; GPRIDX-NEXT:     reserved_sgpr_first = 0
+; GPRIDX-NEXT:     reserved_sgpr_count = 0
+; GPRIDX-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GPRIDX-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GPRIDX-NEXT:     kernarg_segment_alignment = 4
+; GPRIDX-NEXT:     group_segment_alignment = 4
+; GPRIDX-NEXT:     private_segment_alignment = 4
+; GPRIDX-NEXT:     wavefront_size = 6
+; GPRIDX-NEXT:     call_convention = -1
+; GPRIDX-NEXT:     runtime_loader_kernel_symbol = 0
+; GPRIDX-NEXT:    .end_amd_kernel_code_t
+; GPRIDX-NEXT:  ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GPRIDX-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GPRIDX-NEXT:    s_mov_b32 s16, 0
+; GPRIDX-NEXT:    s_mov_b64 s[8:9], 1.0
+; GPRIDX-NEXT:    s_mov_b32 s17, 0x40140000
+; GPRIDX-NEXT:    s_mov_b64 s[14:15], 4.0
+; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b32 m0, s2
+; GPRIDX-NEXT:    s_mov_b32 s13, 0x40080000
+; GPRIDX-NEXT:    s_mov_b32 s12, s16
+; GPRIDX-NEXT:    s_mov_b64 s[10:11], 2.0
+; GPRIDX-NEXT:    s_movrels_b64 s[2:3], s[8:9]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s0
+; GPRIDX-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GPRIDX-NEXT:    s_endpgm
+;
+; MOVREL-LABEL: dyn_extract_v5f64_s_s:
+; MOVREL:         .amd_kernel_code_t
+; MOVREL-NEXT:     amd_code_version_major = 1
+; MOVREL-NEXT:     amd_code_version_minor = 2
+; MOVREL-NEXT:     amd_machine_kind = 1
+; MOVREL-NEXT:     amd_machine_version_major = 8
+; MOVREL-NEXT:     amd_machine_version_minor = 0
+; MOVREL-NEXT:     amd_machine_version_stepping = 3
+; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
+; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
+; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     priority = 0
+; MOVREL-NEXT:     float_mode = 240
+; MOVREL-NEXT:     priv = 0
+; MOVREL-NEXT:     enable_dx10_clamp = 1
+; MOVREL-NEXT:     debug_mode = 0
+; MOVREL-NEXT:     enable_ieee_mode = 1
+; MOVREL-NEXT:     enable_wgp_mode = 0
+; MOVREL-NEXT:     enable_mem_ordered = 0
+; MOVREL-NEXT:     enable_fwd_progress = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT:     user_sgpr_count = 6
+; MOVREL-NEXT:     enable_trap_handler = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_z = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_info = 0
+; MOVREL-NEXT:     enable_vgpr_workitem_id = 0
+; MOVREL-NEXT:     enable_exception_msb = 0
+; MOVREL-NEXT:     granulated_lds_size = 0
+; MOVREL-NEXT:     enable_exception = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_buffer = 1
+; MOVREL-NEXT:     enable_sgpr_dispatch_ptr = 0
+; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
+; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; MOVREL-NEXT:     enable_sgpr_dispatch_id = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; MOVREL-NEXT:     enable_wavefront_size32 = 0
+; MOVREL-NEXT:     enable_ordered_append_gds = 0
+; MOVREL-NEXT:     private_element_size = 1
+; MOVREL-NEXT:     is_ptr64 = 1
+; MOVREL-NEXT:     is_dynamic_callstack = 0
+; MOVREL-NEXT:     is_debug_enabled = 0
+; MOVREL-NEXT:     is_xnack_enabled = 0
+; MOVREL-NEXT:     workitem_private_segment_byte_size = 0
+; MOVREL-NEXT:     workgroup_group_segment_byte_size = 0
+; MOVREL-NEXT:     gds_segment_byte_size = 0
+; MOVREL-NEXT:     kernarg_segment_byte_size = 28
+; MOVREL-NEXT:     workgroup_fbarrier_count = 0
+; MOVREL-NEXT:     wavefront_sgpr_count = 24
+; MOVREL-NEXT:     workitem_vgpr_count = 4
+; MOVREL-NEXT:     reserved_vgpr_first = 0
+; MOVREL-NEXT:     reserved_vgpr_count = 0
+; MOVREL-NEXT:     reserved_sgpr_first = 0
+; MOVREL-NEXT:     reserved_sgpr_count = 0
+; MOVREL-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; MOVREL-NEXT:     debug_private_segment_buffer_sgpr = 0
+; MOVREL-NEXT:     kernarg_segment_alignment = 4
+; MOVREL-NEXT:     group_segment_alignment = 4
+; MOVREL-NEXT:     private_segment_alignment = 4
+; MOVREL-NEXT:     wavefront_size = 6
+; MOVREL-NEXT:     call_convention = -1
+; MOVREL-NEXT:     runtime_loader_kernel_symbol = 0
+; MOVREL-NEXT:    .end_amd_kernel_code_t
+; MOVREL-NEXT:  ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; MOVREL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; MOVREL-NEXT:    s_mov_b32 s16, 0
+; MOVREL-NEXT:    s_mov_b64 s[8:9], 1.0
+; MOVREL-NEXT:    s_mov_b32 s17, 0x40140000
+; MOVREL-NEXT:    s_mov_b64 s[14:15], 4.0
+; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b32 m0, s2
+; MOVREL-NEXT:    s_mov_b32 s13, 0x40080000
+; MOVREL-NEXT:    s_mov_b32 s12, s16
+; MOVREL-NEXT:    s_mov_b64 s[10:11], 2.0
+; MOVREL-NEXT:    s_movrels_b64 s[2:3], s[8:9]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
+; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; MOVREL-NEXT:    s_endpgm
+entry:
+  %ext = extractelement <5 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+
+define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
+; GCN-LABEL: dyn_extract_v15f32_const_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s18, 0x41700000
+; GCN-NEXT:    s_mov_b32 s17, 0x41600000
+; GCN-NEXT:    s_mov_b32 s16, 0x41500000
+; GCN-NEXT:    s_mov_b32 s15, 0x41400000
+; GCN-NEXT:    s_mov_b32 s14, 0x41300000
+; GCN-NEXT:    s_mov_b32 s13, 0x41200000
+; GCN-NEXT:    s_mov_b32 s12, 0x41100000
+; GCN-NEXT:    s_mov_b32 s11, 0x41000000
+; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
+; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
+; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
+; GCN-NEXT:    s_mov_b32 s7, 4.0
+; GCN-NEXT:    s_mov_b32 s6, 0x40400000
+; GCN-NEXT:    s_mov_b32 s5, 2.0
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b64 s[20:21], exec
+; GCN-NEXT:  BB50_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s19, v0
+; GCN-NEXT:    s_mov_b32 m0, s19
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s19, v0
+; GCN-NEXT:    s_movrels_b32 s19, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB50_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[20:21]
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ext = extractelement <15 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v15f32_const_s_s(i32 inreg %sel) {
+; GCN-LABEL: dyn_extract_v15f32_const_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s4, 1.0
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_mov_b32 s18, 0x41700000
+; GCN-NEXT:    s_mov_b32 s17, 0x41600000
+; GCN-NEXT:    s_mov_b32 s16, 0x41500000
+; GCN-NEXT:    s_mov_b32 s15, 0x41400000
+; GCN-NEXT:    s_mov_b32 s14, 0x41300000
+; GCN-NEXT:    s_mov_b32 s13, 0x41200000
+; GCN-NEXT:    s_mov_b32 s12, 0x41100000
+; GCN-NEXT:    s_mov_b32 s11, 0x41000000
+; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
+; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
+; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
+; GCN-NEXT:    s_mov_b32 s7, 4.0
+; GCN-NEXT:    s_mov_b32 s6, 0x40400000
+; GCN-NEXT:    s_mov_b32 s5, 2.0
+; GCN-NEXT:    s_movrels_b32 s0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <15 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel) {
+; GCN-LABEL: dyn_extract_v15f32_s_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b64 s[16:17], exec
+; GCN-NEXT:  BB52_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v0
+; GCN-NEXT:    s_mov_b32 m0, s15
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v0
+; GCN-NEXT:    s_movrels_b32 s15, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GCN-NEXT:    s_xor_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execnz BB52_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[16:17]
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <15 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v15f32_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 m0, s16
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_mov_b32 s8, s10
-; GPRIDX-NEXT:    s_mov_b32 s9, s11
-; GPRIDX-NEXT:    s_mov_b32 s10, s12
-; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
+; GPRIDX-NEXT:  BB53_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v15
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
+; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB53_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v16
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: dyn_extract_v15f32_v_v:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
+; MOVREL-NEXT:  BB53_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v15
+; MOVREL-NEXT:    s_mov_b32 m0, s6
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
+; MOVREL-NEXT:    v_movrels_b32_e32 v16, v0
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB53_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v16
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ext = extractelement <15 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v15f32_v_s(<15 x float> %vec, i32 inreg %sel) {
+; GPRIDX-LABEL: dyn_extract_v15f32_v_s:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v0
+; GPRIDX-NEXT:    s_set_gpr_idx_off
 ; GPRIDX-NEXT:    ; return to shader part epilog
 ;
-; MOVREL-LABEL: dyn_extract_v7f64_s_s:
+; MOVREL-LABEL: dyn_extract_v15f32_v_s:
 ; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 m0, s16
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_mov_b32 s8, s10
-; MOVREL-NEXT:    s_mov_b32 s9, s11
-; MOVREL-NEXT:    s_mov_b32 s10, s12
-; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_movrels_b64 s[0:1], s[0:1]
+; MOVREL-NEXT:    s_mov_b32 m0, s2
+; MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
 ; MOVREL-NEXT:    ; return to shader part epilog
 entry:
-  %ext = extractelement <7 x double> %vec, i32 %sel
-  ret double %ext
+  %ext = extractelement <15 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v15f32_s_s(<15 x float> inreg %vec, i32 inreg %sel) {
+; GCN-LABEL: dyn_extract_v15f32_s_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 m0, s17
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %ext = extractelement <15 x float> %vec, i32 %sel
+  ret float %ext
+}
+
+define amdgpu_ps float @dyn_extract_v15f32_s_s_offset3(<15 x float> inreg %vec, i32 inreg %sel) {
+; GCN-LABEL: dyn_extract_v15f32_s_s_offset3:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 m0, s17
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    s_mov_b32 s10, s12
+; GCN-NEXT:    s_mov_b32 s11, s13
+; GCN-NEXT:    s_mov_b32 s12, s14
+; GCN-NEXT:    s_mov_b32 s13, s15
+; GCN-NEXT:    s_mov_b32 s14, s16
+; GCN-NEXT:    s_movrels_b32 s0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %add = add i32 %sel, 3
+  %ext = extractelement <15 x float> %vec, i32 %add
+  ret float %ext
+}
+
+define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v15f32_v_v_offset3:
+; GPRIDX:       ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
+; GPRIDX-NEXT:  BB57_1: ; =>This Inner Loop Header: Depth=1
+; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v15
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
+; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, v3
+; GPRIDX-NEXT:    s_set_gpr_idx_off
+; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
+; GPRIDX-NEXT:    s_cbranch_execnz BB57_1
+; GPRIDX-NEXT:  ; %bb.2:
+; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, v16
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; MOVREL-LABEL: dyn_extract_v15f32_v_v_offset3:
+; MOVREL:       ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
+; MOVREL-NEXT:  BB57_1: ; =>This Inner Loop Header: Depth=1
+; MOVREL-NEXT:    v_readfirstlane_b32 s6, v15
+; MOVREL-NEXT:    s_mov_b32 m0, s6
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
+; MOVREL-NEXT:    v_movrels_b32_e32 v16, v3
+; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
+; MOVREL-NEXT:    s_cbranch_execnz BB57_1
+; MOVREL-NEXT:  ; %bb.2:
+; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, v16
+; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %add = add i32 %sel, 3
+  %ext = extractelement <15 x float> %vec, i32 %add
+  ret float %ext
 }

From 5ecf85a5fcb71fc616afef8d22d2bfd9c7854402 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Thu, 28 May 2020 19:49:12 +0100
Subject: [PATCH 405/770] [AArch64] Add native CPU detection for Neoverse N1

Map the CPU ID value 0xd0c to "neoverse-n1".

Patch by James Greenhalgh.

Differential Revision: https://reviews.llvm.org/D80736
---
 llvm/lib/Support/Host.cpp       | 1 +
 llvm/unittests/Support/Host.cpp | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index d80c7228f6e04..a3a0a4c2d01a3 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -204,6 +204,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
             .Case("0xd09", "cortex-a73")
             .Case("0xd0a", "cortex-a75")
             .Case("0xd0b", "cortex-a76")
+            .Case("0xd0c", "neoverse-n1")
             .Default("generic");
   }
 
diff --git a/llvm/unittests/Support/Host.cpp b/llvm/unittests/Support/Host.cpp
index dd43f041d0e84..7d43366631c58 100644
--- a/llvm/unittests/Support/Host.cpp
+++ b/llvm/unittests/Support/Host.cpp
@@ -100,6 +100,10 @@ TEST(getLinuxHostCPUName, AArch64) {
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
                                               "CPU part        : 0xd03"),
             "cortex-a53");
+
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
+                                              "CPU part        : 0xd0c"),
+            "neoverse-n1");
   // Verify that both CPU implementer and CPU part are checked:
   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x40\n"
                                               "CPU part        : 0xd03"),

From 47ffc81830d3b1e7e26cfda591c77caddd16a049 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Thu, 28 May 2020 19:08:07 +0000
Subject: [PATCH 406/770] Revert "[LoopUnroll] Support loops with exiting block
 that is neither header nor"

This reverts commit 281058226587d8c70172ff0fb1e55d58876da229.

Revert until
http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-debian/builds/7334
is resolved.
---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 115 +++++++++++-------
 .../Transforms/LoopUnroll/nonlatchcondbr.ll   |  69 -----------
 2 files changed, 74 insertions(+), 110 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index c926f1d100314..d9323e70bef60 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -81,8 +81,8 @@ using namespace llvm;
 // TODO: Should these be here or in LoopUnroll?
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
-STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
-                               "latch (completely or otherwise)");
+STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a "
+                                 "conditional latch (completely or otherwise)");
 
 static cl::opt<bool>
 UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
@@ -304,30 +304,48 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // The current loop unroll pass can unroll loops that have
-  // (1) single latch; and
-  // (2a) latch is an exiting block; or
-  // (2b) latch is unconditional and there exists a single exiting block.
+  // The current loop unroll pass can unroll loops with a single latch or header
+  // that's a conditional branch exiting the loop.
   // FIXME: The implementation can be extended to work with more complicated
   // cases, e.g. loops with multiple latches.
   BasicBlock *Header = L->getHeader();
-  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-
-  // A conditional branch which exits the loop, which can be optimized to an
-  // unconditional branch in the unrolled loop in some cases.
-  BranchInst *ExitingBI = nullptr;
-  bool LatchIsExiting = L->isLoopExiting(LatchBlock);
-  if (LatchIsExiting)
-    ExitingBI = LatchBI;
-  else if (BasicBlock *ExitingBlock = L->getExitingBlock())
-    ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-  if (!LatchBI || !ExitingBI) {
+  BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator());
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  // FIXME: Support loops without conditional latch and multiple exiting blocks.
+  if (!BI ||
+      (BI->isUnconditional() && (!HeaderBI || HeaderBI->isUnconditional() ||
+                                 L->getExitingBlock() != Header))) {
     LLVM_DEBUG(dbgs() << "  Can't unroll; loop not terminated by a conditional "
-                         "branch in latch or a single exiting block.\n");
+                         "branch in the latch or header.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) {
+    return BI->isConditional() && BI->getSuccessor(S1) == Header &&
+           !L->contains(BI->getSuccessor(S2));
+  };
+
+  // If we have a conditional latch, it must exit the loop.
+  if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) &&
+      !CheckLatchSuccessors(1, 0)) {
+    LLVM_DEBUG(
+        dbgs() << "Can't unroll; a conditional latch must exit the loop");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) {
+    return HeaderBI && HeaderBI->isConditional() &&
+           L->contains(HeaderBI->getSuccessor(S1)) &&
+           !L->contains(HeaderBI->getSuccessor(S2));
+  };
+
+  // If we do not have a conditional latch, the header must exit the loop.
+  if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() &&
+      !CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) {
+    LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop");
     return LoopUnrollResult::Unmodified;
   }
-  LLVM_DEBUG(dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
-                    << "\n");
 
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
@@ -516,10 +534,17 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       SE->forgetTopmostLoop(L);
   }
 
-  if (!LatchIsExiting)
-    ++NumUnrolledNotLatch;
-  bool ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
-  BasicBlock *LoopExit = ExitingBI->getSuccessor(ContinueOnTrue);
+  bool ContinueOnTrue;
+  bool LatchIsExiting = BI->isConditional();
+  BasicBlock *LoopExit = nullptr;
+  if (LatchIsExiting) {
+    ContinueOnTrue = L->contains(BI->getSuccessor(0));
+    LoopExit = BI->getSuccessor(ContinueOnTrue);
+  } else {
+    NumUnrolledWithHeader++;
+    ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0));
+    LoopExit = HeaderBI->getSuccessor(ContinueOnTrue);
+  }
 
   // For the first iteration of the loop, we should use the precloned values for
   // PHI nodes.  Insert associations now.
@@ -530,13 +555,21 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   }
 
   std::vector<BasicBlock *> Headers;
-  std::vector<BasicBlock *> ExitingBlocks;
-  std::vector<BasicBlock *> ExitingSucc;
+  std::vector<BasicBlock *> HeaderSucc;
   std::vector<BasicBlock *> Latches;
   Headers.push_back(Header);
   Latches.push_back(LatchBlock);
-  ExitingBlocks.push_back(ExitingBI->getParent());
-  ExitingSucc.push_back(ExitingBI->getSuccessor(!ContinueOnTrue));
+
+  if (!LatchIsExiting) {
+    auto *Term = cast<BranchInst>(Header->getTerminator());
+    if (Term->isUnconditional() || L->contains(Term->getSuccessor(0))) {
+      assert(L->contains(Term->getSuccessor(0)));
+      HeaderSucc.push_back(Term->getSuccessor(0));
+    } else {
+      assert(L->contains(Term->getSuccessor(1)));
+      HeaderSucc.push_back(Term->getSuccessor(1));
+    }
+  }
 
   // The current on-the-fly SSA update requires blocks to be processed in
   // reverse postorder so that LastValueMap contains the correct value at each
@@ -627,12 +660,12 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (*BB == LatchBlock)
         Latches.push_back(New);
 
-      // Keep track of the exiting block and its successor block contained in
-      // the loop for the current iteration.
-      if (*BB == ExitingBlocks[0])
-        ExitingBlocks.push_back(New);
-      if (*BB == ExitingSucc[0])
-        ExitingSucc.push_back(New);
+      // Keep track of the successor of the new header in the current iteration.
+      for (auto *Pred : predecessors(*BB))
+        if (Pred == Header) {
+          HeaderSucc.push_back(New);
+          break;
+        }
 
       NewBlocks.push_back(New);
       UnrolledLoopBlocks.push_back(New);
@@ -751,7 +784,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (!LatchIsExiting) {
     // If the latch is not exiting, we may be able to simplify the conditional
     // branches in the unrolled exiting blocks.
-    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+    for (unsigned i = 0, e = Headers.size(); i != e; ++i) {
       // The branch destination.
       unsigned j = (i + 1) % e;
       bool NeedConditional = true;
@@ -774,7 +807,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       // already correct.
       if (NeedConditional)
         continue;
-      setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
+      setDest(Headers[i], HeaderSucc[i], HeaderSucc[i], NeedConditional,
               ContinueOnTrue, false);
     }
 
@@ -800,8 +833,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           ChildrenToUpdate.push_back(ChildBB);
       }
       BasicBlock *NewIDom;
-      BasicBlock *&TermBlock = ExitingBlocks[0];
-      auto &TermBlocks = ExitingBlocks;
+      BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header;
+      auto &TermBlocks = LatchIsExiting ? Latches : Headers;
       if (BB == TermBlock) {
         // The latch is special because we emit unconditional branches in
         // some cases where the original loop contained a conditional branch.
@@ -810,13 +843,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // must also be a latch.  Specifically, the dominator is the first
         // latch which ends in a conditional branch, or the last latch if
         // there is no such latch.
-        // For loops exiting from non latch exiting block, we limit the
-        // supported loops to have a single exiting block.
+        // For loops exiting from the header, we limit the supported loops
+        // to have a single exiting block.
         NewIDom = TermBlocks.back();
         for (BasicBlock *Iter : TermBlocks) {
           Instruction *Term = Iter->getTerminator();
           if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
-            NewIDom = DT->findNearestCommonDominator(Iter, LatchBlock);
+            NewIDom = Iter;
             break;
           }
         }
diff --git a/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
deleted file mode 100644
index 547b05d1e186d..0000000000000
--- a/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-unroll -S | FileCheck %s
-; RUN: opt < %s -passes='require<opt-remark-emit>,unroll' -S | FileCheck %s
-
-define void @foo(i32* noalias %A) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
-; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
-; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
-; CHECK:       for.header:
-; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]]
-; CHECK:       for.body.for.body_crit_edge:
-; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT]], align 4
-; CHECK-NEXT:    call void @bar(i32 [[DOTPRE]])
-; CHECK-NEXT:    br label [[FOR_BODY_1:%.*]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-; CHECK:       for.body.1:
-; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE_1:%.*]]
-; CHECK:       for.body.for.body_crit_edge.1:
-; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[DOTPRE_1:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_1]], align 4
-; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_1]])
-; CHECK-NEXT:    br label [[FOR_BODY_2:%.*]]
-; CHECK:       for.body.2:
-; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE_2:%.*]]
-; CHECK:       for.body.for.body_crit_edge.2:
-; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4
-; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_2]])
-; CHECK-NEXT:    br label [[FOR_BODY_3:%.*]]
-; CHECK:       for.body.3:
-; CHECK-NEXT:    br i1 false, label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.for.body_crit_edge.3:
-; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; CHECK-NEXT:    unreachable
-;
-entry:
-  %0 = load i32, i32* %A, align 4
-  call void @bar(i32 %0)
-  br label %for.header
-
-for.header:
-  %1 = phi i32 [ %0, %entry ], [ %.pre, %for.body.for.body_crit_edge ]
-  %i = phi i64 [ 0, %entry ], [ %inc, %for.body.for.body_crit_edge ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i
-  call void @bar(i32 %1)
-  br label %for.body
-
-for.body:
-  %inc = add nsw i64 %i, 1
-  %cmp = icmp slt i64 %inc, 4
-  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
-
-for.body.for.body_crit_edge:
-  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %A, i64 %inc
-  %.pre = load i32, i32* %arrayidx.phi.trans.insert, align 4
-  br label %for.header
-
-for.end:
-  ret void
-}
-
-declare void @bar(i32)

From f719e7d9e7c411b833aa7f40e7a2a6c891def843 Mon Sep 17 00:00:00 2001
From: aartbik <ajcbik@google.com>
Date: Thu, 28 May 2020 11:04:02 -0700
Subject: [PATCH 407/770] [llvm] [MatrixIntrinsics] Add row-major support for
 llvm.matrix.transpose

Summary:
Only column-major was supported so far. This adds row-major support as well.
Note that we probably also want very efficient SIMD implementations for the
various target platforms.

Bug:
https://bugs.llvm.org/show_bug.cgi?id=46085

Reviewers: nicolasvasilache, reidtatge, bkramer, fhahn, ftynse, andydavis1, craig.topper, dcaballe, mehdi_amini, anemet

Reviewed By: fhahn

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80673
---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  34 +++---
 .../transpose-double-row-major.ll             | 111 ++++++++++++++++++
 .../transpose-float-row-major.ll              | 111 ++++++++++++++++++
 .../transpose-i32-row-major.ll                | 111 ++++++++++++++++++
 4 files changed, 350 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double-row-major.ll
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float-row-major.ll
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32-row-major.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 45709cb65503f..801069bb97b4c 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1294,24 +1294,24 @@ class LowerMatrixIntrinsics {
     VectorType *VectorTy = cast<VectorType>(InputVal->getType());
     ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
     MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
-    assert(InputMatrix.isColumnMajor() &&
-           "Row-major code-gen not supported yet!");
-
-    for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) {
-      // Build a single column vector for this row. First initialize it.
-      Value *ResultColumn = UndefValue::get(
-          VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns));
-
-      // Go through the elements of this row and insert it into the resulting
-      // column vector.
-      for (auto C : enumerate(InputMatrix.columns())) {
-        Value *Elt = Builder.CreateExtractElement(C.value(), Row);
-        // We insert at index Column since that is the row index after the
-        // transpose.
-        ResultColumn =
-            Builder.CreateInsertElement(ResultColumn, Elt, C.index());
+
+    const unsigned NewNumVecs =
+        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;
+    const unsigned NewNumElts =
+        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;
+
+    for (unsigned I = 0; I < NewNumVecs; ++I) {
+      // Build a single result vector. First initialize it.
+      Value *ResultVector = UndefValue::get(
+          VectorType::get(VectorTy->getElementType(), NewNumElts));
+      // Go through the old elements and insert it into the resulting vector.
+      for (auto J : enumerate(InputMatrix.vectors())) {
+        Value *Elt = Builder.CreateExtractElement(J.value(), I);
+        // Row and column indices are transposed.
+        ResultVector =
+            Builder.CreateInsertElement(ResultVector, Elt, J.index());
       }
-      Result.addVector(ResultColumn);
+      Result.addVector(ResultVector);
     }
 
     // TODO: Improve estimate of operations needed for transposes. Currently we
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double-row-major.ll
new file mode 100644
index 0000000000000..5d607f4b57f1c
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-double-row-major.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM %s
+
+define <8 x double> @transpose(<8 x double> %a) {
+; RM-LABEL: @transpose(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <4 x double> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[SPLIT]], i64 1
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i64 0
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 1
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP5]], double [[TMP6]], i64 1
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[SPLIT]], i64 2
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP8]], i64 0
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 2
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP10]], i64 1
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[SPLIT]], i64 3
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> undef, double [[TMP12]], i64 0
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 3
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP14]], i64 1
+; RM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[TMP18:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    ret <8 x double> [[TMP18]]
+;
+entry:
+  %c  = call <8 x double> @llvm.matrix.transpose(<8 x double> %a, i32 2, i32 4)
+  ret <8 x double> %c
+}
+
+declare <8 x double> @llvm.matrix.transpose(<8 x double>, i32, i32)
+
+define <8 x double> @transpose_single_column(<8 x double> %a) {
+; RM-LABEL: @transpose_single_column(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> undef, <1 x i32> zeroinitializer
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 1>
+; RM-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 2>
+; RM-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 3>
+; RM-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 4>
+; RM-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 5>
+; RM-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 6>
+; RM-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <1 x i32> <i32 7>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <8 x double> undef, double [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <1 x double> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <8 x double> [[TMP1]], double [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <1 x double> [[SPLIT2]], i64 0
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <8 x double> [[TMP3]], double [[TMP4]], i64 2
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <1 x double> [[SPLIT3]], i64 0
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> [[TMP5]], double [[TMP6]], i64 3
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <1 x double> [[SPLIT4]], i64 0
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP8]], i64 4
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <1 x double> [[SPLIT5]], i64 0
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP10]], i64 5
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <1 x double> [[SPLIT6]], i64 0
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP12]], i64 6
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <1 x double> [[SPLIT7]], i64 0
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP14]], i64 7
+; RM-NEXT:    ret <8 x double> [[TMP15]]
+;
+entry:
+  %c  = call <8 x double> @llvm.matrix.transpose(<8 x double> %a, i32 8, i32 1)
+  ret <8 x double> %c
+}
+
+declare <12 x double> @llvm.matrix.transpose.v12f64(<12 x double>, i32, i32)
+
+define <12 x double> @transpose_double_3x4(<12 x double> %a) {
+; RM-LABEL: @transpose_double_3x4(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <12 x double> [[A:%.*]], <12 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <12 x double> [[A]], <12 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    [[SPLIT2:%.*]] = shufflevector <12 x double> [[A]], <12 x double> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <4 x double> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[SPLIT2]], i64 0
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[SPLIT]], i64 1
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 1
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[SPLIT2]], i64 1
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <4 x double> [[SPLIT]], i64 2
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 2
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
+; RM-NEXT:    [[TMP16:%.*]] = extractelement <4 x double> [[SPLIT2]], i64 2
+; RM-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
+; RM-NEXT:    [[TMP18:%.*]] = extractelement <4 x double> [[SPLIT]], i64 3
+; RM-NEXT:    [[TMP19:%.*]] = insertelement <3 x double> undef, double [[TMP18]], i64 0
+; RM-NEXT:    [[TMP20:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 3
+; RM-NEXT:    [[TMP21:%.*]] = insertelement <3 x double> [[TMP19]], double [[TMP20]], i64 1
+; RM-NEXT:    [[TMP22:%.*]] = extractelement <4 x double> [[SPLIT2]], i64 3
+; RM-NEXT:    [[TMP23:%.*]] = insertelement <3 x double> [[TMP21]], double [[TMP22]], i64 2
+; RM-NEXT:    [[TMP24:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> [[TMP11]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; RM-NEXT:    [[TMP25:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> [[TMP23]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; RM-NEXT:    [[TMP26:%.*]] = shufflevector <6 x double> [[TMP24]], <6 x double> [[TMP25]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; RM-NEXT:    ret <12 x double> [[TMP26]]
+;
+entry:
+  %c  = call <12 x double> @llvm.matrix.transpose.v12f64(<12 x double> %a, i32 3, i32 4)
+  ret <12 x double> %c
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float-row-major.ll
new file mode 100644
index 0000000000000..8a9bf5ae3be46
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-float-row-major.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM %s
+
+define <8 x float> @transpose(<8 x float> %a) {
+; RM-LABEL: @transpose(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SPLIT]], i64 1
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i64 0
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 1
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP6]], i64 1
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[SPLIT]], i64 2
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 2
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP10]], i64 1
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[SPLIT]], i64 3
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i64 0
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 3
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP14]], i64 1
+; RM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    ret <8 x float> [[TMP18]]
+;
+entry:
+  %c  = call <8 x float> @llvm.matrix.transpose(<8 x float> %a, i32 2, i32 4)
+  ret <8 x float> %c
+}
+
+declare <8 x float> @llvm.matrix.transpose(<8 x float>, i32, i32)
+
+define <8 x float> @transpose_single_column(<8 x float> %a) {
+; RM-LABEL: @transpose_single_column(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <1 x i32> zeroinitializer
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 1>
+; RM-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 2>
+; RM-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 3>
+; RM-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 4>
+; RM-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 5>
+; RM-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 6>
+; RM-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <1 x i32> <i32 7>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <1 x float> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> undef, float [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <1 x float> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP1]], float [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <1 x float> [[SPLIT2]], i64 0
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP3]], float [[TMP4]], i64 2
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <1 x float> [[SPLIT3]], i64 0
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP6]], i64 3
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <1 x float> [[SPLIT4]], i64 0
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP7]], float [[TMP8]], i64 4
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <1 x float> [[SPLIT5]], i64 0
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i64 5
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <1 x float> [[SPLIT6]], i64 0
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i64 6
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <1 x float> [[SPLIT7]], i64 0
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP14]], i64 7
+; RM-NEXT:    ret <8 x float> [[TMP15]]
+;
+entry:
+  %c  = call <8 x float> @llvm.matrix.transpose(<8 x float> %a, i32 8, i32 1)
+  ret <8 x float> %c
+}
+
+declare <12 x float> @llvm.matrix.transpose.v12f32(<12 x float>, i32, i32)
+
+define <12 x float> @transpose_float_3x4(<12 x float> %a) {
+; RM-LABEL: @transpose_float_3x4(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <12 x float> [[A:%.*]], <12 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <12 x float> [[A]], <12 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    [[SPLIT2:%.*]] = shufflevector <12 x float> [[A]], <12 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> undef, float [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> [[TMP1]], float [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SPLIT2]], i64 0
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP4]], i64 2
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[SPLIT]], i64 1
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <3 x float> undef, float [[TMP6]], i64 0
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 1
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <3 x float> [[TMP7]], float [[TMP8]], i64 1
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[SPLIT2]], i64 1
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <3 x float> [[TMP9]], float [[TMP10]], i64 2
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[SPLIT]], i64 2
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <3 x float> undef, float [[TMP12]], i64 0
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 2
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <3 x float> [[TMP13]], float [[TMP14]], i64 1
+; RM-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[SPLIT2]], i64 2
+; RM-NEXT:    [[TMP17:%.*]] = insertelement <3 x float> [[TMP15]], float [[TMP16]], i64 2
+; RM-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[SPLIT]], i64 3
+; RM-NEXT:    [[TMP19:%.*]] = insertelement <3 x float> undef, float [[TMP18]], i64 0
+; RM-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[SPLIT1]], i64 3
+; RM-NEXT:    [[TMP21:%.*]] = insertelement <3 x float> [[TMP19]], float [[TMP20]], i64 1
+; RM-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[SPLIT2]], i64 3
+; RM-NEXT:    [[TMP23:%.*]] = insertelement <3 x float> [[TMP21]], float [[TMP22]], i64 2
+; RM-NEXT:    [[TMP24:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> [[TMP11]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; RM-NEXT:    [[TMP25:%.*]] = shufflevector <3 x float> [[TMP17]], <3 x float> [[TMP23]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; RM-NEXT:    [[TMP26:%.*]] = shufflevector <6 x float> [[TMP24]], <6 x float> [[TMP25]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; RM-NEXT:    ret <12 x float> [[TMP26]]
+;
+entry:
+  %c  = call <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %a, i32 3, i32 4)
+  ret <12 x float> %c
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32-row-major.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32-row-major.ll
new file mode 100644
index 0000000000000..2f23d5fd8fec7
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/transpose-i32-row-major.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=RM %s
+
+define <8 x i32> @transpose(<8 x i32> %a) {
+; RM-LABEL: @transpose(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 1
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i64 0
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 1
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i64 1
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 2
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i64 0
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 2
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP10]], i64 1
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 3
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i64 0
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 3
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i64 1
+; RM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    ret <8 x i32> [[TMP18]]
+;
+entry:
+  %c  = call <8 x i32> @llvm.matrix.transpose(<8 x i32> %a, i32 2, i32 4)
+  ret <8 x i32> %c
+}
+
+declare <8 x i32> @llvm.matrix.transpose(<8 x i32>, i32, i32)
+
+define <8 x i32> @transpose_single_column(<8 x i32> %a) {
+; RM-LABEL: @transpose_single_column(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <1 x i32> zeroinitializer
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 1>
+; RM-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 2>
+; RM-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 3>
+; RM-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 4>
+; RM-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 5>
+; RM-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 6>
+; RM-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <1 x i32> <i32 7>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <1 x i32> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <1 x i32> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <1 x i32> [[SPLIT2]], i64 0
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP4]], i64 2
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <1 x i32> [[SPLIT3]], i64 0
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP6]], i64 3
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <1 x i32> [[SPLIT4]], i64 0
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP8]], i64 4
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <1 x i32> [[SPLIT5]], i64 0
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP10]], i64 5
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <1 x i32> [[SPLIT6]], i64 0
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i64 6
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <1 x i32> [[SPLIT7]], i64 0
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i64 7
+; RM-NEXT:    ret <8 x i32> [[TMP15]]
+;
+entry:
+  %c  = call <8 x i32> @llvm.matrix.transpose(<8 x i32> %a, i32 8, i32 1)
+  ret <8 x i32> %c
+}
+
+declare <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32>, i32, i32)
+
+define <12 x i32> @transpose_i32_3x4(<12 x i32> %a) {
+; RM-LABEL: @transpose_i32_3x4(
+; RM-NEXT:  entry:
+; RM-NEXT:    [[SPLIT:%.*]] = shufflevector <12 x i32> [[A:%.*]], <12 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; RM-NEXT:    [[SPLIT1:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; RM-NEXT:    [[SPLIT2:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; RM-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 0
+; RM-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[TMP0]], i64 0
+; RM-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 0
+; RM-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[TMP2]], i64 1
+; RM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[SPLIT2]], i64 0
+; RM-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[TMP4]], i64 2
+; RM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 1
+; RM-NEXT:    [[TMP7:%.*]] = insertelement <3 x i32> undef, i32 [[TMP6]], i64 0
+; RM-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 1
+; RM-NEXT:    [[TMP9:%.*]] = insertelement <3 x i32> [[TMP7]], i32 [[TMP8]], i64 1
+; RM-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[SPLIT2]], i64 1
+; RM-NEXT:    [[TMP11:%.*]] = insertelement <3 x i32> [[TMP9]], i32 [[TMP10]], i64 2
+; RM-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 2
+; RM-NEXT:    [[TMP13:%.*]] = insertelement <3 x i32> undef, i32 [[TMP12]], i64 0
+; RM-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 2
+; RM-NEXT:    [[TMP15:%.*]] = insertelement <3 x i32> [[TMP13]], i32 [[TMP14]], i64 1
+; RM-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[SPLIT2]], i64 2
+; RM-NEXT:    [[TMP17:%.*]] = insertelement <3 x i32> [[TMP15]], i32 [[TMP16]], i64 2
+; RM-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[SPLIT]], i64 3
+; RM-NEXT:    [[TMP19:%.*]] = insertelement <3 x i32> undef, i32 [[TMP18]], i64 0
+; RM-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[SPLIT1]], i64 3
+; RM-NEXT:    [[TMP21:%.*]] = insertelement <3 x i32> [[TMP19]], i32 [[TMP20]], i64 1
+; RM-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[SPLIT2]], i64 3
+; RM-NEXT:    [[TMP23:%.*]] = insertelement <3 x i32> [[TMP21]], i32 [[TMP22]], i64 2
+; RM-NEXT:    [[TMP24:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> [[TMP11]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; RM-NEXT:    [[TMP25:%.*]] = shufflevector <3 x i32> [[TMP17]], <3 x i32> [[TMP23]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; RM-NEXT:    [[TMP26:%.*]] = shufflevector <6 x i32> [[TMP24]], <6 x i32> [[TMP25]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; RM-NEXT:    ret <12 x i32> [[TMP26]]
+;
+entry:
+  %c  = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %a, i32 3, i32 4)
+  ret <12 x i32> %c
+}

From 06803d7abc230d0d0e74e5b7f7f283970cb02b9f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 28 May 2020 10:27:52 -0700
Subject: [PATCH 408/770] Add tests for preallocated + musttail

Summary:
Follow-up to https://reviews.llvm.org/D80581.
Turns out the codegen part already worked, so only needed to add tests.
I manually verified that in these tests the generated code for inalloca
and preallocated were identical.

Reviewers: efriedma, hans

Subscribers: llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80742
---
 llvm/test/CodeGen/X86/musttail-indirect.ll | 75 +++++++++++++++++++++-
 llvm/test/CodeGen/X86/musttail-thiscall.ll | 13 +++-
 2 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/X86/musttail-indirect.ll b/llvm/test/CodeGen/X86/musttail-indirect.ll
index 285ad9dcf4c91..5d2e0694e3444 100644
--- a/llvm/test/CodeGen/X86/musttail-indirect.ll
+++ b/llvm/test/CodeGen/X86/musttail-indirect.ll
@@ -22,8 +22,6 @@
 ; Each member pointer creates a thunk.  The ones with inalloca are required to
 ; tail calls by the ABI, even at O0.
 
-; TODO: add tests for preallocated/musttail once supported
-
 %struct.B = type { i32 (...)** }
 %struct.A = type { i32 }
 
@@ -54,6 +52,21 @@ entry:
   ret i32 %3
 }
 
+; Preallocated thunks shouldn't require any stores to the stack.
+; CHECK-LABEL: g_thunk_preallocated:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk_preallocated(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>)) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)**, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %0)
+  ret i32 %3
+}
+
 ; CHECK-LABEL: h_thunk:
 ; CHECK: jmpl
 ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
@@ -68,6 +81,20 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: h_thunk_preallocated:
+; CHECK: jmpl
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk_preallocated(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>)) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)**, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+  %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %0)
+  ret void
+}
+
 ; CHECK-LABEL: i_thunk:
 ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
 ; CHECK: jmpl
@@ -82,6 +109,20 @@ entry:
   ret %struct.A* %3
 }
 
+; CHECK-LABEL: i_thunk_preallocated:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk_preallocated(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A*, %struct.A, i32, %struct.A }>)) {
+entry:
+  %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+  %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)**, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+  %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A*, %struct.A, i32, %struct.A }>) %0)
+  ret %struct.A* %3
+}
+
 ; CHECK-LABEL: j_thunk:
 ; CHECK: jmpl
 ; CHECK-NOT: ret
@@ -111,6 +152,22 @@ entry:
   ret i32 %3
 }
 
+; CHECK-LABEL: _stdcall_thunk_preallocated@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc i32 @stdcall_thunk_preallocated(<{ %struct.B*, %struct.A }>* preallocated(<{ %struct.B*, %struct.A }>)) {
+entry:
+  %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>, <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+  %this = load %struct.B*, %struct.B** %this_ptr
+  %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+  %vtable = load i32 (<{ %struct.B*, %struct.A }>*)**, i32 (<{ %struct.B*, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)*, i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (<{ %struct.B*, %struct.A }>*)*, i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+  %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* preallocated(<{ %struct.B*, %struct.A }>) %0)
+  ret i32 %3
+}
+
 ; CHECK-LABEL: @fastcall_thunk@8:
 ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
 ; CHECK: jmpl
@@ -124,3 +181,17 @@ entry:
   %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
   ret i32 %3
 }
+
+; CHECK-LABEL: @fastcall_thunk_preallocated@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc i32 @fastcall_thunk_preallocated(%struct.B* inreg %this, <{ %struct.A }>* preallocated(<{ %struct.A }>)) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A }>*)**, i32 (%struct.B*, <{ %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+  %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* preallocated(<{ %struct.A }>) %0)
+  ret i32 %3
+}
diff --git a/llvm/test/CodeGen/X86/musttail-thiscall.ll b/llvm/test/CodeGen/X86/musttail-thiscall.ll
index 5cc8faa48e754..682f85e1eb852 100644
--- a/llvm/test/CodeGen/X86/musttail-thiscall.ll
+++ b/llvm/test/CodeGen/X86/musttail-thiscall.ll
@@ -1,8 +1,6 @@
 ; RUN: llc -verify-machineinstrs -mtriple=i686-- < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=i686-- -O0 < %s | FileCheck %s
 
-; TODO: add tests for preallocated/musttail once supported
-
 ; CHECK-LABEL: t1:
 ; CHECK: jmp {{_?}}t1_callee
 define x86_thiscallcc void @t1(i8* %this) {
@@ -31,3 +29,14 @@ define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) {
   ret i8* %rv
 }
 declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);
+
+; CHECK-LABEL: t4:
+; CHECK: jmp {{_?}}t4_callee
+define x86_thiscallcc i8* @t4(i8* %this, <{ i8*, i32 }>* preallocated(<{ i8*, i32 }>) %args) {
+  %adj = getelementptr i8, i8* %this, i32 4
+  %a_ptr = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %args, i32 0, i32 1
+  store i32 0, i32* %a_ptr
+  %rv = musttail call x86_thiscallcc i8* @t4_callee(i8* %adj, <{ i8*, i32 }>* preallocated(<{ i8*, i32 }>) %args)
+  ret i8* %rv
+}
+declare x86_thiscallcc i8* @t4_callee(i8* %this, <{ i8*, i32 }>* preallocated(<{ i8*, i32 }>) %args);

From eca41919d28b0616140a63c6a97483098ec1ffee Mon Sep 17 00:00:00 2001
From: Eduardo Caldas <ecaldas@google.com>
Date: Thu, 28 May 2020 18:28:36 +0200
Subject: [PATCH 409/770] Improve test infrastructure in SyntaxTree

Summary:
* Test if the code sourcing the SyntaxTree compiles
* Output compiler errors and warnings to err
* Fix tests with code that did not compile

Reviewers: gribozavr2

Reviewed By: gribozavr2

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80731
---
 clang/unittests/Tooling/Syntax/TreeTest.cpp | 311 ++++++++++----------
 1 file changed, 154 insertions(+), 157 deletions(-)

diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
index e81e3c2b83542..7051074d3b33a 100644
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -15,6 +15,7 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendAction.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/Tooling/Syntax/BuildTree.h"
@@ -97,8 +98,12 @@ class SyntaxTreeTest : public ::testing::Test {
 
     constexpr const char *FileName = "./input.cpp";
     FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy(""));
+
     if (!Diags->getClient())
-      Diags->setClient(new IgnoringDiagConsumer);
+      Diags->setClient(new TextDiagnosticPrinter(llvm::errs(), DiagOpts.get()));
+    Diags->setSeverityForGroup(diag::Flavor::WarningOrError, "unused-value",
+                               diag::Severity::Ignored, SourceLocation());
+
     // Prepare to run a compiler.
     std::vector<const char *> Args = {
         "syntax-test", "-target",       Target.c_str(),
@@ -117,7 +122,11 @@ class SyntaxTreeTest : public ::testing::Test {
 
     syntax::TranslationUnit *Root = nullptr;
     BuildSyntaxTreeAction Recorder(Root, this->Arena);
-    if (!Compiler.ExecuteAction(Recorder)) {
+
+    // Action could not be executed but the frontend didn't identify any errors
+    // in the code ==> problem in setting up the action.
+    if (!Compiler.ExecuteAction(Recorder) &&
+        Diags->getClient()->getNumErrors() == 0) {
       ADD_FAILURE() << "failed to run the frontend";
       std::abort();
     }
@@ -143,6 +152,8 @@ class SyntaxTreeTest : public ::testing::Test {
         continue;
       }
       auto *Root = buildTree(Code, Target);
+      EXPECT_EQ(Diags->getClient()->getNumErrors(), 0u)
+          << "Source file has syntax errors, they were printed to the test log";
       std::string Actual = std::string(StringRef(Root->dump(*Arena)).trim());
       EXPECT_EQ(Expected, Actual)
           << "for target " << Target << " the resulting dump is:\n"
@@ -180,8 +191,10 @@ class SyntaxTreeTest : public ::testing::Test {
   }
 
   // Data fields.
+  llvm::IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts =
+      new DiagnosticOptions();
   llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
-      new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions);
+      new DiagnosticsEngine(new DiagnosticIDs, DiagOpts.get());
   IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS =
       new llvm::vfs::InMemoryFileSystem;
   llvm::IntrusiveRefCntPtr<FileManager> FileMgr =
@@ -517,11 +530,11 @@ TEST_F(SyntaxTreeTest, UnhandledStatement) {
   // Unhandled statements should end up as 'unknown statement'.
   // This example uses a 'label statement', which does not yet have a syntax
   // counterpart.
-  expectTreeDumpEqual("void main() { foo: return 100; }",
+  expectTreeDumpEqual("int main() { foo: return 100; }",
                       R"txt(
 *: TranslationUnit
 `-SimpleDeclaration
-  |-void
+  |-int
   |-SimpleDeclarator
   | |-main
   | `-ParametersAndQualifiers
@@ -1166,7 +1179,7 @@ TEST_F(SyntaxTreeTest, FreeStandingClasses) {
   // Free-standing classes, must live inside a SimpleDeclaration.
   expectTreeDumpEqual(
       R"cpp(
-sturct X;
+struct X;
 struct X {};
 
 struct Y *y1;
@@ -1177,7 +1190,7 @@ struct {} *a1;
       R"txt(
 *: TranslationUnit
 |-SimpleDeclaration
-| |-sturct
+| |-struct
 | |-X
 | `-;
 |-SimpleDeclaration
@@ -1660,7 +1673,7 @@ TEST_F(SyntaxTreeTest, ArraySubscriptsInDeclarators) {
 int a[10];
 int b[1][2][3];
 int c[] = {1,2,3};
-void f(int xs[static 10]);
+// void f(int xs[static 10]);
     )cpp",
       R"txt(
 *: TranslationUnit
@@ -1694,163 +1707,146 @@ void f(int xs[static 10]);
 | |   | `-3
 | |   `-]
 | `-;
-|-SimpleDeclaration
-| |-int
-| |-SimpleDeclarator
-| | |-c
-| | |-ArraySubscript
-| | | |-[
-| | | `-]
-| | |-=
-| | `-UnknownExpression
-| |   `-UnknownExpression
-| |     |-{
-| |     |-UnknownExpression
-| |     | `-1
-| |     |-,
-| |     |-UnknownExpression
-| |     | `-2
-| |     |-,
-| |     |-UnknownExpression
-| |     | `-3
-| |     `-}
-| `-;
 `-SimpleDeclaration
-  |-void
+  |-int
   |-SimpleDeclarator
-  | |-f
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   |-xs
-  |   |   `-ArraySubscript
-  |   |     |-[
-  |   |     |-static
-  |   |     |-UnknownExpression
-  |   |     | `-10
-  |   |     `-]
-  |   `-)
-  `-;
-       )txt");
+  | |-c
+  | |-ArraySubscript
+  | | |-[
+  | | `-]
+  | |-=
+  | `-UnknownExpression
+  |   `-UnknownExpression
+  |     |-{
+  |     |-UnknownExpression
+  |     | `-1
+  |     |-,
+  |     |-UnknownExpression
+  |     | `-2
+  |     |-,
+  |     |-UnknownExpression
+  |     | `-3
+  |     `-}
+  `-;       )txt");
 }
 
 TEST_F(SyntaxTreeTest, ParameterListsInDeclarators) {
   expectTreeDumpEqual(
       R"cpp(
-int a() const;
-int b() volatile;
-int c() &;
-int d() &&;
-int foo(int a, int b);
-int foo(
-  const int a,
-  volatile int b,
-  const volatile int c,
-  int* d,
-  int& e,
-  int&& f
-);
-    )cpp",
+struct Test {
+  int a() const;
+  int b() volatile;
+  int c() &;
+  int d() &&;
+  int foo(int a, int b);
+  int foo(const int a, volatile int b, const volatile int c, int* d,
+          int& e, int&& f);
+};
+      )cpp",
       R"txt(
 *: TranslationUnit
-|-SimpleDeclaration
-| |-int
-| |-SimpleDeclarator
-| | |-a
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-)
-| |   `-const
-| `-;
-|-SimpleDeclaration
-| |-int
-| |-SimpleDeclarator
-| | |-b
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-)
-| |   `-volatile
-| `-;
-|-SimpleDeclaration
-| |-int
-| |-SimpleDeclarator
-| | |-c
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-)
-| |   `-&
-| `-;
-|-SimpleDeclaration
-| |-int
-| |-SimpleDeclarator
-| | |-d
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-)
-| |   `-&&
-| `-;
-|-SimpleDeclaration
-| |-int
-| |-SimpleDeclarator
-| | |-foo
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | |-int
-| |   | `-SimpleDeclarator
-| |   |   `-a
-| |   |-,
-| |   |-SimpleDeclaration
-| |   | |-int
-| |   | `-SimpleDeclarator
-| |   |   `-b
-| |   `-)
-| `-;
 `-SimpleDeclaration
-  |-int
-  |-SimpleDeclarator
-  | |-foo
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-const
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-volatile
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-b
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-const
-  |   | |-volatile
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-c
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   |-*
-  |   |   `-d
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   |-&
-  |   |   `-e
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   |-&&
-  |   |   `-f
-  |   `-)
+  |-struct
+  |-Test
+  |-{
+  |-SimpleDeclaration
+  | |-int
+  | |-SimpleDeclarator
+  | | |-a
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-)
+  | |   `-const
+  | `-;
+  |-SimpleDeclaration
+  | |-int
+  | |-SimpleDeclarator
+  | | |-b
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-)
+  | |   `-volatile
+  | `-;
+  |-SimpleDeclaration
+  | |-int
+  | |-SimpleDeclarator
+  | | |-c
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-)
+  | |   `-&
+  | `-;
+  |-SimpleDeclaration
+  | |-int
+  | |-SimpleDeclarator
+  | | |-d
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-)
+  | |   `-&&
+  | `-;
+  |-SimpleDeclaration
+  | |-int
+  | |-SimpleDeclarator
+  | | |-foo
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-SimpleDeclaration
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   `-a
+  | |   |-,
+  | |   |-SimpleDeclaration
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   `-b
+  | |   `-)
+  | `-;
+  |-SimpleDeclaration
+  | |-int
+  | |-SimpleDeclarator
+  | | |-foo
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-SimpleDeclaration
+  | |   | |-const
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   `-a
+  | |   |-,
+  | |   |-SimpleDeclaration
+  | |   | |-volatile
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   `-b
+  | |   |-,
+  | |   |-SimpleDeclaration
+  | |   | |-const
+  | |   | |-volatile
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   `-c
+  | |   |-,
+  | |   |-SimpleDeclaration
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   |-*
+  | |   |   `-d
+  | |   |-,
+  | |   |-SimpleDeclaration
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   |-&
+  | |   |   `-e
+  | |   |-,
+  | |   |-SimpleDeclaration
+  | |   | |-int
+  | |   | `-SimpleDeclarator
+  | |   |   |-&&
+  | |   |   `-f
+  | |   `-)
+  | `-;
+  |-}
   `-;
        )txt");
 }
@@ -1860,7 +1856,7 @@ TEST_F(SyntaxTreeTest, TrailingConst) {
       R"cpp(
 struct X {
   int foo() const;
-}
+};
     )cpp",
       R"txt(
 *: TranslationUnit
@@ -1877,7 +1873,8 @@ struct X {
   | |   |-)
   | |   `-const
   | `-;
-  `-}
+  |-}
+  `-;
     )txt");
 }
 

From 51401a676c036f2bd4e6b4b38f3538615799de40 Mon Sep 17 00:00:00 2001
From: Vy Nguyen <vyng@google.com>
Date: Thu, 28 May 2020 00:33:13 -0400
Subject: [PATCH 410/770] add isAtPosition narrowing matcher for parmVarDecl

Differential Revision: https://reviews.llvm.org/D80603
---
 clang/docs/LibASTMatchersReference.html       | 17 ++++++++
 clang/include/clang/ASTMatchers/ASTMatchers.h | 28 +++++++++++++
 clang/lib/ASTMatchers/Dynamic/Registry.cpp    |  1 +
 .../ASTMatchers/ASTMatchersNarrowingTest.cpp  | 39 +++++++++++++++++++
 4 files changed, 85 insertions(+)

diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index f57352389e4ce..9db6795eb5fab 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -4671,6 +4671,23 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;
 </pre></td></tr>
 
+
+<tr><td>Matcher&lt;clang::ParmVarDecl&gt;</td><td class="name" onclick="toggle('isAtPosition0')"><a name="isAtPosition0Anchor">isAtPosition</a></td><td>unsigned N</td></tr>
+<tr><td colspan="4" class="doc" id="isAtPosition0"><pre>Matches the ParmVarDecl nodes that are at the N'th position in the parameter
+list. The parameter list could be that of either a block, function, or
+objc-method.
+
+
+Given
+
+void f(int a, int b, int c) {
+}
+
+``parmVarDecl(isAtPosition(0))`` matches ``int a``.
+
+``parmVarDecl(isAtPosition(1))`` matches ``int b``.
+</pre></td></tr>
+
 <!--END_NARROWING_MATCHERS -->
 </table>
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index a750747c9aa3f..a3747faa139c8 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -4257,6 +4257,34 @@ AST_POLYMORPHIC_MATCHER_P2(forEachArgumentWithParam,
   return Matched;
 }
 
+/// Matches the ParmVarDecl nodes that are at the N'th position in the parameter
+/// list. The parameter list could be that of either a block, function, or
+/// objc-method.
+///
+///
+/// Given
+///
+/// \code
+/// void f(int a, int b, int c) {
+/// }
+/// \endcode
+///
+/// ``parmVarDecl(isAtPosition(0))`` matches ``int a``.
+///
+/// ``parmVarDecl(isAtPosition(1))`` matches ``int b``.
+AST_MATCHER_P(clang::ParmVarDecl, isAtPosition, unsigned, N) {
+  const clang::DeclContext *Context = Node.getParentFunctionOrMethod();
+
+  if (const auto *Decl = dyn_cast_or_null<FunctionDecl>(Context))
+    return N < Decl->param_size() && Decl->getParamDecl(N) == &Node;
+  if (const auto *Decl = dyn_cast_or_null<BlockDecl>(Context))
+    return N < Decl->param_size() && Decl->getParamDecl(N) == &Node;
+  if (const auto *Decl = dyn_cast_or_null<ObjCMethodDecl>(Context))
+    return N < Decl->param_size() && Decl->getParamDecl(N) == &Node;
+
+  return false;
+}
+
 /// Matches any parameter of a function or an ObjC method declaration or a
 /// block.
 ///
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 0a7d09e55c885..14d9bbb3e52d1 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -219,6 +219,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(floatLiteral);
   REGISTER_MATCHER(forEach);
   REGISTER_MATCHER(forEachArgumentWithParam);
+  REGISTER_MATCHER(isAtPosition);
   REGISTER_MATCHER(forEachConstructorInitializer);
   REGISTER_MATCHER(forEachDescendant);
   REGISTER_MATCHER(forEachOverridden);
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index a7d58528c0fb1..929188abf6acd 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2643,6 +2643,45 @@ TEST(HasDefaultArgument, Basic) {
                       parmVarDecl(hasDefaultArgument())));
 }
 
+TEST(IsAtPosition, Basic) {
+  EXPECT_TRUE(matches("void x(int a, int b) {}", parmVarDecl(isAtPosition(1))));
+  EXPECT_TRUE(matches("void x(int a, int b) {}", parmVarDecl(isAtPosition(0))));
+  EXPECT_TRUE(matches("void x(int a, int b) {}", parmVarDecl(isAtPosition(1))));
+  EXPECT_TRUE(notMatches("void x(int val) {}", parmVarDecl(isAtPosition(1))));
+}
+
+TEST(IsAtPosition, FunctionDecl) {
+  EXPECT_TRUE(matches("void x(int a);", parmVarDecl(isAtPosition(0))));
+  EXPECT_TRUE(matches("void x(int a, int b);", parmVarDecl(isAtPosition(0))));
+  EXPECT_TRUE(matches("void x(int a, int b);", parmVarDecl(isAtPosition(1))));
+  EXPECT_TRUE(notMatches("void x(int val);", parmVarDecl(isAtPosition(1))));
+}
+
+TEST(IsAtPosition, Lambda) {
+  EXPECT_TRUE(
+      matches("void x() { [](int a) {};  }", parmVarDecl(isAtPosition(0))));
+  EXPECT_TRUE(matches("void x() { [](int a, int b) {}; }",
+                      parmVarDecl(isAtPosition(0))));
+  EXPECT_TRUE(matches("void x() { [](int a, int b) {}; }",
+                      parmVarDecl(isAtPosition(1))));
+  EXPECT_TRUE(
+      notMatches("void x() { [](int val) {}; }", parmVarDecl(isAtPosition(1))));
+}
+
+TEST(IsAtPosition, BlockDecl) {
+  EXPECT_TRUE(matchesObjC(
+      "void func()  { void (^my_block)(int arg) = ^void(int arg) {}; } ",
+      parmVarDecl(isAtPosition(0))));
+
+  EXPECT_TRUE(matchesObjC("void func()  { void (^my_block)(int x, int y) = "
+                          "^void(int x, int y) {}; } ",
+                          parmVarDecl(isAtPosition(1))));
+
+  EXPECT_TRUE(notMatchesObjC(
+      "void func()  { void (^my_block)(int arg) = ^void(int arg) {}; } ",
+      parmVarDecl(isAtPosition(1))));
+}
+
 TEST(IsArray, Basic) {
   EXPECT_TRUE(matches("struct MyClass {}; MyClass *p1 = new MyClass[10];",
                       cxxNewExpr(isArray())));

From ac1dc1336ad76d719445d706654ca0ec4ff5557c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 28 May 2020 13:06:44 -0700
Subject: [PATCH 411/770] [Docs] Correct description of lldbinit behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jim pointed out that "every time somebody has touched the documentation
on startup files they have stated that we source the application one and
then the global one, even though in actual fact we’ve never done that."

Indeed, when we read the application specific .lldbinit file, the global
one is not read. This patch updates the man page to reflect that.
---
 lldb/docs/man/lldb.rst | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/lldb/docs/man/lldb.rst b/lldb/docs/man/lldb.rst
index 842a693f47518..87be124d788e7 100644
--- a/lldb/docs/man/lldb.rst
+++ b/lldb/docs/man/lldb.rst
@@ -303,13 +303,17 @@ CONFIGURATION FILES
 -------------------
 
 :program:`lldb` reads things like settings, aliases and commands from the
-.lldbinit file. First, it will read the application specific init file whose
-name is ~/.lldbinit followed by a "-" and the name of the current program. This
-would be ~/.lldbinit-lldb for the command line :program:`lldb` and
-~/.lldbinit-Xcode for Xcode. Secondly, the global ~/.lldbinit will be read.
-Finally, :program:`lldb` will look for an .lldbinit file in the current working
-directory. For security reasons, :program:`lldb` will print a warning and not
-source this file by default. This behavior can be changed by changing the
+.lldbinit file.
+
+First, it will read the application specific init file whose name is
+~/.lldbinit followed by a "-" and the name of the current program. This would
+be ~/.lldbinit-lldb for the command line :program:`lldb` and ~/.lldbinit-Xcode
+for Xcode. If there is no application specific init file, the global
+~/.lldbinit is read.
+
+Secondly, it will look for an .lldbinit file in the current working directory.
+For security reasons, :program:`lldb` will print a warning and not source this
+file by default. This behavior can be changed by changing the
 target.load-cwd-lldbinit setting.
 
 To always load the .lldbinit file in the current working directory, add the

From 2f430f7a51693c9d5c648179f2341b541be44000 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 20:35:06 -0700
Subject: [PATCH 412/770] [StackSafety] Remove SetMetadata parameter

---
 clang/lib/CodeGen/BackendUtil.cpp                |  2 +-
 llvm/include/llvm/Analysis/StackSafetyAnalysis.h |  5 ++---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp        | 11 +++++------
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 33627f3a67334..e746aef1a62ff 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -353,7 +353,7 @@ static void addDataFlowSanitizerPass(const PassManagerBuilder &Builder,
 
 static void addMemTagOptimizationPasses(const PassManagerBuilder &Builder,
                                         legacy::PassManagerBase &PM) {
-  PM.add(createStackSafetyGlobalInfoWrapperPass(/*SetMetadata=*/true));
+  PM.add(createStackSafetyGlobalInfoWrapperPass());
 }
 
 static TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple,
diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index df7ccac5b4b92..b5589611c8e28 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -126,12 +126,11 @@ class StackSafetyGlobalAnnotatorPass
 /// (legacy pass manager).
 class StackSafetyGlobalInfoWrapperPass : public ModulePass {
   StackSafetyGlobalInfo SSGI;
-  bool SetMetadata;
 
 public:
   static char ID;
 
-  StackSafetyGlobalInfoWrapperPass(bool SetMetadata = false);
+  StackSafetyGlobalInfoWrapperPass();
 
   const StackSafetyGlobalInfo &getResult() const { return SSGI; }
 
@@ -141,7 +140,7 @@ class StackSafetyGlobalInfoWrapperPass : public ModulePass {
   bool runOnModule(Module &M) override;
 };
 
-ModulePass *createStackSafetyGlobalInfoWrapperPass(bool SetMetadata);
+ModulePass *createStackSafetyGlobalInfoWrapperPass();
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 6eeffe6066dfc..e969639973a46 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -712,9 +712,8 @@ StackSafetyGlobalAnnotatorPass::run(Module &M, ModuleAnalysisManager &AM) {
 
 char StackSafetyGlobalInfoWrapperPass::ID = 0;
 
-StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass(
-    bool SetMetadata)
-    : ModulePass(ID), SetMetadata(SetMetadata) {
+StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass()
+    : ModulePass(ID) {
   initializeStackSafetyGlobalInfoWrapperPassPass(
       *PassRegistry::getPassRegistry());
 }
@@ -738,11 +737,11 @@ bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
             .Info;
       });
   SSGI = SSDFA.run();
-  return SetMetadata ? SSGI.setMetadata(M) : false;
+  return SSGI.setMetadata(M);
 }
 
-ModulePass *llvm::createStackSafetyGlobalInfoWrapperPass(bool SetMetadata) {
-  return new StackSafetyGlobalInfoWrapperPass(SetMetadata);
+ModulePass *llvm::createStackSafetyGlobalInfoWrapperPass() {
+  return new StackSafetyGlobalInfoWrapperPass();
 }
 
 static const char LocalPassArg[] = "stack-safety-local";

From 892c71a5bb72cfcce1f0e94e3a0fd314d4606977 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 27 May 2020 22:21:39 -0700
Subject: [PATCH 413/770] [StackSafety] Don't run datafow on allocas

We need to process only parameters. Allocas access can be calculated
afterwards.
Also don't create fake function for aliases and just resolve them on
initialization.
---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     | 197 +++++++++++-------
 .../Analysis/StackSafetyAnalysis/ipa-alias.ll |  36 ----
 2 files changed, 117 insertions(+), 116 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index e969639973a46..cdb952ba32780 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -137,31 +137,19 @@ ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
   return R;
 }
 
-/// Describes uses of allocas and parameters inside of a single function.
 struct FunctionInfo {
   SmallVector<UseInfo, 4> Allocas;
   SmallVector<UseInfo, 4> Params;
-  const GlobalValue *GV = nullptr;
   // TODO: describe return value as depending on one or more of its arguments.
 
   // StackSafetyDataFlowAnalysis counter stored here for faster access.
   int UpdateCount = 0;
 
-  FunctionInfo() = default;
-  FunctionInfo(const Function *F) : GV(F){};
-  explicit FunctionInfo(const GlobalAlias *A);
-
-  bool IsDSOLocal() const { return GV->isDSOLocal(); };
-
-  bool IsInterposable() const { return GV->isInterposable(); };
-
-  StringRef getName() const { return GV->getName(); }
-
   void print(raw_ostream &O, StringRef Name, const Function *F) const {
     // TODO: Consider different printout format after
     // StackSafetyDataFlowAnalysis. Calls and parameters are irrelevant then.
-    O << "  @" << Name << (IsDSOLocal() ? "" : " dso_preemptable")
-      << (IsInterposable() ? " interposable" : "") << "\n";
+    O << "  @" << Name << ((F && F->isDSOLocal()) ? "" : " dso_preemptable")
+      << ((F && F->isInterposable()) ? " interposable" : "") << "\n";
 
     O << "    args uses:\n";
     size_t Pos = 0;
@@ -190,18 +178,6 @@ struct FunctionInfo {
   }
 };
 
-FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
-  unsigned PointerSize = A->getParent()->getDataLayout().getPointerSizeInBits();
-  const GlobalObject *Aliasee = A->getBaseObject();
-  const FunctionType *Type = cast<FunctionType>(Aliasee->getValueType());
-  // 'Forward' all parameters to this alias to the aliasee
-  for (unsigned ArgNo = 0; ArgNo < Type->getNumParams(); ArgNo++) {
-    Params.emplace_back(PointerSize);
-    UseInfo &US = Params.back();
-    US.Calls.emplace_back(Aliasee, ArgNo, ConstantRange(APInt(PointerSize, 0)));
-  }
-}
-
 } // namespace
 
 struct StackSafetyInfo::InfoTy {
@@ -404,7 +380,7 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, UseInfo &US) {
 }
 
 FunctionInfo StackSafetyLocalAnalysis::run() {
-  FunctionInfo Info(&F);
+  FunctionInfo Info;
   assert(!F.isDeclaration() &&
          "Can't run StackSafety on a function declaration");
 
@@ -433,15 +409,13 @@ class StackSafetyDataFlowAnalysis {
   using FunctionMap = std::map<const GlobalValue *, FunctionInfo>;
 
   FunctionMap Functions;
+  const ConstantRange UnknownRange;
+
   // Callee-to-Caller multimap.
   DenseMap<const GlobalValue *, SmallVector<const GlobalValue *, 4>> Callers;
   SetVector<const GlobalValue *> WorkList;
 
-  unsigned PointerSize = 0;
-  const ConstantRange UnknownRange;
 
-  ConstantRange getArgumentAccessRange(const GlobalValue *Callee,
-                                       unsigned ParamNo) const;
   bool updateOneUse(UseInfo &US, bool UpdateToFullSet);
   void updateOneNode(const GlobalValue *Callee, FunctionInfo &FS);
   void updateOneNode(const GlobalValue *Callee) {
@@ -456,25 +430,24 @@ class StackSafetyDataFlowAnalysis {
   void verifyFixedPoint();
 #endif
 
+  uint32_t findPointerWidth() const {
+    for (auto &F : Functions)
+      for (auto &P : F.second.Params)
+        return P.Range.getBitWidth();
+    return 1;
+  }
+
 public:
-  StackSafetyDataFlowAnalysis(
-      Module &M, std::function<const FunctionInfo &(Function &)> FI);
-  GVToSSI run();
-};
+  explicit StackSafetyDataFlowAnalysis(FunctionMap Functions)
+      : Functions(std::move(Functions)),
+        UnknownRange(ConstantRange::getFull(findPointerWidth())) {}
 
-StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis(
-    Module &M, std::function<const FunctionInfo &(Function &)> FI)
-    : PointerSize(M.getDataLayout().getPointerSizeInBits()),
-      UnknownRange(PointerSize, true) {
-  // Without ThinLTO, run the local analysis for every function in the TU and
-  // then run the DFA.
-  for (auto &F : M.functions())
-    if (!F.isDeclaration())
-      Functions.emplace(&F, FI(F));
-  for (auto &A : M.aliases())
-    if (isa<Function>(A.getBaseObject()))
-      Functions.emplace(&A, FunctionInfo(&A));
-}
+  const FunctionMap &run();
+
+  // FIXME: Accept offset.
+  ConstantRange getArgumentAccessRange(const GlobalValue *Callee,
+                                       unsigned ParamNo) const;
+};
 
 ConstantRange
 StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
@@ -484,10 +457,6 @@ StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
   if (IT == Functions.end())
     return UnknownRange;
   const FunctionInfo &FS = IT->second;
-  // The definition of this symbol may not be the definition in this linkage
-  // unit.
-  if (!FS.IsDSOLocal() || FS.IsInterposable())
-    return UnknownRange;
   if (ParamNo >= FS.Params.size()) // possibly vararg
     return UnknownRange;
   return FS.Params[ParamNo].Range;
@@ -517,8 +486,6 @@ void StackSafetyDataFlowAnalysis::updateOneNode(const GlobalValue *Callee,
                                                 FunctionInfo &FS) {
   bool UpdateToFullSet = FS.UpdateCount > StackSafetyMaxIterations;
   bool Changed = false;
-  for (auto &AS : FS.Allocas)
-    Changed |= updateOneUse(AS, UpdateToFullSet);
   for (auto &PS : FS.Params)
     Changed |= updateOneUse(PS, UpdateToFullSet);
 
@@ -542,9 +509,6 @@ void StackSafetyDataFlowAnalysis::runDataFlow() {
   for (auto &F : Functions) {
     Callees.clear();
     FunctionInfo &FS = F.second;
-    for (auto &AS : FS.Allocas)
-      for (auto &CS : AS.Calls)
-        Callees.push_back(CS.Callee);
     for (auto &PS : FS.Params)
       for (auto &CS : PS.Calls)
         Callees.push_back(CS.Callee);
@@ -573,14 +537,11 @@ void StackSafetyDataFlowAnalysis::verifyFixedPoint() {
 }
 #endif
 
-GVToSSI StackSafetyDataFlowAnalysis::run() {
+const StackSafetyDataFlowAnalysis::FunctionMap &
+StackSafetyDataFlowAnalysis::run() {
   runDataFlow();
   LLVM_DEBUG(verifyFixedPoint());
-
-  GVToSSI SSI;
-  for (auto &F : Functions)
-    SSI.emplace(F.first, makeSSI(F.second));
-  return SSI;
+  return Functions;
 }
 
 bool setStackSafetyMetadata(Module &M, const GVToSSI &SSGI) {
@@ -608,6 +569,78 @@ bool setStackSafetyMetadata(Module &M, const GVToSSI &SSGI) {
   return Changed;
 }
 
+const Function *FindCalleeInModule(const GlobalValue *GV) {
+  while (GV) {
+    if (GV->isInterposable() || !GV->isDSOLocal())
+      return nullptr;
+    if (const Function *F = dyn_cast<Function>(GV))
+      return F;
+    const GlobalAlias *A = dyn_cast<GlobalAlias>(GV);
+    if (!A)
+      return nullptr;
+    GV = A->getBaseObject();
+    if (GV == A)
+      return nullptr;
+  }
+  return nullptr;
+}
+
+void ResolveAllCalls(UseInfo &Use) {
+  ConstantRange FullSet(Use.Range.getBitWidth(), true);
+  for (auto &C : Use.Calls) {
+    const Function *F = FindCalleeInModule(C.Callee);
+    if (F) {
+      C.Callee = F;
+      continue;
+    }
+
+    return Use.updateRange(FullSet);
+  }
+}
+
+void ResolveAllCalls(SmallVectorImpl<UseInfo> &Values) {
+  for (auto &V : Values)
+    ResolveAllCalls(V);
+}
+
+GVToSSI createGlobalStackSafetyInfo(
+    std::map<const GlobalValue *, FunctionInfo> Functions) {
+  GVToSSI SSI;
+  if (Functions.empty())
+    return SSI;
+
+  // FIXME: Simplify printing and remove copying here.
+  auto Copy = Functions;
+
+  for (auto &FI : Copy)
+    ResolveAllCalls(FI.second.Params);
+
+  StackSafetyDataFlowAnalysis SSDFA(std::move(Copy));
+
+  for (auto &F : SSDFA.run()) {
+    auto FI = F.second;
+    size_t Pos = 0;
+    for (auto &A : FI.Allocas) {
+      ResolveAllCalls(A);
+      for (auto &C : A.Calls) {
+        ConstantRange R = SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo);
+        A.updateRange(R.add(C.Offset));
+      }
+      // FIXME: This is needed only to preserve calls in print() results.
+      A.Calls = Functions[F.first].Allocas[Pos].Calls;
+      ++Pos;
+    }
+    Pos = 0;
+    for (auto &P : FI.Params) {
+      P.Calls = Functions[F.first].Params[Pos].Calls;
+      ++Pos;
+    }
+    SSI.emplace(F.first, makeSSI(std::move(FI)));
+  }
+
+  return SSI;
+}
+
 } // end anonymous namespace
 
 StackSafetyInfo::StackSafetyInfo(StackSafetyInfo &&) = default;
@@ -636,10 +669,6 @@ void StackSafetyGlobalInfo::print(raw_ostream &O) const {
       O << "\n";
     }
   }
-  for (auto &A : M.aliases()) {
-    SSGI.find(&A)->second.print(O, A);
-    O << "\n";
-  }
 }
 
 LLVM_DUMP_METHOD void StackSafetyGlobalInfo::dump() const { print(dbgs()); }
@@ -689,11 +718,17 @@ StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  StackSafetyDataFlowAnalysis SSDFA(
-      M, [&FAM](Function &F) -> const FunctionInfo & {
-        return FAM.getResult<StackSafetyAnalysis>(F).getInfo().Info;
-      });
-  return SSDFA.run();
+  // FIXME: Lookup Module Summary.
+  std::map<const GlobalValue *, FunctionInfo> Functions;
+
+  for (auto &F : M.functions()) {
+    if (!F.isDeclaration()) {
+      auto FI = FAM.getResult<StackSafetyAnalysis>(F).getInfo().Info;
+      Functions.emplace(&F, std::move(FI));
+    }
+  }
+
+  return createGlobalStackSafetyInfo(std::move(Functions));
 }
 
 PreservedAnalyses StackSafetyGlobalPrinterPass::run(Module &M,
@@ -729,14 +764,16 @@ void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage(
 }
 
 bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
-  StackSafetyDataFlowAnalysis SSDFA(
-      M, [this](Function &F) -> const FunctionInfo & {
-        return getAnalysis<StackSafetyInfoWrapperPass>(F)
-            .getResult()
-            .getInfo()
-            .Info;
-      });
-  SSGI = SSDFA.run();
+  std::map<const GlobalValue *, FunctionInfo> Functions;
+  for (auto &F : M.functions()) {
+    if (!F.isDeclaration()) {
+      auto FI =
+          getAnalysis<StackSafetyInfoWrapperPass>(F).getResult().getInfo().Info;
+      Functions.emplace(&F, std::move(FI));
+    }
+  }
+
+  SSGI = createGlobalStackSafetyInfo(std::move(Functions));
   return SSGI.setMetadata(M);
 }
 
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll b/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
index d77e7a68925a2..cfb6528e34e8a 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
@@ -95,39 +95,3 @@ entry:
 ; CHECK-NEXT: p[]: [0,1){{$}}
 ; CHECK-NEXT: allocas uses:
 ; CHECK-NOT: ]:
-
-; GLOBAL-LABEL: @InterposableAliasWrite1 interposable{{$}}
-; GLOBAL-NEXT: args uses:
-; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
-; GLOBAL-NEXT: allocas uses:
-; GLOBAL-NOT: ]:
-
-; GLOBAL-LABEL: @PreemptableAliasWrite1 dso_preemptable{{$}}
-; GLOBAL-NEXT: args uses:
-; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
-; GLOBAL-NEXT: allocas uses:
-; GLOBAL-NOT: ]:
-
-; GLOBAL-LABEL: @AliasToPreemptableAliasWrite1{{$}}
-; GLOBAL-NEXT: args uses:
-; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
-; GLOBAL-NEXT: allocas uses:
-; GLOBAL-NOT: ]:
-
-; GLOBAL-LABEL: @AliasWrite1{{$}}
-; GLOBAL-NEXT: args uses:
-; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
-; GLOBAL-NEXT: allocas uses:
-; GLOBAL-NOT: ]:
-
-; GLOBAL-LABEL: @BitcastAliasWrite1{{$}}
-; GLOBAL-NEXT: args uses:
-; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
-; GLOBAL-NEXT: allocas uses:
-; GLOBAL-NOT: ]:
-
-; GLOBAL-LABEL: @AliasToBitcastAliasWrite1{{$}}
-; GLOBAL-NEXT: args uses:
-; GLOBAL-NEXT: <N/A>[]: [0,1), @Write1(arg0, [0,1)){{$}}
-; GLOBAL-NEXT: allocas uses:
-; GLOBAL-NOT: ]:

From 2622cfbcd5d47d7320d253964a725422e8773781 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 28 May 2020 13:01:02 -0700
Subject: [PATCH 414/770] [NFC,StackSafety] Move internal offset calculation

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 28 +++++++++++++++--------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index cdb952ba32780..15ddea6a957c2 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -444,14 +444,14 @@ class StackSafetyDataFlowAnalysis {
 
   const FunctionMap &run();
 
-  // FIXME: Accept offset.
   ConstantRange getArgumentAccessRange(const GlobalValue *Callee,
-                                       unsigned ParamNo) const;
+                                       unsigned ParamNo,
+                                       const ConstantRange &Offsets) const;
 };
 
-ConstantRange
-StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
-                                                    unsigned ParamNo) const {
+ConstantRange StackSafetyDataFlowAnalysis::getArgumentAccessRange(
+    const GlobalValue *Callee, unsigned ParamNo,
+    const ConstantRange &Offsets) const {
   auto IT = Functions.find(Callee);
   // Unknown callee (outside of LTO domain or an indirect call).
   if (IT == Functions.end())
@@ -459,7 +459,15 @@ StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
   const FunctionInfo &FS = IT->second;
   if (ParamNo >= FS.Params.size()) // possibly vararg
     return UnknownRange;
-  return FS.Params[ParamNo].Range;
+  auto &Access = FS.Params[ParamNo].Range;
+  if (Access.isEmptySet())
+    return Access;
+  if (Access.isFullSet() || Offsets.isFullSet())
+    return UnknownRange;
+  if (Offsets.signedAddMayOverflow(Access) !=
+      ConstantRange::OverflowResult::NeverOverflows)
+    return UnknownRange;
+  return Access.add(Offsets);
 }
 
 bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
@@ -469,8 +477,8 @@ bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
     assert(!CS.Offset.isEmptySet() &&
            "Param range can't be empty-set, invalid offset range");
 
-    ConstantRange CalleeRange = getArgumentAccessRange(CS.Callee, CS.ParamNo);
-    CalleeRange = CalleeRange.add(CS.Offset);
+    ConstantRange CalleeRange =
+        getArgumentAccessRange(CS.Callee, CS.ParamNo, CS.Offset);
     if (!US.Range.contains(CalleeRange)) {
       Changed = true;
       if (UpdateToFullSet)
@@ -623,8 +631,8 @@ GVToSSI createGlobalStackSafetyInfo(
     for (auto &A : FI.Allocas) {
       ResolveAllCalls(A);
       for (auto &C : A.Calls) {
-        ConstantRange R = SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo);
-        A.updateRange(R.add(C.Offset));
+        A.updateRange(
+            SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo, C.Offset));
       }
       // FIXME: This is needed only to preserve calls in print() results.
       A.Calls = Functions[F.first].Allocas[Pos].Calls;

From 0e6628d37f99b3baaab662b9d1fa9a1e39d3aeb8 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 28 May 2020 01:07:31 -0700
Subject: [PATCH 415/770] [StackSafety] Lazy calculations

We are going to convert this into pure analysis, so
processing will be delayed up to the first safety request.
---
 .../llvm/Analysis/StackSafetyAnalysis.h       |  33 ++--
 llvm/lib/Analysis/StackSafetyAnalysis.cpp     | 142 ++++++++++--------
 2 files changed, 105 insertions(+), 70 deletions(-)

diff --git a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
index b5589611c8e28..246f44f19f47f 100644
--- a/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
+++ b/llvm/include/llvm/Analysis/StackSafetyAnalysis.h
@@ -19,6 +19,7 @@
 namespace llvm {
 
 class AllocaInst;
+class ScalarEvolution;
 
 /// Interface to access stack safety analysis results for single function.
 class StackSafetyInfo {
@@ -26,30 +27,40 @@ class StackSafetyInfo {
   struct InfoTy;
 
 private:
-  std::unique_ptr<InfoTy> Info;
+  Function *F = nullptr;
+  std::function<ScalarEvolution &()> GetSE;
+  mutable std::unique_ptr<InfoTy> Info;
 
 public:
-  StackSafetyInfo(InfoTy Info);
+  StackSafetyInfo();
+  StackSafetyInfo(Function *F, std::function<ScalarEvolution &()> GetSE);
   StackSafetyInfo(StackSafetyInfo &&);
   StackSafetyInfo &operator=(StackSafetyInfo &&);
   ~StackSafetyInfo();
 
-  const InfoTy &getInfo() const { return *Info; }
+  const InfoTy &getInfo() const;
 
   // TODO: Add useful for client methods.
-  void print(raw_ostream &O, const GlobalValue &F) const;
+  void print(raw_ostream &O) const;
 };
 
 class StackSafetyGlobalInfo {
 public:
-  using GVToSSI = std::map<const GlobalValue *, StackSafetyInfo>;
+  struct InfoTy;
 
 private:
-  GVToSSI SSGI;
+  Module *M = nullptr;
+  std::function<const StackSafetyInfo &(Function &F)> GetSSI;
+  mutable std::unique_ptr<InfoTy> Info;
+  const InfoTy &getInfo() const;
 
 public:
-  StackSafetyGlobalInfo() = default;
-  StackSafetyGlobalInfo(GVToSSI SSGI) : SSGI(std::move(SSGI)) {}
+  StackSafetyGlobalInfo();
+  StackSafetyGlobalInfo(
+      Module *M, std::function<const StackSafetyInfo &(Function &F)> GetSSI);
+  StackSafetyGlobalInfo(StackSafetyGlobalInfo &&);
+  StackSafetyGlobalInfo &operator=(StackSafetyGlobalInfo &&);
+  ~StackSafetyGlobalInfo();
 
   bool setMetadata(Module &M) const;
   void print(raw_ostream &O) const;
@@ -77,14 +88,13 @@ class StackSafetyPrinterPass : public PassInfoMixin<StackSafetyPrinterPass> {
 
 /// StackSafetyInfo wrapper for the legacy pass manager
 class StackSafetyInfoWrapperPass : public FunctionPass {
-  Optional<StackSafetyInfo> SSI;
-  const Function *F = nullptr;
+  StackSafetyInfo SSI;
 
 public:
   static char ID;
   StackSafetyInfoWrapperPass();
 
-  const StackSafetyInfo &getResult() const { return *SSI; }
+  const StackSafetyInfo &getResult() const { return SSI; }
 
   void print(raw_ostream &O, const Module *M) const override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -131,6 +141,7 @@ class StackSafetyGlobalInfoWrapperPass : public ModulePass {
   static char ID;
 
   StackSafetyGlobalInfoWrapperPass();
+  ~StackSafetyGlobalInfoWrapperPass();
 
   const StackSafetyGlobalInfo &getResult() const { return SSGI; }
 
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 15ddea6a957c2..2c8a5e33c847f 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -33,8 +33,6 @@ static cl::opt<int> StackSafetyMaxIterations("stack-safety-max-iterations",
 
 namespace {
 
-using GVToSSI = StackSafetyGlobalInfo::GVToSSI;
-
 /// Rewrite an SCEV expression for a memory access address to an expression that
 /// represents offset from the given alloca.
 class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
@@ -178,15 +176,17 @@ struct FunctionInfo {
   }
 };
 
+using GVToSSI = std::map<const GlobalValue *, FunctionInfo>;
+
 } // namespace
 
 struct StackSafetyInfo::InfoTy {
   FunctionInfo Info;
 };
 
-StackSafetyInfo makeSSI(FunctionInfo Info) {
-  return StackSafetyInfo(StackSafetyInfo::InfoTy{std::move(Info)});
-}
+struct StackSafetyGlobalInfo::InfoTy {
+  GVToSSI Info;
+};
 
 namespace {
 
@@ -430,17 +430,10 @@ class StackSafetyDataFlowAnalysis {
   void verifyFixedPoint();
 #endif
 
-  uint32_t findPointerWidth() const {
-    for (auto &F : Functions)
-      for (auto &P : F.second.Params)
-        return P.Range.getBitWidth();
-    return 1;
-  }
-
 public:
-  explicit StackSafetyDataFlowAnalysis(FunctionMap Functions)
+  StackSafetyDataFlowAnalysis(uint32_t PointerBitWidth, FunctionMap Functions)
       : Functions(std::move(Functions)),
-        UnknownRange(ConstantRange::getFull(findPointerWidth())) {}
+        UnknownRange(ConstantRange::getFull(PointerBitWidth)) {}
 
   const FunctionMap &run();
 
@@ -560,7 +553,7 @@ bool setStackSafetyMetadata(Module &M, const GVToSSI &SSGI) {
     auto Iter = SSGI.find(&F);
     if (Iter == SSGI.end())
       continue;
-    const FunctionInfo &Summary = Iter->second.getInfo().Info;
+    const FunctionInfo &Summary = Iter->second;
     size_t Pos = 0;
     for (auto &I : instructions(F)) {
       if (auto AI = dyn_cast<AllocaInst>(&I)) {
@@ -623,11 +616,16 @@ GVToSSI createGlobalStackSafetyInfo(
   for (auto &FI : Copy)
     ResolveAllCalls(FI.second.Params);
 
-  StackSafetyDataFlowAnalysis SSDFA(std::move(Copy));
+  uint32_t PointerSize = Copy.begin()
+                             ->first->getParent()
+                             ->getDataLayout()
+                             .getMaxPointerSizeInBits();
+  StackSafetyDataFlowAnalysis SSDFA(PointerSize, std::move(Copy));
 
   for (auto &F : SSDFA.run()) {
     auto FI = F.second;
     size_t Pos = 0;
+    auto &SrcF = Functions[F.first];
     for (auto &A : FI.Allocas) {
       ResolveAllCalls(A);
       for (auto &C : A.Calls) {
@@ -635,15 +633,15 @@ GVToSSI createGlobalStackSafetyInfo(
             SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo, C.Offset));
       }
       // FIXME: This is needed only to preserve calls in print() results.
-      A.Calls = Functions[F.first].Allocas[Pos].Calls;
+      A.Calls = SrcF.Allocas[Pos].Calls;
       ++Pos;
     }
     Pos = 0;
     for (auto &P : FI.Params) {
-      P.Calls = Functions[F.first].Params[Pos].Calls;
+      P.Calls = SrcF.Params[Pos].Calls;
       ++Pos;
     }
-    SSI.emplace(F.first, makeSSI(std::move(FI)));
+    SSI[F.first] = std::move(FI);
   }
 
   return SSI;
@@ -651,29 +649,70 @@ GVToSSI createGlobalStackSafetyInfo(
 
 } // end anonymous namespace
 
+StackSafetyInfo::StackSafetyInfo() = default;
+
+StackSafetyInfo::StackSafetyInfo(Function *F,
+                                 std::function<ScalarEvolution &()> GetSE)
+    : F(F), GetSE(GetSE) {}
+
 StackSafetyInfo::StackSafetyInfo(StackSafetyInfo &&) = default;
-StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default;
 
-StackSafetyInfo::StackSafetyInfo(InfoTy Info)
-    : Info(new InfoTy(std::move(Info))) {}
+StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default;
 
 StackSafetyInfo::~StackSafetyInfo() = default;
 
-void StackSafetyInfo::print(raw_ostream &O, const GlobalValue &F) const {
-  Info->Info.print(O, F.getName(), dyn_cast<Function>(&F));
+const StackSafetyInfo::InfoTy &StackSafetyInfo::getInfo() const {
+  if (!Info) {
+    StackSafetyLocalAnalysis SSLA(*F, GetSE());
+    Info.reset(new InfoTy{SSLA.run()});
+  }
+  return *Info;
+}
+
+void StackSafetyInfo::print(raw_ostream &O) const {
+  getInfo().Info.print(O, F->getName(), dyn_cast<Function>(F));
+}
+
+const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
+  if (!Info) {
+    std::map<const GlobalValue *, FunctionInfo> Functions;
+    for (auto &F : M->functions()) {
+      if (!F.isDeclaration()) {
+        auto FI = GetSSI(F).getInfo().Info;
+        Functions.emplace(&F, std::move(FI));
+      }
+    }
+    Info.reset(new InfoTy{createGlobalStackSafetyInfo(std::move(Functions))});
+  }
+  return *Info;
 }
 
+StackSafetyGlobalInfo::StackSafetyGlobalInfo() = default;
+
+StackSafetyGlobalInfo::StackSafetyGlobalInfo(
+    Module *M, std::function<const StackSafetyInfo &(Function &F)> GetSSI)
+    : M(M), GetSSI(GetSSI) {}
+
+StackSafetyGlobalInfo::StackSafetyGlobalInfo(StackSafetyGlobalInfo &&) =
+    default;
+
+StackSafetyGlobalInfo &
+StackSafetyGlobalInfo::operator=(StackSafetyGlobalInfo &&) = default;
+
+StackSafetyGlobalInfo::~StackSafetyGlobalInfo() = default;
+
 bool StackSafetyGlobalInfo::setMetadata(Module &M) const {
-  return setStackSafetyMetadata(M, SSGI);
+  return setStackSafetyMetadata(M, getInfo().Info);
 }
 
 void StackSafetyGlobalInfo::print(raw_ostream &O) const {
-  if (SSGI.empty())
+  auto &SSI = getInfo().Info;
+  if (SSI.empty())
     return;
-  const Module &M = *SSGI.begin()->first->getParent();
+  const Module &M = *SSI.begin()->first->getParent();
   for (auto &F : M.functions()) {
     if (!F.isDeclaration()) {
-      SSGI.find(&F)->second.print(O, F);
+      SSI.find(&F)->second.print(O, F.getName(), &F);
       O << "\n";
     }
   }
@@ -685,14 +724,15 @@ AnalysisKey StackSafetyAnalysis::Key;
 
 StackSafetyInfo StackSafetyAnalysis::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-  StackSafetyLocalAnalysis SSLA(F, AM.getResult<ScalarEvolutionAnalysis>(F));
-  return makeSSI(SSLA.run());
+  return StackSafetyInfo(&F, [&AM, &F]() -> ScalarEvolution & {
+    return AM.getResult<ScalarEvolutionAnalysis>(F);
+  });
 }
 
 PreservedAnalyses StackSafetyPrinterPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
   OS << "'Stack Safety Local Analysis' for function '" << F.getName() << "'\n";
-  AM.getResult<StackSafetyAnalysis>(F).print(OS, F);
+  AM.getResult<StackSafetyAnalysis>(F).print(OS);
   return PreservedAnalyses::all();
 }
 
@@ -703,19 +743,17 @@ StackSafetyInfoWrapperPass::StackSafetyInfoWrapperPass() : FunctionPass(ID) {
 }
 
 void StackSafetyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
   AU.setPreservesAll();
 }
 
 void StackSafetyInfoWrapperPass::print(raw_ostream &O, const Module *M) const {
-  SSI->print(O, *F);
+  SSI.print(O);
 }
 
 bool StackSafetyInfoWrapperPass::runOnFunction(Function &F) {
-  StackSafetyLocalAnalysis SSLA(
-      F, getAnalysis<ScalarEvolutionWrapperPass>().getSE());
-  SSI = makeSSI(SSLA.run());
-  this->F = &F;
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  SSI = {&F, [SE]() -> ScalarEvolution & { return *SE; }};
   return false;
 }
 
@@ -725,18 +763,9 @@ StackSafetyGlobalInfo
 StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
-  // FIXME: Lookup Module Summary.
-  std::map<const GlobalValue *, FunctionInfo> Functions;
-
-  for (auto &F : M.functions()) {
-    if (!F.isDeclaration()) {
-      auto FI = FAM.getResult<StackSafetyAnalysis>(F).getInfo().Info;
-      Functions.emplace(&F, std::move(FI));
-    }
-  }
-
-  return createGlobalStackSafetyInfo(std::move(Functions));
+  return {&M, [&FAM](Function &F) -> const StackSafetyInfo & {
+            return FAM.getResult<StackSafetyAnalysis>(F);
+          }};
 }
 
 PreservedAnalyses StackSafetyGlobalPrinterPass::run(Module &M,
@@ -761,6 +790,8 @@ StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass()
       *PassRegistry::getPassRegistry());
 }
 
+StackSafetyGlobalInfoWrapperPass::~StackSafetyGlobalInfoWrapperPass() = default;
+
 void StackSafetyGlobalInfoWrapperPass::print(raw_ostream &O,
                                              const Module *M) const {
   SSGI.print(O);
@@ -772,16 +803,9 @@ void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage(
 }
 
 bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
-  std::map<const GlobalValue *, FunctionInfo> Functions;
-  for (auto &F : M.functions()) {
-    if (!F.isDeclaration()) {
-      auto FI =
-          getAnalysis<StackSafetyInfoWrapperPass>(F).getResult().getInfo().Info;
-      Functions.emplace(&F, std::move(FI));
-    }
-  }
-
-  SSGI = createGlobalStackSafetyInfo(std::move(Functions));
+  SSGI = {&M, [this](Function &F) -> const StackSafetyInfo & {
+            return getAnalysis<StackSafetyInfoWrapperPass>(F).getResult();
+          }};
   return SSGI.setMetadata(M);
 }
 

From 81b79011a77f97798236af6d716e5d352790d54b Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Thu, 28 May 2020 13:29:48 -0700
Subject: [PATCH 416/770] [lldb-vscode] Make it possible to run vsce package

Summary:
Running `vsce package` to package lldb-vscode as an installable .vsix file errors with:

```
ERROR  Invalid publisher name 'llvm.org'. Expected the identifier of a publisher, not its human-friendly name.
```

This patch fixes the publisher name and bumps a required dependency so that `vsce package` succeeds.

Reviewers: clayborg

Reviewed By: clayborg

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D80569
---
 lldb/tools/lldb-vscode/package.json | 47 +++++++++++++++--------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/lldb/tools/lldb-vscode/package.json b/lldb/tools/lldb-vscode/package.json
index 1df16d0dfe35d..f4408d3607d9e 100644
--- a/lldb/tools/lldb-vscode/package.json
+++ b/lldb/tools/lldb-vscode/package.json
@@ -2,7 +2,8 @@
 	"name": "lldb-vscode",
 	"displayName": "LLDB native Debug stub",
 	"version": "0.1.0",
-	"publisher": "llvm.org",
+	"publisher": "llvm",
+	"repository": "llvm.org",
 	"description": "Debug adapter for LLDB which uses a C++ tool to interface directly with LLDB.",
 	"author": {
 		"name": "Greg Clayton",
@@ -16,7 +17,6 @@
 		"vscode": "^1.18.0",
 		"node": "^7.9.0"
 	},
-	"icon": "images/lldb.png",
 	"categories": [
 		"Debuggers"
 	],
@@ -29,7 +29,7 @@
 		"vscode": "1.1.10",
 		"vscode-debugadapter-testsupport": "1.25.0",
 		"tslint": "5.8.0",
-		"vsce": "1.35.0"
+		"vsce": "^1.36.3"
 	},
 	"contributes": {
 		"debuggers": [
@@ -70,7 +70,10 @@
 								"description": "Path to the program to debug."
 							},
 							"args": {
-								"type": [ "array", "string" ],
+								"type": [
+									"array",
+									"string"
+								],
 								"description": "Program arguments.",
 								"default": []
 							},
@@ -131,29 +134,29 @@
 								"description": "Name of the execution platform to override value derived from the program file."
 							},
 							"initCommands": {
-									"type": "array",
-									"description": "Initialization commands executed upon debugger startup.",
-									"default": []
+								"type": "array",
+								"description": "Initialization commands executed upon debugger startup.",
+								"default": []
 							},
 							"preRunCommands": {
-									"type": "array",
-									"description": "Commands executed just before the program is launched.",
-									"default": []
+								"type": "array",
+								"description": "Commands executed just before the program is launched.",
+								"default": []
 							},
 							"launchCommands": {
-									"type": "array",
-									"description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail.",
-									"default": []
+								"type": "array",
+								"description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail.",
+								"default": []
 							},
 							"stopCommands": {
-									"type": "array",
-									"description": "Commands executed each time the program stops.",
-									"default": []
+								"type": "array",
+								"description": "Commands executed each time the program stops.",
+								"default": []
 							},
 							"exitCommands": {
-									"type": "array",
-									"description": "Commands executed at the end of debugging session.",
-									"default": []
+								"type": "array",
+								"description": "Commands executed at the end of debugging session.",
+								"default": []
 							}
 						}
 					},
@@ -161,12 +164,12 @@
 						"properties": {
 							"program": {
 								"type": "string",
-									"description": "Path to the program to attach to."
+								"description": "Path to the program to attach to."
 							},
 							"pid": {
 								"type": [
-										"number",
-										"string"
+									"number",
+									"string"
 								],
 								"description": "System process ID to attach to."
 							},

From 9021ce9576e438ae5a6fdb574327d30ea6b67fa8 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Thu, 28 May 2020 15:48:05 -0500
Subject: [PATCH 417/770] [Clang] Enable KF and KC mode for [_Complex]
 __float128

The headers provided with recent GNU toolchains for PPC have code that includes
typedefs such as:

typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__KC__)))

This patch allows clang to compile programs that contain
#include <math.h>

with -mfloat128 which it currently fails to compile.

Fixes: https://bugs.llvm.org/show_bug.cgi?id=46068

Differential revision: https://reviews.llvm.org/D80374
---
 clang/include/clang/AST/ASTContext.h   |  2 +-
 clang/include/clang/Basic/TargetInfo.h |  9 +++++++--
 clang/lib/AST/ASTContext.cpp           |  6 ++++--
 clang/lib/Basic/TargetInfo.cpp         |  7 ++++++-
 clang/lib/Sema/SemaDeclAttr.cpp        | 16 ++++++++++++----
 clang/test/Sema/attr-mode.c            | 11 +++++++++++
 6 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 509ada3c96962..a5bb9a34c2fb3 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -657,7 +657,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// getRealTypeForBitwidth -
   /// sets floating point QualTy according to specified bitwidth.
   /// Returns empty type if there is no appropriate target types.
-  QualType getRealTypeForBitwidth(unsigned DestWidth) const;
+  QualType getRealTypeForBitwidth(unsigned DestWidth, bool ExplicitIEEE) const;
 
   bool AtomicUsesUnsupportedLibcall(const AtomicExpr *E) const;
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 910a4d6846aaa..0a5379225caf3 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -368,8 +368,13 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   virtual IntType getLeastIntTypeByWidth(unsigned BitWidth,
                                          bool IsSigned) const;
 
-  /// Return floating point type with specified width.
-  RealType getRealTypeByWidth(unsigned BitWidth) const;
+  /// Return floating point type with specified width. On PPC, there are
+  /// three possible types for 128-bit floating point: "PPC double-double",
+  /// IEEE 754R quad precision, and "long double" (which under the covers
+  /// is represented as one of those two). At this time, there is no support
+  /// for an explicit "PPC double-double" type (i.e. __ibm128) so we only
+  /// need to differentiate between "long double" and IEEE quad precision.
+  RealType getRealTypeByWidth(unsigned BitWidth, bool ExplicitIEEE) const;
 
   /// Return the alignment (in bits) of the specified integer type enum.
   ///
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index c457a5537168a..bfb6014027f44 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -10644,8 +10644,10 @@ QualType ASTContext::getIntTypeForBitwidth(unsigned DestWidth,
 /// getRealTypeForBitwidth -
 /// sets floating point QualTy according to specified bitwidth.
 /// Returns empty type if there is no appropriate target types.
-QualType ASTContext::getRealTypeForBitwidth(unsigned DestWidth) const {
-  TargetInfo::RealType Ty = getTargetInfo().getRealTypeByWidth(DestWidth);
+QualType ASTContext::getRealTypeForBitwidth(unsigned DestWidth,
+                                            bool ExplicitIEEE) const {
+  TargetInfo::RealType Ty =
+      getTargetInfo().getRealTypeByWidth(DestWidth, ExplicitIEEE);
   switch (Ty) {
   case TargetInfo::Float:
     return FloatTy;
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 2f1e044bb106d..a3c8da5885b8e 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -265,7 +265,8 @@ TargetInfo::IntType TargetInfo::getLeastIntTypeByWidth(unsigned BitWidth,
   return NoInt;
 }
 
-TargetInfo::RealType TargetInfo::getRealTypeByWidth(unsigned BitWidth) const {
+TargetInfo::RealType TargetInfo::getRealTypeByWidth(unsigned BitWidth,
+                                                    bool ExplicitIEEE) const {
   if (getFloatWidth() == BitWidth)
     return Float;
   if (getDoubleWidth() == BitWidth)
@@ -277,6 +278,10 @@ TargetInfo::RealType TargetInfo::getRealTypeByWidth(unsigned BitWidth) const {
       return LongDouble;
     break;
   case 128:
+    // The caller explicitly asked for an IEEE compliant type but we still
+    // have to check if the target supports it.
+    if (ExplicitIEEE)
+      return hasFloat128Type() ? Float128 : NoFloat;
     if (&getLongDoubleFormat() == &llvm::APFloat::PPCDoubleDouble() ||
         &getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
       return LongDouble;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 0e062ba74a085..df44b6fcf2af5 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3942,7 +3942,8 @@ bool Sema::checkMSInheritanceAttrOnDefinition(
 /// parseModeAttrArg - Parses attribute mode string and returns parsed type
 /// attribute.
 static void parseModeAttrArg(Sema &S, StringRef Str, unsigned &DestWidth,
-                             bool &IntegerMode, bool &ComplexMode) {
+                             bool &IntegerMode, bool &ComplexMode,
+                             bool &ExplicitIEEE) {
   IntegerMode = true;
   ComplexMode = false;
   switch (Str.size()) {
@@ -3963,7 +3964,12 @@ static void parseModeAttrArg(Sema &S, StringRef Str, unsigned &DestWidth,
     case 'X':
       DestWidth = 96;
       break;
+    case 'K': // KFmode - IEEE quad precision (__float128)
+      ExplicitIEEE = true;
+      DestWidth = Str[1] == 'I' ? 0 : 128;
+      break;
     case 'T':
+      ExplicitIEEE = false;
       DestWidth = 128;
       break;
     }
@@ -4024,6 +4030,7 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
   unsigned DestWidth = 0;
   bool IntegerMode = true;
   bool ComplexMode = false;
+  bool ExplicitIEEE = false;
   llvm::APInt VectorSize(64, 0);
   if (Str.size() >= 4 && Str[0] == 'V') {
     // Minimal length of vector mode is 4: 'V' + NUMBER(>=1) + TYPE(>=2).
@@ -4036,7 +4043,7 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
         !Str.substr(1, VectorStringLength).getAsInteger(10, VectorSize) &&
         VectorSize.isPowerOf2()) {
       parseModeAttrArg(*this, Str.substr(VectorStringLength + 1), DestWidth,
-                       IntegerMode, ComplexMode);
+                       IntegerMode, ComplexMode, ExplicitIEEE);
       // Avoid duplicate warning from template instantiation.
       if (!InInstantiation)
         Diag(AttrLoc, diag::warn_vector_mode_deprecated);
@@ -4046,7 +4053,8 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
   }
 
   if (!VectorSize)
-    parseModeAttrArg(*this, Str, DestWidth, IntegerMode, ComplexMode);
+    parseModeAttrArg(*this, Str, DestWidth, IntegerMode, ComplexMode,
+                     ExplicitIEEE);
 
   // FIXME: Sync this with InitializePredefinedMacros; we need to match int8_t
   // and friends, at least with glibc.
@@ -4112,7 +4120,7 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
     NewElemTy = Context.getIntTypeForBitwidth(DestWidth,
                                               OldElemTy->isSignedIntegerType());
   else
-    NewElemTy = Context.getRealTypeForBitwidth(DestWidth);
+    NewElemTy = Context.getRealTypeForBitwidth(DestWidth, ExplicitIEEE);
 
   if (NewElemTy.isNull()) {
     Diag(AttrLoc, diag::err_machine_mode) << 1 /*Unsupported*/ << Name;
diff --git a/clang/test/Sema/attr-mode.c b/clang/test/Sema/attr-mode.c
index c89cb65241919..a4bac90b99fb8 100644
--- a/clang/test/Sema/attr-mode.c
+++ b/clang/test/Sema/attr-mode.c
@@ -4,6 +4,8 @@
 // RUN:   -verify %s
 // RUN: %clang_cc1 -triple powerpc64-pc-linux-gnu -DTEST_64BIT_PPC64 -fsyntax-only \
 // RUN:   -verify %s
+// RUN: %clang_cc1 -triple powerpc64-pc-linux-gnu -DTEST_F128_PPC64 -fsyntax-only \
+// RUN:   -verify -target-feature +float128 %s
 // RUN: %clang_cc1 -triple x86_64-pc-linux-gnux32 -DTEST_64BIT_X86 -fsyntax-only \
 // RUN:   -verify %s
 // RUN: %clang_cc1 -triple mips-linux-gnu -DTEST_MIPS_32 -fsyntax-only \
@@ -90,6 +92,15 @@ void f_ft128_arg(long double *x);
 void f_ft128_complex_arg(_Complex long double *x);
 void test_TFtype(f128ibm *a) { f_ft128_arg (a); }
 void test_TCtype(c128ibm *a) { f_ft128_complex_arg (a); }
+#elif TEST_F128_PPC64
+typedef int invalid_7 __attribute((mode(KF))); // expected-error{{type of machine mode does not match type of base type}}
+typedef int invalid_8 __attribute((mode(KI))); // expected-error{{unknown machine mode}}
+typedef _Complex float cf128 __attribute__((mode(KC)));
+typedef float f128 __attribute__((mode(KF)));
+void f_f128_arg(__float128 *x);
+void f_f128_complex_arg(_Complex __float128 *x);
+void test_KFtype(f128 *a) { f_f128_arg(a); }
+void test_KCtype(cf128 *a) { f_f128_complex_arg(a); }
 #elif TEST_MIPS_32
 typedef unsigned int gcc_unwind_word __attribute__((mode(unwind_word)));
 int foo[sizeof(gcc_unwind_word) == 4 ? 1 : -1];

From 4d6cda9bdaca01f581ef23904f54443ef5c2acac Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 12:31:49 -0700
Subject: [PATCH 418/770] [Statepoint] Use iterate_range.empty [NFC]

---
 llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index d826fe7b0936b..fd961d1908a40 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -869,7 +869,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
   SI.ID = I.getID();
 
   if (auto Opt = I.getOperandBundle(LLVMContext::OB_deopt)) {
-    assert(ISP.deopt_begin() == ISP.deopt_end() &&
+    assert(ISP.deopt_operands().empty() &&
            "can't list both deopt operands and deopt bundle");
     auto &Inputs = Opt->Inputs;
     SI.DeoptState = ArrayRef<const Use>(Inputs.begin(), Inputs.end());
@@ -877,7 +877,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
     SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
   }
   if (auto Opt = I.getOperandBundle(LLVMContext::OB_gc_transition)) {
-    assert(ISP.gc_transition_args_begin() == ISP.gc_transition_args_end() &&
+    assert(ISP.gc_transition_args().empty() &&
            "can't list both gc_transition operands and bundle");
     auto &Inputs = Opt->Inputs;
     SI.GCTransitionArgs = ArrayRef<const Use>(Inputs.begin(), Inputs.end());

From a0d2fd4a1f785230120df2bb5f74917dc0c357e5 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 13:34:12 -0700
Subject: [PATCH 419/770] [Statepoint] Sink actual_args and gc_args to
 GCStatepointInst [NFC]

These are the two operand sets which are expected to survive more than another week or so.  Instead of bothering to update the deopt and gc-transition operands, we'll just wait until those are removed and delete the code.

For those following along, this is likely to be the last (major) change in this sequence for about a week.  I want to wait until all of this has been merged downstream to ensure I haven't introduced any bugs (and migrate some downstream code to the new interfaces).  Once that's done, we should be able to delete Statepoint/ImmutableStatepoint without too much work.
---
 llvm/include/llvm/IR/Statepoint.h             | 82 +++++++++++++------
 .../SelectionDAG/StatepointLowering.cpp       |  2 +-
 .../Scalar/RewriteStatepointsForGC.cpp        | 19 ++---
 3 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index 5ca6939ce7731..81679e2612da7 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -134,6 +134,53 @@ class GCStatepointInst : public CallBase {
       cast<PointerType>(getActualCalledOperand()->getType())->getElementType();
     return cast<FunctionType>(CalleeTy)->getReturnType();
   }
+
+
+  /// Return the number of arguments to the underlying call.
+  size_t actual_arg_size() const { return getNumCallArgs(); }
+  /// Return an iterator to the begining of the arguments to the underlying call
+  const_op_iterator actual_arg_begin() const {
+    assert(CallArgsBeginPos <= (int)arg_size());
+    return arg_begin() + CallArgsBeginPos;
+  }
+  /// Return an end iterator of the arguments to the underlying call
+  const_op_iterator actual_arg_end() const {
+    auto I = actual_arg_begin() + actual_arg_size();
+    assert((arg_end() - I) >= 0);
+    return I;
+  }
+  /// range adapter for actual call arguments
+  iterator_range<const_op_iterator> actual_args() const {
+    return make_range(actual_arg_begin(), actual_arg_end());
+  }
+
+  /// Returns an iterator to the begining of the argument range describing gc
+  /// values for the statepoint.
+  const_op_iterator gc_args_begin() const {
+    // The current format has two length prefix bundles between call args and
+    // start of gc args.  This will be removed in the near future.
+    const Value *NumGCTransitionArgs = *actual_arg_end();
+    uint64_t NumTrans = cast<ConstantInt>(NumGCTransitionArgs)->getZExtValue();
+    const_op_iterator trans_end = actual_arg_end() + 1 + NumTrans;
+    const Value *NumDeoptArgs = *trans_end;
+    uint64_t NumDeopt = cast<ConstantInt>(NumDeoptArgs)->getZExtValue();
+    auto I = trans_end + 1 + NumDeopt;
+    assert((arg_end() - I) >= 0);
+    return I;
+  }
+
+  /// Return an end iterator for the gc argument range
+  const_op_iterator gc_args_end() const { return arg_end(); }
+
+  /// Return the operand index at which the gc args begin
+  unsigned gcArgsStartIdx() const {
+    return gc_args_begin() - op_begin();
+  }
+
+  /// range adapter for gc arguments
+  iterator_range<const_op_iterator> gc_args() const {
+    return make_range(gc_args_begin(), gc_args_end());
+  }
 };
 
 /// A wrapper around a GC intrinsic call, this provides most of the actual
@@ -201,16 +248,11 @@ class StatepointBase {
     return getCall()->doesNotThrow() || (F ? F->doesNotThrow() : false);
   }
 
-
-  size_t arg_size() const { return getNumCallArgs(); }
-  arg_iterator arg_begin() const {
-    assert(CallArgsBeginPos <= (int)getCall()->arg_size());
-    return getCall()->arg_begin() + CallArgsBeginPos;
-  }
-  arg_iterator arg_end() const {
-    auto I = arg_begin() + arg_size();
-    assert((getCall()->arg_end() - I) >= 0);
-    return I;
+  size_t arg_size() const { return getCall()->actual_arg_size(); }
+  arg_iterator arg_begin() const { return getCall()->actual_arg_begin(); }
+  arg_iterator arg_end() const { return getCall()->actual_arg_end(); }
+  iterator_range<arg_iterator> call_args() const {
+    return getCall()->actual_args();
   }
 
   ValueTy *getArgument(unsigned Index) {
@@ -218,11 +260,6 @@ class StatepointBase {
     return *(arg_begin() + Index);
   }
 
-  /// range adapter for call arguments
-  iterator_range<arg_iterator> call_args() const {
-    return make_range(arg_begin(), arg_end());
-  }
-
   /// Return true if the call or the callee has the given attribute.
   bool paramHasAttr(unsigned i, Attribute::AttrKind A) const {
     Function *F = getCalledFunction();
@@ -274,16 +311,15 @@ class StatepointBase {
     return make_range(deopt_begin(), deopt_end());
   }
 
-  arg_iterator gc_args_begin() const { return deopt_end(); }
-  arg_iterator gc_args_end() const { return getCall()->arg_end(); }
-
-  unsigned gcArgsStartIdx() const {
-    return gc_args_begin() - getCall()->op_begin();
+  arg_iterator gc_args_begin() const {
+    auto I = getCall()->gc_args_begin();
+    assert(I == deopt_end());
+    return I;
   }
-
-  /// range adapter for gc arguments
+  arg_iterator gc_args_end() const { return getCall()->gc_args_end(); }
+  unsigned gcArgsStartIdx() const { return getCall()->gcArgsStartIdx(); }
   iterator_range<arg_iterator> gc_args() const {
-    return make_range(gc_args_begin(), gc_args_end());
+    return getCall()->gc_args();
   }
 
   /// Get list of all gc reloactes linked to this statepoint
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index fd961d1908a40..4f51efd094723 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -864,7 +864,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
     }
   }
 
-  SI.GCArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
+  SI.GCArgs = ArrayRef<const Use>(I.gc_args_begin(), I.gc_args_end());
   SI.StatepointInstr = &I;
   SI.ID = I.getID();
 
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index ec14bca90801a..9742132320509 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -271,7 +271,7 @@ struct PartiallyConstructedSafepointRecord {
 
   /// The *new* gc.statepoint instruction itself.  This produces the token
   /// that normal path gc.relocates and the gc.result are tied to.
-  Instruction *StatepointToken;
+  GCStatepointInst *StatepointToken;
 
   /// Instruction to which exceptional gc relocates are attached
   /// Makes it easier to iterate through them during relocationViaAlloca.
@@ -1546,7 +1546,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   }
 
   // Create the statepoint given all the arguments
-  Instruction *Token = nullptr;
+  GCStatepointInst *Token = nullptr;
   if (auto *CI = dyn_cast<CallInst>(Call)) {
     CallInst *SPCall = Builder.CreateGCStatepointCall(
         StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
@@ -1562,7 +1562,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     SPCall->setAttributes(
         legalizeCallAttributes(CI->getContext(), CI->getAttributes()));
 
-    Token = SPCall;
+    Token = cast<GCStatepointInst>(SPCall);
 
     // Put the following gc_result and gc_relocate calls immediately after the
     // the old call (which we're about to delete)
@@ -1589,7 +1589,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     SPInvoke->setAttributes(
         legalizeCallAttributes(II->getContext(), II->getAttributes()));
 
-    Token = SPInvoke;
+    Token = cast<GCStatepointInst>(SPInvoke);
 
     // Generate gc relocates in exceptional path
     BasicBlock *UnwindBlock = II->getUnwindDest();
@@ -1604,7 +1604,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
     Result.UnwindToken = ExceptionalToken;
 
-    const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+    const unsigned LiveStartIdx = Token->gcArgsStartIdx();
     CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
                       Builder);
 
@@ -1652,7 +1652,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   Result.StatepointToken = Token;
 
   // Second, create a gc.relocate for every live variable
-  const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+  const unsigned LiveStartIdx = Token->gcArgsStartIdx();
   CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
 }
 
@@ -2409,9 +2409,8 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // That Value* no longer exists and we need to use the new gc_result.
     // Thankfully, the live set is embedded in the statepoint (and updated), so
     // we just grab that.
-    Statepoint Statepoint(Info.StatepointToken);
-    Live.insert(Live.end(), Statepoint.gc_args_begin(),
-                Statepoint.gc_args_end());
+    Live.insert(Live.end(), Info.StatepointToken->gc_args_begin(),
+                Info.StatepointToken->gc_args_end());
 #ifndef NDEBUG
     // Do some basic sanity checks on our liveness results before performing
     // relocation.  Relocation can and will turn mistakes in liveness results
@@ -2419,7 +2418,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // TODO: It would be nice to test consistency as well
     assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
            "statepoint must be reachable or liveness is meaningless");
-    for (Value *V : Statepoint.gc_args()) {
+    for (Value *V : Info.StatepointToken->gc_args()) {
       if (!isa<Instruction>(V))
         // Non-instruction values trivial dominate all possible uses
         continue;

From 9d065477942ffa6d9085188c0d1d2cb95a41baf6 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 13:49:41 -0700
Subject: [PATCH 420/770] [Statepoints] Sink routines for grabbing projections
 to GCStatepointInst [NFC]

Mechanical movement, nothing more.
---
 llvm/include/llvm/IR/Statepoint.h             | 44 ++++++++++---------
 .../SelectionDAG/StatepointLowering.cpp       |  4 +-
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index 81679e2612da7..d31484207c10b 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -181,6 +181,23 @@ class GCStatepointInst : public CallBase {
   iterator_range<const_op_iterator> gc_args() const {
     return make_range(gc_args_begin(), gc_args_end());
   }
+
+
+  /// Get list of all gc reloactes linked to this statepoint
+  /// May contain several relocations for the same base/derived pair.
+  /// For example this could happen due to relocations on unwinding
+  /// path of invoke.
+  inline std::vector<const GCRelocateInst *> getGCRelocates() const;
+
+  /// Get the experimental_gc_result call tied to this statepoint if there is
+  /// one, otherwise return nullptr.
+  const GCResultInst *getGCResult() const {
+    for (auto *U : users())
+      if (auto *GRI = dyn_cast<GCResultInst>(U))
+        return GRI;
+    return nullptr;
+  }
+
 };
 
 /// A wrapper around a GC intrinsic call, this provides most of the actual
@@ -322,20 +339,11 @@ class StatepointBase {
     return getCall()->gc_args();
   }
 
-  /// Get list of all gc reloactes linked to this statepoint
-  /// May contain several relocations for the same base/derived pair.
-  /// For example this could happen due to relocations on unwinding
-  /// path of invoke.
-  std::vector<const GCRelocateInst *> getRelocates() const;
-
-  /// Get the experimental_gc_result call tied to this statepoint.  Can be
-  /// nullptr if there isn't a gc_result tied to this statepoint.  Guaranteed to
-  /// be a CallInst if non-null.
+  std::vector<const GCRelocateInst *> getRelocates() const {
+    return getCall()->getGCRelocates();
+  }
   const GCResultInst *getGCResult() const {
-    for (auto *U : getInstruction()->users())
-      if (auto *GRI = dyn_cast<GCResultInst>(U))
-        return GRI;
-    return nullptr;
+    return getCall()->getGCResult();
   }
 
 #ifndef NDEBUG
@@ -470,21 +478,17 @@ class GCResultInst : public GCProjectionInst {
   }
 };
 
-template <typename FunTy, typename InstructionTy, typename ValueTy,
-          typename CallTy>
-std::vector<const GCRelocateInst *>
-StatepointBase<FunTy, InstructionTy, ValueTy, CallTy>::getRelocates()
-    const {
+std::vector<const GCRelocateInst *> GCStatepointInst::getGCRelocates() const {
   std::vector<const GCRelocateInst *> Result;
 
   // Search for relocated pointers.  Note that working backwards from the
   // gc_relocates ensures that we only get pairs which are actually relocated
   // and used after the statepoint.
-  for (const User *U : StatepointCall->users())
+  for (const User *U : users())
     if (auto *Relocate = dyn_cast<GCRelocateInst>(U))
       Result.push_back(Relocate);
 
-  auto *StatepointInvoke = dyn_cast<InvokeInst>(StatepointCall);
+  auto *StatepointInvoke = dyn_cast<InvokeInst>(this);
   if (!StatepointInvoke)
     return Result;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 4f51efd094723..3e8911859e2de 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -854,7 +854,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
   // separately with half the space. This would require a format rev and a
   // fairly major rework of the STATEPOINT node though.
   SmallSet<SDValue, 8> Seen;
-  for (const GCRelocateInst *Relocate : ISP.getRelocates()) {
+  for (const GCRelocateInst *Relocate : I.getGCRelocates()) {
     SI.GCRelocates.push_back(Relocate);
 
     SDValue DerivedSD = getValue(Relocate->getDerivedPtr());
@@ -893,7 +893,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
   SDValue ReturnValue = LowerAsSTATEPOINT(SI);
 
   // Export the result value if needed
-  const GCResultInst *GCResult = ISP.getGCResult();
+  const GCResultInst *GCResult = I.getGCResult();
   Type *RetTy = I.getActualReturnType();
   if (!RetTy->isVoidTy() && GCResult) {
     if (GCResult->getParent() != I.getParent()) {

From 0aa201eaf97681f59b72baee6552aa1b9b5c9129 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Wed, 27 May 2020 13:13:13 -0700
Subject: [PATCH 421/770] [MachineLICM] Assert that locations from debug insts
 are not lost

Summary:
Assert that MachineLICM does not move a debug instruction and then drop
its debug location. Later passes require each debug instruction to have
a location.

Testing: check-llvm, clang stage2 RelWithDebInfo build (x86_64)

Reviewers: aprantl, davide, chrisjackson, jmorse

Subscribers: hiraditya, asbirlea, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80665
---
 llvm/lib/CodeGen/MachineLICM.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 2a60858b6de21..98638b9fa7377 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -638,6 +638,7 @@ void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def) {
   // Since we are moving the instruction out of its basic block, we do not
   // retain its debug location. Doing so would degrade the debugging
   // experience and adversely affect the accuracy of profiling information.
+  assert(!MI->isDebugInstr() && "Should not hoist debug inst");
   MI->setDebugLoc(DebugLoc());
 
   // Add register to livein list to all the BBs in the current loop since a
@@ -841,6 +842,7 @@ void MachineLICMBase::SinkIntoLoop() {
 
     // The instruction is is moved from its basic block, so do not retain the
     // debug information.
+    assert(!I->isDebugInstr() && "Should not sink debug inst");
     I->setDebugLoc(DebugLoc());
   }
 }
@@ -1536,6 +1538,7 @@ bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
     // Since we are moving the instruction out of its basic block, we do not
     // retain its debug location. Doing so would degrade the debugging
     // experience and adversely affect the accuracy of profiling information.
+    assert(!MI->isDebugInstr() && "Should not hoist debug inst");
     MI->setDebugLoc(DebugLoc());
 
     // Update register pressure for BBs from header to this block.

From 4855534d10cea3dd93d33da13ceb3381b0c588e6 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Wed, 27 May 2020 15:44:10 -0700
Subject: [PATCH 422/770] [MachineVerifier] Verify that a DBG_VALUE has a debug
 location

Summary:
Verify that each DBG_VALUE has a debug location. This is required by
LiveDebugValues, and perhaps by other late passes.

There's an exception for tests: lots of tests use a two-operand form of
DBG_VALUE for convenience. There's no reason to prevent that.

This is an extension of D80665, but there's no dependency.

Reviewers: aprantl, jmorse, davide, chrisjackson

Subscribers: hiraditya, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, Jim, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80670
---
 llvm/lib/CodeGen/MachineVerifier.cpp          |  7 +++++++
 llvm/test/CodeGen/Hexagon/early-if-debug.mir  | 11 +++++------
 .../MIR/Generic/dbg-value-missing-loc.mir     | 19 +++++++++++++++++++
 .../MIR/X86/instructions-debug-location.mir   | 12 ++++++------
 .../CodeGen/MIR/X86/metadata-operands.mir     |  2 +-
 .../RISCV/select-optimize-multiple.mir        | 19 +++++++++----------
 llvm/test/CodeGen/X86/machine-cp-debug.mir    |  3 +--
 llvm/test/CodeGen/X86/pr38952.mir             |  4 ++--
 8 files changed, 50 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index f07856d799c9f..b6121c79aad2d 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1488,6 +1488,13 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->isInlineAsm())
     verifyInlineAsm(MI);
 
+  // A fully-formed DBG_VALUE must have a location. Ignore partially formed
+  // DBG_VALUEs: these are convenient to use in tests, but should never get
+  // generated.
+  if (MI->isDebugValue() && MI->getNumOperands() == 4)
+    if (!MI->getDebugLoc())
+      report("Missing DebugLoc for debug instruction", MI);
+
   // Check the MachineMemOperands for basic consistency.
   for (MachineMemOperand *Op : MI->memoperands()) {
     if (Op->isLoad() && !MI->mayLoad())
diff --git a/llvm/test/CodeGen/Hexagon/early-if-debug.mir b/llvm/test/CodeGen/Hexagon/early-if-debug.mir
index b76f41019a047..0eb2ba71a49fb 100644
--- a/llvm/test/CodeGen/Hexagon/early-if-debug.mir
+++ b/llvm/test/CodeGen/Hexagon/early-if-debug.mir
@@ -18,7 +18,6 @@
   define void @foo() {
     ret void
   }
-  !1 = !DIExpression()
 ...
 ---
 name: foo
@@ -40,11 +39,11 @@ body:             |
     J2_jump %bb.1, implicit-def dead $pc
 
   bb.1:
-    DBG_VALUE %0, $noreg, !1, !1
-    DBG_VALUE %0, $noreg, !1, !1
-    DBG_VALUE %0, $noreg, !1, !1
-    DBG_VALUE %0, $noreg, !1, !1
-    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg
+    DBG_VALUE %0, $noreg
+    DBG_VALUE %0, $noreg
+    DBG_VALUE %0, $noreg
+    DBG_VALUE %0, $noreg
     %3 = A2_tfrsi 321
 
   bb.2:
diff --git a/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir b/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir
new file mode 100644
index 0000000000000..d44ba086c7435
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/dbg-value-missing-loc.mir
@@ -0,0 +1,19 @@
+# RUN: not --crash llc -run-pass machineverifier -o - %s 2>&1 | FileCheck %s
+
+# CHECK: Bad machine code: Missing DebugLoc for debug instruction
+# CHECK: - instruction: DBG_VALUE 1, 2, 3, 4
+
+--- |
+
+  define i32 @foo() {
+  entry:
+    ret i32 0
+  }
+
+...
+---
+name:            foo
+body: |
+  bb.0.entry:
+    DBG_VALUE 1, 2, 3, 4
+...
diff --git a/llvm/test/CodeGen/MIR/X86/instructions-debug-location.mir b/llvm/test/CodeGen/MIR/X86/instructions-debug-location.mir
index 05bd4174dc20f..9c6fe3aa708a6 100644
--- a/llvm/test/CodeGen/MIR/X86/instructions-debug-location.mir
+++ b/llvm/test/CodeGen/MIR/X86/instructions-debug-location.mir
@@ -99,12 +99,12 @@ body: |
     liveins: $edi
 
     %0 = COPY $edi
-  ; CHECK:      DBG_VALUE $noreg, i32 0, !DIExpression(), !12
-  ; CHECK-NEXT: DBG_VALUE $noreg, i64 -22, !DIExpression(), !12
-  ; CHECK-NEXT: DBG_VALUE $noreg, i128 123492148938512984928424384934328985928, !DIExpression(), !12
-    DBG_VALUE _, i32 0, !DIExpression(), !13
-    DBG_VALUE _, i64 -22, !DIExpression(), !13
-    DBG_VALUE _, i128 123492148938512984928424384934328985928, !DIExpression(), !13
+  ; CHECK:      DBG_VALUE $noreg, i32 0, !11, !DIExpression()
+  ; CHECK-NEXT: DBG_VALUE $noreg, i64 -22, !11, !DIExpression()
+  ; CHECK-NEXT: DBG_VALUE $noreg, i128 123492148938512984928424384934328985928, !11, !DIExpression()
+    DBG_VALUE _, i32 0, !12, !DIExpression(), debug-location !13
+    DBG_VALUE _, i64 -22, !12, !DIExpression(), debug-location !13
+    DBG_VALUE _, i128 123492148938512984928424384934328985928, !12, !DIExpression(), debug-location !13
     MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
     $eax = COPY %0
     RETQ $eax
diff --git a/llvm/test/CodeGen/MIR/X86/metadata-operands.mir b/llvm/test/CodeGen/MIR/X86/metadata-operands.mir
index 7dcb5f9b810a4..5375485d320ce 100644
--- a/llvm/test/CodeGen/MIR/X86/metadata-operands.mir
+++ b/llvm/test/CodeGen/MIR/X86/metadata-operands.mir
@@ -53,7 +53,7 @@ body: |
     ; CHECK:      %0:gr32 = COPY $edi
     ; CHECK-NEXT: DBG_VALUE $noreg, 0, !11, !DIExpression()
     %0 = COPY $edi
-    DBG_VALUE _, 0, !12, !DIExpression()
+    DBG_VALUE _, 0, !12, !DIExpression(), debug-location !13
     MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
     $eax = COPY %0
     RETQ $eax
diff --git a/llvm/test/CodeGen/RISCV/select-optimize-multiple.mir b/llvm/test/CodeGen/RISCV/select-optimize-multiple.mir
index d93758344f992..d7575b87b55c9 100644
--- a/llvm/test/CodeGen/RISCV/select-optimize-multiple.mir
+++ b/llvm/test/CodeGen/RISCV/select-optimize-multiple.mir
@@ -13,7 +13,6 @@
   define void @cmov_interleaved_debug_value() {
     ret void
   }
-  !1 = !DIExpression()
 ...
 ---
 # Here we have a sequence of select instructions with a non-select instruction
@@ -139,14 +138,14 @@ body:             |
     ; RV32I: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
     ; RV32I: [[COPY4:%[0-9]+]]:gpr = COPY $x0
     ; RV32I: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY3]], 1
-    ; RV32I: DBG_VALUE [[ADDI]], $noreg, !DIExpression(), !DIExpression()
+    ; RV32I: DBG_VALUE [[ADDI]], $noreg
     ; RV32I: BNE [[ANDI]], [[COPY4]], %bb.2
     ; RV32I: .1:
     ; RV32I: .2:
     ; RV32I: [[PHI:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.0, [[COPY1]], %bb.1
     ; RV32I: [[PHI1:%[0-9]+]]:gpr = PHI [[COPY]], %bb.0, [[COPY1]], %bb.1
-    ; RV32I: DBG_VALUE [[PHI]], $noreg, !DIExpression(), !DIExpression()
-    ; RV32I: DBG_VALUE [[PHI1]], $noreg, !DIExpression(), !DIExpression()
+    ; RV32I: DBG_VALUE [[PHI]], $noreg
+    ; RV32I: DBG_VALUE [[PHI1]], $noreg
     ; RV32I: [[ADD:%[0-9]+]]:gpr = ADD [[PHI]], killed [[PHI1]]
     ; RV32I: $x10 = COPY [[ADD]]
     ; RV32I: PseudoRET implicit $x10
@@ -160,14 +159,14 @@ body:             |
     ; RV64I: [[ANDI:%[0-9]+]]:gpr = ANDI [[COPY3]], 1
     ; RV64I: [[COPY4:%[0-9]+]]:gpr = COPY $x0
     ; RV64I: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY3]], 1
-    ; RV64I: DBG_VALUE [[ADDI]], $noreg, !DIExpression(), !DIExpression()
+    ; RV64I: DBG_VALUE [[ADDI]], $noreg
     ; RV64I: BNE [[ANDI]], [[COPY4]], %bb.2
     ; RV64I: .1:
     ; RV64I: .2:
     ; RV64I: [[PHI:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.0, [[COPY1]], %bb.1
     ; RV64I: [[PHI1:%[0-9]+]]:gpr = PHI [[COPY]], %bb.0, [[COPY1]], %bb.1
-    ; RV64I: DBG_VALUE [[PHI]], $noreg, !DIExpression(), !DIExpression()
-    ; RV64I: DBG_VALUE [[PHI1]], $noreg, !DIExpression(), !DIExpression()
+    ; RV64I: DBG_VALUE [[PHI]], $noreg
+    ; RV64I: DBG_VALUE [[PHI1]], $noreg
     ; RV64I: [[ADD:%[0-9]+]]:gpr = ADD [[PHI]], killed [[PHI1]]
     ; RV64I: $x10 = COPY [[ADD]]
     ; RV64I: PseudoRET implicit $x10
@@ -178,11 +177,11 @@ body:             |
     %5:gpr = ANDI %0, 1
     %6:gpr = COPY $x0
     %7:gpr = Select_GPR_Using_CC_GPR %5, %6, 22, %1, %2
-    DBG_VALUE %7, $noreg, !1, !1
+    DBG_VALUE %7, $noreg
     %8:gpr = ADDI %0, 1
-    DBG_VALUE %8, $noreg, !1, !1
+    DBG_VALUE %8, $noreg
     %9:gpr = Select_GPR_Using_CC_GPR %5, %6, 22, %3, %2
-    DBG_VALUE %9, $noreg, !1, !1
+    DBG_VALUE %9, $noreg
     %10:gpr = ADD %7, killed %9
     $x10 = COPY %10
     PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/X86/machine-cp-debug.mir b/llvm/test/CodeGen/X86/machine-cp-debug.mir
index a3230e8910cb3..e998d324e332a 100644
--- a/llvm/test/CodeGen/X86/machine-cp-debug.mir
+++ b/llvm/test/CodeGen/X86/machine-cp-debug.mir
@@ -9,7 +9,6 @@
   define void @fred() {
     ret void
   }
-  !1 = !DIExpression()
 ...
 
 ---
@@ -19,5 +18,5 @@ body: |
   bb.0:
     liveins: $eax
     $ebx = COPY $eax
-    DBG_VALUE $ebx, _, !1, !1
+    DBG_VALUE $ebx, _
 ...
diff --git a/llvm/test/CodeGen/X86/pr38952.mir b/llvm/test/CodeGen/X86/pr38952.mir
index ba2ffa8fefc29..f085f6601396f 100644
--- a/llvm/test/CodeGen/X86/pr38952.mir
+++ b/llvm/test/CodeGen/X86/pr38952.mir
@@ -72,10 +72,10 @@ body:             |
   ; Test that the DBG_VALUE on ebx below is sunk with the def of ebx, despite
   ; not being adjacent to the def, see PR38952
 
-    DBG_VALUE $edi, $noreg, !21, !DIExpression()
+    DBG_VALUE $edi, $noreg
     renamable $ebx = COPY $edi
     renamable $eax = MOV32r0 implicit-def dead $eflags
-    DBG_VALUE $ebx, $noreg, !21, !DIExpression()
+    DBG_VALUE $ebx, $noreg
     CMP32ri $edi, 255, implicit-def $eflags
     JCC_1 %bb.2, 15, implicit killed $eflags
     JMP_1 %bb.1

From d11155d273af00f75c2b40a5ca3007463f9808c1 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Wed, 27 May 2020 13:22:10 -0700
Subject: [PATCH 423/770] [LiveDebugValues] Add cutoffs to avoid pathological
 behavior

Summary:
We received a report of LiveDebugValues consuming 25GB+ of RAM when
compiling code generated by Unity's IL2CPP scripting backend.

There's an initial 5GB spike due to repeatedly copying cached lists of
MachineBasicBlocks within the UserValueScopes members of VarLocs.

But the larger scaling issue arises due to the fact that prior to range
extension, there are 81K basic blocks and 156K DBG_VALUEs: given enough
memory, LiveDebugValues would insert 101 million MIs (I counted this by
incrementing a counter inside of VarLoc::BuildDbgValue).

It seems like LiveDebugValues would have to be rearchitected to support
this kind of input (we'd need some new represntation for DBG_VALUEs that
get inserted into ~every block via flushPendingLocs). OTOH, large globs
of auto-generated code are typically not debugged interactively.

So: add cutoffs to disable range extension when the input is too big. I
chose the cutoffs experimentally, erring on the conservative side. When
compiling a large collection of Apple software, range extension never
got disabled.

rdar://63418929

Reviewers: aprantl, friss, jmorse, Orlando

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80662
---
 llvm/lib/CodeGen/LiveDebugValues.cpp          | 28 ++++++
 .../MIR/X86/live-debug-values-cutoffs.mir     | 99 +++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir

diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index 00a6149a05404..2d11a23e9ede4 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -81,6 +81,18 @@ using namespace llvm;
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 STATISTIC(NumRemoved, "Number of DBG_VALUE instructions removed");
 
+// Options to prevent pathological compile-time behavior. If InputBBLimit and
+// InputDbgValueLimit are both exceeded, range extension is disabled.
+static cl::opt<unsigned> InputBBLimit(
+    "livedebugvalues-input-bb-limit",
+    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
+    cl::init(10000), cl::Hidden);
+static cl::opt<unsigned> InputDbgValueLimit(
+    "livedebugvalues-input-dbg-value-limit",
+    cl::desc(
+        "Maximum input DBG_VALUE insts supported by debug range extension"),
+    cl::init(50000), cl::Hidden);
+
 // If @MI is a DBG_VALUE with debug value described by a defined
 // register, returns the number of this register. In the other case, returns 0.
 static Register isDbgValueDescribedByReg(const MachineInstr &MI) {
@@ -1753,6 +1765,22 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     Worklist.push(RPONumber);
     ++RPONumber;
   }
+
+  if (RPONumber > InputBBLimit) {
+    unsigned NumInputDbgValues = 0;
+    for (auto &MBB : MF)
+      for (auto &MI : MBB)
+        if (MI.isDebugValue())
+          ++NumInputDbgValues;
+    if (NumInputDbgValues > InputDbgValueLimit) {
+      LLVM_DEBUG(dbgs() << "Disabling LiveDebugValues: " << MF.getName()
+                        << " has " << RPONumber << " basic blocks and "
+                        << NumInputDbgValues
+                        << " input DBG_VALUEs, exceeding limits.\n");
+      return false;
+    }
+  }
+
   // This is a standard "union of predecessor outs" dataflow problem.
   // To solve it, we perform join() and process() using the two worklist method
   // until the ranges converge.
diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir
new file mode 100644
index 0000000000000..f14b746d29710
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir
@@ -0,0 +1,99 @@
+# Test cutoffs for livedebugvalues debug range extension.
+# Disable LDV if the input-bb-limit AND the input-dbg-value-limit are both exceeded.
+
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -livedebugvalues-input-bb-limit=1 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=1 \
+# RUN:   | FileCheck %s -check-prefix=LDV-DISABLED
+
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -livedebugvalues-input-bb-limit=1 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=10 \
+# RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
+
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -livedebugvalues-input-bb-limit=10 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=1 \
+# RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
+
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -livedebugvalues-input-bb-limit=10 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=10 \
+# RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
+
+# LDV-DISABLED-LABEL: bb.1.exit
+# LDV-DISABLED-NEXT: $edi = MOV32rm
+
+# LDV-ENABLED-LABEL: bb.1.exit
+# LDV-ENABLED-NEXT: DBG_VALUE $rsp, 0, {{.*}}, !DIExpression(DW_OP_plus_uconst, 4)
+# LDV-ENABLED-NEXT: $edi = MOV32rm
+
+--- |
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+  declare i32 @use(i32)
+
+  define i32 @foo(i32 %x) !dbg !6 {
+  entry:
+    %y = add i32 %x, %x, !dbg !12
+    call void @llvm.dbg.value(metadata i32 %y, metadata !9, metadata !DIExpression()), !dbg !12
+    br label %exit, !dbg !13
+
+  exit:                                             ; preds = %entry
+    %z = call i32 @use(i32 %y), !dbg !14
+    call void @llvm.dbg.value(metadata i32 %z, metadata !11, metadata !DIExpression()), !dbg !14
+    ret i32 %z, !dbg !15
+  }
+
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.debugify = !{!3, !4}
+  !llvm.module.flags = !{!5}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "/tmp/t.ll", directory: "/")
+  !2 = !{}
+  !3 = !{i32 4}
+  !4 = !{i32 2}
+  !5 = !{i32 2, !"Debug Info Version", i32 3}
+  !6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+  !7 = !DISubroutineType(types: !2)
+  !8 = !{!9, !11}
+  !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+  !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+  !11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 3, type: !10)
+  !12 = !DILocation(line: 1, column: 1, scope: !6)
+  !13 = !DILocation(line: 2, column: 1, scope: !6)
+  !14 = !DILocation(line: 3, column: 1, scope: !6)
+  !15 = !DILocation(line: 4, column: 1, scope: !6)
+
+...
+---
+name:            foo
+liveins:
+  - { reg: '$edi', virtual-reg: '' }
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $edi
+
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    renamable $edi = ADD32rr renamable $edi, killed renamable $edi, implicit-def $eflags, debug-location !12
+    DBG_VALUE renamable $edi, $noreg, !9, !DIExpression(), debug-location !12
+    MOV32mr $rsp, 1, $noreg, 4, $noreg, killed $edi :: (store 4 into %stack.0)
+    DBG_VALUE $rsp, 0, !9, !DIExpression(DW_OP_plus_uconst, 4), debug-location !12
+
+  bb.1.exit:
+    $edi = MOV32rm $rsp, 1, $noreg, 4, $noreg :: (load 4 from %stack.0)
+    CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $eax, debug-location !14
+    DBG_VALUE renamable $eax, $noreg, !11, !DIExpression(), debug-location !14
+    $rcx = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !15
+    RETQ implicit killed $eax, debug-location !15
+
+...

From e8e7b2cb46bb16a8939567d770c3f69df35e7bdc Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 28 May 2020 17:49:01 +0100
Subject: [PATCH 424/770] [ARM] More tests for MVE LSR and float issues. NFC

---
 .../CodeGen/Thumb2/mve-float32regloops.ll     | 630 ++++++++++++++++++
 1 file changed, 630 insertions(+)

diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 111a5871a17b8..45cb9fc5b4bcd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1395,6 +1395,636 @@ if.end:                                           ; preds = %while.end, %if.then
   ret void
 }
 
+%struct.arm_biquad_cascade_stereo_df2T_instance_f32 = type { i8, float*, float* }
+define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) {
+; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    ldrb.w lr, [r0]
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    ldrd r12, r0, [r0, #4]
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    strd r4, r4, [sp, #16]
+; CHECK-NEXT:    beq .LBB17_5
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:    movs r5, #2
+; CHECK-NEXT:    viwdup.u32 q0, r4, r5, #1
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:  .LBB17_2: @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB17_3 Depth 2
+; CHECK-NEXT:    mov r7, lr
+; CHECK-NEXT:    ldr.w lr, [r0, #12]
+; CHECK-NEXT:    ldrd r5, r6, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r12]
+; CHECK-NEXT:    vldr s12, [r0, #8]
+; CHECK-NEXT:    vdup.32 q2, lr
+; CHECK-NEXT:    vldr s14, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q1, [r4]
+; CHECK-NEXT:    vdup.32 q1, r6
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmov.f32 s7, s12
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:  .LBB17_3: @ Parent Loop BB17_2 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrw.u32 q4, [r1, q0, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q5, [r4, q0, uxtw #2]
+; CHECK-NEXT:    adds r1, #8
+; CHECK-NEXT:    vfma.f32 q5, q4, r5
+; CHECK-NEXT:    vstmia r6, {s20, s21}
+; CHECK-NEXT:    adds r6, #8
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #8]
+; CHECK-NEXT:    vfma.f32 q3, q5, q2
+; CHECK-NEXT:    vfma.f32 q3, q4, q1
+; CHECK-NEXT:    vstrw.32 q3, [r4]
+; CHECK-NEXT:    le lr, .LBB17_3
+; CHECK-NEXT:  @ %bb.4: @ in Loop: Header=BB17_2 Depth=1
+; CHECK-NEXT:    mov lr, r7
+; CHECK-NEXT:    adds r0, #20
+; CHECK-NEXT:    subs.w lr, r7, #1
+; CHECK-NEXT:    vstrb.8 q3, [r12], #16
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    bne .LBB17_2
+; CHECK-NEXT:    b .LBB17_7
+; CHECK-NEXT:  .LBB17_5: @ %.preheader
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB17_6: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r12], #16
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    le lr, .LBB17_6
+; CHECK-NEXT:  .LBB17_7:
+; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %5 = alloca [6 x float], align 4
+  %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1
+  %7 = load float*, float** %6, align 4
+  %8 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 2
+  %9 = load float*, float** %8, align 4
+  %10 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 0
+  %11 = load i8, i8* %10, align 4
+  %12 = zext i8 %11 to i32
+  %13 = bitcast [6 x float]* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %13) #5
+  %14 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 0, i32 2, i32 1)
+  %15 = extractvalue { <4 x i32>, i32 } %14, 0
+  %16 = getelementptr inbounds [6 x float], [6 x float]* %5, i32 0, i32 4
+  store float 0.000000e+00, float* %16, align 4
+  %17 = getelementptr inbounds [6 x float], [6 x float]* %5, i32 0, i32 5
+  store float 0.000000e+00, float* %17, align 4
+  %18 = bitcast [6 x float]* %5 to <4 x float>*
+  %19 = icmp eq i32 %3, 0
+  %20 = bitcast [6 x float]* %5 to i32*
+  %21 = getelementptr inbounds [6 x float], [6 x float]* %5, i32 0, i32 2
+  %22 = bitcast float* %21 to <4 x float>*
+  br i1 %19, label %23, label %31
+
+23:                                               ; preds = %4, %23
+  %24 = phi i32 [ %29, %23 ], [ %12, %4 ]
+  %25 = phi float* [ %28, %23 ], [ %7, %4 ]
+  %26 = bitcast float* %25 to <4 x float>*
+  %27 = load <4 x float>, <4 x float>* %26, align 8
+  store <4 x float> %27, <4 x float>* %18, align 4
+  %28 = getelementptr inbounds float, float* %25, i32 4
+  %29 = add i32 %24, -1
+  %30 = icmp eq i32 %29, 0
+  br i1 %30, label %82, label %23
+
+31:                                               ; preds = %4, %77
+  %32 = phi i32 [ %80, %77 ], [ %12, %4 ]
+  %33 = phi float* [ %78, %77 ], [ %9, %4 ]
+  %34 = phi float* [ %79, %77 ], [ %7, %4 ]
+  %35 = phi float* [ %2, %77 ], [ %1, %4 ]
+  %36 = getelementptr inbounds float, float* %33, i32 1
+  %37 = load float, float* %33, align 4
+  %38 = getelementptr inbounds float, float* %33, i32 2
+  %39 = load float, float* %36, align 4
+  %40 = getelementptr inbounds float, float* %33, i32 3
+  %41 = load float, float* %38, align 4
+  %42 = getelementptr inbounds float, float* %33, i32 4
+  %43 = load float, float* %40, align 4
+  %44 = load float, float* %42, align 4
+  %45 = insertelement <4 x float> undef, float %43, i32 0
+  %46 = shufflevector <4 x float> %45, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %47 = insertelement <4 x float> %46, float %44, i32 2
+  %48 = insertelement <4 x float> %47, float %44, i32 3
+  %49 = insertelement <4 x float> undef, float %39, i32 0
+  %50 = shufflevector <4 x float> %49, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %51 = insertelement <4 x float> %50, float %41, i32 2
+  %52 = insertelement <4 x float> %51, float %41, i32 3
+  %53 = bitcast float* %34 to <4 x float>*
+  %54 = load <4 x float>, <4 x float>* %53, align 8
+  store <4 x float> %54, <4 x float>* %18, align 4
+  %55 = insertelement <4 x float> undef, float %37, i32 0
+  %56 = shufflevector <4 x float> %55, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %57
+
+57:                                               ; preds = %31, %57
+  %58 = phi float* [ %35, %31 ], [ %74, %57 ]
+  %59 = phi float* [ %2, %31 ], [ %70, %57 ]
+  %60 = phi i32 [ %3, %31 ], [ %75, %57 ]
+  %61 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* nonnull %20, <4 x i32> %15, i32 32, i32 2, i32 1)
+  %62 = bitcast <4 x i32> %61 to <4 x float>
+  %63 = bitcast float* %58 to i32*
+  %64 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %63, <4 x i32> %15, i32 32, i32 2, i32 1)
+  %65 = bitcast <4 x i32> %64 to <4 x float>
+  %66 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %65, <4 x float> %56, <4 x float> %62)
+  %67 = extractelement <4 x float> %66, i32 0
+  %68 = getelementptr inbounds float, float* %59, i32 1
+  store float %67, float* %59, align 4
+  %69 = extractelement <4 x float> %66, i32 1
+  %70 = getelementptr inbounds float, float* %59, i32 2
+  store float %69, float* %68, align 4
+  %71 = load <4 x float>, <4 x float>* %22, align 4
+  %72 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %48, <4 x float> %71)
+  %73 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %65, <4 x float> %52, <4 x float> %72)
+  store <4 x float> %73, <4 x float>* %18, align 4
+  %74 = getelementptr inbounds float, float* %58, i32 2
+  %75 = add i32 %60, -1
+  %76 = icmp eq i32 %75, 0
+  br i1 %76, label %77, label %57
+
+77:                                               ; preds = %57
+  %78 = getelementptr inbounds float, float* %33, i32 5
+  store <4 x float> %73, <4 x float>* %53, align 4
+  %79 = getelementptr inbounds float, float* %34, i32 4
+  %80 = add i32 %32, -1
+  %81 = icmp eq i32 %80, 0
+  br i1 %81, label %82, label %31
+
+82:                                               ; preds = %77, %23
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %13) #5
+  ret void
+}
+
+define arm_aapcs_vfpcc void @fms(float* nocapture readonly %pSrc1, float* nocapture readonly %pSrc2, float* nocapture readonly %pSrc3, float* nocapture %pDst, i32 %N, i32 %M) {
+; CHECK-LABEL: fms:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    ldr.w lr, [sp, #16]
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    cmp.w r4, lr, lsr #2
+; CHECK-NEXT:    beq .LBB18_5
+; CHECK-NEXT:  @ %bb.1: @ %do.body.preheader
+; CHECK-NEXT:    ldr.w r12, [sp, #20]
+; CHECK-NEXT:    lsr.w r5, lr, #2
+; CHECK-NEXT:  .LBB18_2: @ %do.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB18_3 Depth 2
+; CHECK-NEXT:    ldr r4, [r2]
+; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    vdup.32 q0, r4
+; CHECK-NEXT:  .LBB18_3: @ %while.body
+; CHECK-NEXT:    @ Parent Loop BB18_2 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    vfms.f32 q2, q0, q1
+; CHECK-NEXT:    vstrb.8 q2, [r3], #16
+; CHECK-NEXT:    le lr, .LBB18_3
+; CHECK-NEXT:  @ %bb.4: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB18_2 Depth=1
+; CHECK-NEXT:    subs.w r12, r12, #1
+; CHECK-NEXT:    add.w r2, r2, #4
+; CHECK-NEXT:    bne .LBB18_2
+; CHECK-NEXT:  .LBB18_5: @ %do.end
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shr = lshr i32 %N, 2
+  %cmp15 = icmp eq i32 %shr, 0
+  br i1 %cmp15, label %do.end, label %do.body
+
+do.body:                                          ; preds = %entry, %while.end
+  %pDst.addr.0 = phi float* [ %add.ptr2, %while.end ], [ %pDst, %entry ]
+  %M.addr.0 = phi i32 [ %dec3, %while.end ], [ %M, %entry ]
+  %pSrc3.addr.0 = phi float* [ %incdec.ptr, %while.end ], [ %pSrc3, %entry ]
+  %pSrc2.addr.0 = phi float* [ %add.ptr1, %while.end ], [ %pSrc2, %entry ]
+  %pSrc1.addr.0 = phi float* [ %add.ptr, %while.end ], [ %pSrc1, %entry ]
+  %0 = load float, float* %pSrc3.addr.0, align 4
+  %.splatinsert = insertelement <4 x float> undef, float %0, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %while.body
+
+while.body:                                       ; preds = %do.body, %while.body
+  %pSrc1.addr.119 = phi float* [ %pSrc1.addr.0, %do.body ], [ %add.ptr, %while.body ]
+  %pSrc2.addr.118 = phi float* [ %pSrc2.addr.0, %do.body ], [ %add.ptr1, %while.body ]
+  %blkCnt.017 = phi i32 [ %shr, %do.body ], [ %dec, %while.body ]
+  %pDst.addr.116 = phi float* [ %pDst.addr.0, %do.body ], [ %add.ptr2, %while.body ]
+  %1 = bitcast float* %pSrc1.addr.119 to <4 x float>*
+  %2 = load <4 x float>, <4 x float>* %1, align 4
+  %3 = bitcast float* %pSrc2.addr.118 to <4 x float>*
+  %4 = load <4 x float>, <4 x float>* %3, align 4
+  %5 = fneg fast <4 x float> %4
+  %6 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %.splat, <4 x float> %5, <4 x float> %2)
+  %7 = bitcast float* %pDst.addr.116 to <4 x float>*
+  store <4 x float> %6, <4 x float>* %7, align 4
+  %add.ptr = getelementptr inbounds float, float* %pSrc1.addr.119, i32 4
+  %add.ptr1 = getelementptr inbounds float, float* %pSrc2.addr.118, i32 4
+  %add.ptr2 = getelementptr inbounds float, float* %pDst.addr.116, i32 4
+  %dec = add nsw i32 %blkCnt.017, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  %incdec.ptr = getelementptr inbounds float, float* %pSrc3.addr.0, i32 1
+  %dec3 = add i32 %M.addr.0, -1
+  %cmp4 = icmp eq i32 %dec3, 0
+  br i1 %cmp4, label %do.end, label %do.body
+
+do.end:                                           ; preds = %while.end, %entry
+  ret void
+}
+
+
+%struct.arm_biquad_casd_df1_inst_f32 = type { i32, float*, float* }
+define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_df1_inst_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) {
+; CHECK-LABEL: arm_biquad_cascade_df1_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #88
+; CHECK-NEXT:    sub sp, #88
+; CHECK-NEXT:    ldrd r12, r10, [r0]
+; CHECK-NEXT:    @ implicit-def: $s2
+; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    ldr.w r11, [r0, #8]
+; CHECK-NEXT:    lsrs r0, r3, #2
+; CHECK-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #56] @ 4-byte Spill
+; CHECK-NEXT:    b .LBB19_3
+; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vmov.f32 s7, s6
+; CHECK-NEXT:  .LBB19_2: @ %if.end69
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vstr s8, [r10]
+; CHECK-NEXT:    subs.w r12, r12, #1
+; CHECK-NEXT:    vstr s0, [r10, #4]
+; CHECK-NEXT:    add.w r11, r11, #128
+; CHECK-NEXT:    vstr s14, [r10, #8]
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    vstr s7, [r10, #12]
+; CHECK-NEXT:    add.w r10, r10, #16
+; CHECK-NEXT:    beq.w .LBB19_13
+; CHECK-NEXT:  .LBB19_3: @ %do.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
+; CHECK-NEXT:    vldr s7, [r10, #8]
+; CHECK-NEXT:    mov r5, r2
+; CHECK-NEXT:    ldr r0, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    vldr s8, [r10]
+; CHECK-NEXT:    vldr s10, [r10, #4]
+; CHECK-NEXT:    vldr s6, [r10, #12]
+; CHECK-NEXT:    wls lr, r0, .LBB19_6
+; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    ldrd r5, lr, [sp, #56] @ 8-byte Folded Reload
+; CHECK-NEXT:  .LBB19_5: @ %while.body
+; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vldr s8, [r1, #12]
+; CHECK-NEXT:    vldrw.u32 q0, [r11, #112]
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vldr s10, [r1, #8]
+; CHECK-NEXT:    vmov r7, s7
+; CHECK-NEXT:    vmov r9, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r11]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov r8, s8
+; CHECK-NEXT:    vldrw.u32 q0, [r11, #16]
+; CHECK-NEXT:    ldr r6, [r1, #4]
+; CHECK-NEXT:    vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT:    vmul.f32 q1, q1, r8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vldrw.u32 q3, [r11, #48]
+; CHECK-NEXT:    vfma.f32 q1, q0, r3
+; CHECK-NEXT:    ldr r3, [r1]
+; CHECK-NEXT:    vfma.f32 q1, q7, r6
+; CHECK-NEXT:    vldrw.u32 q6, [r11, #64]
+; CHECK-NEXT:    vfma.f32 q1, q3, r3
+; CHECK-NEXT:    vldrw.u32 q5, [r11, #80]
+; CHECK-NEXT:    vfma.f32 q1, q6, r4
+; CHECK-NEXT:    vldrw.u32 q4, [r11, #96]
+; CHECK-NEXT:    vfma.f32 q1, q5, r0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f32 q1, q4, r7
+; CHECK-NEXT:    adds r1, #16
+; CHECK-NEXT:    vfma.f32 q1, q0, r9
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vstrb.8 q1, [r5], #16
+; CHECK-NEXT:    le lr, .LBB19_5
+; CHECK-NEXT:  .LBB19_6: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    beq .LBB19_1
+; CHECK-NEXT:  @ %bb.7: @ %if.then
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vldr s24, [r1]
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vldr s0, [r1, #4]
+; CHECK-NEXT:    vldrw.u32 q3, [r11]
+; CHECK-NEXT:    vldr s3, [r1, #12]
+; CHECK-NEXT:    vldrw.u32 q4, [r11, #32]
+; CHECK-NEXT:    vldr s1, [r1, #8]
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r11, #96]
+; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    vmul.f32 q3, q3, r6
+; CHECK-NEXT:    vmov r6, s1
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r11, #112]
+; CHECK-NEXT:    vldrw.u32 q5, [r11, #48]
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r11, #80]
+; CHECK-NEXT:    vldrw.u32 q7, [r11, #64]
+; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r11, #16]
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    vfma.f32 q3, q2, r6
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f32 q3, q4, r4
+; CHECK-NEXT:    vmov lr, s6
+; CHECK-NEXT:    vfma.f32 q3, q5, r3
+; CHECK-NEXT:    vfma.f32 q3, q7, r0
+; CHECK-NEXT:    vfma.f32 q3, q2, r1
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f32 q3, q2, r2
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f32 q3, q2, lr
+; CHECK-NEXT:    bne .LBB19_9
+; CHECK-NEXT:  @ %bb.8: @ %if.then58
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vstr s12, [r5]
+; CHECK-NEXT:    vmov.f32 s8, s24
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s14, s12
+; CHECK-NEXT:    b .LBB19_11
+; CHECK-NEXT:  .LBB19_9: @ %if.else
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    cmp r7, #2
+; CHECK-NEXT:    vstmia r5, {s12, s13}
+; CHECK-NEXT:    bne .LBB19_12
+; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s0, s24
+; CHECK-NEXT:    vmov.f32 s7, s12
+; CHECK-NEXT:  .LBB19_11: @ %if.end69
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    b .LBB19_2
+; CHECK-NEXT:  .LBB19_12: @ %if.else64
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vmov.f32 s7, s13
+; CHECK-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vstr s14, [r5, #8]
+; CHECK-NEXT:    vmov.f32 s8, s1
+; CHECK-NEXT:    b .LBB19_2
+; CHECK-NEXT:  .LBB19_13: @ %do.end
+; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+  %pState1 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, %struct.arm_biquad_casd_df1_inst_f32* %S, i32 0, i32 1
+  %0 = load float*, float** %pState1, align 4
+  %pCoeffs2 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, %struct.arm_biquad_casd_df1_inst_f32* %S, i32 0, i32 2
+  %1 = load float*, float** %pCoeffs2, align 4
+  %numStages = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, %struct.arm_biquad_casd_df1_inst_f32* %S, i32 0, i32 0
+  %2 = load i32, i32* %numStages, align 4
+  %shr = lshr i32 %blockSize, 2
+  %cmp201 = icmp eq i32 %shr, 0
+  %and = and i32 %blockSize, 3
+  %tobool = icmp eq i32 %and, 0
+  %cmp57 = icmp eq i32 %and, 1
+  %cmp60 = icmp eq i32 %and, 2
+  br label %do.body
+
+do.body:                                          ; preds = %if.end69, %entry
+  %pState.0 = phi float* [ %0, %entry ], [ %incdec.ptr73, %if.end69 ]
+  %pCoeffs.0 = phi float* [ %1, %entry ], [ %add.ptr74, %if.end69 ]
+  %pIn.0 = phi float* [ %pSrc, %entry ], [ %pDst, %if.end69 ]
+  %X3.0 = phi float [ undef, %entry ], [ %X3.2, %if.end69 ]
+  %stage.0 = phi i32 [ %2, %entry ], [ %dec75, %if.end69 ]
+  %3 = load float, float* %pState.0, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %pState.0, i32 1
+  %4 = load float, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %pState.0, i32 2
+  %5 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %pState.0, i32 3
+  %6 = load float, float* %arrayidx5, align 4
+  br i1 %cmp201, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %do.body
+  %7 = bitcast float* %pCoeffs.0 to <4 x float>*
+  %arrayidx9 = getelementptr inbounds float, float* %pCoeffs.0, i32 4
+  %8 = bitcast float* %arrayidx9 to <4 x float>*
+  %arrayidx12 = getelementptr inbounds float, float* %pCoeffs.0, i32 8
+  %9 = bitcast float* %arrayidx12 to <4 x float>*
+  %arrayidx15 = getelementptr inbounds float, float* %pCoeffs.0, i32 12
+  %10 = bitcast float* %arrayidx15 to <4 x float>*
+  %arrayidx18 = getelementptr inbounds float, float* %pCoeffs.0, i32 16
+  %11 = bitcast float* %arrayidx18 to <4 x float>*
+  %arrayidx21 = getelementptr inbounds float, float* %pCoeffs.0, i32 20
+  %12 = bitcast float* %arrayidx21 to <4 x float>*
+  %arrayidx24 = getelementptr inbounds float, float* %pCoeffs.0, i32 24
+  %13 = bitcast float* %arrayidx24 to <4 x float>*
+  %arrayidx27 = getelementptr inbounds float, float* %pCoeffs.0, i32 28
+  %14 = bitcast float* %arrayidx27 to <4 x float>*
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %sample.0208 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
+  %pIn.1207 = phi float* [ %pIn.0, %while.body.lr.ph ], [ %incdec.ptr8, %while.body ]
+  %pOut.1206 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+  %Yn2.0205 = phi float [ %6, %while.body.lr.ph ], [ %37, %while.body ]
+  %Yn1.0204 = phi float [ %5, %while.body.lr.ph ], [ %36, %while.body ]
+  %Xn2.0203 = phi float [ %4, %while.body.lr.ph ], [ %17, %while.body ]
+  %Xn1.0202 = phi float [ %3, %while.body.lr.ph ], [ %18, %while.body ]
+  %incdec.ptr = getelementptr inbounds float, float* %pIn.1207, i32 1
+  %15 = load float, float* %pIn.1207, align 4
+  %incdec.ptr6 = getelementptr inbounds float, float* %pIn.1207, i32 2
+  %16 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %pIn.1207, i32 3
+  %17 = load float, float* %incdec.ptr6, align 4
+  %incdec.ptr8 = getelementptr inbounds float, float* %pIn.1207, i32 4
+  %18 = load float, float* %incdec.ptr7, align 4
+  %19 = load <4 x float>, <4 x float>* %7, align 4
+  %.splatinsert = insertelement <4 x float> undef, float %18, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %20 = fmul fast <4 x float> %.splat, %19
+  %21 = load <4 x float>, <4 x float>* %8, align 4
+  %.splatinsert10 = insertelement <4 x float> undef, float %17, i32 0
+  %.splat11 = shufflevector <4 x float> %.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
+  %22 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %21, <4 x float> %.splat11, <4 x float> %20)
+  %23 = load <4 x float>, <4 x float>* %9, align 4
+  %.splatinsert13 = insertelement <4 x float> undef, float %16, i32 0
+  %.splat14 = shufflevector <4 x float> %.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
+  %24 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %23, <4 x float> %.splat14, <4 x float> %22)
+  %25 = load <4 x float>, <4 x float>* %10, align 4
+  %.splatinsert16 = insertelement <4 x float> undef, float %15, i32 0
+  %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
+  %26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat17, <4 x float> %24)
+  %27 = load <4 x float>, <4 x float>* %11, align 4
+  %.splatinsert19 = insertelement <4 x float> undef, float %Xn1.0202, i32 0
+  %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
+  %28 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %27, <4 x float> %.splat20, <4 x float> %26)
+  %29 = load <4 x float>, <4 x float>* %12, align 4
+  %.splatinsert22 = insertelement <4 x float> undef, float %Xn2.0203, i32 0
+  %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
+  %30 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %29, <4 x float> %.splat23, <4 x float> %28)
+  %31 = load <4 x float>, <4 x float>* %13, align 4
+  %.splatinsert25 = insertelement <4 x float> undef, float %Yn1.0204, i32 0
+  %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
+  %32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat26, <4 x float> %30)
+  %33 = load <4 x float>, <4 x float>* %14, align 4
+  %.splatinsert28 = insertelement <4 x float> undef, float %Yn2.0205, i32 0
+  %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
+  %34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %33, <4 x float> %.splat29, <4 x float> %32)
+  %35 = bitcast float* %pOut.1206 to <4 x float>*
+  store <4 x float> %34, <4 x float>* %35, align 4
+  %add.ptr = getelementptr inbounds float, float* %pOut.1206, i32 4
+  %36 = extractelement <4 x float> %34, i32 3
+  %37 = extractelement <4 x float> %34, i32 2
+  %dec = add nsw i32 %sample.0208, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %do.body
+  %Xn1.0.lcssa = phi float [ %3, %do.body ], [ %18, %while.body ]
+  %Xn2.0.lcssa = phi float [ %4, %do.body ], [ %17, %while.body ]
+  %Yn1.0.lcssa = phi float [ %5, %do.body ], [ %36, %while.body ]
+  %Yn2.0.lcssa = phi float [ %6, %do.body ], [ %37, %while.body ]
+  %pOut.1.lcssa = phi float* [ %pDst, %do.body ], [ %add.ptr, %while.body ]
+  %pIn.1.lcssa = phi float* [ %pIn.0, %do.body ], [ %incdec.ptr8, %while.body ]
+  %X3.1.lcssa = phi float [ %X3.0, %do.body ], [ %18, %while.body ]
+  br i1 %tobool, label %if.end69, label %if.then
+
+if.then:                                          ; preds = %while.end
+  %incdec.ptr30 = getelementptr inbounds float, float* %pIn.1.lcssa, i32 1
+  %38 = load float, float* %pIn.1.lcssa, align 4
+  %incdec.ptr31 = getelementptr inbounds float, float* %pIn.1.lcssa, i32 2
+  %39 = load float, float* %incdec.ptr30, align 4
+  %incdec.ptr32 = getelementptr inbounds float, float* %pIn.1.lcssa, i32 3
+  %40 = load float, float* %incdec.ptr31, align 4
+  %41 = load float, float* %incdec.ptr32, align 4
+  %42 = bitcast float* %pCoeffs.0 to <4 x float>*
+  %43 = load <4 x float>, <4 x float>* %42, align 4
+  %.splatinsert34 = insertelement <4 x float> undef, float %41, i32 0
+  %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
+  %44 = fmul fast <4 x float> %.splat35, %43
+  %arrayidx36 = getelementptr inbounds float, float* %pCoeffs.0, i32 4
+  %45 = bitcast float* %arrayidx36 to <4 x float>*
+  %46 = load <4 x float>, <4 x float>* %45, align 4
+  %.splatinsert37 = insertelement <4 x float> undef, float %40, i32 0
+  %.splat38 = shufflevector <4 x float> %.splatinsert37, <4 x float> undef, <4 x i32> zeroinitializer
+  %47 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %46, <4 x float> %.splat38, <4 x float> %44)
+  %arrayidx39 = getelementptr inbounds float, float* %pCoeffs.0, i32 8
+  %48 = bitcast float* %arrayidx39 to <4 x float>*
+  %49 = load <4 x float>, <4 x float>* %48, align 4
+  %.splatinsert40 = insertelement <4 x float> undef, float %39, i32 0
+  %.splat41 = shufflevector <4 x float> %.splatinsert40, <4 x float> undef, <4 x i32> zeroinitializer
+  %50 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %49, <4 x float> %.splat41, <4 x float> %47)
+  %arrayidx42 = getelementptr inbounds float, float* %pCoeffs.0, i32 12
+  %51 = bitcast float* %arrayidx42 to <4 x float>*
+  %52 = load <4 x float>, <4 x float>* %51, align 4
+  %.splatinsert43 = insertelement <4 x float> undef, float %38, i32 0
+  %.splat44 = shufflevector <4 x float> %.splatinsert43, <4 x float> undef, <4 x i32> zeroinitializer
+  %53 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %52, <4 x float> %.splat44, <4 x float> %50)
+  %arrayidx45 = getelementptr inbounds float, float* %pCoeffs.0, i32 16
+  %54 = bitcast float* %arrayidx45 to <4 x float>*
+  %55 = load <4 x float>, <4 x float>* %54, align 4
+  %.splatinsert46 = insertelement <4 x float> undef, float %Xn1.0.lcssa, i32 0
+  %.splat47 = shufflevector <4 x float> %.splatinsert46, <4 x float> undef, <4 x i32> zeroinitializer
+  %56 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %55, <4 x float> %.splat47, <4 x float> %53)
+  %arrayidx48 = getelementptr inbounds float, float* %pCoeffs.0, i32 20
+  %57 = bitcast float* %arrayidx48 to <4 x float>*
+  %58 = load <4 x float>, <4 x float>* %57, align 4
+  %.splatinsert49 = insertelement <4 x float> undef, float %Xn2.0.lcssa, i32 0
+  %.splat50 = shufflevector <4 x float> %.splatinsert49, <4 x float> undef, <4 x i32> zeroinitializer
+  %59 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %58, <4 x float> %.splat50, <4 x float> %56)
+  %arrayidx51 = getelementptr inbounds float, float* %pCoeffs.0, i32 24
+  %60 = bitcast float* %arrayidx51 to <4 x float>*
+  %61 = load <4 x float>, <4 x float>* %60, align 4
+  %.splatinsert52 = insertelement <4 x float> undef, float %Yn1.0.lcssa, i32 0
+  %.splat53 = shufflevector <4 x float> %.splatinsert52, <4 x float> undef, <4 x i32> zeroinitializer
+  %62 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %61, <4 x float> %.splat53, <4 x float> %59)
+  %arrayidx54 = getelementptr inbounds float, float* %pCoeffs.0, i32 28
+  %63 = bitcast float* %arrayidx54 to <4 x float>*
+  %64 = load <4 x float>, <4 x float>* %63, align 4
+  %.splatinsert55 = insertelement <4 x float> undef, float %Yn2.0.lcssa, i32 0
+  %.splat56 = shufflevector <4 x float> %.splatinsert55, <4 x float> undef, <4 x i32> zeroinitializer
+  %65 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %64, <4 x float> %.splat56, <4 x float> %62)
+  %66 = extractelement <4 x float> %65, i32 0
+  br i1 %cmp57, label %if.then58, label %if.else
+
+if.then58:                                        ; preds = %if.then
+  store float %66, float* %pOut.1.lcssa, align 4
+  br label %if.end69
+
+if.else:                                          ; preds = %if.then
+  %incdec.ptr62 = getelementptr inbounds float, float* %pOut.1.lcssa, i32 1
+  store float %66, float* %pOut.1.lcssa, align 4
+  %67 = extractelement <4 x float> %65, i32 1
+  store float %67, float* %incdec.ptr62, align 4
+  br i1 %cmp60, label %if.end69, label %if.else64
+
+if.else64:                                        ; preds = %if.else
+  %incdec.ptr63 = getelementptr inbounds float, float* %pOut.1.lcssa, i32 2
+  %68 = extractelement <4 x float> %65, i32 2
+  store float %68, float* %incdec.ptr63, align 4
+  br label %if.end69
+
+if.end69:                                         ; preds = %if.else, %while.end, %if.then58, %if.else64
+  %Xn1.1 = phi float [ %38, %if.then58 ], [ %40, %if.else64 ], [ %Xn1.0.lcssa, %while.end ], [ %39, %if.else ]
+  %Xn2.1 = phi float [ %X3.1.lcssa, %if.then58 ], [ %39, %if.else64 ], [ %Xn2.0.lcssa, %while.end ], [ %38, %if.else ]
+  %Yn1.1 = phi float [ %66, %if.then58 ], [ %68, %if.else64 ], [ %Yn1.0.lcssa, %while.end ], [ %67, %if.else ]
+  %Yn2.1 = phi float [ %Yn1.0.lcssa, %if.then58 ], [ %67, %if.else64 ], [ %Yn2.0.lcssa, %while.end ], [ %66, %if.else ]
+  %X3.2 = phi float [ %41, %if.then58 ], [ %41, %if.else64 ], [ %X3.1.lcssa, %while.end ], [ %41, %if.else ]
+  store float %Xn1.1, float* %pState.0, align 4
+  store float %Xn2.1, float* %arrayidx3, align 4
+  store float %Yn1.1, float* %arrayidx4, align 4
+  %incdec.ptr73 = getelementptr inbounds float, float* %pState.0, i32 4
+  store float %Yn2.1, float* %arrayidx5, align 4
+  %add.ptr74 = getelementptr inbounds float, float* %pCoeffs.0, i32 32
+  %dec75 = add i32 %stage.0, -1
+  %cmp76 = icmp eq i32 %dec75, 0
+  br i1 %cmp76, label %do.end, label %do.body
+
+do.end:                                           ; preds = %if.end69
+  ret void
+}
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
+declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32)
 declare void @llvm.assume(i1)
 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)

From 8b3155829a99fecc9e62f84d95668d0e85a0dd62 Mon Sep 17 00:00:00 2001
From: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Date: Thu, 28 May 2020 13:50:32 -0700
Subject: [PATCH 425/770] [MLIR] Fix build when NVPTX is not enabled

In this case, neither target is selected, but there is still a dependence
on the MC library (through the TargetOptions.h include)
---
 mlir/lib/Conversion/GPUCommon/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index 91c281614214b..2b85c237731b2 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -1,6 +1,5 @@
 if (MLIR_CUDA_CONVERSIONS_ENABLED)
   set(NVPTX_LIBS
-    MC
     NVPTXCodeGen
     NVPTXDesc
     NVPTXInfo
@@ -9,7 +8,6 @@ endif()
 
 if (MLIR_ROCM_CONVERSIONS_ENABLED)
   set(AMDGPU_LIBS
-    MC
     AMDGPUCodeGen
     AMDGPUDesc
     AMDGPUInfo
@@ -26,6 +24,7 @@ add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
 
   LINK_COMPONENTS
   Core
+  MC
   ${AMDGPU_LIBS}
   ${NVPTX_LIBS}
 

From 3bff62d45f83ab2c480d82809cd91a32b4a6553d Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@iml.fraunhofer.de>
Date: Thu, 28 May 2020 12:34:44 -0700
Subject: [PATCH 426/770] [mlir] Extend standalone example by
 standalone-translate

Extend the standalone by standalone-translate, based on mlir-translate.

Differential Revision: https://reviews.llvm.org/D80737
---
 mlir/examples/standalone/CMakeLists.txt       |   1 +
 mlir/examples/standalone/README.md            |   2 +-
 .../standalone-translate/CMakeLists.txt       |  24 ++++
 .../standalone-translate.cpp                  | 114 ++++++++++++++++++
 mlir/examples/standalone/test/CMakeLists.txt  |  11 +-
 .../test/Standalone/standalone-translate.mlir |   8 ++
 mlir/examples/standalone/test/lit.cfg.py      |   5 +-
 mlir/test/Examples/standalone/test.toy        |   4 +-
 8 files changed, 159 insertions(+), 10 deletions(-)
 create mode 100644 mlir/examples/standalone/standalone-translate/CMakeLists.txt
 create mode 100644 mlir/examples/standalone/standalone-translate/standalone-translate.cpp
 create mode 100644 mlir/examples/standalone/test/Standalone/standalone-translate.mlir

diff --git a/mlir/examples/standalone/CMakeLists.txt b/mlir/examples/standalone/CMakeLists.txt
index fe309b501963a..9f30f70f949fd 100644
--- a/mlir/examples/standalone/CMakeLists.txt
+++ b/mlir/examples/standalone/CMakeLists.txt
@@ -44,3 +44,4 @@ add_subdirectory(include)
 add_subdirectory(lib)
 add_subdirectory(test)
 add_subdirectory(standalone-opt)
+add_subdirectory(standalone-translate)
diff --git a/mlir/examples/standalone/README.md b/mlir/examples/standalone/README.md
index dd2fad7a5d810..b1ca6275e598f 100644
--- a/mlir/examples/standalone/README.md
+++ b/mlir/examples/standalone/README.md
@@ -8,7 +8,7 @@ This setup assumes that you have built LLVM and MLIR in `$BUILD_DIR` and install
 ```sh
 mkdir build && cd build
 cmake -G Ninja .. -DMLIR_DIR=$PREFIX/lib/cmake/mlir -DLLVM_EXTERNAL_LIT=$BUILD_DIR/bin/llvm-lit
-cmake --build . --target check-standalone-opt
+cmake --build . --target check-standalone
 ```
 To build the documentation from the TableGen description of the dialect operations, run
 ```sh
diff --git a/mlir/examples/standalone/standalone-translate/CMakeLists.txt b/mlir/examples/standalone/standalone-translate/CMakeLists.txt
new file mode 100644
index 0000000000000..137f7947cfac7
--- /dev/null
+++ b/mlir/examples/standalone/standalone-translate/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(translation_libs GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
+
+add_llvm_executable(standalone-translate
+  standalone-translate.cpp
+  )
+llvm_update_compile_flags(standalone-translate)
+target_link_libraries(standalone-translate
+  PRIVATE
+  ${dialect_libs}
+  ${translation_libs}
+  MLIRIR
+  MLIRParser
+  MLIRPass
+  MLIRSPIRV
+  MLIRTranslation
+  MLIRSupport
+  )
+
+mlir_check_link_libraries(standalone-translate)
diff --git a/mlir/examples/standalone/standalone-translate/standalone-translate.cpp b/mlir/examples/standalone/standalone-translate/standalone-translate.cpp
new file mode 100644
index 0000000000000..c81e8dc14bc2d
--- /dev/null
+++ b/mlir/examples/standalone/standalone-translate/standalone-translate.cpp
@@ -0,0 +1,114 @@
+//===- standalone-translate.cpp ---------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a command line utility that translates a file from/to MLIR using one
+// of the registered translations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllTranslations.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/ToolUtilities.h"
+#include "mlir/Translation.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+#include "Standalone/StandaloneDialect.h"
+
+static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                                llvm::cl::desc("<input file>"),
+                                                llvm::cl::init("-"));
+
+static llvm::cl::opt<std::string>
+    outputFilename("o", llvm::cl::desc("Output filename"),
+                   llvm::cl::value_desc("filename"), llvm::cl::init("-"));
+
+static llvm::cl::opt<bool>
+    splitInputFile("split-input-file",
+                   llvm::cl::desc("Split the input file into pieces and "
+                                  "process each chunk independently"),
+                   llvm::cl::init(false));
+
+static llvm::cl::opt<bool> verifyDiagnostics(
+    "verify-diagnostics",
+    llvm::cl::desc("Check that emitted diagnostics match "
+                   "expected-* lines on the corresponding line"),
+    llvm::cl::init(false));
+
+int main(int argc, char **argv) {
+  mlir::registerAllDialects();
+  mlir::registerAllTranslations();
+
+  mlir::registerDialect<mlir::standalone::StandaloneDialect>();
+  // TODO: Register standalone translations here.
+
+  llvm::InitLLVM y(argc, argv);
+
+  // Add flags for all the registered translations.
+  llvm::cl::opt<const mlir::TranslateFunction *, false, mlir::TranslationParser>
+      translationRequested("", llvm::cl::desc("Translation to perform"),
+                           llvm::cl::Required);
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR translation driver\n");
+
+  std::string errorMessage;
+  auto input = mlir::openInputFile(inputFilename, &errorMessage);
+  if (!input) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  auto output = mlir::openOutputFile(outputFilename, &errorMessage);
+  if (!output) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  // Processes the memory buffer with a new MLIRContext.
+  auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
+                           llvm::raw_ostream &os) {
+    mlir::MLIRContext context;
+    context.allowUnregisteredDialects();
+    context.printOpOnDiagnostic(!verifyDiagnostics);
+    llvm::SourceMgr sourceMgr;
+    sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
+
+    if (!verifyDiagnostics) {
+      mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
+      return (*translationRequested)(sourceMgr, os, &context);
+    }
+
+    // In the diagnostic verification flow, we ignore whether the translation
+    // failed (in most cases, it is expected to fail). Instead, we check if the
+    // diagnostics were produced as expected.
+    mlir::SourceMgrDiagnosticVerifierHandler sourceMgrHandler(sourceMgr,
+                                                              &context);
+    (*translationRequested)(sourceMgr, os, &context);
+    return sourceMgrHandler.verify();
+  };
+
+  if (splitInputFile) {
+    if (failed(mlir::splitAndProcessBuffer(std::move(input), processBuffer,
+                                           output->os())))
+      return 1;
+  } else {
+    if (failed(processBuffer(std::move(input), output->os())))
+      return 1;
+  }
+
+  output->keep();
+  return 0;
+}
diff --git a/mlir/examples/standalone/test/CMakeLists.txt b/mlir/examples/standalone/test/CMakeLists.txt
index 8f31f9f1dfaa9..29da2b87ba4fc 100644
--- a/mlir/examples/standalone/test/CMakeLists.txt
+++ b/mlir/examples/standalone/test/CMakeLists.txt
@@ -5,15 +5,16 @@ configure_lit_site_cfg(
         ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
 )
 
-set(STANDALONE_OPT_TEST_DEPENDS
+set(STANDALONE_TEST_DEPENDS
         FileCheck count not
         standalone-opt
+        standalone-translate
         )
 
-add_lit_testsuite(check-standalone-opt "Running the standalone-opt regression tests"
+add_lit_testsuite(check-standalone "Running the standalone regression tests"
         ${CMAKE_CURRENT_BINARY_DIR}
-        DEPENDS ${STANDALONE_OPT_TEST_DEPENDS}
+        DEPENDS ${STANDALONE_TEST_DEPENDS}
         )
-set_target_properties(check-standalone-opt PROPERTIES FOLDER "Tests")
+set_target_properties(check-standalone PROPERTIES FOLDER "Tests")
 
-add_lit_testsuites(STANDALONE_OPT ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${STANDALONE_OPT_TEST_DEPENDS})
+add_lit_testsuites(STANDALONE ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${STANDALONE_TEST_DEPENDS})
diff --git a/mlir/examples/standalone/test/Standalone/standalone-translate.mlir b/mlir/examples/standalone/test/Standalone/standalone-translate.mlir
new file mode 100644
index 0000000000000..2a096c38e1281
--- /dev/null
+++ b/mlir/examples/standalone/test/Standalone/standalone-translate.mlir
@@ -0,0 +1,8 @@
+// RUN: standalone-translate --help | FileCheck %s
+// CHECK: --avx512-mlir-to-llvmir
+// CHECK: --deserialize-spirv
+// CHECK: --import-llvm
+// CHECK: --mlir-to-llvmir
+// CHECK: --mlir-to-nvvmir
+// CHECK: --mlir-to-rocdlir
+// CHECK: --serialize-spirv
diff --git a/mlir/examples/standalone/test/lit.cfg.py b/mlir/examples/standalone/test/lit.cfg.py
index 049fbc73cd816..9fb5b543f0065 100644
--- a/mlir/examples/standalone/test/lit.cfg.py
+++ b/mlir/examples/standalone/test/lit.cfg.py
@@ -16,7 +16,7 @@
 # Configuration file for the 'lit' test runner.
 
 # name: The name of this test suite.
-config.name = 'STANDALONE_OPT'
+config.name = 'STANDALONE'
 
 config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
@@ -54,7 +54,8 @@
 
 tool_dirs = [config.standalone_tools_dir, config.llvm_tools_dir]
 tools = [
-    'standalone-opt'
+    'standalone-opt',
+    'standalone-translate'
 ]
 
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/mlir/test/Examples/standalone/test.toy b/mlir/test/Examples/standalone/test.toy
index 4f9ba5cc78e11..034fd9385d464 100644
--- a/mlir/test/Examples/standalone/test.toy
+++ b/mlir/test/Examples/standalone/test.toy
@@ -1,4 +1,4 @@
-# RUN: %cmake %mlir_src_root/examples/standalone -DCMAKE_CXX_COMPILER=%host_cxx -DCMAKE_C_COMPILER=%host_cc -DMLIR_DIR=%llvm_lib_dir/cmake/mlir ; %cmake --build . --target check-standalone-opt | tee %t | FileCheck %s
+# RUN: %cmake %mlir_src_root/examples/standalone -DCMAKE_CXX_COMPILER=%host_cxx -DCMAKE_C_COMPILER=%host_cc -DMLIR_DIR=%llvm_lib_dir/cmake/mlir ; %cmake --build . --target check-standalone | tee %t | FileCheck %s
 
-# CHECK: Expected Passes: 1
+# CHECK: Expected Passes: 2
 # UNSUPPORTED: windows, android

From 434d122e94a74cbfb08e901821590faad9b6dcd9 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Thu, 28 May 2020 14:13:05 -0700
Subject: [PATCH 427/770] [SVE] Eliminate calls to default-false
 VectorType::get() from Analysis

Reviewers: efriedma, fpetrogalli, kmclaughlin, sunfish

Reviewed By: fpetrogalli

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80324
---
 llvm/lib/Analysis/ConstantFolding.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 7eafc7a6623f7..53f1c144c546a 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -118,8 +118,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       // to simplify things.
       if (SrcEltTy->isFloatingPointTy()) {
         unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
-        Type *SrcIVTy =
-          VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
+        auto *SrcIVTy = FixedVectorType::get(
+            IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
         // Ask IR to do the conversion now that #elts line up.
         C = ConstantExpr::getBitCast(C, SrcIVTy);
       }
@@ -175,8 +175,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
   if (DstEltTy->isFloatingPointTy()) {
     // Fold to an vector of integers with same size as our FP type.
     unsigned FPWidth = DstEltTy->getPrimitiveSizeInBits();
-    Type *DestIVTy =
-      VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumDstElt);
+    auto *DestIVTy = FixedVectorType::get(
+        IntegerType::get(C->getContext(), FPWidth), NumDstElt);
     // Recursively handle this integer conversion, if possible.
     C = FoldBitCast(C, DestIVTy, DL);
 
@@ -188,8 +188,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
   // it to integer first.
   if (SrcEltTy->isFloatingPointTy()) {
     unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
-    Type *SrcIVTy =
-      VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElt);
+    auto *SrcIVTy = FixedVectorType::get(
+        IntegerType::get(C->getContext(), FPWidth), NumSrcElt);
     // Ask IR to do the conversion now that #elts line up.
     C = ConstantExpr::getBitCast(C, SrcIVTy);
     // If IR wasn't able to fold it, bail out.

From 922fa2fce38b0bd97921b91ff1cdc57f18d3569c Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 27 May 2020 23:12:36 -0700
Subject: [PATCH 428/770] Run Coverage pass before other *San passes under new
 pass manager, round 2

Summary:
This was attempted once before in https://reviews.llvm.org/D79698, but
was reverted due to the coverage pass running in the wrong part of the
pipeline. This commit puts it in the same place as the other sanitizers.

This changes PassBuilder.OptimizerLastEPCallbacks to work on a
ModulePassManager instead of a FunctionPassManager. That is because
SanitizerCoverage cannot (easily) be split into a module pass and a
function pass like some of the other sanitizers since in its current
implementation it conditionally inserts module constructors based on
whether or not it successfully modified functions.

This fixes compiler-rt/test/msan/coverage-levels.cpp under the new pass
manager (last check-msan test).

Subscribers: hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D80692
---
 clang/lib/CodeGen/BackendUtil.cpp      | 50 +++++++++++++++++---------
 llvm/include/llvm/Passes/PassBuilder.h |  4 +--
 llvm/lib/Passes/PassBuilder.cpp        |  6 ++--
 llvm/tools/opt/NewPMDriver.cpp         |  2 +-
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index e746aef1a62ff..dd5016333920d 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -1001,6 +1002,15 @@ static void addSanitizersAtO0(ModulePassManager &MPM,
                               const Triple &TargetTriple,
                               const LangOptions &LangOpts,
                               const CodeGenOptions &CodeGenOpts) {
+  if (CodeGenOpts.SanitizeCoverageType ||
+      CodeGenOpts.SanitizeCoverageIndirectCalls ||
+      CodeGenOpts.SanitizeCoverageTraceCmp) {
+    auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
+    MPM.addPass(ModuleSanitizerCoveragePass(
+        SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
+        CodeGenOpts.SanitizeCoverageBlacklistFiles));
+  }
+
   auto ASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
     MPM.addPass(RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
     bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
@@ -1249,6 +1259,20 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
             [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
               FPM.addPass(BoundsCheckingPass());
             });
+
+      if (CodeGenOpts.SanitizeCoverageType ||
+          CodeGenOpts.SanitizeCoverageIndirectCalls ||
+          CodeGenOpts.SanitizeCoverageTraceCmp) {
+        PB.registerOptimizerLastEPCallback(
+            [this](ModulePassManager &MPM,
+                   PassBuilder::OptimizationLevel Level) {
+              auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
+              MPM.addPass(ModuleSanitizerCoveragePass(
+                  SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
+                  CodeGenOpts.SanitizeCoverageBlacklistFiles));
+            });
+      }
+
       if (LangOpts.Sanitize.has(SanitizerKind::Memory)) {
         int TrackOrigins = CodeGenOpts.SanitizeMemoryTrackOrigins;
         bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::Memory);
@@ -1257,17 +1281,19 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
               MPM.addPass(MemorySanitizerPass({TrackOrigins, Recover, false}));
             });
         PB.registerOptimizerLastEPCallback(
-            [TrackOrigins, Recover](FunctionPassManager &FPM,
+            [TrackOrigins, Recover](ModulePassManager &MPM,
                                     PassBuilder::OptimizationLevel Level) {
-              FPM.addPass(MemorySanitizerPass({TrackOrigins, Recover, false}));
+              MPM.addPass(createModuleToFunctionPassAdaptor(
+                  MemorySanitizerPass({TrackOrigins, Recover, false})));
             });
       }
       if (LangOpts.Sanitize.has(SanitizerKind::Thread)) {
         PB.registerPipelineStartEPCallback(
             [](ModulePassManager &MPM) { MPM.addPass(ThreadSanitizerPass()); });
         PB.registerOptimizerLastEPCallback(
-            [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
-              FPM.addPass(ThreadSanitizerPass());
+            [](ModulePassManager &MPM, PassBuilder::OptimizationLevel Level) {
+              MPM.addPass(
+                  createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
             });
       }
       if (LangOpts.Sanitize.has(SanitizerKind::Address)) {
@@ -1278,10 +1304,11 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
         bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::Address);
         bool UseAfterScope = CodeGenOpts.SanitizeAddressUseAfterScope;
         PB.registerOptimizerLastEPCallback(
-            [Recover, UseAfterScope](FunctionPassManager &FPM,
+            [Recover, UseAfterScope](ModulePassManager &MPM,
                                      PassBuilder::OptimizationLevel Level) {
-              FPM.addPass(AddressSanitizerPass(
-                  /*CompileKernel=*/false, Recover, UseAfterScope));
+              MPM.addPass(
+                  createModuleToFunctionPassAdaptor(AddressSanitizerPass(
+                      /*CompileKernel=*/false, Recover, UseAfterScope)));
             });
         bool ModuleUseAfterScope = asanUseGlobalsGC(TargetTriple, CodeGenOpts);
         bool UseOdrIndicator = CodeGenOpts.SanitizeAddressUseOdrIndicator;
@@ -1325,15 +1352,6 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
       }
     }
 
-    if (CodeGenOpts.SanitizeCoverageType ||
-        CodeGenOpts.SanitizeCoverageIndirectCalls ||
-        CodeGenOpts.SanitizeCoverageTraceCmp) {
-      auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
-      MPM.addPass(ModuleSanitizerCoveragePass(
-          SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
-          CodeGenOpts.SanitizeCoverageBlacklistFiles));
-    }
-
     if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) {
       bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::HWAddress);
       MPM.addPass(HWAddressSanitizerPass(
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 391d144d5dcdf..d5a70c2ae132d 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -600,7 +600,7 @@ class PassBuilder {
   /// is not triggered at O0. Extensions to the O0 pipeline should append their
   /// passes to the end of the overall pipeline.
   void registerOptimizerLastEPCallback(
-      const std::function<void(FunctionPassManager &, OptimizationLevel)> &C) {
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
     OptimizerLastEPCallbacks.push_back(C);
   }
 
@@ -728,7 +728,7 @@ class PassBuilder {
       CGSCCOptimizerLateEPCallbacks;
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       VectorizerStartEPCallbacks;
-  SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       OptimizerLastEPCallbacks;
   // Module callbacks
   SmallVector<std::function<void(ModulePassManager &)>, 2>
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0999f7872d12c..1b1701cbe2619 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1073,12 +1073,12 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (PTO.Coroutines)
     OptimizePM.addPass(CoroCleanupPass());
 
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(OptimizePM, Level);
-
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass());
 
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 2e84ca49b6e0b..c99ad2f7b4dcf 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -194,7 +194,7 @@ static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](FunctionPassManager &PM,
+        [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM,
                                             PassBuilder::OptimizationLevel) {
           ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
           Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline, VerifyEachPass,

From 6eb56794023c606224b82366a04f98efd9a564eb Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 28 May 2020 14:25:44 -0700
Subject: [PATCH 429/770] [NFC,StackSafety] clang-tidy warning fixes

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 34 +++++++++++------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 2c8a5e33c847f..1c59d5f2af966 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -118,7 +118,7 @@ ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
   if (APSize.isNonPositive())
     return R;
   if (AI.isArrayAllocation()) {
-    auto C = dyn_cast<ConstantInt>(AI.getArraySize());
+    const auto *C = dyn_cast<ConstantInt>(AI.getArraySize());
     if (!C)
       return R;
     bool Overflow = false;
@@ -163,7 +163,7 @@ struct FunctionInfo {
     if (F) {
       size_t Pos = 0;
       for (auto &I : instructions(F)) {
-        if (auto AI = dyn_cast<AllocaInst>(&I)) {
+        if (const auto *AI = dyn_cast<AllocaInst>(&I)) {
           auto &AS = Allocas[Pos];
           O << "      " << AI->getName() << "["
             << getStaticAllocaSizeRange(*AI).getUpper() << "]: " << AS << "\n";
@@ -263,7 +263,7 @@ ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
 
 ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
     const MemIntrinsic *MI, const Use &U, Value *Base) {
-  if (auto MTI = dyn_cast<MemTransferInst>(MI)) {
+  if (const auto *MTI = dyn_cast<MemTransferInst>(MI)) {
     if (MTI->getRawSource() != U && MTI->getRawDest() != U)
       return ConstantRange::getEmpty(PointerSize);
   } else {
@@ -297,7 +297,7 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, UseInfo &US) {
   while (!WorkList.empty()) {
     const Value *V = WorkList.pop_back_val();
     for (const Use &UI : V->uses()) {
-      auto I = cast<const Instruction>(UI.getUser());
+      const auto *I = cast<const Instruction>(UI.getUser());
       assert(V == UI.get());
 
       switch (I->getOpcode()) {
@@ -352,13 +352,11 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, UseInfo &US) {
 
         assert(isa<Function>(Callee) || isa<GlobalAlias>(Callee));
 
-        auto B = CB.arg_begin(), E = CB.arg_end();
         int Found = 0;
-        for (auto A = B; A != E; ++A) {
-          if (A->get() == V) {
+        for (size_t ArgNo = 0; ArgNo < CB.getNumArgOperands(); ++ArgNo) {
+          if (CB.getArgOperand(ArgNo) == V) {
             ++Found;
-            ConstantRange Offset = offsetFrom(UI, Ptr);
-            US.Calls.emplace_back(Callee, A - B, Offset);
+            US.Calls.emplace_back(Callee, ArgNo, offsetFrom(UI, Ptr));
           }
         }
         if (!Found) {
@@ -387,7 +385,7 @@ FunctionInfo StackSafetyLocalAnalysis::run() {
   LLVM_DEBUG(dbgs() << "[StackSafety] " << F.getName() << "\n");
 
   for (auto &I : instructions(F)) {
-    if (auto AI = dyn_cast<AllocaInst>(&I)) {
+    if (auto *AI = dyn_cast<AllocaInst>(&I)) {
       Info.Allocas.emplace_back(PointerSize);
       UseInfo &AS = Info.Allocas.back();
       analyzeAllUses(AI, AS);
@@ -556,7 +554,7 @@ bool setStackSafetyMetadata(Module &M, const GVToSSI &SSGI) {
     const FunctionInfo &Summary = Iter->second;
     size_t Pos = 0;
     for (auto &I : instructions(F)) {
-      if (auto AI = dyn_cast<AllocaInst>(&I)) {
+      if (auto *AI = dyn_cast<AllocaInst>(&I)) {
         auto &AS = Summary.Allocas[Pos];
         if (getStaticAllocaSizeRange(*AI).contains(AS.Range)) {
           AI->setMetadata(M.getMDKindID("stack-safe"),
@@ -570,7 +568,7 @@ bool setStackSafetyMetadata(Module &M, const GVToSSI &SSGI) {
   return Changed;
 }
 
-const Function *FindCalleeInModule(const GlobalValue *GV) {
+const Function *findCalleeInModule(const GlobalValue *GV) {
   while (GV) {
     if (GV->isInterposable() || !GV->isDSOLocal())
       return nullptr;
@@ -586,10 +584,10 @@ const Function *FindCalleeInModule(const GlobalValue *GV) {
   return nullptr;
 }
 
-void ResolveAllCalls(UseInfo &Use) {
+void resolveAllCalls(UseInfo &Use) {
   ConstantRange FullSet(Use.Range.getBitWidth(), true);
   for (auto &C : Use.Calls) {
-    const Function *F = FindCalleeInModule(C.Callee);
+    const Function *F = findCalleeInModule(C.Callee);
     if (F) {
       C.Callee = F;
       continue;
@@ -599,9 +597,9 @@ void ResolveAllCalls(UseInfo &Use) {
   }
 }
 
-void ResolveAllCalls(SmallVectorImpl<UseInfo> &Values) {
+void resolveAllCalls(SmallVectorImpl<UseInfo> &Values) {
   for (auto &V : Values)
-    ResolveAllCalls(V);
+    resolveAllCalls(V);
 }
 
 GVToSSI createGlobalStackSafetyInfo(
@@ -614,7 +612,7 @@ GVToSSI createGlobalStackSafetyInfo(
   auto Copy = Functions;
 
   for (auto &FI : Copy)
-    ResolveAllCalls(FI.second.Params);
+    resolveAllCalls(FI.second.Params);
 
   uint32_t PointerSize = Copy.begin()
                              ->first->getParent()
@@ -627,7 +625,7 @@ GVToSSI createGlobalStackSafetyInfo(
     size_t Pos = 0;
     auto &SrcF = Functions[F.first];
     for (auto &A : FI.Allocas) {
-      ResolveAllCalls(A);
+      resolveAllCalls(A);
       for (auto &C : A.Calls) {
         A.updateRange(
             SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo, C.Offset));

From 519959ad825bfad0b62a2012c064e582788d8f21 Mon Sep 17 00:00:00 2001
From: Evgenii Stepanov <eugenis@google.com>
Date: Thu, 28 May 2020 14:30:19 -0700
Subject: [PATCH 430/770] [scudo] Fix deadlock in
 ScudoWrappersCTest.DisableForkEnable test.

pthread_cond_wait needs a loop around it to handle spurious wake ups,
as well as the case when signal runs before wait.
---
 compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 8b2bc6ecbd5b6..b41908cf47814 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -372,6 +372,7 @@ TEST(ScudoWrappersCTest, Fork) {
 
 static pthread_mutex_t Mutex;
 static pthread_cond_t Conditional = PTHREAD_COND_INITIALIZER;
+static bool Ready;
 
 static void *enableMalloc(void *Unused) {
   // Initialize the allocator for this thread.
@@ -382,6 +383,7 @@ static void *enableMalloc(void *Unused) {
 
   // Signal the main thread we are ready.
   pthread_mutex_lock(&Mutex);
+  Ready = true;
   pthread_cond_signal(&Conditional);
   pthread_mutex_unlock(&Mutex);
 
@@ -398,7 +400,8 @@ TEST(ScudoWrappersCTest, DisableForkEnable) {
 
   // Wait for the thread to be warmed up.
   pthread_mutex_lock(&Mutex);
-  pthread_cond_wait(&Conditional, &Mutex);
+  while (!Ready)
+    pthread_cond_wait(&Conditional, &Mutex);
   pthread_mutex_unlock(&Mutex);
 
   // Disable the allocator and fork. fork should succeed after malloc_enable.

From 27304b1737a3ff0bca39ddd3ed11b96a624a1d6d Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 14:17:00 -0700
Subject: [PATCH 431/770] [Tests] Switch a few statepoint tests to using
 operand bundles

We've started (D80598) the process of migrating away from the inline operand lists in statepoints to using explicit operand bundles.  Update a few tests to reflect the new preference.  More to come, these were simply the ones outside any obvious grouping.
---
 .../test/Analysis/CallGraph/non-leaf-intrinsics.ll |  2 +-
 .../Analysis/LazyCallGraph/non-leaf-intrinsics.ll  |  2 +-
 llvm/test/Transforms/InstCombine/token.ll          |  4 ++--
 .../Transforms/RewriteStatepointsForGC/basics.ll   |  2 +-
 llvm/test/Transforms/Util/strip-gc-relocates.ll    | 14 +++++++-------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Analysis/CallGraph/non-leaf-intrinsics.ll b/llvm/test/Analysis/CallGraph/non-leaf-intrinsics.ll
index 5caecf7e2244d..31c4260723ceb 100644
--- a/llvm/test/Analysis/CallGraph/non-leaf-intrinsics.ll
+++ b/llvm/test/Analysis/CallGraph/non-leaf-intrinsics.ll
@@ -10,7 +10,7 @@ define private void @f() {
 define void @calls_statepoint(i8 addrspace(1)* %arg) gc "statepoint-example" {
 entry:
   %cast = bitcast i8 addrspace(1)* %arg to i64 addrspace(1)*
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 10, i32 0, i8 addrspace(1)* %arg, i64 addrspace(1)* %cast, i8 addrspace(1)* %arg, i8 addrspace(1)* %arg)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %arg, i64 addrspace(1)* %cast, i8 addrspace(1)* %arg, i8 addrspace(1)* %arg) ["deopt" (i32 0, i32 0, i32 0, i32 10, i32 0)]
   ret void
 }
 
diff --git a/llvm/test/Analysis/LazyCallGraph/non-leaf-intrinsics.ll b/llvm/test/Analysis/LazyCallGraph/non-leaf-intrinsics.ll
index f5dc72ff9e97e..7bc585190397f 100644
--- a/llvm/test/Analysis/LazyCallGraph/non-leaf-intrinsics.ll
+++ b/llvm/test/Analysis/LazyCallGraph/non-leaf-intrinsics.ll
@@ -12,7 +12,7 @@ define void @calls_statepoint(i8 addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-NEXT:  -> f
 entry:
   %cast = bitcast i8 addrspace(1)* %arg to i64 addrspace(1)*
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 10, i32 0, i8 addrspace(1)* %arg, i64 addrspace(1)* %cast, i8 addrspace(1)* %arg, i8 addrspace(1)* %arg)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %arg, i64 addrspace(1)* %cast, i8 addrspace(1)* %arg, i8 addrspace(1)* %arg) ["deopt" (i32 0, i32 0, i32 0, i32 10, i32 0)]
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/token.ll b/llvm/test/Transforms/InstCombine/token.ll
index f96b85b4f2257..a0e5bb9a53629 100644
--- a/llvm/test/Transforms/InstCombine/token.ll
+++ b/llvm/test/Transforms/InstCombine/token.ll
@@ -93,13 +93,13 @@ bb:
   unreachable
 
 unreachable:
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
   ret void
 }
 
 ; CHECK-LABEL: define void @test4(
 ; CHECK: unreachable:
-; CHECK:   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK:   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
 ; CHECK:   ret void
 
 
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/basics.ll b/llvm/test/Transforms/RewriteStatepointsForGC/basics.ll
index 9b611079114ea..6c90f529de463 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/basics.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/basics.ll
@@ -82,7 +82,7 @@ entry:
 ; CHECK-LABEL: entry:
 ; CHECK-NEXT: gc.statepoint
 ; CHECK-NOT: %obj.relocated = call coldcc i8 addrspace(1)*
-  %0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
   ret i8 addrspace(1)* %obj
 }
 
diff --git a/llvm/test/Transforms/Util/strip-gc-relocates.ll b/llvm/test/Transforms/Util/strip-gc-relocates.ll
index 359de7b3efe78..45f71117af0af 100644
--- a/llvm/test/Transforms/Util/strip-gc-relocates.ll
+++ b/llvm/test/Transforms/Util/strip-gc-relocates.ll
@@ -17,8 +17,8 @@ entry:
 ; CHECK: gc.statepoint
 ; CHECK-NOT: gc.relocate
 ; CHECK: ret i32 addrspace(1)* %arg
-  %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
-  %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 8, i32 8) ; (%arg, %arg)
+  %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %arg) ["deopt" (i32 100)]
+  %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7) ; (%arg, %arg)
   %arg.relocated.casted = bitcast i8 addrspace(1)* %arg.relocated to i32 addrspace(1)*
   ret i32 addrspace(1)* %arg.relocated.casted
 }
@@ -58,18 +58,18 @@ define i32 addrspace(1)* @test3(i32 addrspace(1)* %arg) gc "statepoint-example"
 ; CHECK-LABEL: unwind_dest:
 ; CHECK-NOT: gc.relocate
 entry:
-  %statepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+  %statepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %arg) ["deopt" (i32 100)]
           to label %normal_dest unwind label %unwind_dest
 
 normal_dest:                                      ; preds = %entry
-  %arg.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 8, i32 8) ; (%arg, %arg)
+  %arg.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7) ; (%arg, %arg)
   %arg.relocated1.casted = bitcast i8 addrspace(1)* %arg.relocated1 to i32 addrspace(1)*
   ret i32 addrspace(1)* %arg.relocated1.casted
 
 unwind_dest:                                      ; preds = %entry
   %lpad = landingpad token
           cleanup
-  %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %lpad, i32 8, i32 8) ; (%arg, %arg)
+  %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %lpad, i32 7, i32 7) ; (%arg, %arg)
   %arg.relocated.casted = bitcast i8 addrspace(1)* %arg.relocated to i32 addrspace(1)*
   resume token undef
 }
@@ -110,8 +110,8 @@ entry:
 ; CHECK-LABEL: test5
 ; CHECK: gc.statepoint
 ; CHECK-NOT: gc.relocate
-  %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
-  %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 8, i32 8) ; (%arg, %arg)
+  %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %arg) ["deopt" (i32 100)]
+  %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7) ; (%arg, %arg)
   ret i8 addrspace(1)* %arg.relocated
 }
 

From 15000255d18b84d8bf42541a037c399ce4d5a018 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 14:34:47 -0700
Subject: [PATCH 432/770] [Tests] Remove deopt operands from SafepointIRVerfier
 tests

This linter has nothing to do with deopt, and the operands had clearly been copied blindly from another source.  Rather than migrate to deopt operand bundle, let's just simplify the tests.
---
 .../SafepointIRVerifier/basic-use-after-reloc.ll     |  4 ++--
 llvm/test/SafepointIRVerifier/compares.ll            | 12 ++++++------
 llvm/test/SafepointIRVerifier/constant-bases.ll      | 12 ++++++------
 .../unreachable-block-tolerant.ll                    |  6 +++---
 .../SafepointIRVerifier/unrecorded-live-at-sp.ll     | 12 ++++++------
 llvm/test/SafepointIRVerifier/uses-in-phi-nodes.ll   |  8 ++++----
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/llvm/test/SafepointIRVerifier/basic-use-after-reloc.ll b/llvm/test/SafepointIRVerifier/basic-use-after-reloc.ll
index 4b0746c9f5275..09db75968eb13 100644
--- a/llvm/test/SafepointIRVerifier/basic-use-after-reloc.ll
+++ b/llvm/test/SafepointIRVerifier/basic-use-after-reloc.ll
@@ -9,8 +9,8 @@
 ; Function Attrs: nounwind
 define %jObject addrspace(1)* @test(%jObject addrspace(1)* %arg) gc "statepoint-example" {
 bci_0:
-  %safepoint_token3 = tail call token (i64, i32, double (double)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f64f64f(i64 0, i32 0, double (double)* undef, i32 1, i32 0, double undef, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, %jObject addrspace(1)* %arg)
-  %arg2.relocated4 = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token3, i32 13, i32 13)
+  %safepoint_token3 = tail call token (i64, i32, double (double)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f64f64f(i64 0, i32 0, double (double)* undef, i32 1, i32 0, double undef, i32 0, i32 0, %jObject addrspace(1)* %arg)
+  %arg2.relocated4 = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token3, i32 8, i32 8)
   ret %jObject addrspace(1)* %arg
 ; CHECK: Illegal use of unrelocated value found!
 ; CHECK-NEXT: Def: %jObject addrspace(1)* %arg
diff --git a/llvm/test/SafepointIRVerifier/compares.ll b/llvm/test/SafepointIRVerifier/compares.ll
index a14fc44e9814c..e4bbc0080e30f 100644
--- a/llvm/test/SafepointIRVerifier/compares.ll
+++ b/llvm/test/SafepointIRVerifier/compares.ll
@@ -9,7 +9,7 @@ define i8 addrspace(1)* @test1(i64 %arg, i8 addrspace(1)* %addr) gc "statepoint-
 ; CHECK: No illegal uses found by SafepointIRVerifier in: test1
 entry:
   %load_addr = getelementptr i8, i8 addrspace(1)* %addr, i64 %arg
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   %cmp = icmp eq i8 addrspace(1)* %load_addr, null
   ret i8 addrspace(1)* null
 }
@@ -19,7 +19,7 @@ define void @test2(i64 %arg, i1 %cond, i8 addrspace(1)* %addr) gc "statepoint-ex
 ; CHECK: No illegal uses found by SafepointIRVerifier in: test2
   %load_addr = getelementptr i8, i8 addrspace(1)* null, i64 %arg
   %load_addr_sel = select i1 %cond, i8 addrspace(1)* null, i8 addrspace(1)* %load_addr
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   %cmp = icmp eq i8 addrspace(1)* %addr, %load_addr_sel
   ret void
 }
@@ -32,7 +32,7 @@ define void @test3(i64 %arg, i32 addrspace(1)* %addr) gc "statepoint-example" {
 entry:
   %load_addr = getelementptr i32, i32 addrspace(1)* %addr, i64 %arg
   %load_addr_const = getelementptr i32, i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*), i64 %arg
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   %cmp = icmp eq i32 addrspace(1)* %load_addr, %load_addr_const
   ret void
 }
@@ -53,7 +53,7 @@ split:
 
 join:
   %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split]
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   %cmp = icmp eq i8 addrspace(1)* %load_addr, %base
   ret void
 }
@@ -65,7 +65,7 @@ define void @test5(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) g
 ; CHECK: No illegal uses found by SafepointIRVerifier in: test5
   %load_addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg
   %load_addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 %arg
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   %cmp = icmp eq i8 addrspace(1)* %load_addr1, %load_addr2
   ret void
 }
@@ -76,7 +76,7 @@ define void @test6(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) g
 ; CHECK-LABEL: Verifying gc pointers in function: test6
 ; CHECK: Illegal use of unrelocated value found!
   %load_addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2)
   %ptr2.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; base2, base2
   %cmp = icmp eq i8 addrspace(1)* %load_addr1, %ptr2.relocated
   ret void
diff --git a/llvm/test/SafepointIRVerifier/constant-bases.ll b/llvm/test/SafepointIRVerifier/constant-bases.ll
index 52a2a46d068d0..b6705d97f826e 100644
--- a/llvm/test/SafepointIRVerifier/constant-bases.ll
+++ b/llvm/test/SafepointIRVerifier/constant-bases.ll
@@ -3,7 +3,7 @@
 define i8 addrspace(1)* @test1(i64 %arg) gc "statepoint-example" {
 ; CHECK: No illegal uses found by SafepointIRVerifier in: test1
 entry:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   ret i8 addrspace(1)* null
 }
 
@@ -11,7 +11,7 @@ define i8 addrspace(1)* @test2(i64 %arg) gc "statepoint-example" {
 ; CHECK: No illegal uses found by SafepointIRVerifier in: test2
 entry:
   %load_addr = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   ret i8 addrspace(1)* %load_addr
 }
 
@@ -20,7 +20,7 @@ define i8 addrspace(1)* @test3(i64 %arg) gc "statepoint-example" {
 entry:
   %load_addr = getelementptr i32, i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*), i64 %arg
   %load_addr.cast = bitcast i32 addrspace(1)* %load_addr to i8 addrspace(1)*
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   ret i8 addrspace(1)* %load_addr.cast
 }
 
@@ -36,7 +36,7 @@ split:
 
 join:
   %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split]
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   ret i8 addrspace(1)* %load_addr
 }
 
@@ -46,7 +46,7 @@ entry:
   %load_addr.1 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg
   %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg
   %load_addr = select i1 %cond, i8 addrspace(1)* %load_addr.1, i8 addrspace(1)* %load_addr.2
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   ret i8 addrspace(1)* %load_addr
 }
 
@@ -63,7 +63,7 @@ split:
 
 join:
   %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split]
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   ret i8 addrspace(1)* %load_addr
 }
 
diff --git a/llvm/test/SafepointIRVerifier/unreachable-block-tolerant.ll b/llvm/test/SafepointIRVerifier/unreachable-block-tolerant.ll
index 724794b458f92..01af9eee0b315 100644
--- a/llvm/test/SafepointIRVerifier/unreachable-block-tolerant.ll
+++ b/llvm/test/SafepointIRVerifier/unreachable-block-tolerant.ll
@@ -8,8 +8,8 @@
 define %jObject addrspace(1)* @test(%jObject addrspace(1)* %arg) gc "statepoint-example" {
 ; CHECK-LABEL: Verifying gc pointers in function: test
 ; CHECK-NEXT:  No illegal uses found by SafepointIRVerifier in: test
-  %safepoint_token3 = tail call token (i64, i32, double (double)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f64f64f(i64 0, i32 0, double (double)* undef, i32 1, i32 0, double undef, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, %jObject addrspace(1)* %arg)
-  %arg2.relocated4 = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token3, i32 13, i32 13)
+  %safepoint_token3 = tail call token (i64, i32, double (double)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f64f64f(i64 0, i32 0, double (double)* undef, i32 1, i32 0, double undef, i32 0, i32 0, %jObject addrspace(1)* %arg)
+  %arg2.relocated4 = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token3, i32 8, i32 8)
   ret %jObject addrspace(1)* %arg2.relocated4
 
 unreachable:
@@ -30,7 +30,7 @@ define void @test2(i8 addrspace(1)* %arg) gc "statepoint-example" {
   br label %right
 
  left:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   br label %merge
 
  right:
diff --git a/llvm/test/SafepointIRVerifier/unrecorded-live-at-sp.ll b/llvm/test/SafepointIRVerifier/unrecorded-live-at-sp.ll
index 5cd4aa7414548..c3608834fa5bc 100644
--- a/llvm/test/SafepointIRVerifier/unrecorded-live-at-sp.ll
+++ b/llvm/test/SafepointIRVerifier/unrecorded-live-at-sp.ll
@@ -2,7 +2,7 @@
 
 ; CHECK:      Illegal use of unrelocated value found!
 ; CHECK-NEXT: Def:   %base_phi4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %cast6, %bci_37-aload ], !is_base_value !0
-; CHECK-NEXT: Use:   %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, %jObject addrspace(1)* %base_phi1, %jObject addrspace(1)* addrspace(1)* %base_phi4, %jObject addrspace(1)* addrspace(1)* %relocated4, %jObject addrspace(1)* %relocated7)
+; CHECK-NEXT: Use:   %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* undef, i32 0, i32 0, i32 0, i32 0, %jObject addrspace(1)* %base_phi1, %jObject addrspace(1)* addrspace(1)* %base_phi4, %jObject addrspace(1)* addrspace(1)* %relocated4, %jObject addrspace(1)* %relocated7)
 
 
 %jObject = type { [8 x i8] }
@@ -42,12 +42,12 @@ not_zero179:                                      ; preds = %not_zero146, %bci_3
   %relocated7 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %relocated8, %bci_37-aload ]
   %base_phi4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %cast6, %bci_37-aload ], !is_base_value !0
   %relocated4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %addr98, %bci_37-aload ]
-  %safepoint_token = tail call  token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, %jObject addrspace(1)* %base_phi1, %jObject addrspace(1)* addrspace(1)* %base_phi4, %jObject addrspace(1)* addrspace(1)* %relocated4, %jObject addrspace(1)* %relocated7)
+  %safepoint_token = tail call  token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* undef, i32 0, i32 0, i32 0, i32 0, %jObject addrspace(1)* %base_phi1, %jObject addrspace(1)* addrspace(1)* %base_phi4, %jObject addrspace(1)* addrspace(1)* %relocated4, %jObject addrspace(1)* %relocated7)
   %tmp4 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
-  %base_phi1.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 12, i32 12)
-  %base_phi4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 13, i32 13)
-  %relocated4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 13, i32 14)
-  %relocated7.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 12, i32 15)
+  %base_phi1.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 7, i32 7)
+  %base_phi4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 8, i32 8)
+  %relocated4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 8, i32 9)
+  %relocated7.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 7, i32 10)
   %addr636 = bitcast %jObject addrspace(1)* addrspace(1)* %relocated4.relocated to %jObject addrspace(1)* addrspace(1)*
   br label %bci_37-aload
 }
diff --git a/llvm/test/SafepointIRVerifier/uses-in-phi-nodes.ll b/llvm/test/SafepointIRVerifier/uses-in-phi-nodes.ll
index bbf98577230d5..70c41810974ff 100644
--- a/llvm/test/SafepointIRVerifier/uses-in-phi-nodes.ll
+++ b/llvm/test/SafepointIRVerifier/uses-in-phi-nodes.ll
@@ -6,7 +6,7 @@ define i8 addrspace(1)* @test.not.ok.0(i8 addrspace(1)* %arg) gc "statepoint-exa
   br i1 undef, label %left, label %right
 
  left:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   br label %merge
 
  right:
@@ -26,7 +26,7 @@ define i8 addrspace(1)* @test.not.ok.1(i8 addrspace(1)* %arg) gc "statepoint-exa
   br i1 undef, label %left, label %right
 
  left:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   br label %merge
 
  right:
@@ -46,7 +46,7 @@ define i8 addrspace(1)* @test.ok.0(i8 addrspace(1)* %arg) gc "statepoint-example
   br i1 undef, label %left, label %right
 
  left:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   br label %merge
 
  right:
@@ -154,7 +154,7 @@ define void @test.unrelocated-phi.cmp.ok(i8 addrspace(1)* %arg) gc "statepoint-e
   br i1 undef, label %left, label %right
 
  left:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0)
   br label %merge
 
  right:

From e3fb8446f2ec3953348f3c773004cf2aa28a8c04 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 28 May 2020 14:38:05 -0700
Subject: [PATCH 433/770] Revert "Run Coverage pass before other *San passes
 under new pass manager, round 2"

This reverts commit 922fa2fce38b0bd97921b91ff1cdc57f18d3569c.
---
 clang/lib/CodeGen/BackendUtil.cpp      | 50 +++++++++-----------------
 llvm/include/llvm/Passes/PassBuilder.h |  4 +--
 llvm/lib/Passes/PassBuilder.cpp        |  6 ++--
 llvm/tools/opt/NewPMDriver.cpp         |  2 +-
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index dd5016333920d..e746aef1a62ff 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -32,7 +32,6 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -1002,15 +1001,6 @@ static void addSanitizersAtO0(ModulePassManager &MPM,
                               const Triple &TargetTriple,
                               const LangOptions &LangOpts,
                               const CodeGenOptions &CodeGenOpts) {
-  if (CodeGenOpts.SanitizeCoverageType ||
-      CodeGenOpts.SanitizeCoverageIndirectCalls ||
-      CodeGenOpts.SanitizeCoverageTraceCmp) {
-    auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
-    MPM.addPass(ModuleSanitizerCoveragePass(
-        SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
-        CodeGenOpts.SanitizeCoverageBlacklistFiles));
-  }
-
   auto ASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
     MPM.addPass(RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
     bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
@@ -1259,20 +1249,6 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
             [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
               FPM.addPass(BoundsCheckingPass());
             });
-
-      if (CodeGenOpts.SanitizeCoverageType ||
-          CodeGenOpts.SanitizeCoverageIndirectCalls ||
-          CodeGenOpts.SanitizeCoverageTraceCmp) {
-        PB.registerOptimizerLastEPCallback(
-            [this](ModulePassManager &MPM,
-                   PassBuilder::OptimizationLevel Level) {
-              auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
-              MPM.addPass(ModuleSanitizerCoveragePass(
-                  SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
-                  CodeGenOpts.SanitizeCoverageBlacklistFiles));
-            });
-      }
-
       if (LangOpts.Sanitize.has(SanitizerKind::Memory)) {
         int TrackOrigins = CodeGenOpts.SanitizeMemoryTrackOrigins;
         bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::Memory);
@@ -1281,19 +1257,17 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
               MPM.addPass(MemorySanitizerPass({TrackOrigins, Recover, false}));
             });
         PB.registerOptimizerLastEPCallback(
-            [TrackOrigins, Recover](ModulePassManager &MPM,
+            [TrackOrigins, Recover](FunctionPassManager &FPM,
                                     PassBuilder::OptimizationLevel Level) {
-              MPM.addPass(createModuleToFunctionPassAdaptor(
-                  MemorySanitizerPass({TrackOrigins, Recover, false})));
+              FPM.addPass(MemorySanitizerPass({TrackOrigins, Recover, false}));
             });
       }
       if (LangOpts.Sanitize.has(SanitizerKind::Thread)) {
         PB.registerPipelineStartEPCallback(
             [](ModulePassManager &MPM) { MPM.addPass(ThreadSanitizerPass()); });
         PB.registerOptimizerLastEPCallback(
-            [](ModulePassManager &MPM, PassBuilder::OptimizationLevel Level) {
-              MPM.addPass(
-                  createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
+            [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
+              FPM.addPass(ThreadSanitizerPass());
             });
       }
       if (LangOpts.Sanitize.has(SanitizerKind::Address)) {
@@ -1304,11 +1278,10 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
         bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::Address);
         bool UseAfterScope = CodeGenOpts.SanitizeAddressUseAfterScope;
         PB.registerOptimizerLastEPCallback(
-            [Recover, UseAfterScope](ModulePassManager &MPM,
+            [Recover, UseAfterScope](FunctionPassManager &FPM,
                                      PassBuilder::OptimizationLevel Level) {
-              MPM.addPass(
-                  createModuleToFunctionPassAdaptor(AddressSanitizerPass(
-                      /*CompileKernel=*/false, Recover, UseAfterScope)));
+              FPM.addPass(AddressSanitizerPass(
+                  /*CompileKernel=*/false, Recover, UseAfterScope));
             });
         bool ModuleUseAfterScope = asanUseGlobalsGC(TargetTriple, CodeGenOpts);
         bool UseOdrIndicator = CodeGenOpts.SanitizeAddressUseOdrIndicator;
@@ -1352,6 +1325,15 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
       }
     }
 
+    if (CodeGenOpts.SanitizeCoverageType ||
+        CodeGenOpts.SanitizeCoverageIndirectCalls ||
+        CodeGenOpts.SanitizeCoverageTraceCmp) {
+      auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
+      MPM.addPass(ModuleSanitizerCoveragePass(
+          SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
+          CodeGenOpts.SanitizeCoverageBlacklistFiles));
+    }
+
     if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) {
       bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::HWAddress);
       MPM.addPass(HWAddressSanitizerPass(
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index d5a70c2ae132d..391d144d5dcdf 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -600,7 +600,7 @@ class PassBuilder {
   /// is not triggered at O0. Extensions to the O0 pipeline should append their
   /// passes to the end of the overall pipeline.
   void registerOptimizerLastEPCallback(
-      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
+      const std::function<void(FunctionPassManager &, OptimizationLevel)> &C) {
     OptimizerLastEPCallbacks.push_back(C);
   }
 
@@ -728,7 +728,7 @@ class PassBuilder {
       CGSCCOptimizerLateEPCallbacks;
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       VectorizerStartEPCallbacks;
-  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
+  SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       OptimizerLastEPCallbacks;
   // Module callbacks
   SmallVector<std::function<void(ModulePassManager &)>, 2>
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1b1701cbe2619..0999f7872d12c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1073,12 +1073,12 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (PTO.Coroutines)
     OptimizePM.addPass(CoroCleanupPass());
 
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(OptimizePM, Level);
+
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(MPM, Level);
-
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass());
 
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index c99ad2f7b4dcf..2e84ca49b6e0b 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -194,7 +194,7 @@ static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM,
+        [&PB, VerifyEachPass, DebugLogging](FunctionPassManager &PM,
                                             PassBuilder::OptimizationLevel) {
           ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
           Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline, VerifyEachPass,

From a9c7b498621b41e8e25ec8e23a6ba38dc8230282 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Thu, 28 May 2020 14:24:13 -0700
Subject: [PATCH 434/770] [SVE] Eliminate calls to default-false
 VectorType::get() from mlir

Reviewers: efriedma, ftynse, c-rhodes, david-arm, rriddle

Reviewed By: ftynse

Subscribers: tschuett, rkruppe, psnobl, mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80340
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 3a055d04b962b..2c6478ddd1213 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -1925,7 +1925,8 @@ LLVMType LLVMType::setStructTyBody(LLVMType structType,
 LLVMType LLVMType::getVectorTy(LLVMType elementType, unsigned numElements) {
   // Lock access to the dialect as this may modify the LLVM context.
   return getLocked(&elementType.getDialect(), [=] {
-    return llvm::VectorType::get(elementType.getUnderlyingType(), numElements);
+    return llvm::FixedVectorType::get(elementType.getUnderlyingType(),
+                                      numElements);
   });
 }
 

From 10bb03c1c1901de225352814e0e51096b3d4c656 Mon Sep 17 00:00:00 2001
From: Siva Chandra Reddy <sivachandra@google.com>
Date: Thu, 28 May 2020 14:52:39 -0700
Subject: [PATCH 435/770] [libc][NFC][Obvious] Remove line break from a CMake
 message.

The line break was giving an impression of something going wrong.
---
 libc/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ce78d0a5aebf7..df0448b74edfd 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -32,8 +32,8 @@ if(COMMAND_RETURN_CODE EQUAL 0)
   set(COMPILER_RESOURCE_DIR
     "${COMPILER_RESOURCE_DIR}" CACHE PATH "path to compiler resource dir"
   )
-  message(STATUS "Set COMPILER_RESOURCE_DIR to
-                  ${COMPILER_RESOURCE_DIR} using --print-resource-dir")
+  message(STATUS "Set COMPILER_RESOURCE_DIR to "
+                 "${COMPILER_RESOURCE_DIR} using --print-resource-dir")
 else()
   set(COMPILER_RESOURCE_DIR OFF)
   message(STATUS "COMPILER_RESOURCE_DIR not set

From 5f9e0466f273602ba3ec5aa886b1bba8dfde66f4 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Thu, 28 May 2020 17:55:21 -0400
Subject: [PATCH 436/770] [mlir][Vector] Fix vector.transfer alignment
 calculation

https://reviews.llvm.org/D79246 introduces alignment propagation for vector transfer operations. Unfortunately, the alignment calculation is incorrect and can result in crashes.

This revision fixes the calculation by using the natural alignment of the memref elemental type, instead of the resulting vector type.

If more alignment is desired, it can be done in 2 ways:
1. use a proper vector.type_cast to transform a memref<axbxcxdxf32> into a memref<axbxvector<cxdxf32>> giving a natural alignment of vector<cxdxf32>
2. add an alignment attribute to vector transfer operations and propagate it.

With this change the alignment in the relevant tests goes down from 128 to 4.

Lastly, a few minor cleanups are performed and the custom `isMinorIdentityMap` is deprecated.

Differential Revision: https://reviews.llvm.org/D80734
---
 mlir/include/mlir/Conversion/Passes.td        |  14 ++
 .../mlir/Conversion/VectorToSCF/VectorToSCF.h |   5 +
 mlir/include/mlir/Dialect/Vector/VectorOps.h  |   5 +
 mlir/include/mlir/InitAllPasses.h             |   1 +
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      | 191 ++++++++----------
 .../Conversion/VectorToSCF/VectorToSCF.cpp    |  33 ++-
 .../VectorToLLVM/vector-to-llvm.mlir          |   4 +-
 .../VectorToSCF/vector-to-loops.mlir          |   4 +-
 mlir/test/lib/Transforms/CMakeLists.txt       |   1 -
 .../Transforms/TestVectorToSCFConversion.cpp  |  48 -----
 mlir/tools/mlir-opt/mlir-opt.cpp              |   2 -
 11 files changed, 147 insertions(+), 161 deletions(-)
 delete mode 100644 mlir/test/lib/Transforms/TestVectorToSCFConversion.cpp

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 65d05a7aea53b..5d8318483c3ca 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -271,6 +271,20 @@ def ConvertStandardToSPIRV : Pass<"convert-std-to-spirv", "ModuleOp"> {
   let constructor = "mlir::createConvertStandardToSPIRVPass()";
 }
 
+//===----------------------------------------------------------------------===//
+// VectorToSCF
+//===----------------------------------------------------------------------===//
+
+def ConvertVectorToSCF : FunctionPass<"convert-vector-to-scf"> {
+  let summary = "Lower the operations from the vector dialect into the SCF "
+                "dialect";
+  let constructor = "mlir::createConvertVectorToSCFPass()";
+  let options = [
+    Option<"fullUnroll", "full-unroll", "bool", /*default=*/"false",
+           "Perform full unrolling when converting vector transfers to SCF">,
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // VectorToLLVM
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h b/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
index d7a6f829f10fd..f34a5762dfa38 100644
--- a/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
+++ b/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h
@@ -14,6 +14,7 @@
 namespace mlir {
 class MLIRContext;
 class OwningRewritePatternList;
+class Pass;
 
 /// Control whether unrolling is used when lowering vector transfer ops to SCF.
 ///
@@ -164,6 +165,10 @@ void populateVectorToSCFConversionPatterns(
     OwningRewritePatternList &patterns, MLIRContext *context,
     const VectorTransferToSCFOptions &options = VectorTransferToSCFOptions());
 
+/// Create a pass to convert a subset of vector ops to SCF.
+std::unique_ptr<Pass> createConvertVectorToSCFPass(
+    const VectorTransferToSCFOptions &options = VectorTransferToSCFOptions());
+
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_VECTORTOSCF_VECTORTOSCF_H_
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h
index 423c72da64712..8c8424e8ef8f9 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h
@@ -56,6 +56,11 @@ enum class VectorContractLowering {
 /// Structure to control the behavior of vector transform patterns.
 struct VectorTransformsOptions {
   VectorContractLowering vectorContractLowering = VectorContractLowering::FMA;
+  VectorTransformsOptions &
+  setVectorTransformsOptions(VectorContractLowering opt) {
+    vectorContractLowering = opt;
+    return *this;
+  }
 };
 
 /// Collect a set of transformation patterns that are related to contracting
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index fb2ac1ee086f2..95f9ce1c4e1fa 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -28,6 +28,7 @@
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h"
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 5b3a01c7512f3..4185eae170a41 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -124,6 +124,89 @@ static SmallVector<int64_t, 4> getI64SubArray(ArrayAttr arrayAttr,
   return res;
 }
 
+template <typename TransferOp>
+LogicalResult getVectorTransferAlignment(LLVMTypeConverter &typeConverter,
+                                         TransferOp xferOp, unsigned &align) {
+  Type elementTy =
+      typeConverter.convertType(xferOp.getMemRefType().getElementType());
+  if (!elementTy)
+    return failure();
+
+  auto dataLayout = typeConverter.getDialect()->getLLVMModule().getDataLayout();
+  align = dataLayout.getPrefTypeAlignment(
+      elementTy.cast<LLVM::LLVMType>().getUnderlyingType());
+  return success();
+}
+
+static LogicalResult
+replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
+                                 LLVMTypeConverter &typeConverter, Location loc,
+                                 TransferReadOp xferOp,
+                                 ArrayRef<Value> operands, Value dataPtr) {
+  rewriter.replaceOpWithNewOp<LLVM::LoadOp>(xferOp, dataPtr);
+  return success();
+}
+
+static LogicalResult
+replaceTransferOpWithMasked(ConversionPatternRewriter &rewriter,
+                            LLVMTypeConverter &typeConverter, Location loc,
+                            TransferReadOp xferOp, ArrayRef<Value> operands,
+                            Value dataPtr, Value mask) {
+  auto toLLVMTy = [&](Type t) { return typeConverter.convertType(t); };
+  VectorType fillType = xferOp.getVectorType();
+  Value fill = rewriter.create<SplatOp>(loc, fillType, xferOp.padding());
+  fill = rewriter.create<LLVM::DialectCastOp>(loc, toLLVMTy(fillType), fill);
+
+  Type vecTy = typeConverter.convertType(xferOp.getVectorType());
+  if (!vecTy)
+    return failure();
+
+  unsigned align;
+  if (failed(getVectorTransferAlignment(typeConverter, xferOp, align)))
+    return failure();
+
+  rewriter.replaceOpWithNewOp<LLVM::MaskedLoadOp>(
+      xferOp, vecTy, dataPtr, mask, ValueRange{fill},
+      rewriter.getI32IntegerAttr(align));
+  return success();
+}
+
+static LogicalResult
+replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
+                                 LLVMTypeConverter &typeConverter, Location loc,
+                                 TransferWriteOp xferOp,
+                                 ArrayRef<Value> operands, Value dataPtr) {
+  auto adaptor = TransferWriteOpOperandAdaptor(operands);
+  rewriter.replaceOpWithNewOp<LLVM::StoreOp>(xferOp, adaptor.vector(), dataPtr);
+  return success();
+}
+
+static LogicalResult
+replaceTransferOpWithMasked(ConversionPatternRewriter &rewriter,
+                            LLVMTypeConverter &typeConverter, Location loc,
+                            TransferWriteOp xferOp, ArrayRef<Value> operands,
+                            Value dataPtr, Value mask) {
+  unsigned align;
+  if (failed(getVectorTransferAlignment(typeConverter, xferOp, align)))
+    return failure();
+
+  auto adaptor = TransferWriteOpOperandAdaptor(operands);
+  rewriter.replaceOpWithNewOp<LLVM::MaskedStoreOp>(
+      xferOp, adaptor.vector(), dataPtr, mask,
+      rewriter.getI32IntegerAttr(align));
+  return success();
+}
+
+static TransferReadOpOperandAdaptor
+getTransferOpAdapter(TransferReadOp xferOp, ArrayRef<Value> operands) {
+  return TransferReadOpOperandAdaptor(operands);
+}
+
+static TransferWriteOpOperandAdaptor
+getTransferOpAdapter(TransferWriteOp xferOp, ArrayRef<Value> operands) {
+  return TransferWriteOpOperandAdaptor(operands);
+}
+
 namespace {
 
 /// Conversion pattern for a vector.matrix_multiply.
@@ -767,108 +850,6 @@ class VectorTypeCastOpConversion : public ConvertToLLVMPattern {
   }
 };
 
-LogicalResult getLLVMTypeAndAlignment(LLVMTypeConverter &typeConverter,
-                                      Type type, LLVM::LLVMType &llvmType,
-                                      unsigned &align) {
-  auto convertedType = typeConverter.convertType(type);
-  if (!convertedType)
-    return failure();
-
-  llvmType = convertedType.template cast<LLVM::LLVMType>();
-  auto dataLayout = typeConverter.getDialect()->getLLVMModule().getDataLayout();
-  align = dataLayout.getPrefTypeAlignment(llvmType.getUnderlyingType());
-  return success();
-}
-
-LogicalResult
-replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
-                                 LLVMTypeConverter &typeConverter, Location loc,
-                                 TransferReadOp xferOp,
-                                 ArrayRef<Value> operands, Value dataPtr) {
-  LLVM::LLVMType vecTy;
-  unsigned align;
-  if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
-                                     vecTy, align)))
-    return failure();
-  rewriter.replaceOpWithNewOp<LLVM::LoadOp>(xferOp, dataPtr);
-  return success();
-}
-
-LogicalResult replaceTransferOpWithMasked(ConversionPatternRewriter &rewriter,
-                                          LLVMTypeConverter &typeConverter,
-                                          Location loc, TransferReadOp xferOp,
-                                          ArrayRef<Value> operands,
-                                          Value dataPtr, Value mask) {
-  auto toLLVMTy = [&](Type t) { return typeConverter.convertType(t); };
-  VectorType fillType = xferOp.getVectorType();
-  Value fill = rewriter.create<SplatOp>(loc, fillType, xferOp.padding());
-  fill = rewriter.create<LLVM::DialectCastOp>(loc, toLLVMTy(fillType), fill);
-
-  LLVM::LLVMType vecTy;
-  unsigned align;
-  if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
-                                     vecTy, align)))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<LLVM::MaskedLoadOp>(
-      xferOp, vecTy, dataPtr, mask, ValueRange{fill},
-      rewriter.getI32IntegerAttr(align));
-  return success();
-}
-
-LogicalResult
-replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
-                                 LLVMTypeConverter &typeConverter, Location loc,
-                                 TransferWriteOp xferOp,
-                                 ArrayRef<Value> operands, Value dataPtr) {
-  auto adaptor = TransferWriteOpOperandAdaptor(operands);
-  LLVM::LLVMType vecTy;
-  unsigned align;
-  if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
-                                     vecTy, align)))
-    return failure();
-  rewriter.replaceOpWithNewOp<LLVM::StoreOp>(xferOp, adaptor.vector(), dataPtr);
-  return success();
-}
-
-LogicalResult replaceTransferOpWithMasked(ConversionPatternRewriter &rewriter,
-                                          LLVMTypeConverter &typeConverter,
-                                          Location loc, TransferWriteOp xferOp,
-                                          ArrayRef<Value> operands,
-                                          Value dataPtr, Value mask) {
-  auto adaptor = TransferWriteOpOperandAdaptor(operands);
-  LLVM::LLVMType vecTy;
-  unsigned align;
-  if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
-                                     vecTy, align)))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<LLVM::MaskedStoreOp>(
-      xferOp, adaptor.vector(), dataPtr, mask,
-      rewriter.getI32IntegerAttr(align));
-  return success();
-}
-
-static TransferReadOpOperandAdaptor
-getTransferOpAdapter(TransferReadOp xferOp, ArrayRef<Value> operands) {
-  return TransferReadOpOperandAdaptor(operands);
-}
-
-static TransferWriteOpOperandAdaptor
-getTransferOpAdapter(TransferWriteOp xferOp, ArrayRef<Value> operands) {
-  return TransferWriteOpOperandAdaptor(operands);
-}
-
-bool isMinorIdentity(AffineMap map, unsigned rank) {
-  if (map.getNumResults() < rank)
-    return false;
-  unsigned startDim = map.getNumDims() - rank;
-  for (unsigned i = 0; i < rank; ++i)
-    if (map.getResult(i) != getAffineDimExpr(startDim + i, map.getContext()))
-      return false;
-  return true;
-}
-
 /// Conversion pattern that converts a 1-D vector transfer read/write op in a
 /// sequence of:
 /// 1. Bitcast or addrspacecast to vector form.
@@ -892,8 +873,10 @@ class VectorTransferConversion : public ConvertToLLVMPattern {
     if (xferOp.getVectorType().getRank() > 1 ||
         llvm::size(xferOp.indices()) == 0)
       return failure();
-    if (!isMinorIdentity(xferOp.permutation_map(),
-                         xferOp.getVectorType().getRank()))
+    if (xferOp.permutation_map() !=
+        AffineMap::getMinorIdentityMap(xferOp.permutation_map().getNumInputs(),
+                                       xferOp.getVectorType().getRank(),
+                                       op->getContext()))
       return failure();
 
     auto toLLVMTy = [&](Type t) { return typeConverter.convertType(t); };
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index 8c72800819a5b..6816bc749de9f 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -13,6 +13,8 @@
 #include <type_traits>
 
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+
+#include "../PassDetail.h"
 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
 #include "mlir/Dialect/SCF/EDSC/Builders.h"
 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
@@ -29,6 +31,8 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 using namespace mlir::edsc;
@@ -349,7 +353,7 @@ LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
 }
 
 } // namespace
-  
+
 /// Analyzes the `transfer` to find an access dimension along the fastest remote
 /// MemRef dimension. If such a dimension with coalescing properties is found,
 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
@@ -435,7 +439,7 @@ clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef<Value> ivs) {
 }
 
 namespace mlir {
-  
+
 template <typename TransferOpTy>
 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter(
     VectorTransferToSCFOptions options, MLIRContext *context)
@@ -631,3 +635,28 @@ void populateVectorToSCFConversionPatterns(
 
 } // namespace mlir
 
+namespace {
+
+struct ConvertVectorToSCFPass
+    : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
+  ConvertVectorToSCFPass() = default;
+  ConvertVectorToSCFPass(const ConvertVectorToSCFPass &pass) {}
+  ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
+    this->fullUnroll = options.unroll;
+  }
+
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    auto *context = getFunction().getContext();
+    populateVectorToSCFConversionPatterns(
+        patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll));
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass>
+mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
+  return std::make_unique<ConvertVectorToSCFPass>(options);
+}
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 6150ac78fc2a5..3662c24288865 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -818,7 +818,7 @@ func @transfer_read_1d(%A : memref<?xf32>, %base: index) -> vector<17xf32> {
 //       CHECK: %[[PASS_THROUGH:.*]] =  llvm.mlir.constant(dense<7.000000e+00> :
 //  CHECK-SAME:  vector<17xf32>) : !llvm<"<17 x float>">
 //       CHECK: %[[loaded:.*]] = llvm.intr.masked.load %[[vecPtr]], %[[mask]],
-//  CHECK-SAME: %[[PASS_THROUGH]] {alignment = 128 : i32} :
+//  CHECK-SAME: %[[PASS_THROUGH]] {alignment = 4 : i32} :
 //  CHECK-SAME: (!llvm<"<17 x float>*">, !llvm<"<17 x i1>">, !llvm<"<17 x float>">) -> !llvm<"<17 x float>">
 
 //
@@ -850,7 +850,7 @@ func @transfer_read_1d(%A : memref<?xf32>, %base: index) -> vector<17xf32> {
 //
 // 5. Rewrite as a masked write.
 //       CHECK: llvm.intr.masked.store %[[loaded]], %[[vecPtr_b]], %[[mask_b]]
-//  CHECK-SAME: {alignment = 128 : i32} :
+//  CHECK-SAME: {alignment = 4 : i32} :
 //  CHECK-SAME: !llvm<"<17 x float>">, !llvm<"<17 x i1>"> into !llvm<"<17 x float>*">
 
 func @transfer_read_2d_to_1d(%A : memref<?x?xf32>, %base0: index, %base1: index) -> vector<17xf32> {
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
index dc35058cfd893..d4f22d2f66a6c 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -test-convert-vector-to-scf -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-convert-vector-to-scf=full-unroll=true -split-input-file | FileCheck %s --check-prefix=FULL-UNROLL
+// RUN: mlir-opt %s -convert-vector-to-scf -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -split-input-file | FileCheck %s --check-prefix=FULL-UNROLL
 
 // CHECK-LABEL: func @materialize_read_1d() {
 func @materialize_read_1d() {
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 4ea7498d34822..6069570316a8f 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -20,7 +20,6 @@ add_mlir_library(MLIRTestTransforms
   TestMemRefBoundCheck.cpp
   TestMemRefDependenceCheck.cpp
   TestMemRefStrideCalculation.cpp
-  TestVectorToSCFConversion.cpp
   TestVectorTransforms.cpp
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lib/Transforms/TestVectorToSCFConversion.cpp b/mlir/test/lib/Transforms/TestVectorToSCFConversion.cpp
deleted file mode 100644
index 7a83e20e47acd..0000000000000
--- a/mlir/test/lib/Transforms/TestVectorToSCFConversion.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===- TestVectorToSCFConversion.cpp - Test VectorTransfers lowering ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <type_traits>
-
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/Passes.h"
-
-using namespace mlir;
-
-namespace {
-
-struct TestVectorToSCFPass
-    : public PassWrapper<TestVectorToSCFPass, FunctionPass> {
-  TestVectorToSCFPass() = default;
-  TestVectorToSCFPass(const TestVectorToSCFPass &pass) {}
-
-  Option<bool> fullUnroll{
-      *this, "full-unroll",
-      llvm::cl::desc(
-          "Perform full unrolling when converting vector transfers to SCF"),
-      llvm::cl::init(false)};
-
-  void runOnFunction() override {
-    OwningRewritePatternList patterns;
-    auto *context = &getContext();
-    populateVectorToSCFConversionPatterns(
-        patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll));
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
-  }
-};
-
-} // end anonymous namespace
-
-namespace mlir {
-void registerTestVectorToSCFPass() {
-  PassRegistration<TestVectorToSCFPass> pass(
-      "test-convert-vector-to-scf",
-      "Converts vector transfer ops to loops over scalars and vector casts");
-}
-} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 159a7fd4bca54..2764b23b7b35e 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -62,7 +62,6 @@ void registerTestOpaqueLoc();
 void registerTestParallelismDetection();
 void registerTestGpuParallelLoopMappingPass();
 void registerTestVectorConversions();
-void registerTestVectorToSCFPass();
 void registerVectorizerTestPass();
 } // namespace mlir
 
@@ -133,7 +132,6 @@ void registerTestPasses() {
   registerTestParallelismDetection();
   registerTestGpuParallelLoopMappingPass();
   registerTestVectorConversions();
-  registerTestVectorToSCFPass();
   registerVectorizerTestPass();
 }
 #endif

From c7614faa05fa276f34626748af7f975630e46097 Mon Sep 17 00:00:00 2001
From: Siva Chandra Reddy <sivachandra@google.com>
Date: Thu, 28 May 2020 14:57:36 -0700
Subject: [PATCH 437/770] [libc][NFC][Obvious] Fix few header guards in
 src/threads.

---
 libc/src/threads/mtx_init.h    | 6 +++---
 libc/src/threads/mtx_lock.h    | 6 +++---
 libc/src/threads/mtx_unlock.h  | 6 +++---
 libc/src/threads/thrd_create.h | 6 +++---
 libc/src/threads/thrd_join.h   | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/libc/src/threads/mtx_init.h b/libc/src/threads/mtx_init.h
index d85089ff8c2e9..7eed5ece6d5e7 100644
--- a/libc/src/threads/mtx_init.h
+++ b/libc/src/threads/mtx_init.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_MTX_INIT_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_MTX_INIT_H
+#ifndef LLVM_LIBC_SRC_THREADS_MTX_INIT_H
+#define LLVM_LIBC_SRC_THREADS_MTX_INIT_H
 
 #include "include/threads.h"
 
@@ -17,4 +17,4 @@ int mtx_init(mtx_t *mutex, int type);
 
 } // namespace __llvm_libc
 
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_MTX_INIT_H
+#endif // LLVM_LIBC_SRC_THREADS_MTX_INIT_H
diff --git a/libc/src/threads/mtx_lock.h b/libc/src/threads/mtx_lock.h
index fee6285b8daec..5086f773f0fe6 100644
--- a/libc/src/threads/mtx_lock.h
+++ b/libc/src/threads/mtx_lock.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_MTX_LOCK_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_MTX_LOCK_H
+#ifndef LLVM_LIBC_SRC_THREADS_MTX_LOCK_H
+#define LLVM_LIBC_SRC_THREADS_MTX_LOCK_H
 
 #include "include/threads.h"
 
@@ -17,4 +17,4 @@ int mtx_lock(mtx_t *mutex);
 
 } // namespace __llvm_libc
 
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_MTX_LOCK_H
+#endif // LLVM_LIBC_SRC_THREADS_MTX_LOCK_H
diff --git a/libc/src/threads/mtx_unlock.h b/libc/src/threads/mtx_unlock.h
index b7ae913170a41..55f0b4a7a6655 100644
--- a/libc/src/threads/mtx_unlock.h
+++ b/libc/src/threads/mtx_unlock.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_MTX_UNLOCK_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_MTX_UNLOCK_H
+#ifndef LLVM_LIBC_SRC_THREADS_MTX_UNLOCK_H
+#define LLVM_LIBC_SRC_THREADS_MTX_UNLOCK_H
 
 #include "include/threads.h"
 
@@ -17,4 +17,4 @@ int mtx_unlock(mtx_t *mutex);
 
 } // namespace __llvm_libc
 
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_MTX_UNLOCK_H
+#endif // LLVM_LIBC_SRC_THREADS_MTX_UNLOCK_H
diff --git a/libc/src/threads/thrd_create.h b/libc/src/threads/thrd_create.h
index 2ad3c7c737b25..d2bb7dfad41c5 100644
--- a/libc/src/threads/thrd_create.h
+++ b/libc/src/threads/thrd_create.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_THRD_CREATE_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_THRD_CREATE_H
+#ifndef LLVM_LIBC_SRC_THREADS_THRD_CREATE_H
+#define LLVM_LIBC_SRC_THREADS_THRD_CREATE_H
 
 #include "include/threads.h"
 
@@ -17,4 +17,4 @@ int thrd_create(thrd_t *thread, thrd_start_t func, void *arg);
 
 } // namespace __llvm_libc
 
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_THRD_CREATE_H
+#endif // LLVM_LIBC_SRC_THREADS_THRD_CREATE_H
diff --git a/libc/src/threads/thrd_join.h b/libc/src/threads/thrd_join.h
index 9df4644423d89..fc36503dc521c 100644
--- a/libc/src/threads/thrd_join.h
+++ b/libc/src/threads/thrd_join.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_THRD_JOIN_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_THRD_JOIN_H
+#ifndef LLVM_LIBC_SRC_THREADS_THRD_JOIN_H
+#define LLVM_LIBC_SRC_THREADS_THRD_JOIN_H
 
 #include "include/threads.h"
 
@@ -17,4 +17,4 @@ int thrd_join(thrd_t *thread, int *retval);
 
 } // namespace __llvm_libc
 
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_THRD_JOIN_H
+#endif // LLVM_LIBC_SRC_THREADS_THRD_JOIN_H

From ac1d93c53b6a527d43e29a93ff2df9948b7c3e87 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 28 May 2020 15:10:29 -0700
Subject: [PATCH 438/770] [X86] Fix a comment reference to registers R8L..R15L
 to use R8B..R15B like everywhere else. NFC

A new Intel SDM was released today that also fixes this issue in
some documentation.
---
 llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index baf842b12a27b..8e5311054e24a 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -374,7 +374,7 @@ enum ModRMDecisionType {
   ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
   ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
   ENUM_ENTRY(ENCODING_IO,     "8-byte")                                        \
-  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8L..R15L) Register code added to "   \
+  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8B..R15B) Register code added to "   \
                               "the opcode byte")                               \
   ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
   ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \

From 85bf78df654bfbf021192332b9f6dac4f43b01bb Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 15:16:36 -0700
Subject: [PATCH 439/770] [Tests] Update a few more statepoint tests

Starting to work through the hard ones now, progress likely to slow drammatically.
---
 .../CodeGen/X86/combineIncDecVector-crash.ll   | 11 ++++++-----
 .../test/CodeGen/X86/non-value-mem-operand.mir |  6 +++---
 llvm/test/CodeGen/X86/statepoint-allocas.ll    |  2 +-
 .../CodeGen/X86/statepoint-stackmap-format.ll  | 18 +++++++++---------
 llvm/test/CodeGen/X86/statepoint-uniqueing.ll  | 16 ++++++++--------
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll b/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
index 8dea7a5fdcdce..a6d7d9fac36a4 100644
--- a/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
+++ b/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
@@ -29,11 +29,12 @@ bci_0:
    %token418 = call token (i64, i32, i8 * (i64, i32, i32, i32)*, i32,
 i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64
 2882400000, i32 0, i8 * (i64, i32, i32, i32)* nonnull @newarray, i32 4,
-i32 0, i64 undef, i32 10, i32 10, i32 400, i32 0, i32 35, i32 0, i32 1,
-i32 0, i32 43, i32 1, i32 13, i32 0, i32 3, i32 400, i32 3, i32 %0, i32
-4, i64 %1, i32 7, i8* null, i32 3, i32 -11464, i32 7, i8* null, i32 3,
-i32 -243, i32 3, i32 14, i32 3, i32 117, i32 3, i32 -13, i32 3, i32 -15,
-i32 3, i32 -210, i32 3, i32 541, i32 7, i8* null)
+i32 0, i64 undef, i32 10, i32 10, i32 400, i32 0, i32 0) ["deopt"
+(i32 35, i32 0, i32 1, i32 0, i32 43, i32 1, i32 13, i32 0, i32 3,
+i32 400, i32 3, i32 %0, i32 4, i64 %1, i32 7, i8* null, i32 3,
+i32 -11464, i32 7, i8* null, i32 3, i32 -243, i32 3, i32 14, i32 3,
+i32 117, i32 3, i32 -13, i32 3, i32 -15, i32 3, i32 -210, i32 3,
+i32 541, i32 7, i8* null)]
    %v2 = load atomic float, float * undef unordered, align 4
    %v3 = load <4 x i32>, <4 x i32> * undef, align 4
    %v4 = add <4 x i32> %v3, <i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/X86/non-value-mem-operand.mir b/llvm/test/CodeGen/X86/non-value-mem-operand.mir
index 8682d0657b211..f4da5729db02b 100644
--- a/llvm/test/CodeGen/X86/non-value-mem-operand.mir
+++ b/llvm/test/CodeGen/X86/non-value-mem-operand.mir
@@ -34,7 +34,7 @@
     br i1 undef, label %bb26, label %bb15
   
   bb15:                                             ; preds = %bb7
-    %tmp16 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* nonnull @wibble, i32 0, i32 0, i32 0, i32 30, i32 1, i32 0, i32 99, i32 0, i32 12, i32 0, i32 10, i32 %tmp10, i32 10, i32 0, i32 10, i32 %tmp12, i32 10, i32 undef, i32 6, float undef, i32 7, double %tmp13, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* %tmp, i32 7, double undef, i32 99, i8* null, i8* undef)
+    %tmp16 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* nonnull @wibble, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 99, i32 0, i32 12, i32 0, i32 10, i32 %tmp10, i32 10, i32 0, i32 10, i32 %tmp12, i32 10, i32 undef, i32 6, float undef, i32 7, double %tmp13, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* %tmp, i32 7, double undef, i32 99, i8* null, i8* undef)]
     br label %bb26
   
   bb26:                                             ; preds = %bb15, %bb7
@@ -95,11 +95,11 @@
     %tmp53 = phi double [ 2.000000e+00, %bb51.loopexit ], [ 0.000000e+00, %bb2 ]
     %tmp54 = phi i32 [ %tmp9, %bb51.loopexit ], [ undef, %bb2 ]
     %tmp56 = add i32 %tmp54, 0
-    %tmp57 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 -121, i32 0, i32 38, i32 1, i32 0, i32 270, i32 4, i32 12, i32 0, i32 11, i64 undef, i32 99, i8* null, i32 10, i32 %tmp56, i32 6, float undef, i32 99, i8* null, i32 99, i8* null, i32 10, i32 %tmp52, i32 10, i32 undef, i32 99, i8* null, i32 7, double %tmp53, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* undef, i32 99, i8* null, i32 99, i8* null, i8* undef)
+    %tmp57 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 -121, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 270, i32 4, i32 12, i32 0, i32 11, i64 undef, i32 99, i8* null, i32 10, i32 %tmp56, i32 6, float undef, i32 99, i8* null, i32 99, i8* null, i32 10, i32 %tmp52, i32 10, i32 undef, i32 99, i8* null, i32 7, double %tmp53, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* undef, i32 99, i8* null, i32 99, i8* null, i8* undef)]
     unreachable
   
   bb59:                                             ; preds = %bb2
-    %tmp61 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 8, i32 0, i32 38, i32 1, i32 0, i32 123, i32 4, i32 12, i32 0, i32 13, i8* null, i32 99, i32 undef, i32 13, i8* null, i32 10, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i8* null, i32 99, float undef, i32 99, double undef, i32 99, i8* null, i32 99, double undef, i32 99, i8* null, i32 13, i8* null, i32 99, double undef, i32 99, i8* null)
+    %tmp61 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 8, i32 0, i32 0) ["deopt" (i32 1, i32 0, i32 123, i32 4, i32 12, i32 0, i32 13, i8* null, i32 99, i32 undef, i32 13, i8* null, i32 10, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i8* null, i32 99, float undef, i32 99, double undef, i32 99, i8* null, i32 99, double undef, i32 99, i8* null, i32 13, i8* null, i32 99, double undef, i32 99, i8* null)]
     unreachable
   }
   
diff --git a/llvm/test/CodeGen/X86/statepoint-allocas.ll b/llvm/test/CodeGen/X86/statepoint-allocas.ll
index e469f38b311eb..ba2b6ef8ca43c 100644
--- a/llvm/test/CodeGen/X86/statepoint-allocas.ll
+++ b/llvm/test/CodeGen/X86/statepoint-allocas.ll
@@ -47,7 +47,7 @@ define i32 addrspace(1)* @test2(i32 addrspace(1)* %ptr) gc "statepoint-example"
 entry:
   %alloca = alloca i32 addrspace(1)*, align 8
   store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
-  call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 1, i32 addrspace(1)** %alloca)
+  call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 addrspace(1)** %alloca)]
   ret i32 addrspace(1)* null
 }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll b/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll
index ee71f6e70ef40..71f7663cafd6c 100644
--- a/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -25,11 +25,11 @@ entry:
   %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
   store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
   %ptr_derived = getelementptr i32, i32 addrspace(1)* %ptr_base, i32 %arg
-  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null) ["deopt" (i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null)]
   %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
-  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 9)
-  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 10)
-  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 11, i32 11)
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7)
+  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 8)
+  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 9)
 ; 
   ret i1 %call1
 }
@@ -53,11 +53,11 @@ define i1 @test_derived_arg(i32 addrspace(1)* %ptr_base,
 entry:
   %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
   store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
-  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null)
+  %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr_base, i32 addrspace(1)* %ptr_derived, i32 addrspace(1)* null) ["deopt" (i32 addrspace(1)* %ptr_base, i32 addrspace(1)* null)]
   %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
-  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 9)
-  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 10)
-  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 11, i32 11)
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7)
+  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 8)
+  %c = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 9, i32 9)
 ; 
   ret i1 %call1
 }
@@ -116,7 +116,7 @@ entry:
   br label %bb
 
 bb:                                               ; preds = %entry
-  %statepoint_token = call token (i64, i32, void (%struct*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp0s_structsf(i64 0, i32 0, void (%struct*)* @use, i32 1, i32 0, %struct* %x, i32 0, i32 1, %struct* %x)
+  %statepoint_token = call token (i64, i32, void (%struct*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp0s_structsf(i64 0, i32 0, void (%struct*)* @use, i32 1, i32 0, %struct* %x, i32 0, i32 0) ["deopt" (%struct* %x)]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-uniqueing.ll b/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
index ae7c076ab12e8..2a2fb10a7c2a0 100644
--- a/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
+++ b/llvm/test/CodeGen/X86/statepoint-uniqueing.ll
@@ -27,9 +27,9 @@ define void @test_gcrelocate_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-ex
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
   %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
-      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr)
-  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 9, i32 9)
-  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 10, i32 10)
+      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr) ["deopt" (i32 addrspace(1)* %ptr, i32 undef)]
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 7, i32 7)
+  %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 8, i32 8)
   call void (...) @use(i32 addrspace(1)* %a, i32 addrspace(1)* %b)
   ret void
 }
@@ -52,9 +52,9 @@ define void @test_gcptr_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example
 ; CHECK-NEXT:    retq
   %ptr2 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
   %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
-      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2)
-  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 9, i32 9)
-  %b = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 10, i32 10)
+      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2) ["deopt" (i32 addrspace(1)* %ptr, i32 undef)]
+  %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 7, i32 7)
+  %b = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 8, i32 8)
   call void (...) @use(i32 addrspace(1)* %a, i8 addrspace(1)* %b)
   ret void
 }
@@ -73,7 +73,7 @@ define void @test_deopt_use(i32 addrspace(1)* %ptr) gc "statepoint-example" {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
   tail call token (i64, i32, void ()*, i32, i32, ...)
-      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr)
+      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr) ["deopt" (i32 addrspace(1)* %ptr, i32 undef)]
   ret void
 }
 
@@ -89,6 +89,6 @@ define void @test_dse(i32 addrspace(1)* %ptr) gc "statepoint-example" {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
   tail call token (i64, i32, void ()*, i32, i32, ...)
-      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 undef, i32 undef, i32 addrspace(1)* %ptr)
+      @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %ptr)
   ret void
 }

From bce702e5f28fd5358fb734932828f030babf0499 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 28 May 2020 14:54:49 -0700
Subject: [PATCH 440/770] unsigned -> Register for readability.

---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index eb5b7ef4dab5d..d1f2f4fce08ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -619,7 +619,7 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
   // we don't have anything that canonicalizes operand order.
   if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
     if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
-      unsigned Op1 = getRegForValue(I->getOperand(1));
+      Register Op1 = getRegForValue(I->getOperand(1));
       if (!Op1)
         return false;
       bool Op1IsKill = hasTrivialKill(I->getOperand(1));
@@ -1941,7 +1941,7 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
       return selectCast(I, ISD::ZERO_EXTEND);
     if (DstVT.bitsLT(SrcVT))
       return selectCast(I, ISD::TRUNCATE);
-    unsigned Reg = getRegForValue(I->getOperand(0));
+    Register Reg = getRegForValue(I->getOperand(0));
     if (!Reg)
       return false;
     updateValueMap(I, Reg);
@@ -2071,7 +2071,7 @@ Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
     if (!MRI.constrainRegClass(Op, RegClass)) {
       // If it's not legal to COPY between the register classes, something
       // has gone very wrong before we got here.
-      unsigned NewOp = createResultReg(RegClass);
+      Register NewOp = createResultReg(RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), NewOp).addReg(Op);
       return NewOp;

From b0b2507717ca904ffe50248a44a5c653946b6732 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@iml.fraunhofer.de>
Date: Fri, 29 May 2020 00:31:28 +0200
Subject: [PATCH 441/770] [mlir] Add test to check if standalone dialect is
 registered

Summary: Add a test to check if the standalone dialect is registered within standalone-opt. Similar to the mlir-opt commandline.mlir test.

Reviewers: Kayjukh, stephenneuendorffer

Reviewed By: Kayjukh

Subscribers: mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, liufengdb, Joonsoo, grosul1, frgossen, jurahul, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80764
---
 mlir/examples/standalone/test/Standalone/standalone-opt.mlir | 3 +++
 mlir/test/Examples/standalone/test.toy                       | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 mlir/examples/standalone/test/Standalone/standalone-opt.mlir

diff --git a/mlir/examples/standalone/test/Standalone/standalone-opt.mlir b/mlir/examples/standalone/test/Standalone/standalone-opt.mlir
new file mode 100644
index 0000000000000..fac08144ec39c
--- /dev/null
+++ b/mlir/examples/standalone/test/Standalone/standalone-opt.mlir
@@ -0,0 +1,3 @@
+// RUN: standalone-opt --show-dialects | FileCheck %s
+// CHECK: Registered Dialects:
+// CHECK: standalone
diff --git a/mlir/test/Examples/standalone/test.toy b/mlir/test/Examples/standalone/test.toy
index 034fd9385d464..161427951a989 100644
--- a/mlir/test/Examples/standalone/test.toy
+++ b/mlir/test/Examples/standalone/test.toy
@@ -1,4 +1,4 @@
 # RUN: %cmake %mlir_src_root/examples/standalone -DCMAKE_CXX_COMPILER=%host_cxx -DCMAKE_C_COMPILER=%host_cc -DMLIR_DIR=%llvm_lib_dir/cmake/mlir ; %cmake --build . --target check-standalone | tee %t | FileCheck %s
 
-# CHECK: Expected Passes: 2
+# CHECK: Expected Passes: 3
 # UNSUPPORTED: windows, android

From 0dfb43deb6d5511a8ea69eeb7373a212ebd6c9c1 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Thu, 28 May 2020 15:02:18 -0700
Subject: [PATCH 442/770] Fix handling of default arguments in
 __attribute__((enable_if)).

We didn't properly build default argument expressions previously -- we
failed to build the wrapper CXXDefaultArgExpr node, which meant that
std::source_location misbehaved, and we didn't perform default argument
instantiation when necessary, which meant that dependent default
arguments in function templates didn't work at all.
---
 clang/include/clang/Sema/Sema.h  |  3 ++-
 clang/lib/Sema/SemaExpr.cpp      |  3 ++-
 clang/lib/Sema/SemaOverload.cpp  | 42 +++++++++++++++-----------------
 clang/test/SemaCXX/enable_if.cpp | 12 +++++++++
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index e63f65e2580cc..dc7ee2ddd0b89 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3371,7 +3371,8 @@ class Sema final {
 
   /// Check the enable_if expressions on the given function. Returns the first
   /// failing attribute, or NULL if they were all successful.
-  EnableIfAttr *CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
+  EnableIfAttr *CheckEnableIf(FunctionDecl *Function, SourceLocation CallLoc,
+                              ArrayRef<Expr *> Args,
                               bool MissingImplicitThis = false);
 
   /// Find the failed Boolean condition within a given Boolean
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 261e69b440524..4063289711cc5 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6060,7 +6060,8 @@ static void checkDirectCallValidity(Sema &S, const Expr *Fn,
   if (Callee->getMinRequiredArguments() > ArgExprs.size())
     return;
 
-  if (const EnableIfAttr *Attr = S.CheckEnableIf(Callee, ArgExprs, true)) {
+  if (const EnableIfAttr *Attr =
+          S.CheckEnableIf(Callee, Fn->getBeginLoc(), ArgExprs, true)) {
     S.Diag(Fn->getBeginLoc(),
            isa<CXXMethodDecl>(Callee)
                ? diag::err_ovl_no_viable_member_function_in_call
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1b00b2b18572b..ad75529debdba 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6356,7 +6356,8 @@ void Sema::AddOverloadCandidate(
     }
   }
 
-  if (EnableIfAttr *FailedAttr = CheckEnableIf(Function, Args)) {
+  if (EnableIfAttr *FailedAttr =
+          CheckEnableIf(Function, CandidateSet.getLocation(), Args)) {
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_enable_if;
     Candidate.DeductionFailure.Data = FailedAttr;
@@ -6462,11 +6463,10 @@ Sema::SelectBestMethod(Selector Sel, MultiExprArg Args, bool IsInstance,
   return nullptr;
 }
 
-static bool
-convertArgsForAvailabilityChecks(Sema &S, FunctionDecl *Function, Expr *ThisArg,
-                                 ArrayRef<Expr *> Args, Sema::SFINAETrap &Trap,
-                                 bool MissingImplicitThis, Expr *&ConvertedThis,
-                                 SmallVectorImpl<Expr *> &ConvertedArgs) {
+static bool convertArgsForAvailabilityChecks(
+    Sema &S, FunctionDecl *Function, Expr *ThisArg, SourceLocation CallLoc,
+    ArrayRef<Expr *> Args, Sema::SFINAETrap &Trap, bool MissingImplicitThis,
+    Expr *&ConvertedThis, SmallVectorImpl<Expr *> &ConvertedArgs) {
   if (ThisArg) {
     CXXMethodDecl *Method = cast<CXXMethodDecl>(Function);
     assert(!isa<CXXConstructorDecl>(Method) &&
@@ -6511,17 +6511,7 @@ convertArgsForAvailabilityChecks(Sema &S, FunctionDecl *Function, Expr *ThisArg,
   if (!Function->isVariadic() && Args.size() < Function->getNumParams()) {
     for (unsigned i = Args.size(), e = Function->getNumParams(); i != e; ++i) {
       ParmVarDecl *P = Function->getParamDecl(i);
-      Expr *DefArg = P->hasUninstantiatedDefaultArg()
-                         ? P->getUninstantiatedDefaultArg()
-                         : P->getDefaultArg();
-      // This can only happen in code completion, i.e. when PartialOverloading
-      // is true.
-      if (!DefArg)
-        return false;
-      ExprResult R =
-          S.PerformCopyInitialization(InitializedEntity::InitializeParameter(
-                                          S.Context, Function->getParamDecl(i)),
-                                      SourceLocation(), DefArg);
+      ExprResult R = S.BuildCXXDefaultArgExpr(CallLoc, Function, P);
       if (R.isInvalid())
         return false;
       ConvertedArgs.push_back(R.get());
@@ -6533,7 +6523,9 @@ convertArgsForAvailabilityChecks(Sema &S, FunctionDecl *Function, Expr *ThisArg,
   return true;
 }
 
-EnableIfAttr *Sema::CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
+EnableIfAttr *Sema::CheckEnableIf(FunctionDecl *Function,
+                                  SourceLocation CallLoc,
+                                  ArrayRef<Expr *> Args,
                                   bool MissingImplicitThis) {
   auto EnableIfAttrs = Function->specific_attrs<EnableIfAttr>();
   if (EnableIfAttrs.begin() == EnableIfAttrs.end())
@@ -6544,7 +6536,7 @@ EnableIfAttr *Sema::CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
   // FIXME: We should look into making enable_if late-parsed.
   Expr *DiscardedThis;
   if (!convertArgsForAvailabilityChecks(
-          *this, Function, /*ThisArg=*/nullptr, Args, Trap,
+          *this, Function, /*ThisArg=*/nullptr, CallLoc, Args, Trap,
           /*MissingImplicitThis=*/true, DiscardedThis, ConvertedArgs))
     return *EnableIfAttrs.begin();
 
@@ -6874,7 +6866,8 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
     }
   }
 
-  if (EnableIfAttr *FailedAttr = CheckEnableIf(Method, Args, true)) {
+  if (EnableIfAttr *FailedAttr =
+          CheckEnableIf(Method, CandidateSet.getLocation(), Args, true)) {
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_enable_if;
     Candidate.DeductionFailure.Data = FailedAttr;
@@ -7327,7 +7320,8 @@ void Sema::AddConversionCandidate(
            "Can only end up with a standard conversion sequence or failure");
   }
 
-  if (EnableIfAttr *FailedAttr = CheckEnableIf(Conversion, None)) {
+  if (EnableIfAttr *FailedAttr =
+          CheckEnableIf(Conversion, CandidateSet.getLocation(), None)) {
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_enable_if;
     Candidate.DeductionFailure.Data = FailedAttr;
@@ -7497,7 +7491,8 @@ void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion,
     }
   }
 
-  if (EnableIfAttr *FailedAttr = CheckEnableIf(Conversion, None)) {
+  if (EnableIfAttr *FailedAttr =
+          CheckEnableIf(Conversion, CandidateSet.getLocation(), None)) {
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_enable_if;
     Candidate.DeductionFailure.Data = FailedAttr;
@@ -14130,7 +14125,8 @@ Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
   // resolution process, we still need to handle the enable_if attribute. Do
   // that here, so it will not hide previous -- and more relevant -- errors.
   if (auto *MemE = dyn_cast<MemberExpr>(NakedMemExpr)) {
-    if (const EnableIfAttr *Attr = CheckEnableIf(Method, Args, true)) {
+    if (const EnableIfAttr *Attr =
+            CheckEnableIf(Method, LParenLoc, Args, true)) {
       Diag(MemE->getMemberLoc(),
            diag::err_ovl_no_viable_member_function_in_call)
           << Method << Method->getSourceRange();
diff --git a/clang/test/SemaCXX/enable_if.cpp b/clang/test/SemaCXX/enable_if.cpp
index 37664276e4708..50d898959c454 100644
--- a/clang/test/SemaCXX/enable_if.cpp
+++ b/clang/test/SemaCXX/enable_if.cpp
@@ -561,3 +561,15 @@ namespace IgnoreUnusedArgSideEffects {
   float &x = h();
 #endif
 }
+
+namespace DefaultArgs {
+  void f(int n = __builtin_LINE()) __attribute__((enable_if(n == 12345, "only callable on line 12345"))); // expected-note {{only callable on line 12345}}
+  void g() { f(); } // expected-error {{no matching function}}
+#line 12345
+  void h() { f(); }
+
+  template<typename T> void x(int n = T()) __attribute__((enable_if(n == 0, ""))) {} // expected-note {{candidate}}
+  void y() { x<int>(); }
+  struct Z { constexpr operator int() const { return 1; } };
+  void z() { x<Z>(); } // expected-error {{no matching function}}
+}

From 791c78f5e0ce84bcd4cfcccbbbbcc1259f4d7e0c Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 28 May 2020 15:36:17 -0700
Subject: [PATCH 443/770] [NFC,StackSafety] Add test flag

---
 llvm/lib/Analysis/StackSafetyAnalysis.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 1c59d5f2af966..f415facfbff48 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -31,6 +31,9 @@ using namespace llvm;
 static cl::opt<int> StackSafetyMaxIterations("stack-safety-max-iterations",
                                              cl::init(20), cl::Hidden);
 
+static cl::opt<int> StackSafetyPrint("stack-safety-print", cl::init(0),
+                                     cl::Hidden);
+
 namespace {
 
 /// Rewrite an SCEV expression for a memory access address to an expression that
@@ -681,6 +684,8 @@ const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
       }
     }
     Info.reset(new InfoTy{createGlobalStackSafetyInfo(std::move(Functions))});
+    if (StackSafetyPrint)
+      print(errs());
   }
   return *Info;
 }
@@ -689,7 +694,10 @@ StackSafetyGlobalInfo::StackSafetyGlobalInfo() = default;
 
 StackSafetyGlobalInfo::StackSafetyGlobalInfo(
     Module *M, std::function<const StackSafetyInfo &(Function &F)> GetSSI)
-    : M(M), GetSSI(GetSSI) {}
+    : M(M), GetSSI(GetSSI) {
+  if (StackSafetyPrint > 1)
+    getInfo();
+}
 
 StackSafetyGlobalInfo::StackSafetyGlobalInfo(StackSafetyGlobalInfo &&) =
     default;

From 57f1c43cda8a05920997cd50147ed01a124b4239 Mon Sep 17 00:00:00 2001
From: Ian Levesque <ianlevesque@fb.com>
Date: Thu, 28 May 2020 18:25:49 -0400
Subject: [PATCH 444/770] clang-format xray InstrumentationMap.cpp

---
 llvm/lib/XRay/InstrumentationMap.cpp | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/XRay/InstrumentationMap.cpp b/llvm/lib/XRay/InstrumentationMap.cpp
index cadaa4afeef14..9db9692c1f9eb 100644
--- a/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/llvm/lib/XRay/InstrumentationMap.cpp
@@ -52,9 +52,9 @@ using RelocMap = DenseMap<uint64_t, uint64_t>;
 
 static Error
 loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
-          InstrumentationMap::SledContainer &Sleds,
-          InstrumentationMap::FunctionAddressMap &FunctionAddresses,
-          InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+        InstrumentationMap::SledContainer &Sleds,
+        InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+        InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
   InstrumentationMap Map;
 
   // Find the section named "xray_instr_map".
@@ -63,7 +63,8 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
         ObjFile.getBinary()->getArch() == Triple::ppc64le ||
         ObjFile.getBinary()->getArch() == Triple::aarch64))
     return make_error<StringError>(
-        "File format not supported (only does ELF and Mach-O little endian 64-bit).",
+        "File format not supported (only does ELF and Mach-O little endian "
+        "64-bit).",
         std::make_error_code(std::errc::not_supported));
 
   StringRef Contents = "";
@@ -94,11 +95,14 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     uint32_t RelativeRelocation = [](object::ObjectFile *ObjFile) {
       if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
-      else if (const auto *ELFObj = dyn_cast<object::ELF32BEObjectFile>(ObjFile))
+      else if (const auto *ELFObj =
+                   dyn_cast<object::ELF32BEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
-      else if (const auto *ELFObj = dyn_cast<object::ELF64LEObjectFile>(ObjFile))
+      else if (const auto *ELFObj =
+                   dyn_cast<object::ELF64LEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
-      else if (const auto *ELFObj = dyn_cast<object::ELF64BEObjectFile>(ObjFile))
+      else if (const auto *ELFObj =
+                   dyn_cast<object::ELF64BEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
       else
         return static_cast<uint32_t>(0);
@@ -240,7 +244,8 @@ llvm::xray::loadInstrumentationMap(StringRef Filename) {
   if (!ObjectFileOrError) {
     auto E = ObjectFileOrError.takeError();
     // We try to load it as YAML if the ELF load didn't work.
-    Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
+    Expected<sys::fs::file_t> FdOrErr =
+        sys::fs::openNativeFileForRead(Filename);
     if (!FdOrErr) {
       // Report the ELF load error if YAML failed.
       consumeError(FdOrErr.takeError());
@@ -262,7 +267,7 @@ llvm::xray::loadInstrumentationMap(StringRef Filename) {
                           Map.FunctionAddresses, Map.FunctionIds))
       return std::move(E);
   } else if (auto E = loadObj(Filename, *ObjectFileOrError, Map.Sleds,
-                                Map.FunctionAddresses, Map.FunctionIds)) {
+                              Map.FunctionAddresses, Map.FunctionIds)) {
     return std::move(E);
   }
   return Map;

From e62d67f770d279d0ef7e6f1175e0a88aaffd58b0 Mon Sep 17 00:00:00 2001
From: Ian Levesque <ianlevesque@fb.com>
Date: Tue, 19 May 2020 01:38:14 -0400
Subject: [PATCH 445/770] [xray] Add llvm-xray extract support for 32 bit ARM

Summary:
XRay works on 32-bit ARM but extract didn't support it.

See also another previous attempt in D77858.

Reviewers: MaskRay, dberris, johnislarry

Subscribers: kristof.beyls, hiraditya, danielkiss, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80185
---
 llvm/lib/XRay/InstrumentationMap.cpp          |  33 ++--
 .../tools/llvm-xray/ARM/extract-instrmap.test | 150 ++++++++++++++++++
 .../llvm-xray/X86/bad-instrmap-sizes.txt      |   2 +-
 .../tools/llvm-xray/X86/unsupported-elf32.txt |   2 +-
 4 files changed, 176 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/tools/llvm-xray/ARM/extract-instrmap.test

diff --git a/llvm/lib/XRay/InstrumentationMap.cpp b/llvm/lib/XRay/InstrumentationMap.cpp
index 9db9692c1f9eb..de0a9e60a5111 100644
--- a/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/llvm/lib/XRay/InstrumentationMap.cpp
@@ -61,6 +61,7 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
   if ((!ObjFile.getBinary()->isELF() && !ObjFile.getBinary()->isMachO()) ||
       !(ObjFile.getBinary()->getArch() == Triple::x86_64 ||
         ObjFile.getBinary()->getArch() == Triple::ppc64le ||
+        ObjFile.getBinary()->getArch() == Triple::arm ||
         ObjFile.getBinary()->getArch() == Triple::aarch64))
     return make_error<StringError>(
         "File format not supported (only does ELF and Mach-O little endian "
@@ -115,7 +116,14 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
 
     for (const object::SectionRef &Section : Sections) {
       for (const object::RelocationRef &Reloc : Section.relocations()) {
-        if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
+        if (ObjFile.getBinary()->getArch() == Triple::arm) {
+          if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
+            Expected<uint64_t> ValueOrErr = Reloc.getSymbol()->getValue();
+            if (!ValueOrErr)
+              return ValueOrErr.takeError();
+            Relocs.insert({Reloc.getOffset(), Resolver(Reloc, *ValueOrErr, 0)});
+          }
+        } else if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
           auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend();
           auto A = AddendOrErr ? *AddendOrErr : 0;
           Expected<uint64_t> ValueOrErr = Reloc.getSymbol()->getValue();
@@ -133,12 +141,13 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
 
   // Copy the instrumentation map data into the Sleds data structure.
   auto C = Contents.bytes_begin();
-  static constexpr size_t ELF64SledEntrySize = 32;
+  bool Is32Bit = ObjFile.getBinary()->makeTriple().isArch32Bit();
+  size_t ELFSledEntrySize = Is32Bit ? 16 : 32;
 
-  if ((C - Contents.bytes_end()) % ELF64SledEntrySize != 0)
+  if ((C - Contents.bytes_end()) % ELFSledEntrySize != 0)
     return make_error<StringError>(
         Twine("Instrumentation map entries not evenly divisible by size of "
-              "an XRay sled entry in ELF64."),
+              "an XRay sled entry."),
         std::make_error_code(std::errc::executable_format_error));
 
   auto RelocateOrElse = [&](uint64_t Offset, uint64_t Address) {
@@ -151,20 +160,26 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     return Address;
   };
 
-  const int WordSize = 8;
+  const int WordSize = Is32Bit ? 4 : 8;
   int32_t FuncId = 1;
   uint64_t CurFn = 0;
-  for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
+  for (; C != Contents.bytes_end(); C += ELFSledEntrySize) {
     DataExtractor Extractor(
-        StringRef(reinterpret_cast<const char *>(C), ELF64SledEntrySize), true,
+        StringRef(reinterpret_cast<const char *>(C), ELFSledEntrySize), true,
         8);
     Sleds.push_back({});
     auto &Entry = Sleds.back();
     uint64_t OffsetPtr = 0;
     uint64_t AddrOff = OffsetPtr;
-    Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr));
+    if (Is32Bit)
+      Entry.Address = RelocateOrElse(AddrOff, Extractor.getU32(&OffsetPtr));
+    else
+      Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr));
     uint64_t FuncOff = OffsetPtr;
-    Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr));
+    if (Is32Bit)
+      Entry.Function = RelocateOrElse(FuncOff, Extractor.getU32(&OffsetPtr));
+    else
+      Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr));
     auto Kind = Extractor.getU8(&OffsetPtr);
     static constexpr SledEntry::FunctionKinds Kinds[] = {
         SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT,
diff --git a/llvm/test/tools/llvm-xray/ARM/extract-instrmap.test b/llvm/test/tools/llvm-xray/ARM/extract-instrmap.test
new file mode 100644
index 0000000000000..79f743f1dbdef
--- /dev/null
+++ b/llvm/test/tools/llvm-xray/ARM/extract-instrmap.test
@@ -0,0 +1,150 @@
+## This test makes sure we can extract the instrumentation map from an
+## XRay-instrumented PIE file.
+
+## Generated from the following source:
+## __attribute__((xray_always_instrument)) void foo() {}
+## __attribute__((xray_always_instrument)) void bar() {}
+## __attribute__((xray_always_instrument)) void jar() {}
+## Built with the following arguments:
+## -target armv7a-linux-androideabi -nostdlib -fxray-instrument -fPIC -shared
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_ARM
+  Flags:           [ EF_ARM_SOFT_FLOAT, EF_ARM_EABI_VER5 ]
+  Entry:           0x00000000000012B0
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    Sections:
+      - Section:         .rel.dyn
+    Align:           0x0000000000001000
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    Sections:
+      - Section:         .text
+    VAddr:           0x00000000000012B0
+    Align:           0x0000000000001000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    Sections:
+      - Section:         xray_instr_map
+      - Section:         xray_fn_idx
+    VAddr:           0x00000000000033CC
+    Align:           0x0000000000001000
+Sections:
+  - Name:            .rel.dyn
+    Type:            SHT_REL
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0000000000000200
+    Link:            .dynsym
+    AddressAlign:    0x0000000000000004
+    Relocations:
+      - Offset:          0x00000000000033CC
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x00000000000033DC
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x00000000000033EC
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x00000000000033FC
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x000000000000340C
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x000000000000341C
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x0000000000003430
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x0000000000003434
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x0000000000003438
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x000000000000343C
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x0000000000003440
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x0000000000003444
+        Type:            R_ARM_RELATIVE
+      - Offset:          0x00000000000033F0
+        Symbol:          _Z3barv
+        Type:            R_ARM_ABS32
+      - Offset:          0x0000000000003400
+        Symbol:          _Z3barv
+        Type:            R_ARM_ABS32
+      - Offset:          0x00000000000033D0
+        Symbol:          _Z3foov
+        Type:            R_ARM_ABS32
+      - Offset:          0x00000000000033E0
+        Symbol:          _Z3foov
+        Type:            R_ARM_ABS32
+      - Offset:          0x0000000000003410
+        Symbol:          _Z3jarv
+        Type:            R_ARM_ABS32
+      - Offset:          0x0000000000003420
+        Symbol:          _Z3jarv
+        Type:            R_ARM_ABS32
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x00000000000012B0
+    AddressAlign:    0x0000000000000004
+    Size:            180
+  - Name:            xray_instr_map
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC, SHF_LINK_ORDER ]
+    Address:         0x00000000000033CC
+    Link:            .text
+    AddressAlign:    0x0000000000000001
+    Content:         B0120000000000000001000000000000CC120000000000000101000000000000EC120000000000000001000000000000081300000000000001010000000000002813000000000000000100000000000044130000000000000101000000000000
+  - Name:            xray_fn_idx
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC, SHF_LINK_ORDER ]
+    Address:         0x0000000000003430
+    Link:            .text
+    AddressAlign:    0x0000000000000008
+    Content:         CC330000EC330000EC3300000C3400000C3400002C340000
+DynamicSymbols:
+  - Name:            _Z3barv
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x00000000000012EC
+    Size:            0x000000000000003C
+  - Name:            _Z3foov
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x00000000000012B0
+    Size:            0x000000000000003C
+  - Name:            _Z3jarv
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x0000000000001328
+    Size:            0x000000000000003C
+...
+
+# RUN: yaml2obj %s -o %t.so
+# RUN: llvm-xray extract %t.so -s | FileCheck %s
+
+# CHECK:      ---
+# CHECK-NEXT: - { id: 1, address: 0x00000000000012B0, function: 0x00000000000012B0, kind: function-enter, always-instrument: true, function-name: 'foo()' }
+# CHECK-NEXT: - { id: 1, address: 0x00000000000012CC, function: 0x00000000000012B0, kind: function-exit, always-instrument: true, function-name: 'foo()' }
+# CHECK-NEXT: - { id: 2, address: 0x00000000000012EC, function: 0x00000000000012EC, kind: function-enter, always-instrument: true, function-name: 'bar()' }
+# CHECK-NEXT: - { id: 2, address: 0x0000000000001308, function: 0x00000000000012EC, kind: function-exit, always-instrument: true, function-name: 'bar()' }
+# CHECK-NEXT: - { id: 3, address: 0x0000000000001328, function: 0x0000000000001328, kind: function-enter, always-instrument: true, function-name: 'jar()' }
+# CHECK-NEXT: - { id: 3, address: 0x0000000000001344, function: 0x0000000000001328, kind: function-exit, always-instrument: true, function-name: 'jar()' }
+# CHECK-NEXT: ...
+
+# RUN: llvm-xray extract -s --no-demangle %t.so | FileCheck --check-prefix=MANGLED %s
+
+# MANGLED:      ---
+# MANGLED-NEXT: - { id: 1, address: 0x00000000000012B0, function: 0x00000000000012B0, kind: function-enter, always-instrument: true, function-name: _Z3foov }
+# MANGLED-NEXT: - { id: 1, address: 0x00000000000012CC, function: 0x00000000000012B0, kind: function-exit, always-instrument: true, function-name: _Z3foov }
+# MANGLED-NEXT: - { id: 2, address: 0x00000000000012EC, function: 0x00000000000012EC, kind: function-enter, always-instrument: true, function-name: _Z3barv }
+# MANGLED-NEXT: - { id: 2, address: 0x0000000000001308, function: 0x00000000000012EC, kind: function-exit, always-instrument: true, function-name: _Z3barv }
+# MANGLED-NEXT: - { id: 3, address: 0x0000000000001328, function: 0x0000000000001328, kind: function-enter, always-instrument: true, function-name: _Z3jarv }
+# MANGLED-NEXT: - { id: 3, address: 0x0000000000001344, function: 0x0000000000001328, kind: function-exit, always-instrument: true, function-name: _Z3jarv }
+# MANGLED-NEXT: ...
diff --git a/llvm/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt b/llvm/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
index 4ea33510e5dcc..0c574461fda1d 100644
--- a/llvm/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
+++ b/llvm/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
@@ -1,3 +1,3 @@
 ; RUN: not llvm-xray extract %S/Inputs/elf64-badentrysizes.bin 2>&1 | FileCheck %s
 ; CHECK: llvm-xray: Cannot extract instrumentation map from '{{.*}}elf64-badentrysizes.bin'.
-; CHECK-NEXT: Instrumentation map entries not evenly divisible by size of an XRay sled entry in ELF64.
+; CHECK-NEXT: Instrumentation map entries not evenly divisible by size of an XRay sled entry.
diff --git a/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt b/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt
index 1fc3a1a08287f..dc025d96956b4 100644
--- a/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt
+++ b/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt
@@ -1,3 +1,3 @@
 ; RUN: not llvm-xray extract %S/Inputs/elf32-noxray.bin 2>&1 | FileCheck %s
 ; CHECK: llvm-xray: Cannot extract instrumentation map from '{{.*}}elf32-noxray.bin'.
-; CHECK-NEXT: File format not supported (only does ELF and Mach-O little endian 64-bit).
+; CHECK-NEXT: File format not supported.  Supports: AArch64/ARM/ppc64le/x86-64.

From d79e51a711cbca80492e517eade2666b315907e4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 28 May 2020 15:40:06 -0700
Subject: [PATCH 446/770] [X86] Add test case to show fast-isel incorrectly
 emitting a 64-bit movabsq instruction in 32-bit mode when using constant
 pools with -code-model=large. NFC

-code-model=large isn't supposed to mean anything to 32-bit mode.
But nothing prevents passing it so we shouldn't generate bad code
if someone does.
---
 llvm/test/CodeGen/X86/fast-isel-constpool.ll | 115 +++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/llvm/test/CodeGen/X86/fast-isel-constpool.ll b/llvm/test/CodeGen/X86/fast-isel-constpool.ll
index 4b8f387571e9b..706674c584baf 100644
--- a/llvm/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-constpool.ll
@@ -6,6 +6,10 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX
 ; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large -mattr=avx512f < %s | FileCheck %s --check-prefix=LARGE_AVX
 
+; This large code mode shouldn't mean anything on x86 but it currently
+; generates 64-bit only instructions and will assert in the encoder.
+; RUN: llc -mtriple=i686-apple-darwin -fast-isel -code-model=large -mattr=sse2 < %s | FileCheck %s --check-prefix=X86-LARGE
+
 ; Make sure fast isel uses rip-relative addressing for the small code model.
 define float @constpool_float(float %x) {
 ; CHECK-LABEL: constpool_float:
@@ -31,6 +35,17 @@ define float @constpool_float(float %x) {
 ; LARGE_AVX-NEXT:    movabsq $LCPI0_0, %rax
 ; LARGE_AVX-NEXT:    vaddss (%rax), %xmm0, %xmm0
 ; LARGE_AVX-NEXT:    retq
+;
+; X86-LARGE-LABEL: constpool_float:
+; X86-LARGE:       ## %bb.0:
+; X86-LARGE-NEXT:    pushl %eax
+; X86-LARGE-NEXT:    .cfi_def_cfa_offset 8
+; X86-LARGE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-LARGE-NEXT:    addss LCPI0_0, %xmm0
+; X86-LARGE-NEXT:    movss %xmm0, (%esp)
+; X86-LARGE-NEXT:    flds (%esp)
+; X86-LARGE-NEXT:    popl %eax
+; X86-LARGE-NEXT:    retl
 
   %1 = fadd float %x, 16.50e+01
   ret float %1
@@ -60,7 +75,107 @@ define double @constpool_double(double %x) nounwind {
 ; LARGE_AVX-NEXT:    movabsq $LCPI1_0, %rax
 ; LARGE_AVX-NEXT:    vaddsd (%rax), %xmm0, %xmm0
 ; LARGE_AVX-NEXT:    retq
+;
+; X86-LARGE-LABEL: constpool_double:
+; X86-LARGE:       ## %bb.0:
+; X86-LARGE-NEXT:    subl $12, %esp
+; X86-LARGE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-LARGE-NEXT:    addsd LCPI1_0, %xmm0
+; X86-LARGE-NEXT:    movsd %xmm0, (%esp)
+; X86-LARGE-NEXT:    fldl (%esp)
+; X86-LARGE-NEXT:    addl $12, %esp
+; X86-LARGE-NEXT:    retl
 
   %1 = fadd double %x, 8.500000e-01
   ret double %1
 }
+
+define void @constpool_float_no_fp_args(float* %x) nounwind {
+; CHECK-LABEL: constpool_float_no_fp_args:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss (%rdi), %xmm0
+; CHECK-NEXT:    movss %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+;
+; LARGE-LABEL: constpool_float_no_fp_args:
+; LARGE:       ## %bb.0:
+; LARGE-NEXT:    movabsq $LCPI2_0, %rax
+; LARGE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; LARGE-NEXT:    addss (%rdi), %xmm0
+; LARGE-NEXT:    movss %xmm0, (%rdi)
+; LARGE-NEXT:    retq
+;
+; AVX-LABEL: constpool_float_no_fp_args:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vaddss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; LARGE_AVX-LABEL: constpool_float_no_fp_args:
+; LARGE_AVX:       ## %bb.0:
+; LARGE_AVX-NEXT:    movabsq $LCPI2_0, %rax
+; LARGE_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; LARGE_AVX-NEXT:    vaddss (%rdi), %xmm0, %xmm0
+; LARGE_AVX-NEXT:    vmovss %xmm0, (%rdi)
+; LARGE_AVX-NEXT:    retq
+;
+; X86-LARGE-LABEL: constpool_float_no_fp_args:
+; X86-LARGE:       ## %bb.0:
+; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-LARGE-NEXT:    movabsq $LCPI2_0, %rcx
+; X86-LARGE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-LARGE-NEXT:    addss (%eax), %xmm0
+; X86-LARGE-NEXT:    movss %xmm0, (%eax)
+; X86-LARGE-NEXT:    retl
+  %a = load float, float* %x
+  %b = fadd float %a, 16.50e+01
+  store float %b, float* %x
+  ret void
+}
+
+define void @constpool_double_no_fp_args(double* %x) nounwind {
+; CHECK-LABEL: constpool_double_no_fp_args:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    addsd (%rdi), %xmm0
+; CHECK-NEXT:    movsd %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+;
+; LARGE-LABEL: constpool_double_no_fp_args:
+; LARGE:       ## %bb.0:
+; LARGE-NEXT:    movabsq $LCPI3_0, %rax
+; LARGE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; LARGE-NEXT:    addsd (%rdi), %xmm0
+; LARGE-NEXT:    movsd %xmm0, (%rdi)
+; LARGE-NEXT:    retq
+;
+; AVX-LABEL: constpool_double_no_fp_args:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; LARGE_AVX-LABEL: constpool_double_no_fp_args:
+; LARGE_AVX:       ## %bb.0:
+; LARGE_AVX-NEXT:    movabsq $LCPI3_0, %rax
+; LARGE_AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; LARGE_AVX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0
+; LARGE_AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; LARGE_AVX-NEXT:    retq
+;
+; X86-LARGE-LABEL: constpool_double_no_fp_args:
+; X86-LARGE:       ## %bb.0:
+; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-LARGE-NEXT:    movabsq $LCPI3_0, %rcx
+; X86-LARGE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-LARGE-NEXT:    addsd (%eax), %xmm0
+; X86-LARGE-NEXT:    movsd %xmm0, (%eax)
+; X86-LARGE-NEXT:    retl
+  %a = load double, double* %x
+  %b = fadd double %a, 8.500000e-01
+  store double %b, double* %x
+  ret void
+}

From e47e22642f9261c93a598e506bc57c717db1df7a Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Thu, 28 May 2020 15:57:44 -0700
Subject: [PATCH 447/770] Disable `duplicate_os_log_reports.cpp` test.

It's not passing on macOS green dragon bots. To get them green just
disable for now.

rdar://problem/62141527
---
 .../test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp b/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp
index b40d8fef26cb2..a8314d804331d 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp
@@ -1,4 +1,6 @@
 // UNSUPPORTED: ios
+// Don't re-enable until rdar://problem/62141527 is fixed.
+// REQUIRES: rdar_62141527
 // REQUIRES: shell
 // REQUIRES: darwin_log_cmd
 // RUN: %clangxx_asan -fsanitize-recover=address %s -o %t

From 46c177c92681791c99abde668f34931d84392cb2 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 28 May 2020 15:47:35 -0700
Subject: [PATCH 448/770] Test update for
 a7fa35a629e85a72b8cf07a8f95c7c09d9663808

---
 llvm/test/tools/gold/X86/thinlto.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/tools/gold/X86/thinlto.ll b/llvm/test/tools/gold/X86/thinlto.ll
index 51609ebb7918c..6857778b55d5b 100644
--- a/llvm/test/tools/gold/X86/thinlto.ll
+++ b/llvm/test/tools/gold/X86/thinlto.ll
@@ -107,6 +107,7 @@
 ; BACKEND1-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; BACKEND1-NEXT: <COMBINED
 ; BACKEND1-NEXT: <COMBINED
+; BACKEND1-NEXT: <BLOCK_COUNT
 ; BACKEND1-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 ; The backend index for Input/thinlto.ll contains summaries from itself only,
@@ -119,6 +120,7 @@
 ; BACKEND2-NEXT: <FLAGS
 ; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
 ; BACKEND2-NEXT: <COMBINED
+; BACKEND2-NEXT: <BLOCK_COUNT
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 ; DIS1: ^0 = module: (path: "{{.*}}thinlto.ll.tmp.o", hash: (0, 0, 0, 0, 0))
@@ -140,6 +142,7 @@
 ; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <BLOCK_COUNT
 ; COMBINED-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

From 8ae21fb8d2a2a5c55070b82dd57b345dd4716f56 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 28 May 2020 16:03:57 -0700
Subject: [PATCH 449/770] [lldb/CMake] Set both the BUILD and INSTALL RPATH on
 macOS

This is necessary when building the framework.
---
 lldb/source/API/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index b0ada3ef81451..ce6a7ec830fce 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -123,6 +123,7 @@ endif()
 
 if(PYTHON_RPATH)
   set_property(TARGET liblldb APPEND PROPERTY INSTALL_RPATH "${PYTHON_RPATH}")
+  set_property(TARGET liblldb APPEND PROPERTY BUILD_RPATH   "${PYTHON_RPATH}")
 endif()
 
 if (MSVC)

From fd1c894a4a3690b2e500bfdf71194e9cc3f1b399 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Wed, 20 May 2020 21:35:18 -0700
Subject: [PATCH 450/770] [lld][WebAssembly] Convert some lld tests to assembly

When we originally wrote these tests we didn't have a stable and
fleshed out assembly format.  Now we do so we should prefer that
over llvm ir for lld tests to avoid including more part of llvm
than necessary in order to run the test.

This change converts just 30 out of about 130 test files. More to
come when I have some more time.

Differential Revision: https://reviews.llvm.org/D80361
---
 lld/test/wasm/Inputs/call-indirect.ll       | 20 -------
 lld/test/wasm/Inputs/call-indirect.s        | 28 +++++++++
 lld/test/wasm/Inputs/call-ret32.ll          | 11 ----
 lld/test/wasm/Inputs/call-ret32.s           | 16 +++++
 lld/test/wasm/Inputs/hello.ll               | 17 ------
 lld/test/wasm/Inputs/hello.s                | 18 ++++++
 lld/test/wasm/Inputs/hidden.ll              | 13 -----
 lld/test/wasm/Inputs/hidden.s               | 12 ++++
 lld/test/wasm/Inputs/import-attributes.ll   | 10 ----
 lld/test/wasm/Inputs/import-attributes.s    |  8 +++
 lld/test/wasm/Inputs/optional-symbol.ll     |  7 ---
 lld/test/wasm/Inputs/optional-symbol.s      |  7 +++
 lld/test/wasm/Inputs/ret32.ll               |  6 --
 lld/test/wasm/Inputs/ret32.s                |  6 ++
 lld/test/wasm/Inputs/ret64.ll               |  6 --
 lld/test/wasm/Inputs/ret64.s                |  6 ++
 lld/test/wasm/Inputs/start.ll               |  6 --
 lld/test/wasm/Inputs/start.s                |  4 ++
 lld/test/wasm/Inputs/strong-symbol.ll       |  6 --
 lld/test/wasm/Inputs/strong-symbol.s        |  6 ++
 lld/test/wasm/Inputs/weak-symbol1.ll        | 13 -----
 lld/test/wasm/Inputs/weak-symbol1.s         | 17 ++++++
 lld/test/wasm/Inputs/weak-symbol2.ll        | 13 -----
 lld/test/wasm/Inputs/weak-symbol2.s         | 17 ++++++
 lld/test/wasm/alias.ll                      | 65 ---------------------
 lld/test/wasm/alias.s                       | 64 ++++++++++++++++++++
 lld/test/wasm/archive-export.ll             |  2 +-
 lld/test/wasm/archive-no-index.ll           | 13 -----
 lld/test/wasm/archive-no-index.s            | 14 +++++
 lld/test/wasm/archive-weak-undefined.ll     |  4 +-
 lld/test/wasm/archive.ll                    |  2 +-
 lld/test/wasm/bss-only.ll                   | 14 -----
 lld/test/wasm/bss-only.s                    | 43 ++++++++++++++
 lld/test/wasm/call-indirect.ll              |  6 +-
 lld/test/wasm/compress-relocs.ll            |  6 +-
 lld/test/wasm/conflict.test                 |  2 +-
 lld/test/wasm/data-layout.ll                |  2 +-
 lld/test/wasm/demangle.ll                   | 19 ------
 lld/test/wasm/demangle.s                    | 18 ++++++
 lld/test/wasm/emit-relocs-fpic.s            |  2 +-
 lld/test/wasm/emit-relocs.ll                |  2 +-
 lld/test/wasm/entry-signature.ll            | 10 ----
 lld/test/wasm/entry-signature.s             |  8 +++
 lld/test/wasm/export-empty.test             |  2 +-
 lld/test/wasm/export-optional-lazy.ll       | 25 --------
 lld/test/wasm/export-optional-lazy.test     | 18 ++++++
 lld/test/wasm/export-table.test             |  2 +-
 lld/test/wasm/fatal-warnings.ll             |  2 +-
 lld/test/wasm/function-imports-first.ll     |  2 +-
 lld/test/wasm/function-imports.ll           |  2 +-
 lld/test/wasm/function-index.test           |  4 +-
 lld/test/wasm/global-base.test              |  2 +-
 lld/test/wasm/growable-table.test           |  2 +-
 lld/test/wasm/import-attribute-mismatch.ll  | 18 ------
 lld/test/wasm/import-attribute-mismatch.s   | 16 +++++
 lld/test/wasm/import-memory.test            |  2 +-
 lld/test/wasm/import-table.test             |  2 +-
 lld/test/wasm/invalid-stack-size.test       |  2 +-
 lld/test/wasm/large-memory.test             |  2 +-
 lld/test/wasm/load-undefined.test           |  6 +-
 lld/test/wasm/no-tls.ll                     | 48 +++++++++++++++
 lld/test/wasm/no-tls.test                   | 41 -------------
 lld/test/wasm/optional-symbol.ll            | 14 -----
 lld/test/wasm/optional-symbol.s             | 13 +++++
 lld/test/wasm/pic-static.ll                 |  2 +-
 lld/test/wasm/relocatable.ll                |  2 +-
 lld/test/wasm/responsefile.test             |  2 +-
 lld/test/wasm/shared-needed.ll              |  2 +-
 lld/test/wasm/signature-mismatch-export.ll  |  2 +-
 lld/test/wasm/signature-mismatch-unknown.ll |  4 +-
 lld/test/wasm/signature-mismatch-weak.ll    |  4 +-
 lld/test/wasm/signature-mismatch.ll         |  6 +-
 lld/test/wasm/stack-first.test              |  2 +-
 lld/test/wasm/strip-all.test                |  2 +-
 lld/test/wasm/strip-debug.test              |  2 +-
 lld/test/wasm/symbol-type-mismatch.ll       | 11 ----
 lld/test/wasm/symbol-type-mismatch.s        |  9 +++
 lld/test/wasm/trace-symbol.ll               |  2 +-
 lld/test/wasm/trace.test                    |  2 +-
 lld/test/wasm/undefined-entry.test          |  2 +-
 lld/test/wasm/version.ll                    | 15 -----
 lld/test/wasm/version.s                     | 13 +++++
 lld/test/wasm/visibility-hidden.ll          |  2 +-
 lld/test/wasm/weak-symbols.ll               |  4 +-
 lld/test/wasm/whole-archive.test            |  4 +-
 85 files changed, 461 insertions(+), 427 deletions(-)
 delete mode 100644 lld/test/wasm/Inputs/call-indirect.ll
 create mode 100644 lld/test/wasm/Inputs/call-indirect.s
 delete mode 100644 lld/test/wasm/Inputs/call-ret32.ll
 create mode 100644 lld/test/wasm/Inputs/call-ret32.s
 delete mode 100644 lld/test/wasm/Inputs/hello.ll
 create mode 100644 lld/test/wasm/Inputs/hello.s
 delete mode 100644 lld/test/wasm/Inputs/hidden.ll
 create mode 100644 lld/test/wasm/Inputs/hidden.s
 delete mode 100644 lld/test/wasm/Inputs/import-attributes.ll
 create mode 100644 lld/test/wasm/Inputs/import-attributes.s
 delete mode 100644 lld/test/wasm/Inputs/optional-symbol.ll
 create mode 100644 lld/test/wasm/Inputs/optional-symbol.s
 delete mode 100644 lld/test/wasm/Inputs/ret32.ll
 create mode 100644 lld/test/wasm/Inputs/ret32.s
 delete mode 100644 lld/test/wasm/Inputs/ret64.ll
 create mode 100644 lld/test/wasm/Inputs/ret64.s
 delete mode 100644 lld/test/wasm/Inputs/start.ll
 create mode 100644 lld/test/wasm/Inputs/start.s
 delete mode 100644 lld/test/wasm/Inputs/strong-symbol.ll
 create mode 100644 lld/test/wasm/Inputs/strong-symbol.s
 delete mode 100644 lld/test/wasm/Inputs/weak-symbol1.ll
 create mode 100644 lld/test/wasm/Inputs/weak-symbol1.s
 delete mode 100644 lld/test/wasm/Inputs/weak-symbol2.ll
 create mode 100644 lld/test/wasm/Inputs/weak-symbol2.s
 delete mode 100644 lld/test/wasm/alias.ll
 create mode 100644 lld/test/wasm/alias.s
 delete mode 100644 lld/test/wasm/archive-no-index.ll
 create mode 100644 lld/test/wasm/archive-no-index.s
 delete mode 100644 lld/test/wasm/bss-only.ll
 create mode 100644 lld/test/wasm/bss-only.s
 delete mode 100644 lld/test/wasm/demangle.ll
 create mode 100644 lld/test/wasm/demangle.s
 delete mode 100644 lld/test/wasm/entry-signature.ll
 create mode 100644 lld/test/wasm/entry-signature.s
 delete mode 100644 lld/test/wasm/export-optional-lazy.ll
 create mode 100644 lld/test/wasm/export-optional-lazy.test
 delete mode 100644 lld/test/wasm/import-attribute-mismatch.ll
 create mode 100644 lld/test/wasm/import-attribute-mismatch.s
 create mode 100644 lld/test/wasm/no-tls.ll
 delete mode 100644 lld/test/wasm/no-tls.test
 delete mode 100644 lld/test/wasm/optional-symbol.ll
 create mode 100644 lld/test/wasm/optional-symbol.s
 delete mode 100644 lld/test/wasm/symbol-type-mismatch.ll
 create mode 100644 lld/test/wasm/symbol-type-mismatch.s
 delete mode 100644 lld/test/wasm/version.ll
 create mode 100644 lld/test/wasm/version.s

diff --git a/lld/test/wasm/Inputs/call-indirect.ll b/lld/test/wasm/Inputs/call-indirect.ll
deleted file mode 100644
index 6afcf30c25159..0000000000000
--- a/lld/test/wasm/Inputs/call-indirect.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-@indirect_bar = internal local_unnamed_addr global i64 ()* @bar, align 4
-@indirect_foo = internal local_unnamed_addr global i32 ()* @foo, align 4
-
-declare i32 @foo() local_unnamed_addr
-
-define i64 @bar() {
-entry:
-  ret i64 1
-}
-
-define void @call_bar_indirect() local_unnamed_addr #1 {
-entry:
-  %0 = load i64 ()*, i64 ()** @indirect_bar, align 4
-  %1 = load i32 ()*, i32 ()** @indirect_foo, align 4
-  %call0 = tail call i64 %0() #2
-  %call1 = tail call i32 %1() #2
-  ret void
-}
diff --git a/lld/test/wasm/Inputs/call-indirect.s b/lld/test/wasm/Inputs/call-indirect.s
new file mode 100644
index 0000000000000..c181aa19ad6b5
--- /dev/null
+++ b/lld/test/wasm/Inputs/call-indirect.s
@@ -0,0 +1,28 @@
+  .globl  bar
+bar:
+  .functype bar () -> (i64)
+  i64.const 1
+  end_function
+
+  .globl  call_bar_indirect
+call_bar_indirect:
+  .functype call_bar_indirect () -> ()
+  i32.load  indirect_bar
+  call_indirect () -> (i64)
+  drop
+  i32.load  indirect_foo
+  call_indirect () -> (i32)
+  drop
+  end_function
+
+  .section  .data.indirect_bar,"",@
+indirect_bar:
+  .int32  bar
+  .size indirect_bar, 4
+
+  .section  .data.indirect_foo,"",@
+indirect_foo:
+  .int32  foo
+  .size indirect_foo, 4
+
+  .functype foo () -> (i32)
diff --git a/lld/test/wasm/Inputs/call-ret32.ll b/lld/test/wasm/Inputs/call-ret32.ll
deleted file mode 100644
index 90cbf9684573c..0000000000000
--- a/lld/test/wasm/Inputs/call-ret32.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-@ret32_address = global i32 (float)* @ret32, align 4
-
-define hidden i32* @call_ret32() {
-entry:
-  %call1 = call i32 @ret32(float 0.000000e+00)
-  ret i32* bitcast (i32 (float)** @ret32_address to i32*)
-}
-
-declare i32 @ret32(float)
diff --git a/lld/test/wasm/Inputs/call-ret32.s b/lld/test/wasm/Inputs/call-ret32.s
new file mode 100644
index 0000000000000..b0c8129059171
--- /dev/null
+++ b/lld/test/wasm/Inputs/call-ret32.s
@@ -0,0 +1,16 @@
+  .globl  call_ret32
+call_ret32:
+  .functype call_ret32 () -> (i32)
+  f32.const 0x0p0
+  call  ret32
+  drop
+  i32.const ret32_address
+  end_function
+
+  .section  .data.ret32_address,"",@
+  .globl ret32_address
+ret32_address:
+  .int32  ret32
+  .size ret32_address, 4
+
+  .functype ret32 (f32) -> (i32)
diff --git a/lld/test/wasm/Inputs/hello.ll b/lld/test/wasm/Inputs/hello.ll
deleted file mode 100644
index 675566861cc7e..0000000000000
--- a/lld/test/wasm/Inputs/hello.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-; Wasm module generated from the following C code:
-;   void puts(const char*);
-;   void hello() { puts("hello\n"); }
-
-@hello_str = unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
-
-; Function Attrs: nounwind
-define hidden void @hello() local_unnamed_addr #0 {
-entry:
-  tail call void @puts(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @hello_str, i32 0, i32 0))
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @puts(i8* nocapture readonly) local_unnamed_addr #1
diff --git a/lld/test/wasm/Inputs/hello.s b/lld/test/wasm/Inputs/hello.s
new file mode 100644
index 0000000000000..b245262f9eba4
--- /dev/null
+++ b/lld/test/wasm/Inputs/hello.s
@@ -0,0 +1,18 @@
+# asm generated by clang from the following C code:
+#   void puts(const char*);
+#   void hello() { puts("hello\n"); }
+
+  .globl  hello
+hello:
+  .functype hello () -> ()
+  i32.const hello_str
+  call  puts
+  end_function
+
+  .section  .rodata.hello_str,"",@
+  .globl  hello_str
+hello_str:
+  .asciz  "hello\n"
+  .size hello_str, 7
+
+  .functype puts (i32) -> ()
diff --git a/lld/test/wasm/Inputs/hidden.ll b/lld/test/wasm/Inputs/hidden.ll
deleted file mode 100644
index 4af16b3b99ed5..0000000000000
--- a/lld/test/wasm/Inputs/hidden.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-; Function Attrs: norecurse nounwind readnone
-define hidden i32 @archiveHidden() #0 {
-entry:
-    ret i32 0
-}
-
-; Function Attrs: norecurse nounwind readnone
-define i32 @archiveDefault() #1 {
-entry:
-    ret i32 0
-}
diff --git a/lld/test/wasm/Inputs/hidden.s b/lld/test/wasm/Inputs/hidden.s
new file mode 100644
index 0000000000000..58b9f5988bfae
--- /dev/null
+++ b/lld/test/wasm/Inputs/hidden.s
@@ -0,0 +1,12 @@
+  .hidden archiveHidden
+  .globl  archiveHidden
+archiveHidden:
+  .functype archiveHidden () -> (i32)
+  i32.const 0
+  end_function
+
+  .globl  archiveDefault
+archiveDefault:
+  .functype archiveDefault () -> (i32)
+  i32.const 0
+  end_function
diff --git a/lld/test/wasm/Inputs/import-attributes.ll b/lld/test/wasm/Inputs/import-attributes.ll
deleted file mode 100644
index 27ee2774f2210..0000000000000
--- a/lld/test/wasm/Inputs/import-attributes.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define void @call_foo() {
-  call void @foo();
-  ret void
-}
-
-declare void @foo() #0
-
-attributes #0 = { "wasm-import-module"="baz" }
diff --git a/lld/test/wasm/Inputs/import-attributes.s b/lld/test/wasm/Inputs/import-attributes.s
new file mode 100644
index 0000000000000..681b54fb78f38
--- /dev/null
+++ b/lld/test/wasm/Inputs/import-attributes.s
@@ -0,0 +1,8 @@
+  .globl  call_foo
+call_foo:
+  .functype call_foo () -> ()
+  call  foo
+  end_function
+
+  .functype foo () -> ()
+  .import_module  foo, baz
diff --git a/lld/test/wasm/Inputs/optional-symbol.ll b/lld/test/wasm/Inputs/optional-symbol.ll
deleted file mode 100644
index d39a8a4db6376..0000000000000
--- a/lld/test/wasm/Inputs/optional-symbol.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-@__dso_handle = external global i8*
-
-define i8** @get_optional() {
-  ret i8** @__dso_handle
-}
diff --git a/lld/test/wasm/Inputs/optional-symbol.s b/lld/test/wasm/Inputs/optional-symbol.s
new file mode 100644
index 0000000000000..511a0d82a7965
--- /dev/null
+++ b/lld/test/wasm/Inputs/optional-symbol.s
@@ -0,0 +1,7 @@
+# __dso_handle is an linker-generated symbol that is included only when needed.
+
+  .globl  get_optional
+get_optional:
+  .functype get_optional () -> (i32)
+  i32.const __dso_handle
+  end_function
diff --git a/lld/test/wasm/Inputs/ret32.ll b/lld/test/wasm/Inputs/ret32.ll
deleted file mode 100644
index 674b34b66499e..0000000000000
--- a/lld/test/wasm/Inputs/ret32.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define hidden i32 @ret32(float %arg) {
-entry:
-    ret i32 0
-}
diff --git a/lld/test/wasm/Inputs/ret32.s b/lld/test/wasm/Inputs/ret32.s
new file mode 100644
index 0000000000000..5233455917e67
--- /dev/null
+++ b/lld/test/wasm/Inputs/ret32.s
@@ -0,0 +1,6 @@
+  .hidden ret32
+  .globl  ret32
+ret32:
+  .functype ret32 (f32) -> (i32)
+  i32.const 0
+  end_function
diff --git a/lld/test/wasm/Inputs/ret64.ll b/lld/test/wasm/Inputs/ret64.ll
deleted file mode 100644
index c1dd5e55fee03..0000000000000
--- a/lld/test/wasm/Inputs/ret64.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define hidden i64 @ret64(double %arg) {
-entry:
-    ret i64 1
-}
diff --git a/lld/test/wasm/Inputs/ret64.s b/lld/test/wasm/Inputs/ret64.s
new file mode 100644
index 0000000000000..4ce85495a0fdd
--- /dev/null
+++ b/lld/test/wasm/Inputs/ret64.s
@@ -0,0 +1,6 @@
+  .hidden ret64
+  .globl  ret64
+ret64:
+  .functype ret64 (f64) -> (i64)
+  i64.const 1
+  end_function
diff --git a/lld/test/wasm/Inputs/start.ll b/lld/test/wasm/Inputs/start.ll
deleted file mode 100644
index e2629659bf509..0000000000000
--- a/lld/test/wasm/Inputs/start.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define void @_start() local_unnamed_addr {
-entry:
-  ret void
-}
diff --git a/lld/test/wasm/Inputs/start.s b/lld/test/wasm/Inputs/start.s
new file mode 100644
index 0000000000000..c793185ca1bb5
--- /dev/null
+++ b/lld/test/wasm/Inputs/start.s
@@ -0,0 +1,4 @@
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  end_function
diff --git a/lld/test/wasm/Inputs/strong-symbol.ll b/lld/test/wasm/Inputs/strong-symbol.ll
deleted file mode 100644
index cc2aa8ab5d266..0000000000000
--- a/lld/test/wasm/Inputs/strong-symbol.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define i64 @weakFn() #0 {
-entry:
-  ret i64 1
-}
diff --git a/lld/test/wasm/Inputs/strong-symbol.s b/lld/test/wasm/Inputs/strong-symbol.s
new file mode 100644
index 0000000000000..92200759e16c3
--- /dev/null
+++ b/lld/test/wasm/Inputs/strong-symbol.s
@@ -0,0 +1,6 @@
+  .globl  weakFn
+  .type weakFn,@function
+weakFn:
+  .functype weakFn () -> (i64)
+  i64.const 1
+  end_function
diff --git a/lld/test/wasm/Inputs/weak-symbol1.ll b/lld/test/wasm/Inputs/weak-symbol1.ll
deleted file mode 100644
index 6e394ff91d0cf..0000000000000
--- a/lld/test/wasm/Inputs/weak-symbol1.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define weak i32 @weakFn() #0 {
-entry:
-  ret i32 1
-}
-
-define i32 @exportWeak1() {
-entry:
-    ret i32 ptrtoint (i32 ()* @weakFn to i32)
-}
-
-@weakGlobal = weak global i32 1
diff --git a/lld/test/wasm/Inputs/weak-symbol1.s b/lld/test/wasm/Inputs/weak-symbol1.s
new file mode 100644
index 0000000000000..eefdb248cc6b0
--- /dev/null
+++ b/lld/test/wasm/Inputs/weak-symbol1.s
@@ -0,0 +1,17 @@
+  .weak weakFn
+weakFn:
+  .functype weakFn () -> (i32)
+  i32.const 1
+  end_function
+
+  .globl  exportWeak1
+exportWeak1:
+  .functype exportWeak1 () -> (i32)
+  i32.const weakFn
+  end_function
+
+  .section  .data.weakGlobal,"",@
+  .weak weakGlobal
+weakGlobal:
+  .int32  1
+  .size weakGlobal, 4
diff --git a/lld/test/wasm/Inputs/weak-symbol2.ll b/lld/test/wasm/Inputs/weak-symbol2.ll
deleted file mode 100644
index e9c30c18f7dea..0000000000000
--- a/lld/test/wasm/Inputs/weak-symbol2.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-target triple = "wasm32-unknown-unknown"
-
-define weak i32 @weakFn() #0 {
-entry:
-  ret i32 2
-}
-
-define i32 @exportWeak2() {
-entry:
-    ret i32 ptrtoint (i32 ()* @weakFn to i32)
-}
-
-@weakGlobal = weak global i32 2
diff --git a/lld/test/wasm/Inputs/weak-symbol2.s b/lld/test/wasm/Inputs/weak-symbol2.s
new file mode 100644
index 0000000000000..d044b87b96dac
--- /dev/null
+++ b/lld/test/wasm/Inputs/weak-symbol2.s
@@ -0,0 +1,17 @@
+  .weak weakFn
+weakFn:
+  .functype weakFn () -> (i32)
+  i32.const 2
+  end_function
+
+  .globl  exportWeak2
+exportWeak2:
+  .functype exportWeak2 () -> (i32)
+  i32.const weakFn
+  end_function
+
+  .section  .data.weakGlobal,"",@
+  .weak weakGlobal
+weakGlobal:
+  .int32  2
+  .size weakGlobal, 4
diff --git a/lld/test/wasm/alias.ll b/lld/test/wasm/alias.ll
deleted file mode 100644
index 9927ba4c2225f..0000000000000
--- a/lld/test/wasm/alias.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -filetype=obj -o %t.o %s
-; RUN: wasm-ld --export=start_alias %t.o -o %t.wasm
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-@start_alias = alias void (), void ()* @_start
-
-; Function Attrs: nounwind uwtable
-define void @_start() local_unnamed_addr #1 {
-entry:
-  ret void
-}
-
-; CHECK:      --- !WASM
-; CHECK-NEXT: FileHeader:
-; CHECK-NEXT:   Version:         0x00000001
-; CHECK-NEXT: Sections:
-; CHECK-NEXT:   - Type:            TYPE
-; CHECK-NEXT:     Signatures:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:         ReturnTypes:     []
-; CHECK-NEXT:   - Type:            FUNCTION
-; CHECK-NEXT:     FunctionTypes:   [ 0 ]
-; CHECK-NEXT:   - Type:            TABLE
-; CHECK-NEXT:     Tables:
-; CHECK-NEXT:       - ElemType:        FUNCREF
-; CHECK-NEXT:         Limits:
-; CHECK-NEXT:           Flags:           [ HAS_MAX ]
-; CHECK-NEXT:           Initial:         0x00000001
-; CHECK-NEXT:           Maximum:         0x00000001
-; CHECK-NEXT:   - Type:            MEMORY
-; CHECK-NEXT:     Memories:
-; CHECK-NEXT:       - Initial:         0x00000002
-; CHECK-NEXT:   - Type:            GLOBAL
-; CHECK-NEXT:     Globals:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Type:            I32
-; CHECK-NEXT:         Mutable:         true
-; CHECK-NEXT:         InitExpr:
-; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66560
-; CHECK-NEXT:   - Type:            EXPORT
-; CHECK-NEXT:     Exports:
-; CHECK-NEXT:       - Name:            memory
-; CHECK-NEXT:         Kind:            MEMORY
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            _start
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            start_alias
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:   - Type:            CODE
-; CHECK-NEXT:     Functions:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            0B
-; CHECK-NEXT:   - Type:            CUSTOM
-; CHECK-NEXT:     Name:            name
-; CHECK-NEXT:     FunctionNames:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            _start
-; CHECK-NEXT: ...
diff --git a/lld/test/wasm/alias.s b/lld/test/wasm/alias.s
new file mode 100644
index 0000000000000..e95cef1bb6a98
--- /dev/null
+++ b/lld/test/wasm/alias.s
@@ -0,0 +1,64 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --export=start_alias %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  end_function
+
+  .globl start_alias
+  .type start_alias,@function
+.set start_alias, _start
+
+# CHECK:      --- !WASM
+# CHECK-NEXT: FileHeader:
+# CHECK-NEXT:   Version:         0x00000001
+# CHECK-NEXT: Sections:
+# CHECK-NEXT:   - Type:            TYPE
+# CHECK-NEXT:     Signatures:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:         ReturnTypes:     []
+# CHECK-NEXT:   - Type:            FUNCTION
+# CHECK-NEXT:     FunctionTypes:   [ 0 ]
+# CHECK-NEXT:   - Type:            TABLE
+# CHECK-NEXT:     Tables:
+# CHECK-NEXT:       - ElemType:        FUNCREF
+# CHECK-NEXT:         Limits:
+# CHECK-NEXT:           Flags:           [ HAS_MAX ]
+# CHECK-NEXT:           Initial:         0x00000001
+# CHECK-NEXT:           Maximum:         0x00000001
+# CHECK-NEXT:   - Type:            MEMORY
+# CHECK-NEXT:     Memories:
+# CHECK-NEXT:       - Initial:         0x00000002
+# CHECK-NEXT:   - Type:            GLOBAL
+# CHECK-NEXT:     Globals:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Type:            I32
+# CHECK-NEXT:         Mutable:         true
+# CHECK-NEXT:         InitExpr:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           66560
+# CHECK-NEXT:   - Type:            EXPORT
+# CHECK-NEXT:     Exports:
+# CHECK-NEXT:       - Name:            memory
+# CHECK-NEXT:         Kind:            MEMORY
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:       - Name:            _start
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:       - Name:            start_alias
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:   - Type:            CODE
+# CHECK-NEXT:     Functions:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            0B
+# CHECK-NEXT:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            name
+# CHECK-NEXT:     FunctionNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            _start
+# CHECK-NEXT: ...
diff --git a/lld/test/wasm/archive-export.ll b/lld/test/wasm/archive-export.ll
index 664f7761e61fa..9a76d60d63d91 100644
--- a/lld/test/wasm/archive-export.ll
+++ b/lld/test/wasm/archive-export.ll
@@ -1,6 +1,6 @@
 Test that --export will also fetch lazy symbols from archives
 
-RUN: llc -filetype=obj %S/Inputs/start.ll -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
 RUN: llc -filetype=obj %S/Inputs/archive1.ll -o %t.a1.o
 RUN: llc -filetype=obj %S/Inputs/archive2.ll -o %t.a2.o
 RUN: rm -f %t.a
diff --git a/lld/test/wasm/archive-no-index.ll b/lld/test/wasm/archive-no-index.ll
deleted file mode 100644
index 237fd93dcd6c4..0000000000000
--- a/lld/test/wasm/archive-no-index.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; Tests error on archive file without a symbol table
-; RUN: llvm-as -o %t.o %s
-; RUN: llvm-as -o %t.archive.o %S/Inputs/archive1.ll
-; RUN: rm -f %t.a
-; RUN: llvm-ar crS %t.a %t.archive.o
-
-; RUN: not wasm-ld -o out.wasm %t.o %t.a 2>&1 | FileCheck %s
-
-define i32 @_start() {
-  ret i32 0
-}
-
-; CHECK: archive has no index; run ranlib to add one
diff --git a/lld/test/wasm/archive-no-index.s b/lld/test/wasm/archive-no-index.s
new file mode 100644
index 0000000000000..99ca5a367d3c6
--- /dev/null
+++ b/lld/test/wasm/archive-no-index.s
@@ -0,0 +1,14 @@
+# Tests error on archive file without a symbol table
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: llvm-as -o %t.archive.o %S/Inputs/archive1.ll
+# RUN: rm -f %t.a
+# RUN: llvm-ar crS %t.a %t.archive.o
+
+# RUN: not wasm-ld -o out.wasm %t.o %t.a 2>&1 | FileCheck %s
+
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  end_function
+
+# CHECK: archive has no index; run ranlib to add one
diff --git a/lld/test/wasm/archive-weak-undefined.ll b/lld/test/wasm/archive-weak-undefined.ll
index 25afccabaf7fc..530ff8aeb6183 100644
--- a/lld/test/wasm/archive-weak-undefined.ll
+++ b/lld/test/wasm/archive-weak-undefined.ll
@@ -1,7 +1,7 @@
 ; Test that weak undefined symbols do not fetch members from archive files.
 ; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llc -filetype=obj %S/Inputs/ret32.ll -o %t.ret32.o
-; RUN: llc -filetype=obj %S/Inputs/hello.ll -o %t.hello.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/hello.s -o %t.hello.o
 ; RUN: rm -f %t.a
 ; RUN: llvm-ar rcs %t.a %t.ret32.o %t.hello.o
 
diff --git a/lld/test/wasm/archive.ll b/lld/test/wasm/archive.ll
index 84054536a92df..df72ec7813e22 100644
--- a/lld/test/wasm/archive.ll
+++ b/lld/test/wasm/archive.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -filetype=obj %S/Inputs/archive1.ll -o %t.a1.o
 ; RUN: llc -filetype=obj %S/Inputs/archive2.ll -o %t.a2.o
 ; RUN: llc -filetype=obj %S/Inputs/archive3.ll -o %t.a3.o
-; RUN: llc -filetype=obj %S/Inputs/hello.ll -o %t.hello.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/hello.s -o %t.hello.o
 ; RUN: rm -f %t.a
 ; RUN: llvm-ar rcs %t.a %t.a1.o %t.a2.o %t.a3.o %t.hello.o
 ; RUN: rm -f %t.imports
diff --git a/lld/test/wasm/bss-only.ll b/lld/test/wasm/bss-only.ll
deleted file mode 100644
index 23f94a1a3ebdb..0000000000000
--- a/lld/test/wasm/bss-only.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: wasm-ld -no-gc-sections --no-entry %t.o -o %t.wasm
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-; Test that the data section is skipped entirely when there are only
-; bss segments
-
-target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
-
-@a = global [1000 x i8] zeroinitializer, align 1
-@b = global i32 0
-
-; CHECK-NOT: - Type:            DATA
diff --git a/lld/test/wasm/bss-only.s b/lld/test/wasm/bss-only.s
new file mode 100644
index 0000000000000..56963530a0b03
--- /dev/null
+++ b/lld/test/wasm/bss-only.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld -no-gc-sections --no-entry --export=__data_end %t.o -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+# Test that the data section is skipped entirely when there are only
+# bss segments
+
+  .section  .bss.a,"",@
+  .globl  a
+a:
+  .skip 1000
+  .size a, 1000
+
+  .section  .bss.b,"",@
+  .globl  b
+b:
+  .int32  0
+  .size b, 4
+
+# CHECK-NOT: - Type:            DATA
+
+#      CHECK:   - Type:            GLOBAL
+# CHECK-NEXT:     Globals:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Type:            I32
+# CHECK-NEXT:         Mutable:         true
+# CHECK-NEXT:         InitExpr:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           67568
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Type:            I32
+# CHECK-NEXT:         Mutable:         false
+# CHECK-NEXT:         InitExpr:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           2028
+# CHECK-NEXT:   - Type:            EXPORT
+# CHECK-NEXT:     Exports:
+# CHECK-NEXT:       - Name:            memory
+# CHECK-NEXT:         Kind:            MEMORY
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:       - Name:            __data_end
+# CHECK-NEXT:         Kind:            GLOBAL
+# CHECK-NEXT:         Index:           1
diff --git a/lld/test/wasm/call-indirect.ll b/lld/test/wasm/call-indirect.ll
index 2eb134a97bbb4..722385bdd350a 100644
--- a/lld/test/wasm/call-indirect.ll
+++ b/lld/test/wasm/call-indirect.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj %p/Inputs/call-indirect.ll -o %t2.o
 ; RUN: llc -filetype=obj %s -o %t.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-indirect.s -o %t2.o
 ; RUN: wasm-ld --export-dynamic -o %t.wasm %t2.o %t.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
 
@@ -121,9 +121,7 @@ define void @call_ptr(i64 (i64)* %arg) {
 ; CHECK-NEXT:         Body:            42010B
 ; CHECK-NEXT:       - Index:           1
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:            - Type:            I32
-; CHECK-NEXT:              Count:           1
-; CHECK-NEXT:          Body:            4100280284888080002100410028028088808000118080808000001A2000118180808000001A0B
+; CHECK-NEXT:         Body:            28028088808000118080808000001A28028488808000118180808000001A0B
 ; CHECK-NEXT:       - Index:           2
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            41020B
diff --git a/lld/test/wasm/compress-relocs.ll b/lld/test/wasm/compress-relocs.ll
index d14ea26a4c334..6c3533a108a53 100644
--- a/lld/test/wasm/compress-relocs.ll
+++ b/lld/test/wasm/compress-relocs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj %p/Inputs/call-indirect.ll -o %t2.o
 ; RUN: llc -filetype=obj %s -o %t.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-indirect.s -o %t2.o
 ; RUN: wasm-ld --export-dynamic -o %t.wasm %t2.o %t.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
 ; RUN: wasm-ld --export-dynamic -O2 -o %t-opt.wasm %t2.o %t.o
@@ -22,5 +22,5 @@ entry:
 
 ; ERROR: wasm-ld: error: --compress-relocations is incompatible with output debug information. Please pass --strip-debug or --strip-all
 
-; CHECK:    Body:            4100280284888080002100410028028088808000118080808000001A2000118180808000001A0B
-; COMPRESS: Body:            41002802840821004100280280081100001A20001101001A0B
+; CHECK:    Body:            28028088808000118080808000001A28028488808000118180808000001A0B
+; COMPRESS: Body:            280280081100001A280284081101001A0B
diff --git a/lld/test/wasm/conflict.test b/lld/test/wasm/conflict.test
index 9adc92ed1eda0..290a0319af532 100644
--- a/lld/test/wasm/conflict.test
+++ b/lld/test/wasm/conflict.test
@@ -1,4 +1,4 @@
-# RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 # RUN: not wasm-ld -o %t.wasm %t.ret32.o %t.ret32.o 2>&1 | FileCheck %s
 
 # CHECK:      duplicate symbol: ret32
diff --git a/lld/test/wasm/data-layout.ll b/lld/test/wasm/data-layout.ll
index 02ff0ecd8b97a..759c5440fe99a 100644
--- a/lld/test/wasm/data-layout.ll
+++ b/lld/test/wasm/data-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/hello.ll -o %t.hello.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/hello.s -o %t.hello.o
 ; RUN: llc -filetype=obj %s -o %t.o
 
 target triple = "wasm32-unknown-unknown"
diff --git a/lld/test/wasm/demangle.ll b/lld/test/wasm/demangle.ll
deleted file mode 100644
index 64fa46ae4dce4..0000000000000
--- a/lld/test/wasm/demangle.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: not wasm-ld -o %t.wasm %t.o 2>&1 | FileCheck %s
-
-; CHECK: error: {{.*}}.o: undefined symbol: foo(int)
-
-; RUN: not wasm-ld --no-demangle \
-; RUN:     -o %t.wasm %t.o 2>&1 | FileCheck -check-prefix=CHECK-NODEMANGLE %s
-
-; CHECK-NODEMANGLE: error: {{.*}}.o: undefined symbol: _Z3fooi
-
-target triple = "wasm32-unknown-unknown"
-
-declare void @_Z3fooi(i32);
-
-define hidden void @_start() local_unnamed_addr {
-entry:
-    call void @_Z3fooi(i32 1)
-    ret void
-}
diff --git a/lld/test/wasm/demangle.s b/lld/test/wasm/demangle.s
new file mode 100644
index 0000000000000..7e1af46d64b89
--- /dev/null
+++ b/lld/test/wasm/demangle.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: not wasm-ld -o %t.wasm %t.o 2>&1 | FileCheck %s
+
+# CHECK: error: {{.*}}.o: undefined symbol: foo(int)
+
+# RUN: not wasm-ld --no-demangle \
+# RUN:     -o %t.wasm %t.o 2>&1 | FileCheck -check-prefix=CHECK-NODEMANGLE %s
+
+# CHECK-NODEMANGLE: error: {{.*}}.o: undefined symbol: _Z3fooi
+
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  i32.const 1
+  call  _Z3fooi
+  end_function
+
+.functype _Z3fooi (i32) -> ()
diff --git a/lld/test/wasm/emit-relocs-fpic.s b/lld/test/wasm/emit-relocs-fpic.s
index e1adede20a5f7..c70e1e6751098 100644
--- a/lld/test/wasm/emit-relocs-fpic.s
+++ b/lld/test/wasm/emit-relocs-fpic.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -o %t.o < %s
-# RUN: llc --relocation-model=pic -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 # RUN: wasm-ld -pie --export-all --no-gc-sections --no-entry --emit-relocs -o %t.wasm %t.o %t.ret32.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/emit-relocs.ll b/lld/test/wasm/emit-relocs.ll
index cdf492ab8fbde..0317f9b0eb37a 100644
--- a/lld/test/wasm/emit-relocs.ll
+++ b/lld/test/wasm/emit-relocs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: wasm-ld --emit-relocs -o %t.wasm %t.o %t.ret32.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/entry-signature.ll b/lld/test/wasm/entry-signature.ll
deleted file mode 100644
index f7f3d481acfc6..0000000000000
--- a/lld/test/wasm/entry-signature.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; Verify that the entry point signature can be flexible.
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: wasm-ld -o %t1.wasm %t.o
-
-target triple = "wasm32-unknown-unknown-wasm"
-
-define hidden i32 @_start(i32, i64) local_unnamed_addr #0 {
-entry:
-  ret i32 0
-}
diff --git a/lld/test/wasm/entry-signature.s b/lld/test/wasm/entry-signature.s
new file mode 100644
index 0000000000000..b6f355c131934
--- /dev/null
+++ b/lld/test/wasm/entry-signature.s
@@ -0,0 +1,8 @@
+# Verify that the entry point signature can be flexible.
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld -o %t1.wasm %t.o
+
+  .globl  _start
+_start:
+  .functype _start (i64) -> (f32)
+  end_function
diff --git a/lld/test/wasm/export-empty.test b/lld/test/wasm/export-empty.test
index 1c0f7bbc779d9..03fc3d209c314 100644
--- a/lld/test/wasm/export-empty.test
+++ b/lld/test/wasm/export-empty.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
 RUN: not wasm-ld --export "" %t.o -o %t.wasm 2>&1 | FileCheck --match-full-lines %s
 
 CHECK: wasm-ld: error: symbol exported via --export not found: 
diff --git a/lld/test/wasm/export-optional-lazy.ll b/lld/test/wasm/export-optional-lazy.ll
deleted file mode 100644
index c37a3e5183eba..0000000000000
--- a/lld/test/wasm/export-optional-lazy.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; Optional linker-synthetic symbols are only created if they are undefined
-; in the final output.
-; This test is for a regression where an explicit --export of an lazy archive
-; symbol caused an undefined reference to an optional symbol to occur *after*
-; the optional symbols were created.
-
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llc -filetype=obj %S/Inputs/optional-symbol.ll -o %t.a1.o
-; RUN: rm -f %t.a
-; RUN: llvm-ar rcs %t.a %t.a1.o
-; RUN: wasm-ld --export=get_optional %t.o %t.a -o %t.wasm
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-define void @_start() {
-entry:
-  ret void
-}
-
-; CHECK:      FunctionNames:
-; CHECK-NEXT:   - Index:           0
-; CHECK-NEXT:     Name:            _start
-; CHECK-NEXT:   - Index:           1
-; CHECK-NEXT:     Name:            get_optional
diff --git a/lld/test/wasm/export-optional-lazy.test b/lld/test/wasm/export-optional-lazy.test
new file mode 100644
index 0000000000000..6304d6bb2cd55
--- /dev/null
+++ b/lld/test/wasm/export-optional-lazy.test
@@ -0,0 +1,18 @@
+Optional linker-synthetic symbols are only created if they are undefined
+in the final output.
+This test is for a regression where an explicit --export of an lazy archive
+symbol caused an undefined reference to an optional symbol to occur *after*
+the optional symbols were created.
+
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/optional-symbol.s -o %t.a1.o
+RUN: rm -f %t.a
+RUN: llvm-ar rcs %t.a %t.a1.o
+RUN: wasm-ld --export=get_optional %t.o %t.a -o %t.wasm
+RUN: obj2yaml %t.wasm | FileCheck %s
+
+CHECK:      FunctionNames:
+CHECK-NEXT:   - Index:           0
+CHECK-NEXT:     Name:            _start
+CHECK-NEXT:   - Index:           1
+CHECK-NEXT:     Name:            get_optional
diff --git a/lld/test/wasm/export-table.test b/lld/test/wasm/export-table.test
index e2d05f00d5a7b..b218392652d68 100644
--- a/lld/test/wasm/export-table.test
+++ b/lld/test/wasm/export-table.test
@@ -1,4 +1,4 @@
-# RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.start.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 # RUN: wasm-ld --export-table -o %t.wasm %t.start.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/fatal-warnings.ll b/lld/test/wasm/fatal-warnings.ll
index d338420ef09c3..01a0137a2f454 100644
--- a/lld/test/wasm/fatal-warnings.ll
+++ b/lld/test/wasm/fatal-warnings.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -filetype=obj %s -o %t.main.o
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: wasm-ld -o %t.wasm %t.main.o %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-WARN
 ; RUN: not wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-FATAL
 
diff --git a/lld/test/wasm/function-imports-first.ll b/lld/test/wasm/function-imports-first.ll
index 73c113438f01a..b2751e047bd40 100644
--- a/lld/test/wasm/function-imports-first.ll
+++ b/lld/test/wasm/function-imports-first.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: llc -filetype=obj %s -o %t.o
 ; RUN: wasm-ld -o %t.wasm %t.o %t.ret32.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
diff --git a/lld/test/wasm/function-imports.ll b/lld/test/wasm/function-imports.ll
index 4fb1c64c72bf0..5f2d6f9ee611e 100644
--- a/lld/test/wasm/function-imports.ll
+++ b/lld/test/wasm/function-imports.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: llc -filetype=obj %s -o %t.o
 ; RUN: wasm-ld -o %t.wasm %t.ret32.o %t.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
diff --git a/lld/test/wasm/function-index.test b/lld/test/wasm/function-index.test
index fbcde6cd81685..0b32551c5af68 100644
--- a/lld/test/wasm/function-index.test
+++ b/lld/test/wasm/function-index.test
@@ -1,5 +1,5 @@
-# RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
-# RUN: llc -filetype=obj %p/Inputs/ret64.ll -o %t.ret64.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret64.s -o %t.ret64.o
 # RUN: wasm-ld -r -o %t.wasm %t.ret32.o %t.ret64.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/global-base.test b/lld/test/wasm/global-base.test
index 723b0d79af11f..56efadd9d5889 100644
--- a/lld/test/wasm/global-base.test
+++ b/lld/test/wasm/global-base.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
 
 RUN: wasm-ld --export=__global_base --export=__data_end --allow-undefined -o %t.wasm %t.o
 RUN: obj2yaml %t.wasm | FileCheck %s  -check-prefix=CHECK-1024
diff --git a/lld/test/wasm/growable-table.test b/lld/test/wasm/growable-table.test
index cd52f2e1662ff..00cff3fc416ee 100644
--- a/lld/test/wasm/growable-table.test
+++ b/lld/test/wasm/growable-table.test
@@ -1,4 +1,4 @@
-# RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.start.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 # RUN: wasm-ld --export-table --growable-table -o %t.wasm %t.start.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/import-attribute-mismatch.ll b/lld/test/wasm/import-attribute-mismatch.ll
deleted file mode 100644
index d3ba294a212b0..0000000000000
--- a/lld/test/wasm/import-attribute-mismatch.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t1.o
-; RUN: llc -filetype=obj %S/Inputs/import-attributes.ll -o %t2.o
-; RUN: not wasm-ld --export call_foo --allow-undefined -o %t.wasm %t1.o %t2.o 2>&1 | FileCheck %s
-
-target triple = "wasm32-unknown-unknown-wasm"
-
-define void @_start() {
-  call void @foo();
-  ret void
-}
-
-declare void @foo() #0
-
-attributes #0 = { "wasm-import-module"="bar" }
-
-; CHECK: wasm-ld: error: import module mismatch for symbol: foo
-; CHECK: >>> defined as bar in {{.*}}1.o
-; CHECK: >>> defined as baz in {{.*}}2.o
diff --git a/lld/test/wasm/import-attribute-mismatch.s b/lld/test/wasm/import-attribute-mismatch.s
new file mode 100644
index 0000000000000..67aa1e39ef368
--- /dev/null
+++ b/lld/test/wasm/import-attribute-mismatch.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t1.o %s
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %S/Inputs/import-attributes.s -o %t2.o
+# RUN: not wasm-ld --export call_foo --allow-undefined -o %t.wasm %t1.o %t2.o 2>&1 | FileCheck %s
+
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  call  foo
+  end_function
+
+.functype foo () -> ()
+.import_module  foo, bar
+
+# CHECK: wasm-ld: error: import module mismatch for symbol: foo
+# CHECK: >>> defined as bar in {{.*}}1.o
+# CHECK: >>> defined as baz in {{.*}}2.o
diff --git a/lld/test/wasm/import-memory.test b/lld/test/wasm/import-memory.test
index d7a257199dbbb..bc9e64fe578e5 100644
--- a/lld/test/wasm/import-memory.test
+++ b/lld/test/wasm/import-memory.test
@@ -1,4 +1,4 @@
-# RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.start.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 # RUN: wasm-ld --import-memory -o %t.wasm %t.start.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/import-table.test b/lld/test/wasm/import-table.test
index 440509b3483c4..4d1b4c9a78325 100644
--- a/lld/test/wasm/import-table.test
+++ b/lld/test/wasm/import-table.test
@@ -1,4 +1,4 @@
-# RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.start.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 # RUN: wasm-ld --import-table -o %t.wasm %t.start.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/invalid-stack-size.test b/lld/test/wasm/invalid-stack-size.test
index 90c9fda113e62..645e1a49f8d6b 100644
--- a/lld/test/wasm/invalid-stack-size.test
+++ b/lld/test/wasm/invalid-stack-size.test
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
 ; RUN: not wasm-ld -o %t.wasm -z stack-size=1 %t.o 2>&1 | FileCheck %s
 
 ; CHECK: error: stack size must be 16-byte aligned
diff --git a/lld/test/wasm/large-memory.test b/lld/test/wasm/large-memory.test
index 0713a8b02dd15..40270c5195181 100644
--- a/lld/test/wasm/large-memory.test
+++ b/lld/test/wasm/large-memory.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
 
 ; Verify we can parse large integers such as when we ask for 2G of total
 ; memory.
diff --git a/lld/test/wasm/load-undefined.test b/lld/test/wasm/load-undefined.test
index 1b8d259d5b8dd..3171d4a4a4aa0 100644
--- a/lld/test/wasm/load-undefined.test
+++ b/lld/test/wasm/load-undefined.test
@@ -1,9 +1,9 @@
 ; Verify that the -u / --undefined option is able to pull in symbols from
 ; an archive, and doesn't error when uses to pull in a symbol already loaded.
 ;
-; RUN: llc -filetype=obj %S/Inputs/ret64.ll -o %t.o
-; RUN: llc -filetype=obj %S/Inputs/ret32.ll -o %t2.o
-; RUN: llc -filetype=obj %S/Inputs/start.ll -o %t.start.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret64.s -o %t.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t2.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 ; RUN: rm -f %t2.a
 ; RUN: llvm-ar rcs %t2.a %t2.o
 ; RUN: wasm-ld %t.start.o --no-gc-sections %t2.a %t.o -o %t.wasm -u ret32 --undefined ret64
diff --git a/lld/test/wasm/no-tls.ll b/lld/test/wasm/no-tls.ll
new file mode 100644
index 0000000000000..54bb9cee01699
--- /dev/null
+++ b/lld/test/wasm/no-tls.ll
@@ -0,0 +1,48 @@
+; Testing that __tls_size and __tls_align are correctly emitted when there are
+; no thread_local variables.
+
+; RUN: llc -mattr=+bulk-memory,+atomics -filetype=obj %s -o %t.o
+
+target triple = "wasm32-unknown-unknown"
+
+define void @_start() local_unnamed_addr {
+entry:
+  ret void
+}
+
+; RUN: wasm-ld -no-gc-sections --shared-memory --max-memory=131072 --allow-undefined -o %t.wasm %t.o
+; RUN: obj2yaml %t.wasm | FileCheck %s
+; CHECK:       - Type:            GLOBAL
+; CHECK-NEXT:    Globals:
+
+; __stack_pointer
+; CHECK-NEXT:      - Index:           0
+; CHECK-NEXT:        Type:            I32
+; CHECK-NEXT:        Mutable:         true
+; CHECK-NEXT:        InitExpr:
+; CHECK-NEXT:          Opcode:          I32_CONST
+; CHECK-NEXT:          Value:           66576
+
+; __tls_base
+; CHECK-NEXT:      - Index:           1
+; CHECK-NEXT:        Type:            I32
+; CHECK-NEXT:        Mutable:         true
+; CHECK-NEXT:        InitExpr:
+; CHECK-NEXT:          Opcode:          I32_CONST
+; CHECK-NEXT:          Value:           0
+
+; __tls_size
+; CHECK-NEXT:      - Index:           2
+; CHECK-NEXT:        Type:            I32
+; CHECK-NEXT:        Mutable:         false
+; CHECK-NEXT:        InitExpr:
+; CHECK-NEXT:          Opcode:          I32_CONST
+; CHECK-NEXT:          Value:           0
+
+; __tls_align
+; CHECK-NEXT:      - Index:           3
+; CHECK-NEXT:        Type:            I32
+; CHECK-NEXT:        Mutable:         false
+; CHECK-NEXT:        InitExpr:
+; CHECK-NEXT:          Opcode:          I32_CONST
+; CHECK-NEXT:          Value:           1
diff --git a/lld/test/wasm/no-tls.test b/lld/test/wasm/no-tls.test
deleted file mode 100644
index 225d78a968565..0000000000000
--- a/lld/test/wasm/no-tls.test
+++ /dev/null
@@ -1,41 +0,0 @@
-; Testing that __tls_size and __tls_align are correctly emitted when there are
-; no thread_local variables.
-
-RUN: llc -mattr=+bulk-memory,+atomics -filetype=obj %p/Inputs/start.ll -o %t.o
-
-RUN: wasm-ld -no-gc-sections --shared-memory --max-memory=131072 --allow-undefined -o %t.wasm %t.o
-RUN: obj2yaml %t.wasm | FileCheck %s
-CHECK:       - Type:            GLOBAL
-CHECK-NEXT:    Globals:
-
-; __stack_pointer
-CHECK-NEXT:      - Index:           0
-CHECK-NEXT:        Type:            I32
-CHECK-NEXT:        Mutable:         true
-CHECK-NEXT:        InitExpr:
-CHECK-NEXT:          Opcode:          I32_CONST
-CHECK-NEXT:          Value:           66576
-
-; __tls_base
-CHECK-NEXT:      - Index:           1
-CHECK-NEXT:        Type:            I32
-CHECK-NEXT:        Mutable:         true
-CHECK-NEXT:        InitExpr:
-CHECK-NEXT:          Opcode:          I32_CONST
-CHECK-NEXT:          Value:           0
-
-; __tls_size
-CHECK-NEXT:      - Index:           2
-CHECK-NEXT:        Type:            I32
-CHECK-NEXT:        Mutable:         false
-CHECK-NEXT:        InitExpr:
-CHECK-NEXT:          Opcode:          I32_CONST
-CHECK-NEXT:          Value:           0
-
-; __tls_align
-CHECK-NEXT:      - Index:           3
-CHECK-NEXT:        Type:            I32
-CHECK-NEXT:        Mutable:         false
-CHECK-NEXT:        InitExpr:
-CHECK-NEXT:          Opcode:          I32_CONST
-CHECK-NEXT:          Value:           1
diff --git a/lld/test/wasm/optional-symbol.ll b/lld/test/wasm/optional-symbol.ll
deleted file mode 100644
index ac1a4212fbf09..0000000000000
--- a/lld/test/wasm/optional-symbol.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -filetype=obj -o %t.o %s
-; RUN: wasm-ld --export=get_handle %t.o -o %t.wasm
-
-target triple = "wasm32-unknown-unknown"
-
-@__dso_handle = external global i8*
-
-define i8** @get_handle() {
-  ret i8** @__dso_handle
-}
-
-define void @_start() {
-  ret void
-}
diff --git a/lld/test/wasm/optional-symbol.s b/lld/test/wasm/optional-symbol.s
new file mode 100644
index 0000000000000..bac98838e6aee
--- /dev/null
+++ b/lld/test/wasm/optional-symbol.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --export=get_handle %t.o -o %t.wasm
+
+  .globl  get_handle
+get_handle:
+  .functype get_handle () -> (i32)
+  i32.const __dso_handle
+  end_function
+
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  end_function
diff --git a/lld/test/wasm/pic-static.ll b/lld/test/wasm/pic-static.ll
index 65b47175865ee..d9fe0eacc98b9 100644
--- a/lld/test/wasm/pic-static.ll
+++ b/lld/test/wasm/pic-static.ll
@@ -1,7 +1,7 @@
 ; Test that PIC code can be linked into static binaries.
 ; In this case the GOT entries will end up as internalized wasm globals with
 ; fixed values.
-; RUN: llc -relocation-model=pic -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: llc -relocation-model=pic -filetype=obj %s -o %t.o
 ; RUN: wasm-ld --allow-undefined --export-all -o %t.wasm %t.o %t.ret32.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
diff --git a/lld/test/wasm/relocatable.ll b/lld/test/wasm/relocatable.ll
index a43a06e4be581..1bf0391086148 100644
--- a/lld/test/wasm/relocatable.ll
+++ b/lld/test/wasm/relocatable.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/hello.ll -o %t.hello.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/hello.s -o %t.hello.o
 ; RUN: llc -filetype=obj %s -o %t.o
 ; RUN: wasm-ld -r -o %t.wasm %t.hello.o %t.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
diff --git a/lld/test/wasm/responsefile.test b/lld/test/wasm/responsefile.test
index 85ac41f93f2be..36209d48edda5 100644
--- a/lld/test/wasm/responsefile.test
+++ b/lld/test/wasm/responsefile.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj -o %t.o %p/Inputs/ret32.ll
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.o
 
 RUN: echo "%/t.o -o %/t.wasm -e ret32" > %t.rsp
 RUN: wasm-ld @%t.rsp --initial-memory=655360
diff --git a/lld/test/wasm/shared-needed.ll b/lld/test/wasm/shared-needed.ll
index f0afb10985019..00ecdb68202e9 100644
--- a/lld/test/wasm/shared-needed.ll
+++ b/lld/test/wasm/shared-needed.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 
 ; RUN: wasm-ld -shared -o %t1.so %t.o
 ; RUN: obj2yaml %t1.so | FileCheck %s -check-prefix=SO1
diff --git a/lld/test/wasm/signature-mismatch-export.ll b/lld/test/wasm/signature-mismatch-export.ll
index 55ca66ba81bd5..80c05c33913fc 100644
--- a/lld/test/wasm/signature-mismatch-export.ll
+++ b/lld/test/wasm/signature-mismatch-export.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: llc -filetype=obj %s -o %t.main.o
 ; RUN: wasm-ld --export=ret32 -o %t.wasm %t.main.o %t.ret32.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
diff --git a/lld/test/wasm/signature-mismatch-unknown.ll b/lld/test/wasm/signature-mismatch-unknown.ll
index 9bbad0065dcdc..c78bff556eeec 100644
--- a/lld/test/wasm/signature-mismatch-unknown.ll
+++ b/lld/test/wasm/signature-mismatch-unknown.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: llc -filetype=obj %s -o %t.main.o
 ; RUN: wasm-ld --fatal-warnings -o %t.wasm %t.ret32.o %t.main.o
 ; RUN: wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o
@@ -7,7 +7,7 @@
 ; references ret32:
 ; %t.main.o: Does not call ret32 directly; used the wrong signature.
 ; %t.call-ret32.o: Calls ret32 directly; uses the correct signature.
-; RUN: llc -filetype=obj %p/Inputs/call-ret32.ll -o %t.call-ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-ret32.s -o %t.call-ret32.o
 ; RUN: wasm-ld --export=call_ret32 --fatal-warnings -o %t.wasm %t.main.o %t.call-ret32.o %t.ret32.o
 ; RUN: wasm-ld --export=call_ret32 --fatal-warnings -o %t.wasm %t.call-ret32.o %t.main.o %t.ret32.o
 
diff --git a/lld/test/wasm/signature-mismatch-weak.ll b/lld/test/wasm/signature-mismatch-weak.ll
index 4d2b02cc9ed33..bf94d5361b386 100644
--- a/lld/test/wasm/signature-mismatch-weak.ll
+++ b/lld/test/wasm/signature-mismatch-weak.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj %p/Inputs/weak-symbol1.ll -o %t.weak.o
-; RUN: llc -filetype=obj %p/Inputs/strong-symbol.ll -o %t.strong.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/weak-symbol1.s -o %t.weak.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/strong-symbol.s -o %t.strong.o
 ; RUN: llc -filetype=obj %s -o %t.o
 ; RUN: wasm-ld -o %t.wasm %t.o %t.strong.o %t.weak.o 2>&1 | FileCheck %s
 
diff --git a/lld/test/wasm/signature-mismatch.ll b/lld/test/wasm/signature-mismatch.ll
index bb9204ea7e459..b797b013820b2 100644
--- a/lld/test/wasm/signature-mismatch.ll
+++ b/lld/test/wasm/signature-mismatch.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
-; RUN: llc -filetype=obj %p/Inputs/call-ret32.ll -o %t.call.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-ret32.s -o %t.call.o
 ; RUN: llc -filetype=obj %s -o %t.main.o
 
 ; RUN: wasm-ld --export=call_ret32 --export=ret32 -o %t.wasm %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=WARN
@@ -76,7 +76,7 @@ declare i32 @ret32(i32, i64, i32) local_unnamed_addr
 ; RELOC-NEXT:       - Index:           3
 ; RELOC-NEXT:         Kind:            FUNCTION
 ; RELOC-NEXT:         Name:            call_ret32
-; RELOC-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+; RELOC-NEXT:         Flags:           [ ]
 ; RELOC-NEXT:         Function:        3
 ; RELOC-NEXT:       - Index:           4
 ; RELOC-NEXT:         Kind:            DATA
diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test
index 805acfb6fa6fe..9d7f077d58cf0 100644
--- a/lld/test/wasm/stack-first.test
+++ b/lld/test/wasm/stack-first.test
@@ -3,7 +3,7 @@
 ; stack size of 512.  This means (since the stack grows down) the stack pointer
 ; global should be initialized to 512.
 
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
 RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base -o %t.wasm %t.o
 RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/strip-all.test b/lld/test/wasm/strip-all.test
index 7b7c25963bc75..ae3314f30676c 100644
--- a/lld/test/wasm/strip-all.test
+++ b/lld/test/wasm/strip-all.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.start.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 RUN: wasm-ld --strip-all -o %t.wasm %t.start.o
 RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/strip-debug.test b/lld/test/wasm/strip-debug.test
index 6ee27f8c8a776..12a9dc7816f57 100644
--- a/lld/test/wasm/strip-debug.test
+++ b/lld/test/wasm/strip-debug.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.start.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.start.o
 RUN: wasm-ld --strip-debug -o %t.wasm %t.start.o
 RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/symbol-type-mismatch.ll b/lld/test/wasm/symbol-type-mismatch.ll
deleted file mode 100644
index 4738c4bd00b9f..0000000000000
--- a/lld/test/wasm/symbol-type-mismatch.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
-; RUN: not wasm-ld -o %t.wasm %t.o %t.ret32.o 2>&1 | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-@ret32 = extern_weak global i32, align 4
-
-; CHECK: error: symbol type mismatch: ret32
-; CHECK: >>> defined as WASM_SYMBOL_TYPE_DATA in {{.*}}symbol-type-mismatch.ll.tmp.o
-; CHECK: >>> defined as WASM_SYMBOL_TYPE_FUNCTION in {{.*}}.ret32.o
diff --git a/lld/test/wasm/symbol-type-mismatch.s b/lld/test/wasm/symbol-type-mismatch.s
new file mode 100644
index 0000000000000..d68abc3d349f8
--- /dev/null
+++ b/lld/test/wasm/symbol-type-mismatch.s
@@ -0,0 +1,9 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+# RUN: not wasm-ld -o %t.wasm %t.o %t.ret32.o 2>&1 | FileCheck %s
+
+.weak ret32
+
+# CHECK: error: symbol type mismatch: ret32
+# CHECK: >>> defined as WASM_SYMBOL_TYPE_DATA in {{.*}}symbol-type-mismatch.s.tmp.o
+# CHECK: >>> defined as WASM_SYMBOL_TYPE_FUNCTION in {{.*}}.ret32.o
diff --git a/lld/test/wasm/trace-symbol.ll b/lld/test/wasm/trace-symbol.ll
index e589de0f6d43e..25154004d6b0e 100644
--- a/lld/test/wasm/trace-symbol.ll
+++ b/lld/test/wasm/trace-symbol.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 ; RUN: llc -filetype=obj -o %t.start.o %s
 ; RUN: wasm-ld -o %t.wasm %t.start.o %t.ret32.o -y ret32 -y _start | FileCheck %s -check-prefix=BOTH
 ; RUN: wasm-ld -o %t.wasm %t.ret32.o %t.start.o -y ret32 -y _start | FileCheck %s -check-prefix=REVERSED
diff --git a/lld/test/wasm/trace.test b/lld/test/wasm/trace.test
index 023a2ccb2cc0d..ae6f6335c4e09 100644
--- a/lld/test/wasm/trace.test
+++ b/lld/test/wasm/trace.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.foo.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.foo.o
 
 # Check -t
 RUN: wasm-ld %t.foo.o -o %t.t.out.wasm -t 2>&1 | FileCheck %s
diff --git a/lld/test/wasm/undefined-entry.test b/lld/test/wasm/undefined-entry.test
index 3106a76510f46..a5bca16a0de0e 100644
--- a/lld/test/wasm/undefined-entry.test
+++ b/lld/test/wasm/undefined-entry.test
@@ -1,4 +1,4 @@
-RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 RUN: not wasm-ld -o %t.wasm %t.ret32.o 2>&1 | FileCheck %s
 RUN: not wasm-ld --allow-undefined -o %t.wasm %t.ret32.o 2>&1 | FileCheck %s
 RUN: not wasm-ld -entry=foo -o %t.wasm %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-CUSTOM
diff --git a/lld/test/wasm/version.ll b/lld/test/wasm/version.ll
deleted file mode 100644
index 84932b029c9e4..0000000000000
--- a/lld/test/wasm/version.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: wasm-ld -o %t.wasm %t.o
-; RUN: llvm-readobj --file-headers %t.wasm | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-define hidden void @_start() local_unnamed_addr #0 {
-entry:
-    ret void
-}
-
-; CHECK: Format: WASM
-; CHECK: Arch: wasm32
-; CHECK: AddressSize: 32bit
-; CHECK: Version: 0x1
diff --git a/lld/test/wasm/version.s b/lld/test/wasm/version.s
new file mode 100644
index 0000000000000..2ce86f109fef2
--- /dev/null
+++ b/lld/test/wasm/version.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld -o %t.wasm %t.o
+# RUN: llvm-readobj --file-headers %t.wasm | FileCheck %s
+
+  .globl  _start
+_start:
+  .functype _start () -> ()
+  end_function
+
+# CHECK: Format: WASM
+# CHECK: Arch: wasm32
+# CHECK: AddressSize: 32bit
+# CHECK: Version: 0x1
diff --git a/lld/test/wasm/visibility-hidden.ll b/lld/test/wasm/visibility-hidden.ll
index 99acd5651f7ba..36c29a8e47385 100644
--- a/lld/test/wasm/visibility-hidden.ll
+++ b/lld/test/wasm/visibility-hidden.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -filetype=obj -o %t.o %s
-; RUN: llc -filetype=obj %S/Inputs/hidden.ll -o %t2.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/hidden.s -o %t2.o
 ; RUN: rm -f %t2.a
 ; RUN: llvm-ar rcs %t2.a %t2.o
 
diff --git a/lld/test/wasm/weak-symbols.ll b/lld/test/wasm/weak-symbols.ll
index 70357b23101de..43e9014ff322a 100644
--- a/lld/test/wasm/weak-symbols.ll
+++ b/lld/test/wasm/weak-symbols.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj %p/Inputs/weak-symbol1.ll -o %t1.o
-; RUN: llc -filetype=obj %p/Inputs/weak-symbol2.ll -o %t2.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/weak-symbol1.s -o %t1.o
+; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/weak-symbol2.s -o %t2.o
 ; RUN: llc -filetype=obj %s -o %t.o
 ; RUN: wasm-ld --export-dynamic -o %t.wasm %t.o %t1.o %t2.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
diff --git a/lld/test/wasm/whole-archive.test b/lld/test/wasm/whole-archive.test
index 814acbf432e7d..34b5932aeec83 100644
--- a/lld/test/wasm/whole-archive.test
+++ b/lld/test/wasm/whole-archive.test
@@ -1,5 +1,5 @@
-RUN: llc -filetype=obj %p/Inputs/start.ll -o %t.o
-RUN: llc -filetype=obj %p/Inputs/ret32.ll -o %t.ret32.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/start.s -o %t.o
+RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
 RUN: rm -f %t.a
 RUN: llvm-ar rcs %t.a %t.ret32.o
 

From 1285e8bcac2c54ddd924ffb813b2b187467ac2a6 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 27 May 2020 23:12:36 -0700
Subject: [PATCH 451/770] Run Coverage pass before other *San passes under new
 pass manager, round 2

Summary:
This was attempted once before in https://reviews.llvm.org/D79698, but
was reverted due to the coverage pass running in the wrong part of the
pipeline. This commit puts it in the same place as the other sanitizers.

This changes PassBuilder.OptimizerLastEPCallbacks to work on a
ModulePassManager instead of a FunctionPassManager. That is because
SanitizerCoverage cannot (easily) be split into a module pass and a
function pass like some of the other sanitizers since in its current
implementation it conditionally inserts module constructors based on
whether or not it successfully modified functions.

This fixes compiler-rt/test/msan/coverage-levels.cpp under the new pass
manager (last check-msan test).

Currently sanitizers + LTO don't work together under the new pass
manager, so I removed tests that checked that this combination works for
sancov.

Subscribers: hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D80692
---
 clang/lib/CodeGen/BackendUtil.cpp      | 50 +++++++++++++++++---------
 clang/test/CodeGen/sancov-new-pm.c     |  8 ++---
 llvm/include/llvm/Passes/PassBuilder.h |  4 +--
 llvm/lib/Passes/PassBuilder.cpp        |  6 ++--
 llvm/tools/opt/NewPMDriver.cpp         |  2 +-
 5 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index e746aef1a62ff..dd5016333920d 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -1001,6 +1002,15 @@ static void addSanitizersAtO0(ModulePassManager &MPM,
                               const Triple &TargetTriple,
                               const LangOptions &LangOpts,
                               const CodeGenOptions &CodeGenOpts) {
+  if (CodeGenOpts.SanitizeCoverageType ||
+      CodeGenOpts.SanitizeCoverageIndirectCalls ||
+      CodeGenOpts.SanitizeCoverageTraceCmp) {
+    auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
+    MPM.addPass(ModuleSanitizerCoveragePass(
+        SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
+        CodeGenOpts.SanitizeCoverageBlacklistFiles));
+  }
+
   auto ASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
     MPM.addPass(RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
     bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
@@ -1249,6 +1259,20 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
             [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
               FPM.addPass(BoundsCheckingPass());
             });
+
+      if (CodeGenOpts.SanitizeCoverageType ||
+          CodeGenOpts.SanitizeCoverageIndirectCalls ||
+          CodeGenOpts.SanitizeCoverageTraceCmp) {
+        PB.registerOptimizerLastEPCallback(
+            [this](ModulePassManager &MPM,
+                   PassBuilder::OptimizationLevel Level) {
+              auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
+              MPM.addPass(ModuleSanitizerCoveragePass(
+                  SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
+                  CodeGenOpts.SanitizeCoverageBlacklistFiles));
+            });
+      }
+
       if (LangOpts.Sanitize.has(SanitizerKind::Memory)) {
         int TrackOrigins = CodeGenOpts.SanitizeMemoryTrackOrigins;
         bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::Memory);
@@ -1257,17 +1281,19 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
               MPM.addPass(MemorySanitizerPass({TrackOrigins, Recover, false}));
             });
         PB.registerOptimizerLastEPCallback(
-            [TrackOrigins, Recover](FunctionPassManager &FPM,
+            [TrackOrigins, Recover](ModulePassManager &MPM,
                                     PassBuilder::OptimizationLevel Level) {
-              FPM.addPass(MemorySanitizerPass({TrackOrigins, Recover, false}));
+              MPM.addPass(createModuleToFunctionPassAdaptor(
+                  MemorySanitizerPass({TrackOrigins, Recover, false})));
             });
       }
       if (LangOpts.Sanitize.has(SanitizerKind::Thread)) {
         PB.registerPipelineStartEPCallback(
             [](ModulePassManager &MPM) { MPM.addPass(ThreadSanitizerPass()); });
         PB.registerOptimizerLastEPCallback(
-            [](FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
-              FPM.addPass(ThreadSanitizerPass());
+            [](ModulePassManager &MPM, PassBuilder::OptimizationLevel Level) {
+              MPM.addPass(
+                  createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
             });
       }
       if (LangOpts.Sanitize.has(SanitizerKind::Address)) {
@@ -1278,10 +1304,11 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
         bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::Address);
         bool UseAfterScope = CodeGenOpts.SanitizeAddressUseAfterScope;
         PB.registerOptimizerLastEPCallback(
-            [Recover, UseAfterScope](FunctionPassManager &FPM,
+            [Recover, UseAfterScope](ModulePassManager &MPM,
                                      PassBuilder::OptimizationLevel Level) {
-              FPM.addPass(AddressSanitizerPass(
-                  /*CompileKernel=*/false, Recover, UseAfterScope));
+              MPM.addPass(
+                  createModuleToFunctionPassAdaptor(AddressSanitizerPass(
+                      /*CompileKernel=*/false, Recover, UseAfterScope)));
             });
         bool ModuleUseAfterScope = asanUseGlobalsGC(TargetTriple, CodeGenOpts);
         bool UseOdrIndicator = CodeGenOpts.SanitizeAddressUseOdrIndicator;
@@ -1325,15 +1352,6 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
       }
     }
 
-    if (CodeGenOpts.SanitizeCoverageType ||
-        CodeGenOpts.SanitizeCoverageIndirectCalls ||
-        CodeGenOpts.SanitizeCoverageTraceCmp) {
-      auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
-      MPM.addPass(ModuleSanitizerCoveragePass(
-          SancovOpts, CodeGenOpts.SanitizeCoverageWhitelistFiles,
-          CodeGenOpts.SanitizeCoverageBlacklistFiles));
-    }
-
     if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) {
       bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::HWAddress);
       MPM.addPass(HWAddressSanitizerPass(
diff --git a/clang/test/CodeGen/sancov-new-pm.c b/clang/test/CodeGen/sancov-new-pm.c
index 06d9042bc70a8..87c836bd40c95 100644
--- a/clang/test/CodeGen/sancov-new-pm.c
+++ b/clang/test/CodeGen/sancov-new-pm.c
@@ -1,10 +1,6 @@
 // Test that SanitizerCoverage works under the new pass manager.
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=fuzzer %s -fexperimental-new-pass-manager -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O0
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=fuzzer %s -fexperimental-new-pass-manager -O2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O2
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=fuzzer %s -fexperimental-new-pass-manager -flto -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=fuzzer %s -fexperimental-new-pass-manager -flto -O2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O2
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=fuzzer %s -fexperimental-new-pass-manager -flto=thin -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=fuzzer %s -fexperimental-new-pass-manager -flto=thin -O2 -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O2,CHECK-O2-THINLTO
 
 extern void *memcpy(void *, const void *, unsigned long);
 extern int printf(const char *restrict, ...);
@@ -29,10 +25,10 @@ int LLVMFuzzerTestOneInput(const unsigned char *data, unsigned long size) {
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_cmp8(i64, i64)
-// CHECK-O2-THINLTO-NOT: declare void @__sanitizer_cov_trace_const_cmp1(i8 zeroext, i8 zeroext)
+// CHECK-O2-NOT: declare void @__sanitizer_cov_trace_const_cmp1(i8 zeroext, i8 zeroext)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_const_cmp2(i16 zeroext, i16 zeroext)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_const_cmp4(i32 zeroext, i32 zeroext)
-// CHECK-O2-THINLTO-NOT: declare void @__sanitizer_cov_trace_const_cmp8(i64, i64)
+// CHECK-O2-NOT: declare void @__sanitizer_cov_trace_const_cmp8(i64, i64)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_div4(i32 zeroext)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_div8(i64)
 // CHECK-O0-DAG: declare void @__sanitizer_cov_trace_gep(i64)
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 391d144d5dcdf..d5a70c2ae132d 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -600,7 +600,7 @@ class PassBuilder {
   /// is not triggered at O0. Extensions to the O0 pipeline should append their
   /// passes to the end of the overall pipeline.
   void registerOptimizerLastEPCallback(
-      const std::function<void(FunctionPassManager &, OptimizationLevel)> &C) {
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
     OptimizerLastEPCallbacks.push_back(C);
   }
 
@@ -728,7 +728,7 @@ class PassBuilder {
       CGSCCOptimizerLateEPCallbacks;
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       VectorizerStartEPCallbacks;
-  SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       OptimizerLastEPCallbacks;
   // Module callbacks
   SmallVector<std::function<void(ModulePassManager &)>, 2>
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0999f7872d12c..1b1701cbe2619 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1073,12 +1073,12 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (PTO.Coroutines)
     OptimizePM.addPass(CoroCleanupPass());
 
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(OptimizePM, Level);
-
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass());
 
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 2e84ca49b6e0b..c99ad2f7b4dcf 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -194,7 +194,7 @@ static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
         });
   if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
-        [&PB, VerifyEachPass, DebugLogging](FunctionPassManager &PM,
+        [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM,
                                             PassBuilder::OptimizationLevel) {
           ExitOnError Err("Unable to parse OptimizerLastEP pipeline: ");
           Err(PB.parsePassPipeline(PM, OptimizerLastEPPipeline, VerifyEachPass,

From 8c050070fb96b4e6b8608a120d102bac0c8ada1c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 28 May 2020 17:00:56 -0700
Subject: [PATCH 452/770] [X86] Fix a nullptr dereference in
 X86Subtarget::classifyLocalReference when compiling with -mcmodel=medium
 -fpic and using a constant pool

LowerConstantPool passes a nullptr into classifyLocalReference. The medium code model handling for PIC will try to deference it using isa. This patch switches to isa_and_nonnull.

Differential Revision: https://reviews.llvm.org/D80763
---
 llvm/lib/Target/X86/X86Subtarget.cpp    |  4 ++-
 llvm/test/CodeGen/X86/code-model-elf.ll | 43 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8f7185cc5fa62..975cbabb30fd5 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -88,7 +88,9 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
 
       // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
       case CodeModel::Medium:
-        if (isa<Function>(GV))
+        // Constant pool and jump table handling pass a nullptr to this
+        // function so we need to use isa_and_nonnull.
+        if (isa_and_nonnull<Function>(GV))
           return X86II::MO_NO_FLAG; // All code is RIP-relative
         return X86II::MO_GOTOFF;    // Local symbols use GOTOFF.
       }
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index f7ffd6ea1eb7c..82d0d1ef59131 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -439,6 +439,49 @@ define dso_local i32 @load_thread_data() #0 {
   ret i32 %1
 }
 
+define dso_local float @load_constant_pool(float %x) #0 {
+; SMALL-STATIC-LABEL: load_constant_pool:
+; SMALL-STATIC:       # %bb.0:
+; SMALL-STATIC-NEXT:    addss {{\.LCPI.*}}(%rip), %xmm0
+; SMALL-STATIC-NEXT:    retq
+;
+; MEDIUM-STATIC-LABEL: load_constant_pool:
+; MEDIUM-STATIC:       # %bb.0:
+; MEDIUM-STATIC-NEXT:    movabsq ${{\.LCPI.*}}, %rax
+; MEDIUM-STATIC-NEXT:    addss (%rax), %xmm0
+; MEDIUM-STATIC-NEXT:    retq
+;
+; LARGE-STATIC-LABEL: load_constant_pool:
+; LARGE-STATIC:       # %bb.0:
+; LARGE-STATIC-NEXT:    movabsq ${{\.LCPI.*}}, %rax
+; LARGE-STATIC-NEXT:    addss (%rax), %xmm0
+; LARGE-STATIC-NEXT:    retq
+;
+; SMALL-PIC-LABEL: load_constant_pool:
+; SMALL-PIC:       # %bb.0:
+; SMALL-PIC-NEXT:    addss {{\.LCPI.*}}(%rip), %xmm0
+; SMALL-PIC-NEXT:    retq
+;
+; MEDIUM-PIC-LABEL: load_constant_pool:
+; MEDIUM-PIC:       # %bb.0:
+; MEDIUM-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
+; MEDIUM-PIC-NEXT:    movabsq ${{\.LCPI.*}}@GOTOFF, %rcx
+; MEDIUM-PIC-NEXT:    addss (%rax,%rcx), %xmm0
+; MEDIUM-PIC-NEXT:    retq
+;
+; LARGE-PIC-LABEL: load_constant_pool:
+; LARGE-PIC:       # %bb.0:
+; LARGE-PIC-NEXT:  .L11$pb:
+; LARGE-PIC-NEXT:    leaq .L11$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L11$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq ${{\.LCPI.*}}@GOTOFF, %rax
+; LARGE-PIC-NEXT:    addss (%rcx,%rax), %xmm0
+; LARGE-PIC-NEXT:    retq
+  %a = fadd float %x, 1.0
+  ret float %a
+}
+
 attributes #0 = { noinline nounwind uwtable }
 
 !llvm.module.flags = !{!0, !1, !2}

From e13c84c3be589c80edd2391664e136f54f0e3345 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 23 May 2020 18:28:12 -0400
Subject: [PATCH 453/770] GlobalISel: Work on improving stock set of legality
 predicates

I get confused by a lot of the predicate names here, since I would
assume they apply to vectors as well. Rename to reflect they only
apply to scalars.

Also add a few predicates AMDGPU uses that should be generally useful.
Also add any() to complement all. I've wanted to use this a few times
but then worked around it not being there.
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   | 36 ++++++++++++++++---
 .../CodeGen/GlobalISel/LegalityPredicates.cpp | 32 ++++++++++++++---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 29 +++------------
 3 files changed, 63 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index f913f5f41b8e8..49bc66a89a219 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -203,6 +203,20 @@ template<typename Predicate, typename... Args>
 Predicate all(Predicate P0, Predicate P1, Args... args) {
   return all(all(P0, P1), args...);
 }
+
+/// True iff P0 or P1 are true.
+template<typename Predicate>
+Predicate any(Predicate P0, Predicate P1) {
+  return [=](const LegalityQuery &Query) {
+    return P0(Query) || P1(Query);
+  };
+}
+/// True iff any given predicates are true.
+template<typename Predicate, typename... Args>
+Predicate any(Predicate P0, Predicate P1, Args... args) {
+  return any(any(P0, P1), args...);
+}
+
 /// True iff the given type index is the specified types.
 LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit);
 /// True iff the given type index is one of the specified types.
@@ -228,13 +242,16 @@ LegalityPredicate isPointer(unsigned TypeIdx);
 /// space.
 LegalityPredicate isPointer(unsigned TypeIdx, unsigned AddrSpace);
 
+/// True if the type index is a vector with element type \p EltTy
+LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy);
+
 /// True iff the specified type index is a scalar that's narrower than the given
 /// size.
-LegalityPredicate narrowerThan(unsigned TypeIdx, unsigned Size);
+LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size);
 
 /// True iff the specified type index is a scalar that's wider than the given
 /// size.
-LegalityPredicate widerThan(unsigned TypeIdx, unsigned Size);
+LegalityPredicate scalarWiderThan(unsigned TypeIdx, unsigned Size);
 
 /// True iff the specified type index is a scalar or vector with an element type
 /// that's narrower than the given size.
@@ -257,6 +274,15 @@ LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size);
 
 /// True iff the specified type indices are both the same bit size.
 LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1);
+
+/// True iff the first type index has a larger total bit size than second type
+/// index.
+LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1);
+
+/// True iff the first type index has a smaller total bit size than second type
+/// index.
+LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1);
+
 /// True iff the specified MMO index has a size that is not a power of 2
 LegalityPredicate memSizeInBytesNotPow2(unsigned MMOIdx);
 /// True iff the specified type index is a vector whose element count is not a
@@ -774,7 +800,7 @@ class LegalizeRuleSet {
     using namespace LegalityPredicates;
     using namespace LegalizeMutations;
     return actionIf(LegalizeAction::WidenScalar,
-                    narrowerThan(TypeIdx, Ty.getSizeInBits()),
+                    scalarNarrowerThan(TypeIdx, Ty.getSizeInBits()),
                     changeTo(typeIdx(TypeIdx), Ty));
   }
 
@@ -792,7 +818,7 @@ class LegalizeRuleSet {
     using namespace LegalityPredicates;
     using namespace LegalizeMutations;
     return actionIf(LegalizeAction::NarrowScalar,
-                    widerThan(TypeIdx, Ty.getSizeInBits()),
+                    scalarWiderThan(TypeIdx, Ty.getSizeInBits()),
                     changeTo(typeIdx(TypeIdx), Ty));
   }
 
@@ -806,7 +832,7 @@ class LegalizeRuleSet {
     return actionIf(
         LegalizeAction::NarrowScalar,
         [=](const LegalityQuery &Query) {
-          return widerThan(TypeIdx, Ty.getSizeInBits()) && Predicate(Query);
+          return scalarWiderThan(TypeIdx, Ty.getSizeInBits()) && Predicate(Query);
         },
         changeElementTo(typeIdx(TypeIdx), Ty));
   }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index b6fb061a8334b..a83742f2138fc 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -80,22 +80,46 @@ LegalityPredicate LegalityPredicates::isPointer(unsigned TypeIdx,
   };
 }
 
-LegalityPredicate LegalityPredicates::narrowerThan(unsigned TypeIdx,
-                                                   unsigned Size) {
+LegalityPredicate LegalityPredicates::elementTypeIs(unsigned TypeIdx,
+                                                    LLT EltTy) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isVector() && QueryTy.getElementType() == EltTy;
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarNarrowerThan(unsigned TypeIdx,
+                                                         unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
     return QueryTy.isScalar() && QueryTy.getSizeInBits() < Size;
   };
 }
 
-LegalityPredicate LegalityPredicates::widerThan(unsigned TypeIdx,
-                                                unsigned Size) {
+LegalityPredicate LegalityPredicates::scalarWiderThan(unsigned TypeIdx,
+                                                      unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
     return QueryTy.isScalar() && QueryTy.getSizeInBits() > Size;
   };
 }
 
+LegalityPredicate LegalityPredicates::smallerThan(unsigned TypeIdx0,
+                                                  unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() <
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
+LegalityPredicate LegalityPredicates::largerThan(unsigned TypeIdx0,
+                                                  unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() >
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
 LegalityPredicate LegalityPredicates::scalarOrEltNarrowerThan(unsigned TypeIdx,
                                                               unsigned Size) {
   return [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 74e03e1d99199..2a546433a2454 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -158,13 +158,6 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
   };
 }
 
-static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
-  return [=](const LegalityQuery &Query) {
-    const LLT QueryTy = Query.Types[TypeIdx];
-    return QueryTy.isVector() && QueryTy.getElementType() == Type;
-  };
-}
-
 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
@@ -183,20 +176,6 @@ static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
   };
 }
 
-static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
-  return [=](const LegalityQuery &Query) {
-    return Query.Types[TypeIdx0].getSizeInBits() <
-           Query.Types[TypeIdx1].getSizeInBits();
-  };
-}
-
-static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
-  return [=](const LegalityQuery &Query) {
-    return Query.Types[TypeIdx0].getSizeInBits() >
-           Query.Types[TypeIdx1].getSizeInBits();
-  };
-}
-
 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                                          const GCNTargetMachine &TM)
   :  ST(ST_) {
@@ -680,7 +659,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     // TODO: Should have same legality without v_perm_b32
     getActionDefinitionsBuilder(G_BSWAP)
       .legalFor({S32})
-      .lowerIf(narrowerThan(0, 32))
+      .lowerIf(scalarNarrowerThan(0, 32))
       // FIXME: Fixing non-power-of-2 before clamp is workaround for
       // narrowScalar limitation.
       .widenScalarToNextPow2(0)
@@ -707,7 +686,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       [](const LegalityQuery &Query) {
         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
       })
-    .narrowScalarIf(greaterThan(1, 0),
+    .narrowScalarIf(largerThan(1, 0),
       [](const LegalityQuery &Query) {
         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
       });
@@ -724,7 +703,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
       })
     .narrowScalarIf(
-      greaterThan(0, 1),
+      largerThan(0, 1),
       [](const LegalityQuery &Query) {
         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
       });
@@ -1238,7 +1217,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         })
       // Try to widen to s16 first for small types.
       // TODO: Only do this on targets with legal s16 shifts
-      .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
+      .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),

From 38727bab6f1337880861b7ffd5a02fcff581facc Mon Sep 17 00:00:00 2001
From: Valery N Dmitriev <valery.n.dmitriev@intel.com>
Date: Thu, 28 May 2020 08:54:04 -0700
Subject: [PATCH 454/770] [NFC][SLP] Add test case exposing SLP cost model bug.
 The bug is related to aggregate build cost model adjustment that adds a bias
 to cost triggering vectorization of actually unprofitable to vectorize tree.

Differential Revision: https://reviews.llvm.org/D80682
---
 .../SLPVectorizer/X86/vec_list_bias.ll        | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
new file mode 100644
index 0000000000000..9ceea2b81ac9f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -S | FileCheck %s
+
+; Check no vectorization triggered with any portion of
+; insertelement <8 x i32> instructions that build entire vector.
+; Vectorization triggered by cost bias caused by subtracting
+; the cost of entire "aggregate build" sequence while
+; building vectorizable tree from only a portion of it.
+; FIXME: this is unprofitable to vectorize.
+
+
+define void @test(i32* nocapture %t2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[T3:%.*]] = load i32, i32* [[T2:%.*]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 7
+; CHECK-NEXT:    [[T5:%.*]] = load i32, i32* [[T4]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 1
+; CHECK-NEXT:    [[T9:%.*]] = load i32, i32* [[T8]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 6
+; CHECK-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 2
+; CHECK-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4
+; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 5
+; CHECK-NEXT:    [[T17:%.*]] = load i32, i32* [[T16]], align 4
+; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 3
+; CHECK-NEXT:    [[T21:%.*]] = load i32, i32* [[T20]], align 4
+; CHECK-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T2]], i64 4
+; CHECK-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4
+; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
+; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
+; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
+; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
+; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
+; CHECK-NEXT:    [[T40:%.*]] = mul nsw i32 [[T39]], 9633
+; CHECK-NEXT:    [[T41:%.*]] = mul nsw i32 [[T25]], 2446
+; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
+; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
+; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 6270, i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[TMP15]], i32 3
+; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP12]], i32 4
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP15]], i32 7
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
+; CHECK-NEXT:    ret void
+;
+  %t3 = load i32, i32* %t2, align 4
+  %t4 = getelementptr inbounds i32, i32* %t2, i64 7
+  %t5 = load i32, i32* %t4, align 4
+  %t8 = getelementptr inbounds i32, i32* %t2, i64 1
+  %t9 = load i32, i32* %t8, align 4
+  %t10 = getelementptr inbounds i32, i32* %t2, i64 6
+  %t11 = load i32, i32* %t10, align 4
+  %t14 = getelementptr inbounds i32, i32* %t2, i64 2
+  %t15 = load i32, i32* %t14, align 4
+  %t16 = getelementptr inbounds i32, i32* %t2, i64 5
+  %t17 = load i32, i32* %t16, align 4
+  %t20 = getelementptr inbounds i32, i32* %t2, i64 3
+  %t21 = load i32, i32* %t20, align 4
+  %t22 = getelementptr inbounds i32, i32* %t2, i64 4
+  %t23 = load i32, i32* %t22, align 4
+  %t24 = add nsw i32 %t23, %t21
+  %t25 = sub nsw i32 %t21, %t23
+  %t27 = sub nsw i32 %t3, %t24
+  %t28 = add nsw i32 %t15, %t9
+  %t29 = sub nsw i32 %t9, %t15
+  %t30 = add nsw i32 %t27, %t29
+  %t31 = mul nsw i32 %t30, 4433
+  %t32 = mul nsw i32 %t27, 6270
+  %t34 = mul nsw i32 %t29, -15137
+  %t37 = add nsw i32 %t25, %t11
+  %t38 = add nsw i32 %t17, %t5
+  %t39 = add nsw i32 %t37, %t38
+  %t40 = mul nsw i32 %t39, 9633
+  %t41 = mul nsw i32 %t25, 2446
+  %t42 = mul nsw i32 %t17, 16819
+  %t47 = mul nsw i32 %t37, -16069
+  %t48 = mul nsw i32 %t38, -3196
+  %t49 = add nsw i32 %t40, %t47
+  %t50 = add nsw i32 %t40, %t48
+  %t65 = insertelement <8 x i32> undef, i32 %t28, i32 0
+  %t66 = insertelement <8 x i32> %t65, i32 %t50, i32 1
+  %t67 = insertelement <8 x i32> %t66, i32 %t32, i32 2
+  %t68 = insertelement <8 x i32> %t67, i32 %t49, i32 3
+  %t69 = insertelement <8 x i32> %t68, i32 %t28, i32 4
+  %t70 = insertelement <8 x i32> %t69, i32 %t50, i32 5
+  %t71 = insertelement <8 x i32> %t70, i32 %t34, i32 6
+  %t72 = insertelement <8 x i32> %t71, i32 %t49, i32 7
+  %t76 = shl <8 x i32> %t72, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %t79 = bitcast i32* %t2 to <8 x i32>*
+  store <8 x i32> %t76, <8 x i32>* %t79, align 4
+  ret void
+}

From b4668a268ddedea76bd406818fd0313b58e6f750 Mon Sep 17 00:00:00 2001
From: Tony <Tony.Tye@amd.com>
Date: Tue, 26 May 2020 23:44:10 -0400
Subject: [PATCH 455/770] [AMDGPU] DWARF Proposal For Heterogeneous Debugging

- Add introduction to DWARF Proposal For Heterogeneous Debugging.

Differential Revision: https://reviews.llvm.org/D70523
---
 ...DwarfProposalForHeterogeneousDebugging.rst | 73 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst
index 41ed0c57e62e7..c6868d675a88e 100644
--- a/llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst
@@ -1,4 +1,4 @@
-.. _amdgpu-proposal-for-heterogeneous-debugging:
+.. _amdgpu-dwarf-proposal-for-heterogeneous-debugging:
 
 ******************************************
 DWARF Proposal For Heterogeneous Debugging
@@ -13,9 +13,76 @@ DWARF Proposal For Heterogeneous Debugging
    [:ref:`DWARF <amdgpu-dwarf-DWARF>`] to support heterogeneous debugging. It is
    not currently fully implemented and is subject to change.
 
+.. _amdgpu-dwarf-introduction:
+
 Introduction
 ============
 
+AMD [:ref:`AMD <amdgpu-dwarf-AMD>`] has been working on supporting heterogeneous
+computing through the AMD Radeon Open Compute Platform (ROCm) [:ref:`AMD-ROCm
+<amdgpu-dwarf-AMD-ROCm>`]. A heterogeneous computing program can be written in a
+high level language such as C++ or Fortran with OpenMP pragmas, OpenCL, or HIP
+(a portable C++ programming environment for heterogeneous computing [:ref:`HIP
+<amdgpu-dwarf-HIP>`]). A heterogeneous compiler and runtime allows a program to
+execute on multiple devices within the same native process. Devices could
+include CPUs, GPUs, DSPs, FPGAs, or other special purpose accelerators.
+Currently HIP programs execute on systems with CPUs and GPUs.
+
+ROCm is fully open sourced and includes contributions to open source projects
+such as LLVM for compilation [:ref:`LLVM <amdgpu-dwarf-LLVM>`] and GDB for
+debugging [:ref:`GDB <amdgpu-dwarf-GDB>`], as well as collaboration with other
+third party projects such as the GCC compiler [:ref:`GCC <amdgpu-dwarf-GCC>`]
+and the Perforce TotalView HPC debugger [:ref:`Perforce-TotalView
+<amdgpu-dwarf-Perforce-TotalView>`].
+
+To support debugging heterogeneous programs several features that are not
+provided by current DWARF Version 5 [:ref:`DWARF <amdgpu-dwarf-DWARF>`] have
+been identified. This document contains a collection of proposals to address
+providing those features.
+
+The :ref:`amdgpu-dwarf-motivation` section describes the issues that are being
+addressed for heterogeneous computing. That is followed by the
+:ref:`amdgpu-dwarf-proposed-changes-relative-to-dwarf-version-5` section
+containing the proposed textual changes relative to the DWARF Version 5
+standard. Then there is an :ref:`amdgpu-dwarf-examples` section that links to
+the AMD GPU specific usage of the features in the proposal that includes an
+example. Finally, there is a :ref:`amdgpu-dwarf-references` section. There are a
+number of notes included that raise open questions, or provide alternative
+approaches considered. The draft proposal seeks to be general in nature and
+backwards compatible with DWARF Version 5. Its goal is to be applicable to
+meeting the needs of any heterogeneous system and not be vendor or architecture
+specific.
+
+A fundamental aspect of the draft proposal is that it allows DWARF expression
+location descriptions as stack elements. The draft proposal is based on DWARF
+Version 5 and maintains compatibility with DWARF Version 5. After attempting
+several alternatives, the current thinking is that such an addition to DWARF
+Version 5 is the simplest and cleanest way to support debugging optimized GPU
+code. It also appears to be generally useful and may be able to address other
+reported DWARF issues, as well as being helpful in providing better optimization
+support for non-GPU code.
+
+General feedback on this draft proposal is sought, together with suggestions on
+how to clarify, simplify, or organize it before submitting it as a formal DWARF
+proposal. The current draft proposal is large and may need to be split into
+separate proposals before formal submission. Any suggestions on how best to do
+that are appreciated. However, at the initial review stage it is believed there
+is value in presenting a unified proposal as there are mutual dependencies
+between the various parts that would not be as apparent if it was broken up into
+separate independent proposals.
+
+We are in the process of modifying LLVM and GDB to support this draft proposal
+which is providing experience and insights. We plan to upstream the changes to
+those projects for any final form of the proposal.
+
+The author very much appreciates the input provided so far by many others which
+has been incorporated into this current version.
+
+.. _amdgpu-dwarf-motivation:
+
+Motivation
+==========
+
 This document proposes a set of backwards compatible extensions to DWARF Version
 5 [:ref:`DWARF <amdgpu-dwarf-DWARF>`] for consideration of inclusion into a
 future DWARF Version 6 standard to support heterogeneous debugging.
@@ -3753,12 +3820,16 @@ debugger information entries.
                                  * ``DW_AT_LLVM_lanes``
    ============================= =============================
 
+.. _amdgpu-dwarf-examples:
+
 Examples
 ========
 
 The AMD GPU specific usage of the features in the proposal, including examples,
 is available at :ref:`amdgpu-dwarf-debug-information`.
 
+.. _amdgpu-dwarf-references:
+
 References
 ==========
 

From 1bc73b02d6934c8586ca5740da416b7094c3c471 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Fri, 29 May 2020 01:11:50 +0000
Subject: [PATCH 456/770] [LoopUnroll] Support loops with exiting block that is
 neither header nor latch.

Summary: Remove the limitation in LoopUnrollPass that exiting block must
be either header or latch.
Reviewer: dmgreen, jdoerfert, Meinersbur, kbarton, bmahjour, etiotto,
fhahn, efriedma
Reviewed By: etiotto, fhahn, efriedma
Subscribers: efriedma, lkail, xbolva00, hiraditya, zzheng, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D80477
---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 113 +++++++-----------
 .../Transforms/LoopUnroll/nonlatchcondbr.ll   |  69 +++++++++++
 2 files changed, 109 insertions(+), 73 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index d9323e70bef60..4525681855ce4 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -81,8 +81,8 @@ using namespace llvm;
 // TODO: Should these be here or in LoopUnroll?
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
-STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a "
-                                 "conditional latch (completely or otherwise)");
+STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
+                               "latch (completely or otherwise)");
 
 static cl::opt<bool>
 UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
@@ -304,48 +304,30 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // The current loop unroll pass can unroll loops with a single latch or header
-  // that's a conditional branch exiting the loop.
+  // The current loop unroll pass can unroll loops that have
+  // (1) single latch; and
+  // (2a) latch is an exiting block; or
+  // (2b) latch is unconditional and there exists a single exiting block.
   // FIXME: The implementation can be extended to work with more complicated
   // cases, e.g. loops with multiple latches.
   BasicBlock *Header = L->getHeader();
-  BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator());
-  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-
-  // FIXME: Support loops without conditional latch and multiple exiting blocks.
-  if (!BI ||
-      (BI->isUnconditional() && (!HeaderBI || HeaderBI->isUnconditional() ||
-                                 L->getExitingBlock() != Header))) {
+  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  // A conditional branch which exits the loop, which can be optimized to an
+  // unconditional branch in the unrolled loop in some cases.
+  BranchInst *ExitingBI = nullptr;
+  bool LatchIsExiting = L->isLoopExiting(LatchBlock);
+  if (LatchIsExiting)
+    ExitingBI = LatchBI;
+  else if (BasicBlock *ExitingBlock = L->getExitingBlock())
+    ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (!LatchBI || !ExitingBI) {
     LLVM_DEBUG(dbgs() << "  Can't unroll; loop not terminated by a conditional "
-                         "branch in the latch or header.\n");
-    return LoopUnrollResult::Unmodified;
-  }
-
-  auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) {
-    return BI->isConditional() && BI->getSuccessor(S1) == Header &&
-           !L->contains(BI->getSuccessor(S2));
-  };
-
-  // If we have a conditional latch, it must exit the loop.
-  if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) &&
-      !CheckLatchSuccessors(1, 0)) {
-    LLVM_DEBUG(
-        dbgs() << "Can't unroll; a conditional latch must exit the loop");
-    return LoopUnrollResult::Unmodified;
-  }
-
-  auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) {
-    return HeaderBI && HeaderBI->isConditional() &&
-           L->contains(HeaderBI->getSuccessor(S1)) &&
-           !L->contains(HeaderBI->getSuccessor(S2));
-  };
-
-  // If we do not have a conditional latch, the header must exit the loop.
-  if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() &&
-      !CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) {
-    LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop");
+                         "branch in latch or a single exiting block.\n");
     return LoopUnrollResult::Unmodified;
   }
+  LLVM_DEBUG(dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
+                    << "\n");
 
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
@@ -534,17 +516,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       SE->forgetTopmostLoop(L);
   }
 
-  bool ContinueOnTrue;
-  bool LatchIsExiting = BI->isConditional();
-  BasicBlock *LoopExit = nullptr;
-  if (LatchIsExiting) {
-    ContinueOnTrue = L->contains(BI->getSuccessor(0));
-    LoopExit = BI->getSuccessor(ContinueOnTrue);
-  } else {
-    NumUnrolledWithHeader++;
-    ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0));
-    LoopExit = HeaderBI->getSuccessor(ContinueOnTrue);
-  }
+  if (!LatchIsExiting)
+    ++NumUnrolledNotLatch;
+  bool ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
+  BasicBlock *LoopExit = ExitingBI->getSuccessor(ContinueOnTrue);
 
   // For the first iteration of the loop, we should use the precloned values for
   // PHI nodes.  Insert associations now.
@@ -555,21 +530,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   }
 
   std::vector<BasicBlock *> Headers;
-  std::vector<BasicBlock *> HeaderSucc;
+  std::vector<BasicBlock *> ExitingBlocks;
+  std::vector<BasicBlock *> ExitingSucc;
   std::vector<BasicBlock *> Latches;
   Headers.push_back(Header);
   Latches.push_back(LatchBlock);
-
-  if (!LatchIsExiting) {
-    auto *Term = cast<BranchInst>(Header->getTerminator());
-    if (Term->isUnconditional() || L->contains(Term->getSuccessor(0))) {
-      assert(L->contains(Term->getSuccessor(0)));
-      HeaderSucc.push_back(Term->getSuccessor(0));
-    } else {
-      assert(L->contains(Term->getSuccessor(1)));
-      HeaderSucc.push_back(Term->getSuccessor(1));
-    }
-  }
+  ExitingBlocks.push_back(ExitingBI->getParent());
+  ExitingSucc.push_back(ExitingBI->getSuccessor(!ContinueOnTrue));
 
   // The current on-the-fly SSA update requires blocks to be processed in
   // reverse postorder so that LastValueMap contains the correct value at each
@@ -660,12 +627,12 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (*BB == LatchBlock)
         Latches.push_back(New);
 
-      // Keep track of the successor of the new header in the current iteration.
-      for (auto *Pred : predecessors(*BB))
-        if (Pred == Header) {
-          HeaderSucc.push_back(New);
-          break;
-        }
+      // Keep track of the exiting block and its successor block contained in
+      // the loop for the current iteration.
+      if (*BB == ExitingBlocks[0])
+        ExitingBlocks.push_back(New);
+      if (*BB == ExitingSucc[0])
+        ExitingSucc.push_back(New);
 
       NewBlocks.push_back(New);
       UnrolledLoopBlocks.push_back(New);
@@ -784,7 +751,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (!LatchIsExiting) {
     // If the latch is not exiting, we may be able to simplify the conditional
     // branches in the unrolled exiting blocks.
-    for (unsigned i = 0, e = Headers.size(); i != e; ++i) {
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       // The branch destination.
       unsigned j = (i + 1) % e;
       bool NeedConditional = true;
@@ -807,7 +774,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       // already correct.
       if (NeedConditional)
         continue;
-      setDest(Headers[i], HeaderSucc[i], HeaderSucc[i], NeedConditional,
+      setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
               ContinueOnTrue, false);
     }
 
@@ -833,8 +800,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           ChildrenToUpdate.push_back(ChildBB);
       }
       BasicBlock *NewIDom;
-      BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header;
-      auto &TermBlocks = LatchIsExiting ? Latches : Headers;
+      BasicBlock *&TermBlock = ExitingBlocks[0];
+      auto &TermBlocks = ExitingBlocks;
       if (BB == TermBlock) {
         // The latch is special because we emit unconditional branches in
         // some cases where the original loop contained a conditional branch.
@@ -843,8 +810,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // must also be a latch.  Specifically, the dominator is the first
         // latch which ends in a conditional branch, or the last latch if
         // there is no such latch.
-        // For loops exiting from the header, we limit the supported loops
-        // to have a single exiting block.
+        // For loops exiting from non latch exiting block, we limit the
+        // supported loops to have a single exiting block.
         NewIDom = TermBlocks.back();
         for (BasicBlock *Iter : TermBlocks) {
           Instruction *Term = Iter->getTerminator();
diff --git a/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
new file mode 100644
index 0000000000000..547b05d1e186d
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-unroll -S | FileCheck %s
+; RUN: opt < %s -passes='require<opt-remark-emit>,unroll' -S | FileCheck %s
+
+define void @foo(i32* noalias %A) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]]
+; CHECK:       for.body.for.body_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    br label [[FOR_BODY_1:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body.1:
+; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE_1:%.*]]
+; CHECK:       for.body.for.body_crit_edge.1:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[DOTPRE_1:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_1]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_1]])
+; CHECK-NEXT:    br label [[FOR_BODY_2:%.*]]
+; CHECK:       for.body.2:
+; CHECK-NEXT:    br label [[FOR_BODY_FOR_BODY_CRIT_EDGE_2:%.*]]
+; CHECK:       for.body.for.body_crit_edge.2:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_2]])
+; CHECK-NEXT:    br label [[FOR_BODY_3:%.*]]
+; CHECK:       for.body.3:
+; CHECK-NEXT:    br i1 false, label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.for.body_crit_edge.3:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  call void @bar(i32 %0)
+  br label %for.header
+
+for.header:
+  %1 = phi i32 [ %0, %entry ], [ %.pre, %for.body.for.body_crit_edge ]
+  %i = phi i64 [ 0, %entry ], [ %inc, %for.body.for.body_crit_edge ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i
+  call void @bar(i32 %1)
+  br label %for.body
+
+for.body:
+  %inc = add nsw i64 %i, 1
+  %cmp = icmp slt i64 %inc, 4
+  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
+
+for.body.for.body_crit_edge:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %A, i64 %inc
+  %.pre = load i32, i32* %arrayidx.phi.trans.insert, align 4
+  br label %for.header
+
+for.end:
+  ret void
+}
+
+declare void @bar(i32)

From 59ba12994c07d03ac3b628c05c45a834774f9b17 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 28 May 2020 19:16:29 -0700
Subject: [PATCH 457/770] [llvm-xray][test] Fix unsupported-elf32.txt after
 D80185

---
 llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt b/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt
index dc025d96956b4..33fd844337135 100644
--- a/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt
+++ b/llvm/test/tools/llvm-xray/X86/unsupported-elf32.txt
@@ -1,3 +1,3 @@
 ; RUN: not llvm-xray extract %S/Inputs/elf32-noxray.bin 2>&1 | FileCheck %s
 ; CHECK: llvm-xray: Cannot extract instrumentation map from '{{.*}}elf32-noxray.bin'.
-; CHECK-NEXT: File format not supported.  Supports: AArch64/ARM/ppc64le/x86-64.
+; CHECK-NEXT: File format not supported (only does ELF and Mach-O little endian 64-bit)

From 3fe6ea4641b20c3406e2ef10c0f3782788585030 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Mon, 25 May 2020 01:34:43 -0700
Subject: [PATCH 458/770] [WebAssembly] Fix a bug in removing unnecessary
 branches

Summary:
One of the things `removeUnnecessaryInstrs()` in CFGStackify does is to
remove an unnecessary unconditinal branch before an EH pad. When there
is an unconditional branch right before a catch instruction and it
branches to the end of `end_try` marker, we don't need the branch,
because it there is no exception, the control flow transfers to
that point anyway.
```
bb0:
  try
    ...
    br bb2      <- Not necessary
bb1:
  catch
    ...
bb2:
  end
```

This applies when we have a conditional branch followed by an
unconditional one, in which case we should only remove the unconditional
branch. For example:
```
bb0:
  try
    ...
    br_if someplace_else
    br bb2                 <- Not necessary
bb1:
  catch
    ...
bb2:
  end
```

But `TargetInstrInfo::removeBranch` we used removed all existing
branches when there are multiple ones. This patch fixes it by only
deleting the last (= unconditional) branch manually.

Also fixes some `preds` comments in the test file.

Reviewers: dschuff

Subscribers: sbc100, jgravelle-google, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80572
---
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 22 ++++++++-
 .../CodeGen/WebAssembly/cfg-stackify-eh.ll    | 49 +++++++++++++++++--
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 23a5aa61daa97..103fe97c6e934 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -670,9 +670,27 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
     MachineBasicBlock *EHPadLayoutPred = MBB.getPrevNode();
     MachineBasicBlock *Cont = BeginToEnd[EHPadToTry[&MBB]]->getParent();
     bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
+    // This condition means either
+    // 1. This BB ends with a single unconditional branch whose destinaion is
+    //    Cont.
+    // 2. This BB ends with a conditional branch followed by an unconditional
+    //    branch, and the unconditional branch's destination is Cont.
+    // In both cases, we want to remove the last (= unconditional) branch.
     if (Analyzable && ((Cond.empty() && TBB && TBB == Cont) ||
-                       (!Cond.empty() && FBB && FBB == Cont)))
-      TII.removeBranch(*EHPadLayoutPred);
+                       (!Cond.empty() && FBB && FBB == Cont))) {
+      bool ErasedUncondBr = false;
+      for (auto I = EHPadLayoutPred->end(), E = EHPadLayoutPred->begin();
+           I != E; --I) {
+        auto PrevI = std::prev(I);
+        if (PrevI->isTerminator()) {
+          assert(PrevI->getOpcode() == WebAssembly::BR);
+          PrevI->eraseFromParent();
+          ErasedUncondBr = true;
+          break;
+        }
+      }
+      assert(ErasedUncondBr && "Unconditional branch not erased!");
+    }
   }
 
   // When there are block / end_block markers that overlap with try / end_try
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
index 188ad22c89fcc..fcc30466594e6 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling
 ; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -exception-model=wasm -mattr=+exception-handling | FileCheck %s --check-prefix=NOOPT
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT-STAT
@@ -856,11 +857,11 @@ define void @test16(i32* %p, i32 %a, i32 %b) personality i8* bitcast (i32 (...)*
 entry:
   br label %loop
 
-loop:
+loop:                                             ; preds = %try.cont, %entry
   invoke void @foo()
           to label %bb0 unwind label %catch.dispatch0
 
-bb0:
+bb0:                                              ; preds = %loop
   %cmp = icmp ne i32 %a, %b
   br i1 %cmp, label %bb1, label %last
 
@@ -886,10 +887,50 @@ catch.start1:                                     ; preds = %catch.dispatch1
   %7 = call i32 @llvm.wasm.get.ehselector(token %5)
   catchret from %5 to label %try.cont
 
-try.cont:                                         ; preds = %catch.start, %loop
+try.cont:                                         ; preds = %catch.start1, %catch.start0, %bb1
   br label %loop
 
-last:
+last:                                             ; preds = %bb0
+  ret void
+}
+
+; Tests if CFGStackify's removeUnnecessaryInstrs() removes unnecessary branches
+; correctly.
+; CHECK-LABEL: test17
+define void @test17(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @foo()
+          to label %for.body unwind label %catch.dispatch
+
+for.body:                                         ; preds = %for.end, %entry
+  %i = phi i32 [ %inc, %for.end ], [ 0, %entry ]
+  invoke void @foo()
+          to label %for.end unwind label %catch.dispatch
+
+; Before going to CFGStackify, this BB will have a conditional branch followed
+; by an unconditional branch. CFGStackify should remove only the unconditional
+; one.
+for.end:                                          ; preds = %for.body
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %try.cont, label %for.body
+; CHECK: br_if
+; CHECK-NOT: br
+; CHECK: end_loop
+; CHECK: catch
+
+catch.dispatch:                                   ; preds = %for.body, %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i8* @__cxa_begin_catch(i8* %2) #2 [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start, %for.end
   ret void
 }
 

From 4cd3f4b31b0bd19f3b63f53888a5a2afea68e109 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Mon, 25 May 2020 01:35:35 -0700
Subject: [PATCH 459/770] [WebAssembly] Fix a bug in finding matching EH pad

Summary:
`getMatchingEHPad()` in LateEHPrepare is a function to find the nearest
EH pad that dominates the given instruction. This intends to be
lightweight so it does not use full WebAssemblyException scope analysis
or dominator analysis. It simply does backward BFS to its predecessors
and stops at the first EH pad each search path encounters. All search
should end up at the same EH pad, and if not, it returns null.

But it didn't take into account that when there are inner scopes within
the current scope, some path in BFS can hit an inner EH pad first. For
example, in the given diagram, `Inst` belongs to the outer scope and
`getMathingEHPad()` should return 'EHPad 1', but some search path can go
into the inner scope and end up with 'EHPad 2'. The search will return
null because different paths end up with different EH pads.
```
--- EHPad 1 ---
| - EHPad 2 - |
| |         | |
| ----------- |
|   Inst      |
---------------
```

So far this was OK because we haven't tested a case in which a given
instruction is far from its EH pad. Also, this bug does not happen when
the inner EH scope is a cleanup scope, because a cleanup scope ends with
a `cleanupret` whose successor is an EH pad, so the search encounters
that EH pad first before going into the child scope. But this can happen
when the child scope is a catch scope that ends with `catchret`. So this
patch, when doing backward BFS, does not search predecessors that ends
with `catchret`. Because `catchret`s are replaced with `br`s during this
pass, this records BBs that have `catchret`s in the beginning, before
doing any other transformations.

Reviewers: dschuff

Subscribers: sbc100, jgravelle-google, hiraditya, sunfish, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80571
---
 .../WebAssembly/WebAssemblyLateEHPrepare.cpp  | 27 +++++++++++-
 llvm/test/CodeGen/WebAssembly/exception.ll    | 44 ++++++++++++++++---
 2 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 54115849df182..2280ec38e5b90 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -32,12 +32,16 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  void recordCatchRetBBs(MachineFunction &MF);
   bool addCatches(MachineFunction &MF);
   bool replaceFuncletReturns(MachineFunction &MF);
   bool removeUnnecessaryUnreachables(MachineFunction &MF);
   bool addExceptionExtraction(MachineFunction &MF);
   bool restoreStackPointer(MachineFunction &MF);
 
+  MachineBasicBlock *getMatchingEHPad(MachineInstr *MI);
+  SmallSet<MachineBasicBlock *, 8> CatchRetBBs;
+
 public:
   static char ID; // Pass identification, replacement for typeid
   WebAssemblyLateEHPrepare() : MachineFunctionPass(ID) {}
@@ -58,7 +62,8 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
 // possible search paths should be the same.
 // Returns nullptr in case it does not find any EH pad in the search, or finds
 // multiple different EH pads.
-static MachineBasicBlock *getMatchingEHPad(MachineInstr *MI) {
+MachineBasicBlock *
+WebAssemblyLateEHPrepare::getMatchingEHPad(MachineInstr *MI) {
   MachineFunction *MF = MI->getParent()->getParent();
   SmallVector<MachineBasicBlock *, 2> WL;
   SmallPtrSet<MachineBasicBlock *, 2> Visited;
@@ -77,7 +82,9 @@ static MachineBasicBlock *getMatchingEHPad(MachineInstr *MI) {
     }
     if (MBB == &MF->front())
       return nullptr;
-    WL.append(MBB->pred_begin(), MBB->pred_end());
+    for (auto *Pred : MBB->predecessors())
+      if (!CatchRetBBs.count(Pred)) // We don't go into child scopes
+        WL.push_back(Pred);
   }
   return EHPad;
 }
@@ -111,6 +118,7 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   if (MF.getFunction().hasPersonalityFn()) {
+    recordCatchRetBBs(MF);
     Changed |= addCatches(MF);
     Changed |= replaceFuncletReturns(MF);
   }
@@ -122,6 +130,21 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
+// Record which BB ends with 'CATCHRET' instruction, because this will be
+// replaced with BRs later. This set of 'CATCHRET' BBs is necessary in
+// 'getMatchingEHPad' function.
+void WebAssemblyLateEHPrepare::recordCatchRetBBs(MachineFunction &MF) {
+  CatchRetBBs.clear();
+  for (auto &MBB : MF) {
+    auto Pos = MBB.getFirstTerminator();
+    if (Pos == MBB.end())
+      continue;
+    MachineInstr *TI = &*Pos;
+    if (TI->getOpcode() == WebAssembly::CATCHRET)
+      CatchRetBBs.insert(&MBB);
+  }
+}
+
 // Add catch instruction to beginning of catchpads and cleanuppads.
 bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
   bool Changed = false;
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll
index bfd4e1720e281..d6a6360198386 100644
--- a/llvm/test/CodeGen/WebAssembly/exception.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception.ll
@@ -74,7 +74,7 @@ rethrow:                                          ; preds = %catch.start
   call void @llvm.wasm.rethrow.in.catch() [ "funclet"(token %1) ]
   unreachable
 
-try.cont:                                         ; preds = %entry, %catch
+try.cont:                                         ; preds = %catch, %entry
   ret void
 }
 
@@ -169,7 +169,7 @@ invoke.cont1:                                     ; preds = %catch.start
   call void @__cxa_end_catch() [ "funclet"(token %1) ]
   catchret from %1 to label %try.cont
 
-try.cont:                                         ; preds = %entry, %invoke.cont1
+try.cont:                                         ; preds = %invoke.cont1, %entry
   ret void
 
 ehcleanup:                                        ; preds = %catch.start
@@ -262,7 +262,7 @@ rethrow:                                          ; preds = %catch.start
   call void @llvm.wasm.rethrow.in.catch() [ "funclet"(token %1) ]
   unreachable
 
-try.cont:                                         ; preds = %entry, %invoke.cont1
+try.cont:                                         ; preds = %invoke.cont1, %entry
   ret void
 
 ehcleanup:                                        ; preds = %catch
@@ -303,11 +303,11 @@ catch.start:                                      ; preds = %catch.dispatch
   %1 = catchpad within %0 [i8* null]
   %2 = call i8* @llvm.wasm.get.exception(token %1)
   %3 = call i32 @llvm.wasm.get.ehselector(token %1)
-  %4 = call i8* @__cxa_begin_catch(i8* %2) #2 [ "funclet"(token %1) ]
+  %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
   call void @__cxa_end_catch() [ "funclet"(token %1) ]
   catchret from %1 to label %try.cont
 
-try.cont:                                         ; preds = %entry, %catch.start
+try.cont:                                         ; preds = %catch.start, %entry
   ret void
 }
 
@@ -327,8 +327,40 @@ catch.start:                                      ; preds = %catch.dispatch
   %3 = call i32 @llvm.wasm.get.ehselector(token %1)
   catchret from %1 to label %try.cont
 
-try.cont:                                         ; preds = %entry, %catch.start
+try.cont:                                         ; preds = %catch.start, %entry
+  ret void
+}
+
+; Tests a case when a cleanup region (cleanuppad ~ clanupret) contains another
+; catchpad
+define void @test_complex_cleanup_region() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @foo()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
   ret void
+
+ehcleanup:                                        ; preds = %entry
+  %0 = cleanuppad within none []
+  invoke void @foo() [ "funclet"(token %0) ]
+          to label %ehcleanupret unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %ehcleanup
+  %1 = catchswitch within %0 [label %catch.start] unwind label %ehcleanup.1
+
+catch.start:                                      ; preds = %catch.dispatch
+  %2 = catchpad within %1 [i8* null]
+  %3 = call i8* @llvm.wasm.get.exception(token %2)
+  %4 = call i32 @llvm.wasm.get.ehselector(token %2)
+  catchret from %2 to label %ehcleanupret
+
+ehcleanup.1:                                      ; preds = %catch.dispatch
+  %5 = cleanuppad within %0 []
+  unreachable
+
+ehcleanupret:                                     ; preds = %catch.start, %ehcleanup
+  cleanupret from %0 unwind to caller
 }
 
 declare void @foo()

From 373e98a3a5f882661acf67c1f99e89b838aa2ad2 Mon Sep 17 00:00:00 2001
From: Xing GUO <higuoxing@gmail.com>
Date: Fri, 29 May 2020 10:51:37 +0800
Subject: [PATCH 460/770] [ObjectYAML][DWARF] Add DWARF entry in ELFYAML.

This patch adds a new DWARF entry in ELF YAML file.

Reviewed By: grimar

Differential Revision: https://reviews.llvm.org/D80203
---
 llvm/include/llvm/ObjectYAML/DWARFYAML.h      |   3 +
 llvm/include/llvm/ObjectYAML/ELFYAML.h        |   2 +
 llvm/lib/ObjectYAML/DWARFYAML.cpp             |   7 +
 llvm/lib/ObjectYAML/ELFEmitter.cpp            |  82 +++++-
 llvm/lib/ObjectYAML/ELFYAML.cpp               |   1 +
 .../tools/yaml2obj/ELF/DWARF/debug-str.yaml   | 238 ++++++++++++++++++
 6 files changed, 332 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/yaml2obj/ELF/DWARF/debug-str.yaml

diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 11b41e13b8e24..806dd13715e51 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_OBJECTYAML_DWARFYAML_H
 #define LLVM_OBJECTYAML_DWARFYAML_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -171,6 +172,8 @@ struct Data {
   std::vector<LineTable> DebugLines;
 
   bool isEmpty() const;
+
+  SetVector<StringRef> getUsedSectionNames() const;
 };
 
 } // end namespace DWARFYAML
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 5d3384925631f..5b96283b786e5 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -16,6 +16,7 @@
 #define LLVM_OBJECTYAML_ELFYAML_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/ObjectYAML/YAML.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -529,6 +530,7 @@ struct Object {
   // being a single SHT_SYMTAB section are upheld.
   Optional<std::vector<Symbol>> Symbols;
   Optional<std::vector<Symbol>> DynamicSymbols;
+  Optional<DWARFYAML::Data> DWARF;
 
   std::vector<Section *> getSections() {
     std::vector<Section *> Ret;
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 7ba73783cf63b..4805f727e0ce9 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -23,6 +23,13 @@ bool DWARFYAML::Data::isEmpty() const {
          DebugLines.empty();
 }
 
+SetVector<StringRef> DWARFYAML::Data::getUsedSectionNames() const {
+  SetVector<StringRef> SecNames;
+  if (!DebugStrings.empty())
+    SecNames.insert("debug_str");
+  return SecNames;
+}
+
 namespace yaml {
 
 void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index 2b7bad674fa46..254cbef2f60ac 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -13,10 +13,13 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ObjectYAML/DWARFEmitter.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/ObjectYAML/yaml2obj.h"
 #include "llvm/Support/EndianStream.h"
@@ -149,6 +152,9 @@ template <class ELFT> class ELFState {
                                StringTableBuilder &STB,
                                ContiguousBlobAccumulator &CBA,
                                ELFYAML::Section *YAMLSec);
+  void initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
+                              ContiguousBlobAccumulator &CBA,
+                              ELFYAML::Section *YAMLSec);
   void setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
                               std::vector<Elf_Shdr> &SHeaders);
 
@@ -274,6 +280,11 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
     ImplicitSections.insert(ImplicitSections.end(), {".dynsym", ".dynstr"});
   if (Doc.Symbols)
     ImplicitSections.push_back(".symtab");
+  if (Doc.DWARF)
+    for (StringRef DebugSecName : Doc.DWARF->getUsedSectionNames()) {
+      std::string SecName = ("." + DebugSecName).str();
+      ImplicitSections.push_back(StringRef(SecName).copy(StringAlloc));
+    }
   ImplicitSections.insert(ImplicitSections.end(), {".strtab", ".shstrtab"});
 
   // Insert placeholders for implicit sections that are not
@@ -447,7 +458,13 @@ bool ELFState<ELFT>::initImplicitHeader(ContiguousBlobAccumulator &CBA,
     initSymtabSectionHeader(Header, SymtabType::Dynamic, CBA, YAMLSec);
   else if (SecName == ".dynstr")
     initStrtabSectionHeader(Header, SecName, DotDynstr, CBA, YAMLSec);
-  else
+  else if (SecName.startswith(".debug_")) {
+    // If a ".debug_*" section's type is a preserved one, e.g., SHT_DYNAMIC, we
+    // will not treat it as a debug section.
+    if (YAMLSec && !isa<ELFYAML::RawContentSection>(YAMLSec))
+      return false;
+    initDWARFSectionHeader(Header, SecName, CBA, YAMLSec);
+  } else
     return false;
 
   LocationCounter += Header.sh_size;
@@ -791,6 +808,69 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
   assignSectionAddress(SHeader, YAMLSec);
 }
 
+static bool shouldEmitDWARF(DWARFYAML::Data &DWARF, StringRef Name) {
+  SetVector<StringRef> DebugSecNames = DWARF.getUsedSectionNames();
+  return Name.consume_front(".") && DebugSecNames.count(Name);
+}
+
+template <class ELFT>
+uint64_t emitDWARF(typename ELFT::Shdr &SHeader, StringRef Name,
+                   const DWARFYAML::Data &DWARF, raw_ostream &OS) {
+  uint64_t BeginOffset = OS.tell();
+  if (Name == ".debug_str")
+    DWARFYAML::EmitDebugStr(OS, DWARF);
+  else
+    llvm_unreachable("unexpected emitDWARF() call");
+
+  return OS.tell() - BeginOffset;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
+                                            ContiguousBlobAccumulator &CBA,
+                                            ELFYAML::Section *YAMLSec) {
+  zero(SHeader);
+  SHeader.sh_name = DotShStrtab.getOffset(ELFYAML::dropUniqueSuffix(Name));
+  SHeader.sh_type = YAMLSec ? YAMLSec->Type : ELF::SHT_PROGBITS;
+  SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 1;
+  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign,
+                                    YAMLSec ? YAMLSec->Offset : None);
+
+  ELFYAML::RawContentSection *RawSec =
+      dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
+  if (Doc.DWARF && shouldEmitDWARF(*Doc.DWARF, Name)) {
+    if (RawSec && (RawSec->Content || RawSec->Size))
+      reportError("cannot specify section '" + Name +
+                  "' contents in the 'DWARF' entry and the 'Content' "
+                  "or 'Size' in the 'Sections' entry at the same time");
+    else
+      SHeader.sh_size = emitDWARF<ELFT>(SHeader, Name, *Doc.DWARF, CBA.getOS());
+  } else if (RawSec)
+    SHeader.sh_size = writeContent(CBA.getOS(), RawSec->Content, RawSec->Size);
+  else
+    llvm_unreachable("debug sections can only be initialized via the 'DWARF' "
+                     "entry or a RawContentSection");
+
+  if (YAMLSec && YAMLSec->EntSize)
+    SHeader.sh_entsize = *YAMLSec->EntSize;
+  else if (Name == ".debug_str")
+    SHeader.sh_entsize = 1;
+
+  if (RawSec && RawSec->Info)
+    SHeader.sh_info = *RawSec->Info;
+
+  if (YAMLSec && YAMLSec->Flags)
+    SHeader.sh_flags = *YAMLSec->Flags;
+  else if (Name == ".debug_str")
+    SHeader.sh_flags = ELF::SHF_MERGE | ELF::SHF_STRINGS;
+
+  unsigned Link = 0;
+  if (YAMLSec && !YAMLSec->Link.empty() && SN2I.lookup(YAMLSec->Link, Link))
+    SHeader.sh_link = Link;
+
+  assignSectionAddress(SHeader, YAMLSec);
+}
+
 template <class ELFT> void ELFState<ELFT>::reportError(const Twine &Msg) {
   ErrHandler(Msg);
   HasError = true;
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index d3e4d2ee3bd85..250f97d7bac5e 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1653,6 +1653,7 @@ void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
   IO.mapOptional("Sections", Object.Chunks);
   IO.mapOptional("Symbols", Object.Symbols);
   IO.mapOptional("DynamicSymbols", Object.DynamicSymbols);
+  IO.mapOptional("DWARF", Object.DWARF);
   IO.setContext(nullptr);
 }
 
diff --git a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-str.yaml b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-str.yaml
new file mode 100644
index 0000000000000..62d86639ad86b
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-str.yaml
@@ -0,0 +1,238 @@
+## Test that yaml2obj emits .debug_str section.
+
+## a) Generate the .debug_str section from the "DWARF" entry.
+
+# RUN: yaml2obj --docnum=1 %s -o %t1.o
+# RUN: llvm-readelf --string-dump=.debug_str %t1.o | FileCheck %s --check-prefix=DWARF-DEFAULT
+
+#      DWARF-DEFAULT: String dump of section '.debug_str':
+# DWARF-DEFAULT-NEXT: [     0] a
+# DWARF-DEFAULT-NEXT: [     2] b
+# DWARF-DEFAULT-NEXT: [     4] c
+
+## Check the default sh_type, sh_entsize, sh_info, sh_flags and sh_addralign of the
+## .debug_str section header.
+
+# RUN: llvm-readelf -S %t1.o | FileCheck %s --check-prefix=SHDRS-DEFAULT
+
+#                Name       Type     Address          Offset Size   ES Flg Lk Inf Al
+# SHDRS-DEFAULT: .debug_str PROGBITS 0000000000000000 000040 000006 01 MS  0  0   1
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+  debug_str:
+    - a
+    - b
+    - c
+
+## b) Generate the .debug_str section from the raw section content.
+
+# RUN: yaml2obj --docnum=2 %s -o %t2.o
+# RUN: llvm-readelf --string-dump=.debug_str %t2.o | FileCheck %s --check-prefix=DWARF-DEFAULT
+# RUN: llvm-readelf -S %t2.o | FileCheck %s --check-prefix=SHDRS
+
+#        Name       Type     Address          Offset Size   ES Flg Lk Inf Al
+# SHDRS: .debug_str PROGBITS 0000000000000000 000040 000006 01 MS  0  0   0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .debug_str
+    Type:    SHT_PROGBITS
+    Content: "610062006300"
+
+## c) Generate the .debug_str section when the "Size" is specified.
+
+# RUN: yaml2obj --docnum=3 %s -o %t3.o
+# RUN: llvm-readelf -S %t3.o | FileCheck %s --check-prefix=SIZE
+# RUN: llvm-readelf --hex-dump=.debug_str %t3.o | FileCheck %s --check-prefix=SIZE-CONTENT
+
+#       Name       Type     Address          Offset Size   ES Flg Lk Inf Al
+# SIZE: .debug_str PROGBITS 0000000000000000 000040 000010 01 MS  0  0   0
+
+#       SIZE-CONTENT: Hex dump of section '.debug_str':
+#  SIZE-CONTENT-NEXT: 0x00000000 00000000 00000000 00000000 00000000 ................
+# SIZE-CONTENT-EMPTY:
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .debug_str
+    Type:  SHT_PROGBITS
+    Size:  0x10
+
+## d) Test that yaml2obj emits an error message when both the "Size" and the
+## "debug_str" entry are specified at the same time.
+
+# RUN: not yaml2obj --docnum=4 %s -o %t4.o 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR: yaml2obj: error: cannot specify section '.debug_str' contents in the 'DWARF' entry and the 'Content' or 'Size' in the 'Sections' entry at the same time
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .debug_str
+    Type:  SHT_PROGBITS
+    Size:  0x10
+DWARF:
+  debug_str:
+    - a
+
+## e) Test that yaml2obj emits an error message when both the "Content" and the
+## "debug_str" entry are specified at the same time.
+
+# RUN: not yaml2obj --docnum=5 %s -o %t5.o 2>&1 | FileCheck %s --check-prefix=ERROR
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .debug_str
+    Type:    SHT_PROGBITS
+    Content: "6100"
+DWARF:
+  debug_str:
+    - a
+
+## f) Test that all the properties can be overridden by the section header when
+## the "debug_str" entry doesn't exist.
+
+# RUN: yaml2obj --docnum=6 %s -o %t6.o
+# RUN: llvm-readelf -S %t6.o | FileCheck %s --check-prefix=OVERRIDDEN
+
+#             Index Name       Type   Address          Offset Size   ES Flg Lk Inf Al
+# OVERRIDDEN: [ 1]  .sec       STRTAB 0000000000000000 000040 000000 00     0  0   0
+# OVERRIDDEN: [ 2]  .debug_str STRTAB 0000000000002020 000050 000006 02 A   1  1   2
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:         .sec               # Linked by .debug_str.
+    Type:         SHT_STRTAB
+  - Name:         .debug_str
+    Type:         SHT_STRTAB         # SHT_PROGBITS by default.
+    Flags:        [SHF_ALLOC]        # [SHF_STRINGS, SHF_MERGE] by default.
+    Link:         .sec               # 0 by default.
+    EntSize:      2                  # 1 by default.
+    Info:         1                  # 0 by default.
+    AddressAlign: 2                  # 0 by default.
+    Address:      0x0000000000002020 # 0x00 by default.
+    Offset:       0x00000050         # 0x40 for the first section.
+    Size:         6                  # Set the "Size" so that we can reuse the check tag "OVERRIEDDEN"
+
+## g) Test that all the properties can be overridden by the section header when
+## the "debug_str" entry is used.
+
+# RUN: yaml2obj --docnum=7 %s -o %t7.o
+# RUN: llvm-readelf --string-dump=.debug_str %t7.o | FileCheck %s --check-prefix=DWARF-DEFAULT
+# RUN: llvm-readelf -S %t7.o | FileCheck %s --check-prefix=OVERRIDDEN
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:         .sec               # Linked by .debug_str.
+    Type:         SHT_STRTAB
+  - Name:         .debug_str
+    Type:         SHT_STRTAB         # SHT_PROGBITS by default.
+    Flags:        [SHF_ALLOC]        # [SHF_STRINGS, SHF_MERGE] by default.
+    Link:         .sec               # 0 by default.
+    EntSize:      2                  # 1 by default.
+    Info:         1                  # 0 by default.
+    AddressAlign: 2                  # 1 by default.
+    Address:      0x0000000000002020 # 0x00 by default.
+    Offset:       0x00000050         # 0x40 for the first section.
+DWARF:
+  debug_str:
+    - a
+    - b
+    - c
+
+## h) Test that yaml2obj will not generate the .debug_str section when the "DWARF" entry exists
+## but the "debug_str" entry doesn't exist in the "DWARF" entry or the "Sections" entry.
+
+# RUN: yaml2obj --docnum=8 %s -o %t8.o
+# RUN: llvm-readelf -S %t8.o | FileCheck /dev/null --implicit-check-not=.debug_str
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+
+## i) Test that if we try to initialize the .debug_str section with a preserved "Type", e.g.,
+## SHT_DYNAMIC, yaml2obj will treat it as a dynamic section and discard the content in the
+## "DWARF" entry.
+
+# RUN: yaml2obj --docnum=9 %s -o %t9.o
+# RUN: llvm-readelf -S %t9.o | FileCheck %s --check-prefix=DYN-SHDR
+# RUN: llvm-readelf -d %t9.o | FileCheck %s --check-prefix=DYNAMIC
+
+# RUN: yaml2obj --docnum=10 %s -o %t10.o
+# RUN: llvm-readelf -S %t10.o | FileCheck %s --check-prefix=DYN-SHDR
+# RUN: llvm-readelf -d %t10.o | FileCheck %s --check-prefix=DYNAMIC
+
+#           Name       Type    Address          Offset Size   ES Flg Lk Inf Al
+# DYN-SHDR: .debug_str DYNAMIC 0000000000000000 000040 000010 10      0   0  0
+
+#      DYNAMIC: Dynamic section at offset 0x40 contains 1 entries:
+# DYNAMIC-NEXT:  Tag                Type   Name/Value
+# DYNAMIC-NEXT:  0x0000000000000000 (NULL) 0x0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name: .debug_str
+    Type: SHT_DYNAMIC
+    Entries:
+      - Tag:   DT_NULL
+        Value: 0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name: .debug_str
+    Type: SHT_DYNAMIC
+    Entries:
+      - Tag:   DT_NULL
+        Value: 0
+DWARF:
+  debug_str:
+    - a

From 66e6b9afa833426032b1450cb707e6ad892aba00 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 28 May 2020 20:03:20 -0700
Subject: [PATCH 461/770] [Tests] Migrate more statepoint lowering tests to use
 operand bundles

Only 2 tests left after this.  They just happen to be the most annoying.
---
 .../statepoint-gctransition-call-lowering.ll  | 17 +----
 .../CodeGen/X86/statepoint-live-in-remat.ll   |  2 +-
 llvm/test/CodeGen/X86/statepoint-live-in.ll   |  2 +-
 llvm/test/CodeGen/X86/statepoint-regs.ll      | 62 +++++++------------
 .../CodeGen/X86/statepoint-stack-usage.ll     | 56 ++++++++---------
 llvm/test/CodeGen/X86/statepoint-vector.ll    |  2 +-
 6 files changed, 55 insertions(+), 86 deletions(-)

diff --git a/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index c98badf682686..b1b266c5699f2 100644
--- a/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/llvm/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -97,7 +97,7 @@ define i32 @test_transition_args() gc "statepoint-example" {
 ; CHECK: retq
 entry:
   %val = alloca i32
-  %safepoint_token = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 1, i32 2, i32* %val, i64 42, i32 0)
+  %safepoint_token = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* @return_i32, i32 0, i32 1, i32 0, i32 0) ["gc-transition" (i32* %val, i64 42)]
   %call1 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
   ret i32 %call1
 }
@@ -108,21 +108,6 @@ define i32 @test_transition_args_2() gc "statepoint-example" {
 ; CHECK: callq return_i32
 ; CHECK: popq %rcx
 ; CHECK: retq
-entry:
-  %val = alloca i32
-  %arg = alloca i8
-  %safepoint_token = call token (i64, i32, i32 (i32, i8*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32i32p0i8f(i64 0, i32 0, i32 (i32, i8*)* @return_i32_with_args, i32 2, i32 1, i32 0, i8* %arg, i32 2, i32* %val, i64 42, i32 0)
-  %call1 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
-  ret i32 %call1
-}
-
-; Same as test_transition_args_2 except using bundle format
-define i32 @test_bundle() gc "statepoint-example" {
-; CHECK-LABEL: test_bundle
-; CHECK: pushq %rax
-; CHECK: callq return_i32
-; CHECK: popq %rcx
-; CHECK: retq
 entry:
   %val = alloca i32
   %arg = alloca i8
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll b/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
index c2028650e092c..4b09ad2dfe622 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll
@@ -123,7 +123,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt" (i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index acc12f8e6283d..c736394f662b7 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -17,7 +17,7 @@ define void @test1(i32 %a) gc "statepoint-example" {
 ; CHECK-NEXT:    retq
 entry:
 ; We expect the argument to be passed in an extra register to bar
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a)]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index b137b18e88e32..af237cdb3345f 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -18,7 +18,7 @@ define void @test1(i32 %a) gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a)]
   ret void
 }
 
@@ -45,8 +45,8 @@ define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
 entry:
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 2, i32 %a, i32 %b)
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 2, i32 %b, i32 %a)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b)]
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %b, i32 %a)]
   ret void
 }
 
@@ -67,7 +67,7 @@ define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)]
   ret void
 }
 
@@ -91,7 +91,7 @@ define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
   ret void
 }
 
@@ -115,10 +115,10 @@ define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-ex
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
 entry:
-  %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p, i32 addrspace(1)* %p)
-  %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 9, i32 9)
-  %token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p2, i32 addrspace(1)* %p2)
-  %p3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token2,  i32 9, i32 9)
+  %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %p, i32 addrspace(1)* %p) ["deopt"(i32 %a)]
+  %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 8, i32 8)
+  %token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %p2, i32 addrspace(1)* %p2) ["deopt"(i32 %a)]
+  %p3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token2,  i32 8, i32 8)
   ret i32 addrspace(1)* %p3
 }
 
@@ -137,8 +137,8 @@ define void @test6(i32 %a) gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
 entry:
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a)]
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a)]
   ret void
 }
 
@@ -250,7 +250,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   ret void
 }
 
@@ -354,7 +354,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i8 %a8, i8 %b8, i8 %c8, i8 %d8, i16 %e16, i16 %f16, i16 %g16, i16 %h16, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i8 %a8, i8 %b8, i8 %c8, i8 %d8, i16 %e16, i16 %f16, i16 %g16, i16 %h16, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   ret void
 }
 
@@ -377,7 +377,7 @@ define void @test9(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    retq
 
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
   ret void
 }
 
@@ -429,8 +429,8 @@ define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32
 ; CHECK-NEXT:    retq
 
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
-  %statepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
+  %statepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
   ret void
 }
 
@@ -591,7 +591,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 0) ["deopt" (i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   %addab = add i64 %a64, %b64
   %addc = add i64 %addab, %c64
   %addd = add i64 %addc, %d64
@@ -633,7 +633,7 @@ define void @addr_func() gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 3, void ()* @bar, void ()* @bar, void ()* @bar)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 0) ["deopt" (void ()* @bar, void ()* @bar, void ()* @bar)]
   ret void
 }
 
@@ -651,7 +651,7 @@ define void @addr_global() gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 3, i32* @G, i32* @G, i32* @G)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 0) ["deopt" (i32* @G, i32* @G, i32* @G)]
   ret void
 }
 
@@ -668,7 +668,7 @@ define void @addr_alloca(i32 %v) gc "statepoint-example" {
 entry:
   %a = alloca i32
   store i32 %v, i32* %a
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 3, i32* %a, i32* %a, i32* %a)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 0) ["deopt" (i32* %a, i32* %a, i32* %a)]
   ret void
 }
 
@@ -683,31 +683,15 @@ define i32 addrspace(1)*  @test_fpconst_deopt(i32 addrspace(1)* %in) gc "statepo
 ; CHECK-NEXT:    movq (%rsp), %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
-    %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2, i32 5, void ()* nonnull @bar, i32 0, i32 0, i32 0, i32 20,
+    %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2, i32 5, void ()* nonnull @bar, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %in) ["deopt" (
     float 0x40421A1CA0000000, float 0x40459A1CA0000000, float 0x40401A1CA0000000, float 0x40479A1CA0000000, float 0x403C343940000000,
     float 0x403E343940000000, float 0x40469A1CA0000000, float 0x40489A1CA0000000, float 0x404A9A1CA0000000, float 0x40499A1CA0000000,
     float 0xC05FCD2F20000000, float 0xC05C0D2F20000000, float 0xC060269780000000, float 0xC05B8D2F20000000, float 0xC060669780000000,
-    float 0xC05B0D2F20000000, float 0xC060A69780000000, float 0xC05A8D2F20000000, float 0xC060E69780000000, float 0x40439A1CA0000000, i32 addrspace(1)* %in)
-    %out = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %statepoint_token, i32 27, i32 27)
+    float 0xC05B0D2F20000000, float 0xC060A69780000000, float 0xC05A8D2F20000000, float 0xC060E69780000000, float 0x40439A1CA0000000)]
+    %out = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %statepoint_token, i32 7, i32 7)
     ret i32 addrspace(1)* %out
 }
 
-; Same as test1, but using deopt bundle
-define void @test1b(i32 %a) gc "statepoint-example" {
-; CHECK-LABEL: test1b:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    callq _bar ## 4-byte Folded Reload
-; CHECK-NEXT:  Ltmp19:
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a)]
-  ret void
-}
-
 ; CHECK-LABEL: __LLVM_StackMaps:
 ; CHECK: .long   Ltmp18-_test_fpconst_deopt
 ; CHECK-NEXT: .short	0
diff --git a/llvm/test/CodeGen/X86/statepoint-stack-usage.ll b/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
index 73b0d6a180715..1841bd03d6d3c 100644
--- a/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -16,17 +16,17 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a
 ; CHECK-DAG: movq	%rsi, (%rsp)
 ; There should be no more than three moves
 ; CHECK-NOT: movq
-  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
-  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
-  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
-  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14)
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 8)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 9)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
-  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 14)
-  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 13)
-  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 12)
+  %safepoint_token2 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 9)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 8)
+  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 7)
 ; CHECK: callq
   ret i32 1
 }
@@ -39,17 +39,17 @@ define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrsp
 ; CHECK-DAG: movq	%rdi, 16(%rsp)
 ; CHECK-DAG: movq	%rdx, 8(%rsp)
 ; CHECK-DAG: movq	%rsi, (%rsp)
-  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
-  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
-  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
-  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14)
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 8)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 9)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
-  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 14)
-  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 13)
-  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 12)
+  %safepoint_token2 = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1) ["deopt" (i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0)]
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 9)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 8)
+  %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 7)
 ; CHECK: callq
   ret i32 1
 }
@@ -77,10 +77,10 @@ define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1
 ; CHECK-DAG: movl	%ebp, 8(%rsp)
 ; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
-call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
-call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
-call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
+call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
+call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
+call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
   ret i32 1
 }
 
@@ -93,25 +93,25 @@ entry:
   ; CHECK-DAG: movq	%rdx, 8(%rsp)
   ; CHECK-DAG: movq	%rsi, (%rsp)
   ; CHECK: callq
-  %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
                    to label %normal_return unwind label %exceptional_return
 
 normal_return:
-  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
-  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
-  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14)
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 7)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 8)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 7, i32 9)
   ; Should work even through bitcasts
   %c1.casted = bitcast i32 addrspace(1)* %c1 to i8 addrspace(1)*
   ; This is the key check.  There should NOT be any memory moves here
   ; CHECK-NOT: movq
   ; CHECK: callq
-  %safepoint_token2 = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %c1.casted, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %safepoint_token2 = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %c1.casted, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
                     to label %normal_return2 unwind label %exceptional_return2
 
 normal_return2:
-  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 14)
-  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 12, i32 13)
-  %c2 = tail call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token2, i32 12, i32 12)
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 9)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token2, i32 7, i32 8)
+  %c2 = tail call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token2, i32 7, i32 7)
   ret i32 1
 
 exceptional_return:
diff --git a/llvm/test/CodeGen/X86/statepoint-vector.ll b/llvm/test/CodeGen/X86/statepoint-vector.ll
index 30fb8027e715c..36532765192e8 100644
--- a/llvm/test/CodeGen/X86/statepoint-vector.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vector.ll
@@ -126,7 +126,7 @@ define void @test5() gc "statepoint-example" {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
 entry:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 1, i128 0)
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0) ["deopt" (i128 0)]
   ret void
 }
 

From 4e74541a928090b3604094e239884b0ea0a631bb Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Fri, 29 May 2020 03:02:27 +0000
Subject: [PATCH 462/770] [LoopUnroll] Fix not-rotated.ll by adding back a
 limitation was unintentionally removed in https://reviews.llvm.org/D80477

---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4525681855ce4..06e9f7ff2f533 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -326,6 +326,11 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                          "branch in latch or a single exiting block.\n");
     return LoopUnrollResult::Unmodified;
   }
+  if (LatchBI->isConditional() && LatchBI != ExitingBI) {
+    LLVM_DEBUG(
+        dbgs() << "Can't unroll; a conditional latch must exit the loop");
+    return LoopUnrollResult::Unmodified;
+  }
   LLVM_DEBUG(dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
                     << "\n");
 

From 9eafcbfca1b0ab4016bc7320e4d6646b29bf051a Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 28 May 2020 09:40:46 -0700
Subject: [PATCH 463/770] [JITLink] Fix 80-column rule violation.

---
 llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
index 156cd010e64bd..d68c9e6952836 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
@@ -90,7 +90,8 @@ Error registerMachOStubsAndGOT(Session &S, LinkGraph &G) {
   for (auto &Sec : G.sections()) {
     LLVM_DEBUG({
       dbgs() << "  Section \"" << Sec.getName() << "\": "
-             << (llvm::empty(Sec.symbols()) ? "empty. skipping." : "processing...")
+             << (llvm::empty(Sec.symbols()) ? "empty. skipping."
+                                            : "processing...")
              << "\n";
     });
 

From a6deaeec370ec5e34f9e5aa3fad3bc73770d4895 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 28 May 2020 09:02:58 -0700
Subject: [PATCH 464/770] [JITLink] Improve llvm-jitlink regression testing
 support for ELF.

This patch adds a jitlink pass, 'registerELFGraphInfo', that records section
and symbol information about each LinkGraph in the llvm-jitlink session object.
This allows symbols and sections to be referred to by name in llvm-jitlink
regression tests. This will enable a testcase to be written for
https://reviews.llvm.org/D80613.
---
 .../ExecutionEngine/JITLink/ELF_x86_64.cpp    |   3 +
 llvm/tools/llvm-jitlink/CMakeLists.txt        |   1 +
 llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp  | 100 ++++++++++++++++++
 .../tools/llvm-jitlink/llvm-jitlink-macho.cpp |   2 +-
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |   7 +-
 llvm/tools/llvm-jitlink/llvm-jitlink.h        |   6 +-
 6 files changed, 116 insertions(+), 3 deletions(-)
 create mode 100644 llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp

diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index ee219724ee469..a7118eb9b563f 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -344,6 +344,9 @@ void jitLink_ELF_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
   else
     Config.PrePrunePasses.push_back(markAllSymbolsLive);
 
+  if (auto Err = Ctx->modifyPassConfig(TT, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
   ELFJITLinker_x86_64::link(std::move(Ctx), std::move(Config));
 }
 
diff --git a/llvm/tools/llvm-jitlink/CMakeLists.txt b/llvm/tools/llvm-jitlink/CMakeLists.txt
index 5e022f1d2a576..bfe691d976ba7 100644
--- a/llvm/tools/llvm-jitlink/CMakeLists.txt
+++ b/llvm/tools/llvm-jitlink/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_tool(llvm-jitlink
   llvm-jitlink.cpp
+  llvm-jitlink-elf.cpp
   llvm-jitlink-macho.cpp
   )
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
new file mode 100644
index 0000000000000..1b74f1016ae99
--- /dev/null
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
@@ -0,0 +1,100 @@
+//===---- llvm-jitlink-elf.cpp -- ELF parsing support for llvm-jitlink ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF parsing support for llvm-jitlink.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-jitlink.h"
+
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
+
+#define DEBUG_TYPE "llvm_jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+
+namespace llvm {
+
+Error registerELFGraphInfo(Session &S, LinkGraph &G) {
+  auto FileName = sys::path::filename(G.getName());
+  if (S.FileInfos.count(FileName)) {
+    return make_error<StringError>("When -check is passed, file names must be "
+                                   "distinct (duplicate: \"" +
+                                       FileName + "\")",
+                                   inconvertibleErrorCode());
+  }
+
+  auto &FileInfo = S.FileInfos[FileName];
+  LLVM_DEBUG({
+    dbgs() << "Registering ELF file info for \"" << FileName << "\"\n";
+  });
+  for (auto &Sec : G.sections()) {
+    LLVM_DEBUG({
+      dbgs() << "  Section \"" << Sec.getName() << "\": "
+             << (llvm::empty(Sec.symbols()) ? "empty. skipping."
+                                            : "processing...")
+             << "\n";
+    });
+
+    // Skip empty sections.
+    if (llvm::empty(Sec.symbols()))
+      continue;
+
+    if (FileInfo.SectionInfos.count(Sec.getName()))
+      return make_error<StringError>("Encountered duplicate section name \"" +
+                                         Sec.getName() + "\" in \"" + FileName +
+                                         "\"",
+                                     inconvertibleErrorCode());
+
+    bool SectionContainsContent = false;
+    bool SectionContainsZeroFill = false;
+
+    auto *FirstSym = *Sec.symbols().begin();
+    auto *LastSym = FirstSym;
+    for (auto *Sym : Sec.symbols()) {
+      if (Sym->getAddress() < FirstSym->getAddress())
+        FirstSym = Sym;
+      if (Sym->getAddress() > LastSym->getAddress())
+        LastSym = Sym;
+
+      if (Sym->hasName()) {
+        dbgs() << "Symbol: " << Sym->getName() << "\n";
+        if (Sym->isSymbolZeroFill()) {
+          S.SymbolInfos[Sym->getName()] = {Sym->getSize(), Sym->getAddress()};
+          SectionContainsZeroFill = true;
+        } else {
+          S.SymbolInfos[Sym->getName()] = {Sym->getSymbolContent(),
+                                           Sym->getAddress()};
+          SectionContainsContent = true;
+        }
+      }
+    }
+
+    JITTargetAddress SecAddr = FirstSym->getAddress();
+    uint64_t SecSize =
+        (LastSym->getBlock().getAddress() + LastSym->getBlock().getSize()) -
+        SecAddr;
+
+    if (SectionContainsZeroFill && SectionContainsContent)
+      return make_error<StringError>("Mixed zero-fill and content sections not "
+                                     "supported yet",
+                                     inconvertibleErrorCode());
+    if (SectionContainsZeroFill)
+      FileInfo.SectionInfos[Sec.getName()] = {SecSize, SecAddr};
+    else
+      FileInfo.SectionInfos[Sec.getName()] = {
+          StringRef(FirstSym->getBlock().getContent().data(), SecSize),
+          SecAddr};
+  }
+
+  return Error::success();
+}
+
+} // end namespace llvm
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
index d68c9e6952836..18584e55d0f5c 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
@@ -74,7 +74,7 @@ static Expected<Symbol &> getMachOStubTarget(LinkGraph &G, Block &B) {
 
 namespace llvm {
 
-Error registerMachOStubsAndGOT(Session &S, LinkGraph &G) {
+Error registerMachOGraphInfo(Session &S, LinkGraph &G) {
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error<StringError>("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 961cd77c0ecbd..b44a56e0ac925 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -467,8 +467,13 @@ void Session::modifyPassConfig(const Triple &FTT,
                                PassConfiguration &PassConfig) {
   if (!CheckFiles.empty())
     PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) {
+
+      if (TT.getObjectFormat() == Triple::ELF)
+        return registerELFGraphInfo(*this, G);
+
       if (TT.getObjectFormat() == Triple::MachO)
-        return registerMachOStubsAndGOT(*this, G);
+        return registerMachOGraphInfo(*this, G);
+
       return make_error<StringError>("Unsupported object format for GOT/stub "
                                      "registration",
                                      inconvertibleErrorCode());
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.h b/llvm/tools/llvm-jitlink/llvm-jitlink.h
index c888baec9adf4..5884e164a44d1 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.h
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.h
@@ -69,7 +69,11 @@ struct Session {
   Session(Triple TT, Error &Err);
 };
 
-Error registerMachOStubsAndGOT(Session &S, jitlink::LinkGraph &G);
+/// Record symbols, GOT entries, stubs, and sections for ELF file.
+Error registerELFGraphInfo(Session &S, jitlink::LinkGraph &G);
+
+/// Record symbols, GOT entries, stubs, and sections for MachO file.
+Error registerMachOGraphInfo(Session &S, jitlink::LinkGraph &G);
 
 } // end namespace llvm
 

From ff92d3c672e2bf0b885b67b0efebea691df9c5b9 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 28 May 2020 17:55:49 -0700
Subject: [PATCH 465/770] [ORC] Add debugging output for LLJIT construction.

This can be handy for checking whether the LLJIT instance you're constructing
matches your expectations.
---
 .../Orc/JITTargetMachineBuilder.h             |  9 +++
 .../Orc/JITTargetMachineBuilder.cpp           | 74 +++++++++++++++++++
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        | 29 ++++++++
 3 files changed, 112 insertions(+)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
index 3b6becd2853a9..c4109a8de82eb 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
@@ -25,6 +25,9 @@
 #include <vector>
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace orc {
 
 /// A utility class for building TargetMachines for JITs.
@@ -136,6 +139,12 @@ class JITTargetMachineBuilder {
   /// Access Triple.
   const Triple &getTargetTriple() const { return TT; }
 
+#ifndef NDEBUG
+  /// Debug-dump a JITTargetMachineBuilder.
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const JITTargetMachineBuilder &JTMB);
+#endif
+
 private:
   Triple TT;
   std::string CPU;
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 3eda0fa1360c7..8cf66c9e759a3 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace orc {
@@ -63,5 +64,78 @@ JITTargetMachineBuilder &JITTargetMachineBuilder::addFeatures(
   return *this;
 }
 
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const JITTargetMachineBuilder &JTMB) {
+  OS << "{ Triple = \"" << JTMB.TT.str() << "\", CPU = \"" << JTMB.CPU
+     << "\", Options = <not-printable>, Relocation Model = ";
+
+  if (JTMB.RM) {
+    switch (*JTMB.RM) {
+    case Reloc::Static:
+      OS << "Static";
+      break;
+    case Reloc::PIC_:
+      OS << "PIC_";
+      break;
+    case Reloc::DynamicNoPIC:
+      OS << "DynamicNoPIC";
+      break;
+    case Reloc::ROPI:
+      OS << "ROPI";
+      break;
+    case Reloc::RWPI:
+      OS << "RWPI";
+      break;
+    case Reloc::ROPI_RWPI:
+      OS << "ROPI_RWPI";
+      break;
+    }
+  } else
+    OS << "unspecified";
+
+  OS << ", Code Model = ";
+
+  if (JTMB.CM) {
+    switch (*JTMB.CM) {
+    case CodeModel::Tiny:
+      OS << "Tiny";
+      break;
+    case CodeModel::Small:
+      OS << "Small";
+      break;
+    case CodeModel::Kernel:
+      OS << "Kernel";
+      break;
+    case CodeModel::Medium:
+      OS << "Medium";
+      break;
+    case CodeModel::Large:
+      OS << "Large";
+      break;
+    }
+  } else
+    OS << "unspecified";
+
+  OS << ", Optimization Level = ";
+  switch (JTMB.OptLevel) {
+  case CodeGenOpt::None:
+    OS << "None";
+    break;
+  case CodeGenOpt::Less:
+    OS << "Less";
+    break;
+  case CodeGenOpt::Default:
+    OS << "Default";
+    break;
+  case CodeGenOpt::Aggressive:
+    OS << "Aggressive";
+    break;
+  }
+
+  OS << " }";
+  return OS;
+}
+#endif // NDEBUG
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 79e502775f799..0e51ba1dff352 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -927,13 +927,42 @@ LLJIT::PlatformSupport::~PlatformSupport() {}
 
 Error LLJITBuilderState::prepareForConstruction() {
 
+  LLVM_DEBUG(dbgs() << "Preparing to create LLIT instance...\n");
+
   if (!JTMB) {
+    LLVM_DEBUG({
+      dbgs() << "  No explicitly set JITTargetMachineBuilder. "
+                "Detecting host...\n";
+    });
     if (auto JTMBOrErr = JITTargetMachineBuilder::detectHost())
       JTMB = std::move(*JTMBOrErr);
     else
       return JTMBOrErr.takeError();
   }
 
+  LLVM_DEBUG({
+    dbgs() << "  JITTargetMachineBuilder is " << JTMB << "\n"
+           << "  Pre-constructed ExecutionSession: " << (ES ? "Yes" : "No")
+           << "\n"
+           << "  DataLayout: ";
+    if (DL)
+      dbgs() << DL->getStringRepresentation() << "\n";
+    else
+      dbgs() << "None (will be created by JITTargetMachineBuilder)\n";
+
+    dbgs() << "  Custom object-linking-layer creator: "
+           << (CreateObjectLinkingLayer ? "Yes" : "No") << "\n"
+           << "  Custom compile-function creator: "
+           << (CreateCompileFunction ? "Yes" : "No") << "\n"
+           << "  Custom platform-setup function: "
+           << (SetUpPlatform ? "Yes" : "No") << "\n"
+           << "  Number of compile threads: " << NumCompileThreads;
+    if (!NumCompileThreads)
+      dbgs() << " (code will be compiled on the execution thread)\n";
+    else
+      dbgs() << "\n";
+  });
+
   // If the client didn't configure any linker options then auto-configure the
   // JIT linker.
   if (!CreateObjectLinkingLayer && JTMB->getCodeModel() == None &&

From 3ea38b5b4e6b0684b352a79191b55472aac09f13 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 29 May 2020 03:47:15 +0000
Subject: [PATCH 466/770] [gn build] Port a6deaeec370

---
 llvm/utils/gn/secondary/llvm/tools/llvm-jitlink/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-jitlink/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-jitlink/BUILD.gn
index 3df75c250ae47..ae26bb618043e 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-jitlink/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-jitlink/BUILD.gn
@@ -11,6 +11,7 @@ executable("llvm-jitlink") {
     "//llvm/lib/Target:TargetsToBuild",
   ]
   sources = [
+    "llvm-jitlink-elf.cpp",
     "llvm-jitlink-macho.cpp",
     "llvm-jitlink.cpp",
   ]

From a0c90b5b2ad6a81e8aded5ad59cc25ff58464bcf Mon Sep 17 00:00:00 2001
From: Amara Emerson <aemerson@apple.com>
Date: Fri, 22 May 2020 14:21:50 -0700
Subject: [PATCH 467/770] [AArch64][GlobalISel] Enable extending loads combines
 post-legalization.

During legalization we can end up with extends of loads, which in the case of
zexts causes us to not hit tablegen imported patterns.

The caveat here is that we don't want anyext load forming, since some variants
are illegal. This change also prevents the combine from creating any illegal
loads.

Differential Revision: https://reviews.llvm.org/D80458
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  5 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 21 ++++++--
 llvm/lib/Target/AArch64/AArch64Combine.td     |  5 +-
 .../AArch64/AArch64PostLegalizerCombiner.cpp  |  4 +-
 .../postlegalizercombiner-extending-loads.mir | 50 +++++++++++++++++++
 5 files changed, 78 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extending-loads.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index efcfbb88367d2..e09a81acef163 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -30,6 +30,7 @@ class MachineInstr;
 class MachineOperand;
 class GISelKnownBits;
 class MachineDominatorTree;
+class LegalizerInfo;
 
 struct PreferredTuple {
   LLT Ty;                // The result type of the extend.
@@ -56,11 +57,13 @@ class CombinerHelper {
   GISelChangeObserver &Observer;
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
+  const LegalizerInfo *LI;
 
 public:
   CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
                  GISelKnownBits *KB = nullptr,
-                 MachineDominatorTree *MDT = nullptr);
+                 MachineDominatorTree *MDT = nullptr,
+                 const LegalizerInfo *LI = nullptr);
 
   GISelKnownBits *getKnownBits() const {
     return KB;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 45b7d991ae727..a3291a6a9712a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -9,6 +9,7 @@
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
@@ -36,9 +37,10 @@ static cl::opt<bool>
 
 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B, GISelKnownBits *KB,
-                               MachineDominatorTree *MDT)
+                               MachineDominatorTree *MDT,
+                               const LegalizerInfo *LI)
     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
-      KB(KB), MDT(MDT) {
+      KB(KB), MDT(MDT), LI(LI) {
   (void)this->KB;
 }
 
@@ -405,7 +407,20 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
   for (auto &UseMI : MRI.use_nodbg_instructions(LoadValue.getReg())) {
     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
-        UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
+        (UseMI.getOpcode() == TargetOpcode::G_ANYEXT)) {
+      // Check for legality.
+      if (LI) {
+        LegalityQuery::MemDesc MMDesc;
+        const auto &MMO = **MI.memoperands_begin();
+        MMDesc.SizeInBits = MMO.getSizeInBits();
+        MMDesc.AlignInBits = MMO.getAlign().value() * 8;
+        MMDesc.Ordering = MMO.getOrdering();
+        LLT UseTy = MRI.getType(UseMI.getOperand(0).getReg());
+        LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+        if (LI->getAction({MI.getOpcode(), {UseTy, SrcTy}, {MMDesc}}).Action !=
+            LegalizeActions::Legal)
+          continue;
+      }
       Preferred = ChoosePreferredUse(Preferred,
                                      MRI.getType(UseMI.getOperand(0).getReg()),
                                      UseMI.getOpcode(), &UseMI);
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index fc2527c57514f..183e2a458883e 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -24,7 +24,8 @@ def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
   let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
 }
 
-def AArch64PostLegalizerCombinerHelper: GICombinerHelper<
-  "AArch64GenPostLegalizerCombinerHelper", [erase_undef_store]> {
+def AArch64PostLegalizerCombinerHelper
+    : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
+                       [erase_undef_store, combines_for_extload]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp
index 1516523bfb579..ce2e68bb0e040 100644
--- a/llvm/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp
@@ -61,7 +61,9 @@ class AArch64PostLegalizerCombinerInfo : public CombinerInfo {
 bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                                MachineInstr &MI,
                                                MachineIRBuilder &B) const {
-  CombinerHelper Helper(Observer, B, KB, MDT);
+  const auto *LI =
+      MI.getParent()->getParent()->getSubtarget().getLegalizerInfo();
+  CombinerHelper Helper(Observer, B, KB, MDT, LI);
   return Generated.tryCombineAll(Observer, MI, B, Helper);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extending-loads.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extending-loads.mir
new file mode 100644
index 0000000000000..5ed7661f98f18
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extending-loads.mir
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-postlegalizer-combiner -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_zeroext(i8* %addr) {
+  entry:
+    ret void
+  }
+  define void @test_no_anyext(i8* %addr) {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_zeroext
+legalized:       true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_zeroext
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load 1 from %ir.addr)
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s8) = G_LOAD %0 :: (load 1 from %ir.addr)
+    %2:_(s32) = G_ZEXT %1
+    $w0 = COPY %2
+...
+
+---
+name:            test_no_anyext
+legalized:       true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; Check that we don't try to do an anyext combine. We don't want to do this
+    ; because an anyexting load like s64 = G_LOAD %p (load 4) isn't legal.
+    ; CHECK-LABEL: name: test_no_anyext
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 4 from %ir.addr)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; CHECK: $x0 = COPY [[ANYEXT]](s64)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.addr)
+    %2:_(s64) = G_ANYEXT %1
+    $x0 = COPY %2
+...

From 17ed6dcb0c96ac6a6fd5021b326213dbd5fef250 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Thu, 28 May 2020 23:39:54 -0700
Subject: [PATCH 468/770] [X86] Remove MMX isel patterns containing (x86mmx
 (scalar_to_vector (i32))).

I don't think we can make such a node. I don't think
x86_mmx is considered a vector for the check in getNode.
---
 llvm/lib/Target/X86/X86InstrMMX.td | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 0f4d4d764cc90..2880be6cb8f38 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -164,21 +164,17 @@ def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
-                         (x86mmx (scalar_to_vector GR32:$src)))]>,
+                         (x86mmx (MMX_X86movw2d GR32:$src)))]>,
                         Sched<[WriteVecMoveFromGpr]>;
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
-                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>,
+                          (x86mmx (MMX_X86movw2d (loadi32 addr:$src))))]>,
                         Sched<[WriteVecLoad]>;
 
 let Predicates = [HasMMX] in {
-  def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
-            (MMX_MOVD64rr GR32:$src)>;
   def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
             (MMX_SET0)>;
-  def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
-            (MMX_MOVD64rm addr:$src)>;
 }
 
 let mayStore = 1 in
@@ -272,14 +268,6 @@ def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>,
                          Sched<[SchedWriteVecMoveLSNT.MMX.MR]>;
 
-let Predicates = [HasMMX] in {
-  // movd to MMX register zero-extends
-  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
-            (MMX_MOVD64rr GR32:$src)>;
-  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
-            (MMX_MOVD64rm addr:$src)>;
-}
-
 // Arithmetic Instructions
 defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
                                      SchedWriteVecALU.MMX>;

From 0baf0e8cfc1845ef92d397c1ae43793bf9e6aaad Mon Sep 17 00:00:00 2001
From: Siva Chandra Reddy <sivachandra@google.com>
Date: Tue, 12 May 2020 16:01:28 -0700
Subject: [PATCH 469/770] [libc] Add implementation of call_once from
 threads.h.

Reviewers: abrachet, maskray

Differential Revision: https://reviews.llvm.org/D79828
---
 libc/config/linux/api.td                 |  19 ++++
 libc/lib/CMakeLists.txt                  |   1 +
 libc/spec/stdc.td                        |  19 +++-
 libc/src/threads/CMakeLists.txt          |   7 ++
 libc/src/threads/call_once.h             |  20 ++++
 libc/src/threads/linux/CMakeLists.txt    |  13 +++
 libc/src/threads/linux/call_once.cpp     |  58 ++++++++++++
 libc/test/src/threads/CMakeLists.txt     |  16 ++++
 libc/test/src/threads/call_once_test.cpp | 111 +++++++++++++++++++++++
 9 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/threads/call_once.h
 create mode 100644 libc/src/threads/linux/call_once.cpp
 create mode 100644 libc/test/src/threads/call_once_test.cpp

diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index cb070e1466ad4..d45be84fa0805 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -301,6 +301,12 @@ def SignalAPI : PublicAPI<"signal.h"> {
   ];
 }
 
+def OnceFlag : TypeDecl<"once_flag"> {
+  let Decl = [{
+    typedef unsigned int once_flag;
+  }];
+}
+
 def MtxT : TypeDecl<"mtx_t"> {
   let Decl = [{
     typedef struct {
@@ -314,8 +320,20 @@ def ThreadStartT : TypeDecl<"thrd_start_t"> {
   let Decl = "typedef int (*thrd_start_t)(void *);";
 }
 
+def CallOnceFuncT : TypeDecl<"__call_once_func_t"> {
+  let Decl = [{
+    typedef void(*__call_once_func_t)(void);
+  }];
+}
+
 def ThreadsAPI : PublicAPI<"threads.h"> {
+  let Macros = [
+    SimpleMacroDef<"ONCE_FLAG_INIT", "0">,
+  ];
+
   let TypeDeclarations = [
+    OnceFlag,
+    CallOnceFuncT,
     MtxT,
     ThreadStartT,
   ];
@@ -332,6 +350,7 @@ def ThreadsAPI : PublicAPI<"threads.h"> {
   ];
 
   let Functions = [
+    "call_once",
     "mtx_init",
     "mtx_lock",
     "mtx_unlock",
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 1c500ba3e5dac..51f587a2a70ac 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -35,6 +35,7 @@ add_entrypoint_library(
     libc.src.sys.mman.munmap
 
     # threads.h entrypoints
+    libc.src.threads.call_once
     libc.src.threads.mtx_init
     libc.src.threads.mtx_lock
     libc.src.threads.mtx_unlock
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 139a1af84c587..4e6bfbfac1600 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -8,6 +8,11 @@ def StdC : StandardSpec<"stdc"> {
   RestrictedPtrType CharRestrictedPtr = RestrictedPtrType<CharType>;
   ConstType ConstCharRestrictedPtr = ConstType<CharRestrictedPtr>;
 
+  NamedType OnceFlagType = NamedType<"once_flag">;
+  PtrType OnceFlagTypePtr = PtrType<OnceFlagType>;
+  // TODO(sivachandra): Remove this non-standard type when a formal
+  // way to describe callable types is available.
+  NamedType CallOnceFuncType = NamedType<"__call_once_func_t">;
   NamedType MtxTType = NamedType<"mtx_t">;
   PtrType MtxTTypePtr = PtrType<MtxTType>;
   NamedType ThrdStartTType = NamedType<"thrd_start_t">;
@@ -267,8 +272,12 @@ def StdC : StandardSpec<"stdc"> {
 
   HeaderSpec Threads = HeaderSpec<
       "threads.h",
-      [], // Macros
       [
+          Macro<"ONCE_FLAG_INIT">,
+      ],
+      [
+          OnceFlagType,
+          CallOnceFuncType,
           MtxTType,
           ThrdStartTType,
           ThrdTType,
@@ -284,6 +293,14 @@ def StdC : StandardSpec<"stdc"> {
           EnumeratedNameValue<"thrd_nomem">,
       ],
       [
+          FunctionSpec<
+              "call_once",
+              RetValSpec<VoidType>,
+              [
+                  ArgSpec<OnceFlagTypePtr>,
+                  ArgSpec<CallOnceFuncType>,
+              ]
+          >,
           FunctionSpec<
               "mtx_init",
               RetValSpec<IntType>,
diff --git a/libc/src/threads/CMakeLists.txt b/libc/src/threads/CMakeLists.txt
index 966e41ec9d551..276aa51cfbd51 100644
--- a/libc/src/threads/CMakeLists.txt
+++ b/libc/src/threads/CMakeLists.txt
@@ -2,6 +2,13 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${LIBC_TARGET_OS})
 endif()
 
+add_entrypoint_object(
+  call_once
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.call_once
+)
+
 add_entrypoint_object(
   thrd_create
   ALIAS
diff --git a/libc/src/threads/call_once.h b/libc/src/threads/call_once.h
new file mode 100644
index 0000000000000..f6602df68197c
--- /dev/null
+++ b/libc/src/threads/call_once.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for call_once function ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_THREADS_CALL_ONCE_H
+#define LLVM_LIBC_SRC_THREADS_CALL_ONCE_H
+
+#include "include/threads.h"
+
+namespace __llvm_libc {
+
+void call_once(once_flag *flag, __call_once_func_t func);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_THREADS_CALL_ONCE_H
diff --git a/libc/src/threads/linux/CMakeLists.txt b/libc/src/threads/linux/CMakeLists.txt
index 63e956ec15000..08a04c604a36a 100644
--- a/libc/src/threads/linux/CMakeLists.txt
+++ b/libc/src/threads/linux/CMakeLists.txt
@@ -8,6 +8,19 @@ add_gen_header(
     ${LIBC_TARGET_MACHINE}/thread_start_args.h.in
 )
 
+add_entrypoint_object(
+  call_once
+  SRCS
+    call_once.cpp
+  HDRS
+    ../call_once.h
+  DEPENDS
+    .threads_utils
+    libc.config.linux.linux_syscall_h
+    libc.include.sys_syscall
+    libc.include.threads
+)
+
 add_header_library(
   threads_utils
   HDRS
diff --git a/libc/src/threads/linux/call_once.cpp b/libc/src/threads/linux/call_once.cpp
new file mode 100644
index 0000000000000..058f3700ef7eb
--- /dev/null
+++ b/libc/src/threads/linux/call_once.cpp
@@ -0,0 +1,58 @@
+//===-- Linux implementation of the call_once function --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "config/linux/syscall.h" // For syscall functions.
+#include "include/sys/syscall.h"  // For syscall numbers.
+#include "include/threads.h"      // For call_once related type definition.
+#include "src/__support/common.h"
+#include "src/threads/linux/thread_utils.h"
+
+#include <limits.h>
+#include <linux/futex.h>
+#include <stdatomic.h>
+
+namespace __llvm_libc {
+
+static constexpr unsigned START = 0x11;
+static constexpr unsigned WAITING = 0x22;
+static constexpr unsigned FINISH = 0x33;
+
+void LLVM_LIBC_ENTRYPOINT(call_once)(once_flag *flag, __call_once_func_t func) {
+  FutexData *futex_word = reinterpret_cast<FutexData *>(flag);
+  unsigned int not_called = ONCE_FLAG_INIT;
+
+  // The C standard wording says:
+  //
+  //     The completion of the function func synchronizes with all
+  //     previous or subsequent calls to call_once with the same
+  //     flag variable.
+  //
+  // What this means is that, the call_once call can return only after
+  // the called function |func| returns. So, we use futexes to synchronize
+  // calls with the same flag value.
+  if (::atomic_compare_exchange_strong(futex_word, &not_called, START)) {
+    func();
+    auto status = ::atomic_exchange(futex_word, FINISH);
+    if (status == WAITING) {
+      __llvm_libc::syscall(SYS_futex, futex_word, FUTEX_WAKE_PRIVATE,
+                           INT_MAX, // Wake all waiters.
+                           0, 0, 0);
+    }
+    return;
+  }
+
+  unsigned int status = START;
+  if (::atomic_compare_exchange_strong(futex_word, &status, WAITING) ||
+      status == WAITING) {
+    __llvm_libc::syscall(SYS_futex, futex_word, FUTEX_WAIT_PRIVATE,
+                         WAITING, // Block only if status is still |WAITING|.
+                         0, 0, 0);
+  }
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/threads/CMakeLists.txt b/libc/test/src/threads/CMakeLists.txt
index 178323e92714e..6511efe2c33b0 100644
--- a/libc/test/src/threads/CMakeLists.txt
+++ b/libc/test/src/threads/CMakeLists.txt
@@ -1,5 +1,21 @@
 add_libc_testsuite(libc_threads_unittests)
 
+add_libc_unittest(
+  call_once_test
+  SUITE
+    libc_threads_unittests
+  SRCS
+    call_once_test.cpp
+  DEPENDS
+    libc.include.threads
+    libc.src.threads.call_once
+    libc.src.threads.mtx_init
+    libc.src.threads.mtx_lock
+    libc.src.threads.mtx_unlock
+    libc.src.threads.thrd_create
+    libc.src.threads.thrd_join
+)
+
 add_libc_unittest(
   thrd_test
   SUITE
diff --git a/libc/test/src/threads/call_once_test.cpp b/libc/test/src/threads/call_once_test.cpp
new file mode 100644
index 0000000000000..bb5f14899d9d6
--- /dev/null
+++ b/libc/test/src/threads/call_once_test.cpp
@@ -0,0 +1,111 @@
+//===-- Unittests for call_once -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/threads.h"
+#include "src/threads/call_once.h"
+#include "src/threads/mtx_init.h"
+#include "src/threads/mtx_lock.h"
+#include "src/threads/mtx_unlock.h"
+#include "src/threads/thrd_create.h"
+#include "src/threads/thrd_join.h"
+#include "utils/UnitTest/Test.h"
+
+#include <stdatomic.h>
+
+static constexpr unsigned int num_threads = 5;
+static atomic_uint thread_count;
+
+static unsigned int call_count;
+static void call_once_func() { ++call_count; }
+
+static int func(void *) {
+  static once_flag flag = ONCE_FLAG_INIT;
+  __llvm_libc::call_once(&flag, call_once_func);
+
+  ++thread_count; // This is a an atomic update.
+
+  return 0;
+}
+
+TEST(CallOnceTest, CallFrom5Threads) {
+  // Ensure the call count and thread count are 0 to begin with.
+  call_count = 0;
+  thread_count = 0;
+
+  thrd_t threads[num_threads];
+  for (unsigned int i = 0; i < num_threads; ++i) {
+    ASSERT_EQ(__llvm_libc::thrd_create(threads + i, func, nullptr),
+              static_cast<int>(thrd_success));
+  }
+
+  for (unsigned int i = 0; i < num_threads; ++i) {
+    int retval;
+    ASSERT_EQ(__llvm_libc::thrd_join(threads + i, &retval),
+              static_cast<int>(thrd_success));
+    ASSERT_EQ(retval, 0);
+  }
+
+  EXPECT_EQ(static_cast<unsigned int>(thread_count), 5U);
+  EXPECT_EQ(call_count, 1U);
+}
+
+static mtx_t once_func_blocker;
+static void blocking_once_func() {
+  __llvm_libc::mtx_lock(&once_func_blocker);
+  __llvm_libc::mtx_unlock(&once_func_blocker);
+}
+
+static atomic_uint start_count;
+static atomic_uint done_count;
+static int once_func_caller(void *) {
+  static once_flag flag;
+  ++start_count;
+  __llvm_libc::call_once(&flag, blocking_once_func);
+  ++done_count;
+  return 0;
+}
+
+// Test the synchronization aspect of the call_once function.
+// This is not a fool proof test, but something which might be
+// useful when we add a flakiness detection scheme to UnitTest.
+TEST(CallOnceTest, TestSynchronization) {
+  start_count = 0;
+  done_count = 0;
+
+  ASSERT_EQ(__llvm_libc::mtx_init(&once_func_blocker, mtx_plain),
+            static_cast<int>(thrd_success));
+  // Lock the blocking mutex so that the once func blocks.
+  ASSERT_EQ(__llvm_libc::mtx_lock(&once_func_blocker),
+            static_cast<int>(thrd_success));
+
+  thrd_t t1, t2;
+  ASSERT_EQ(__llvm_libc::thrd_create(&t1, once_func_caller, nullptr),
+            static_cast<int>(thrd_success));
+  ASSERT_EQ(__llvm_libc::thrd_create(&t2, once_func_caller, nullptr),
+            static_cast<int>(thrd_success));
+
+  while (start_count != 2)
+    ; // Spin until both threads start.
+
+  // Since the once func is blocked, the threads should not be done yet.
+  EXPECT_EQ(static_cast<unsigned int>(done_count), 0U);
+
+  // Unlock the blocking mutex so that the once func blocks.
+  ASSERT_EQ(__llvm_libc::mtx_unlock(&once_func_blocker),
+            static_cast<int>(thrd_success));
+
+  int retval;
+  ASSERT_EQ(__llvm_libc::thrd_join(&t1, &retval),
+            static_cast<int>(thrd_success));
+  ASSERT_EQ(retval, 0);
+  ASSERT_EQ(__llvm_libc::thrd_join(&t2, &retval),
+            static_cast<int>(thrd_success));
+  ASSERT_EQ(retval, 0);
+
+  ASSERT_EQ(static_cast<unsigned int>(done_count), 2U);
+}

From b147b88c8432cdc14a3238925dbfb8d55be32932 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Wed, 6 May 2020 17:14:15 +0100
Subject: [PATCH 470/770] [CodeGen] Add support for extracting elements of
 scalable vectors

I have tried to ensure that SelectionDAG and DAGCombiner do
sensible things for scalable vectors, and added support for a
limited number of simple folds. Codegen support for the vector
extract patterns have also been added to the AArch64 backend.

New vector extract tests have been added here:

  CodeGen/AArch64/sve-extract-element.ll

and I have also added new folds using inserts and extracts here:

  CodeGen/AArch64/sve-insert-element.ll

Differential Revision: https://reviews.llvm.org/D80208
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  24 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  39 ++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  40 +++
 .../CodeGen/AArch64/sve-extract-element.ll    | 247 ++++++++++++++++++
 .../CodeGen/AArch64/sve-insert-element.ll     | 206 ++++++++++-----
 5 files changed, 481 insertions(+), 75 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-extract-element.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7e41b2fffeda1..9216151272851 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17429,16 +17429,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // extract_vector_elt of out-of-bounds element -> UNDEF
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  unsigned NumElts = VecVT.getVectorNumElements();
-  unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
-  if (IndexC && IndexC->getAPIntValue().uge(NumElts))
+  if (IndexC && VecVT.isFixedLengthVector() &&
+      IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
     return DAG.getUNDEF(ScalarVT);
 
   // extract_vector_elt (build_vector x, y), 1 -> y
-  if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR &&
+  if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
+       VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
       TLI.isTypeLegal(VecVT) &&
       (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
-    SDValue Elt = VecOp.getOperand(IndexC->getZExtValue());
+    assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
+            VecVT.isFixedLengthVector()) &&
+           "BUILD_VECTOR used for scalable vectors");
+    unsigned IndexVal =
+        VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
+    SDValue Elt = VecOp.getOperand(IndexVal);
     EVT InEltVT = Elt.getValueType();
 
     // Sometimes build_vector's scalar input types do not match result type.
@@ -17449,6 +17454,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // converts.
   }
 
+  if (VecVT.isScalableVector())
+    return SDValue();
+
+  // All the code from this point onwards assumes fixed width vectors, but it's
+  // possible that some of the combinations could be made to work for scalable
+  // vectors too.
+  unsigned NumElts = VecVT.getVectorNumElements();
+  unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
+
   // TODO: These transforms should not require the 'hasOneUse' restriction, but
   // there are regressions on multiple targets without it. We can end up with a
   // mess of scalar and vector code if we reduce only part of the DAG to scalar.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index cfb15d6ca9d7c..2f277eee84956 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -5362,15 +5363,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N1.isUndef() || N2.isUndef())
       return getUNDEF(VT);
 
-    // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
-    if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
+    // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF for fixed length
+    // vectors. For scalable vectors we will provide appropriate support for
+    // dealing with arbitrary indices.
+    if (N2C && N1.getValueType().isFixedLengthVector() &&
+        N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
-    // expanding copies of large vectors from registers.
-    if (N2C &&
-        N1.getOpcode() == ISD::CONCAT_VECTORS &&
-        N1.getNumOperands() > 0) {
+    // expanding copies of large vectors from registers. This only works for
+    // fixed length vectors, since we need to know the exact number of
+    // elements.
+    if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) {
       unsigned Factor =
         N1.getOperand(0).getValueType().getVectorNumElements();
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
@@ -5378,10 +5383,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                      getVectorIdxConstant(N2C->getZExtValue() % Factor, DL));
     }
 
-    // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
-    // expanding large vector constants.
-    if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
-      SDValue Elt = N1.getOperand(N2C->getZExtValue());
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR or SPLAT_VECTOR is often formed while
+    // lowering is expanding large vector constants.
+    if (N2C && (N1.getOpcode() == ISD::BUILD_VECTOR ||
+                N1.getOpcode() == ISD::SPLAT_VECTOR)) {
+      assert((N1.getOpcode() != ISD::BUILD_VECTOR ||
+              N1.getValueType().isFixedLengthVector()) &&
+             "BUILD_VECTOR used for scalable vectors");
+      unsigned Index =
+          N1.getOpcode() == ISD::BUILD_VECTOR ? N2C->getZExtValue() : 0;
+      SDValue Elt = N1.getOperand(Index);
 
       if (VT != Elt.getValueType())
         // If the vector element type is not legal, the BUILD_VECTOR operands
@@ -5415,8 +5426,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
     // EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
     // when vector types are scalarized and v1iX is legal.
-    // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
+    // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx).
+    // Here we are completely ignoring the extract element index (N2),
+    // which is fine for fixed width vectors, since any index other than 0
+    // is undefined anyway. However, this cannot be ignored for scalable
+    // vectors - in theory we could support this, but we don't want to do this
+    // without a profitability check.
     if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N1.getValueType().isFixedLengthVector() &&
         N1.getValueType().getVectorNumElements() == 1) {
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
                      N1.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 54a764337324c..f5b983ac757fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1897,6 +1897,46 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
                                        (INDEX_II_D 0, 1),
                                        (DUP_ZR_D $index)),
                         $src)>;
+
+  // Extract element from vector with immediate index
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+
+  // Extract element from vector with scalar index
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+                         ZPR:$vec)>;
+
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+                         ZPR:$vec)>;
 }
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
new file mode 100644
index 0000000000000..4cb3103ec55b9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane0_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, b0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 0
+  ret i8 %b
+}
+
+define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: test_lane0_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 0
+  ret i16 %b
+}
+
+define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_lane0_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x i32> %a, i32 0
+  ret i32 %b
+}
+
+define i64 @test_lane0_2xi64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: test_lane0_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x i64> %a, i32 0
+  ret i64 %b
+}
+
+define double @test_lane0_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane0_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x double> %a, i32 0
+  ret double %b
+}
+
+define float @test_lane0_4xf32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: test_lane0_4xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x float> %a, i32 0
+  ret float %b
+}
+
+define half @test_lane0_8xf16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: test_lane0_8xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x half> %a, i32 0
+  ret half %b
+}
+
+define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.b, xzr, x8
+; CHECK-NEXT:    lastb w0, p0, z0.b
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 %x
+  ret i8 %b
+}
+
+define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb w0, p0, z0.h
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 %x
+  ret i16 %b
+}
+
+define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb w0, p0, z0.s
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x i32> %a, i32 %x
+  ret i32 %b
+}
+
+define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb x0, p0, z0.d
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x i64> %a, i32 %x
+  ret i64 %b
+}
+
+define double @test_lanex_2xf64(<vscale x 2 x double> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x double> %a, i32 %x
+  ret double %b
+}
+
+define float @test_lanex_4xf32(<vscale x 4 x float> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_4xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x float> %a, i32 %x
+  ret float %b
+}
+
+define half @test_lanex_8xf16(<vscale x 8 x half> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_8xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x half> %a, i32 %x
+  ret half %b
+}
+
+; Deliberately choose an index that is out-of-bounds
+define i8 @test_lane64_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane64_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #64
+; CHECK-NEXT:    whilels p0.b, xzr, x8
+; CHECK-NEXT:    lastb w0, p0, z0.b
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 64
+  ret i8 %b
+}
+
+define double @test_lane9_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane9_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x double> %a, i32 9
+  ret double %b
+}
+
+; Deliberately choose an index that is undefined
+define i32 @test_lane64_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_lane64_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x i32> %a, i32 undef
+  ret i32 %b
+}
+
+define i8 @extract_of_insert_undef_16xi8(i8 %a) {
+; CHECK-LABEL: extract_of_insert_undef_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %b = insertelement <vscale x 16 x i8> undef, i8 %a, i32 0
+  %c = extractelement <vscale x 16 x i8> %b, i32 0
+  ret i8 %c
+}
+
+define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: extract0_of_insert0_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 0
+  %d = extractelement <vscale x 16 x i8> %c, i32 0
+  ret i8 %d
+}
+
+define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: extract64_of_insert64_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 64
+  %d = extractelement <vscale x 16 x i8> %c, i32 64
+  ret i8 %d
+}
+
+define i8 @extract_of_insert_diff_lanes_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: extract_of_insert_diff_lanes_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, z0.b[3]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 0
+  %d = extractelement <vscale x 16 x i8> %c, i32 3
+  ret i8 %d
+}
+
+define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane0_zero_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> zeroinitializer, i32 0
+  ret i8 %b
+}
+
+; The DAG combiner should fold the extract of a splat to give element zero
+; of the splat, i.e. %x. If the index is beyond the end of the scalable
+; vector the result is undefined anyway.
+define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) {
+; CHECK-LABEL: test_lanex_splat_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %a = insertelement <vscale x 2 x i64> undef, i64 %x, i32 0
+  %b = shufflevector <vscale x 2 x i64> %a, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %c = extractelement <vscale x 2 x i64> %b, i32 %y
+  ret i64 %c
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index 90acf8cf3d0a7..daaaa6d347c0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -1,106 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 
 define <vscale x 16 x i8> @test_lane0_16xi8(<vscale x 16 x i8> %a) {
-; CHECK-LABEL: test_lane0_16xi8
-; CHECK:       mov [[REG:.*]], #30
-; CHECK:       mov z0.b, p{{[0-7]}}/m, [[REG]]
+; CHECK-LABEL: test_lane0_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl1
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.b, p0/m, w8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 0
   ret <vscale x 16 x i8> %b
 }
 
 define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
-; CHECK-LABEL: test_lane0_8xi16
-; CHECK:       mov [[REG:.*]], #30
-; CHECK:       mov z0.h, p{{[0-7]}}/m, [[REG]]
+; CHECK-LABEL: test_lane0_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.h, p0/m, w8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x i16> %a, i16 30, i32 0
   ret <vscale x 8 x i16> %b
 }
 
 define <vscale x 4 x i32> @test_lane0_4xi32(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: test_lane0_4xi32
-; CHECK:       mov [[REG:.*]], #30
-; CHECK:       mov z0.s, p{{[0-7]}}/m, [[REG]]
+; CHECK-LABEL: test_lane0_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.s, p0/m, w8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 4 x i32> %a, i32 30, i32 0
   ret <vscale x 4 x i32> %b
 }
 
 define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: test_lane0_2xi64
-; CHECK:       mov w[[REG:.*]], #30
-; CHECK:       mov z0.d, p{{[0-7]}}/m, x[[REG]]
+; CHECK-LABEL: test_lane0_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.d, p0/m, x8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 0
   ret <vscale x 2 x i64> %b
 }
 
 define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
-; CHECK-LABEL: test_lane0_2xf64
-; CHECK:       fmov d[[REG:[0-9]+]], #1.00000000
-; CHECK:       mov z0.d, p{{[0-7]}}/m, z[[REG]].d
+; CHECK-LABEL: test_lane0_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, #1.00000000
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x double> %a, double 1.0, i32 0
   ret <vscale x 2 x double> %b
 }
 
 define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
-; CHECK-LABEL: test_lane0_4xf32
-; CHECK:       fmov s[[REG:[0-9]+]], #1.00000000
-; CHECK:       mov z0.s, p{{[0-7]}}/m, z[[REG]].s
+; CHECK-LABEL: test_lane0_4xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s1, #1.00000000
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 4 x float> %a, float 1.0, i32 0
   ret <vscale x 4 x float> %b
 }
 
 define <vscale x 8 x half> @test_lane0_8xf16(<vscale x 8 x half> %a) {
-; CHECK-LABEL: test_lane0_8xf16
-; CHECK:       fmov h[[REG:[0-9]+]], #1.00000000
-; CHECK:       mov z0.h, p{{[0-7]}}/m, z[[REG]].h
+; CHECK-LABEL: test_lane0_8xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov h1, #1.00000000
+; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 0
   ret <vscale x 8 x half> %b
 }
 
 ; Undefined lane insert
 define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: test_lane4_2xi64
-; CHECK:       mov w[[IDXREG:.*]], #4
-; CHECK:       index z[[CMPVEC:[0-9]+]].d, #0, #1
-; CHECK:       mov z[[IDXVEC:[0-9]+]].d, x[[IDXREG]]
-; CHECK:       cmpeq p[[PRED:[0-9]+]].d, p{{[0-7]}}/z, z[[CMPVEC]].d, z[[IDXVEC]].d
-; CHECK:       mov w[[VALREG:.*]], #30
-; CHECK:       mov z0.d, p[[PRED]]/m, x[[VALREG]]
+; CHECK-LABEL: test_lane4_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.d, p0/m, x8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 4
   ret <vscale x 2 x i64> %b
 }
 
 ; Undefined lane insert
 define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
-; CHECK-LABEL: test_lane9_8xf16
-; CHECK:       mov w[[IDXREG:.*]], #9
-; CHECK:       index z[[CMPVEC:[0-9]+]].h, #0, #1
-; CHECK:       mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
-; CHECK:       cmpeq p[[PRED:[0-9]+]].h, p{{[0-7]}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
-; CHECK:       fmov h[[VALREG:[0-9]+]], #1.00000000
-; CHECK:       mov z0.h, p[[PRED]]/m, h[[VALREG]]
+; CHECK-LABEL: test_lane9_8xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fmov h1, #1.00000000
+; CHECK-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 9
   ret <vscale x 8 x half> %b
 }
 
 define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
-; CHECK-LABEL: test_lane1_16xi8
-; CHECK:       mov w[[IDXREG:.*]], #1
-; CHECK:       index z[[CMPVEC:[0-9]+]].b, #0, #1
-; CHECK:       mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
-; CHECK:       cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
-; CHECK:       mov w[[VALREG:.*]], #30
-; CHECK:       mov z0.b, p[[PRED]]/m, w[[VALREG]]
+; CHECK-LABEL: test_lane1_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.b, p0/m, w8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 1
   ret <vscale x 16 x i8> %b
 }
 
 define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_16xi8
-; CHECK:       index z[[CMPVEC:[0-9]+]].b, #0, #1
-; CHECK:       mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
-; CHECK:       cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
-; CHECK:       mov w[[VALREG:.*]], #30
-; CHECK:       mov z0.b, p[[PRED]]/m, w[[VALREG]]
+; CHECK-LABEL: test_lanex_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov w8, #30
+; CHECK-NEXT:    mov z0.b, p0/m, w8
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 %x
   ret <vscale x 16 x i8> %b
 }
@@ -108,28 +144,80 @@ define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
 
 ; Redundant lane insert
 define <vscale x 4 x i32> @extract_insert_4xi32(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: extract_insert_4xi32
-; CHECK-NOT:   mov w{{.*}}, #30
-; CHECK-NOT:   mov z0.d
+; CHECK-LABEL: extract_insert_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
   %b = extractelement <vscale x 4 x i32> %a, i32 2
   %c = insertelement <vscale x 4 x i32> %a, i32 %b, i32 2
   ret <vscale x 4 x i32> %c
 }
 
 define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
-; CHECK-LABEL: test_lane6_undef_8xi16
-; CHECK:       mov w[[IDXREG:.*]], #6
-; CHECK:       index z[[CMPVEC:.*]].h, #0, #1
-; CHECK:       mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
-; CHECK:       cmpeq p[[PRED:.*]].h, p{{.*}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
-; CHECK:       mov z0.h, p[[PRED]]/m, w0
+; CHECK-LABEL: test_lane6_undef_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p0/m, w0
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x i16> undef, i16 %a, i32 6
   ret <vscale x 8 x i16> %b
 }
 
 define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
-; CHECK-LABEL: test_lane0_undef_16xi8
-; CHECK:       fmov s0, w0
+; CHECK-LABEL: test_lane0_undef_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> undef, i8 %a, i32 0
   ret <vscale x 16 x i8> %b
 }
+
+define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_insert0_of_extract0_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, b1
+; CHECK-NEXT:    ptrue p0.b, vl1
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z0.b, p0/m, w8
+; CHECK-NEXT:    ret
+  %c = extractelement <vscale x 16 x i8> %b, i32 0
+  %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 0
+  ret <vscale x 16 x i8> %d
+}
+
+define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_insert64_of_extract64_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #64
+; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    mov z3.b, w8
+; CHECK-NEXT:    lastb w8, p1, z1.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
+; CHECK-NEXT:    ret
+  %c = extractelement <vscale x 16 x i8> %b, i32 64
+  %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 64
+  ret <vscale x 16 x i8> %d
+}
+
+define <vscale x 16 x i8> @test_insert3_of_extract1_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_insert3_of_extract1_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, z1.b[1]
+; CHECK-NEXT:    mov w8, #3
+; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z1.b, w8
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
+; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    ret
+  %c = extractelement <vscale x 16 x i8> %b, i32 1
+  %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 3
+  ret <vscale x 16 x i8> %d
+}

From 9c0ef044beb4850ad9626cb81a1ede4f3bbda4a7 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Wed, 27 May 2020 15:21:48 +0100
Subject: [PATCH 471/770] [SVE] Fix warnings in SelectInst::areInvalidOperands

We should be comparing the element counts rather than the
numbers of elements.

Differential Revision: https://reviews.llvm.org/D80634
---
 llvm/lib/IR/Instructions.cpp                |  2 +-
 llvm/test/CodeGen/AArch64/sve-bad-select.ll | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-bad-select.ll

diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 957db32d6085a..3c7b79512908d 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -81,7 +81,7 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
     VectorType *ET = dyn_cast<VectorType>(Op1->getType());
     if (!ET)
       return "selected values for vector select must be vectors";
-    if (ET->getNumElements() != VT->getNumElements())
+    if (ET->getElementCount() != VT->getElementCount())
       return "vector select requires selected vectors to have "
                    "the same vector length as select condition";
   } else if (Op0->getType() != Type::getInt1Ty(Op0->getContext())) {
diff --git a/llvm/test/CodeGen/AArch64/sve-bad-select.ll b/llvm/test/CodeGen/AArch64/sve-bad-select.ll
new file mode 100644
index 0000000000000..2dbc4ea8c7cde
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bad-select.ll
@@ -0,0 +1,10 @@
+; RUN: not llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>&1 | FileCheck %s
+
+define <vscale x 16 x i8> @badsel1_nxv16i8(<16 x i1> %p,
+                                           <vscale x 16 x i8> %dst,
+                                           <vscale x 16 x i8> %a) {
+  %sel = select <16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %dst
+  ret <vscale x 16 x i8> %sel
+}
+
+; CHECK: error: vector select requires selected vectors to have the same vector length as select condition

From f254f1d94e8d0070b2d006a3d1e7ee6eeae0aaa7 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 26 May 2020 16:07:46 +0100
Subject: [PATCH 472/770] [SVE] Remove getNumElements() warnings in
 InstCombiner::visitBitCast

Whilst trying to compile this test to assembly:

  CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c

I discovered some warnings were firing in InstCombiner::visitBitCast
due to calls to getNumElements() for scalable vector types. These
calls only really made sense for fixed width vectors so I have fixed
up the code appropriately.

Differential Revision: https://reviews.llvm.org/D80559
---
 .../lib/Transforms/InstCombine/InstCombineCasts.cpp |  4 ++--
 .../Transforms/InstCombine/AArch64/sve-bitcast.ll   | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-bitcast.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 714d1ae8aaec3..a2b75848ea028 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2534,7 +2534,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
     // Beware: messing with this target-specific oddity may cause trouble.
     if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
       Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
@@ -2563,7 +2563,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+  if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) {
     if (SrcVTy->getNumElements() == 1) {
       // If our destination is not a vector, then make this a straight
       // scalar-scalar cast.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-bitcast.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-bitcast.ll
new file mode 100644
index 0000000000000..8049cad596b50
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-bitcast.ll
@@ -0,0 +1,13 @@
+; RUN: opt -instcombine -mtriple=aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s
+
+; We shouldn't fold bitcast(insert <vscale x 1 x iX> .., iX %val, i32 0)
+; into bitcast(iX %val) for scalable vectors.
+define <vscale x 2 x i8> @bitcast_of_insert_i8_i16(i16 %val) #0 {
+; CHECK-LABEL: @bitcast_of_insert_i8_i16(
+; CHECK-NOT:   bitcast i16 %val to <vscale x 2 x i8>
+; CHECK:       bitcast <vscale x 1 x i16> %op2 to <vscale x 2 x i8>
+entry:
+  %op2 = insertelement <vscale x 1 x i16> undef, i16 %val, i32 0
+  %0 = bitcast <vscale x 1 x i16> %op2 to <vscale x 2 x i8>
+  ret <vscale x 2 x i8> %0
+}

From ca467542eecfc621eea7fefb3c7e3849c6b43ac7 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Fri, 29 May 2020 09:13:08 +0200
Subject: [PATCH 473/770] [CMake] Pass CLANG_VENDOR variables into later stages

We are already passing CLANG_VERSION_* & PACKAGE_VENDOR
---
 clang/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 7dadc5f6e917a..5a5e34aacbebb 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -711,6 +711,7 @@ if (CLANG_ENABLE_BOOTSTRAP)
     CLANG_VERSION_MAJOR
     CLANG_VERSION_MINOR
     CLANG_VERSION_PATCHLEVEL
+    CLANG_VENDOR
     LLVM_VERSION_SUFFIX
     LLVM_BINUTILS_INCDIR
     CLANG_REPOSITORY_STRING

From a3418631e8aa0941b8b57ec2fc3b8d0c7db493be Mon Sep 17 00:00:00 2001
From: davidak <davidak@users.noreply.github.com>
Date: Thu, 28 May 2020 22:33:14 +0200
Subject: [PATCH 474/770] libclc: update website url

old link is dead
---
 libclc/README.TXT | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/README.TXT b/libclc/README.TXT
index 00ae6bfa40a17..57b5242b9bbec 100644
--- a/libclc/README.TXT
+++ b/libclc/README.TXT
@@ -49,4 +49,4 @@ $ DESTDIR=/path/for/staged/install ninja install
 Website
 -------
 
-http://www.pcc.me.uk/~peter/libclc/
+https://libclc.llvm.org/

From 7fb8a40e5220d6d4efa14c15f92b6f28ba1b18f7 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Fri, 29 May 2020 08:27:22 +0100
Subject: [PATCH 475/770] New intrinsic @llvm.get.active.lane.mask()

This is split off from D79100 and:
- adds a intrinsic description/definition for @llvm.get.active.lane.mask(), and
- describe its semantics in LangRef.

As described (in more detail) in its LangRef section, it is semantically
equivalent to an icmp with the vector induction variable and the back-edge
taken count, and generates a mask of active/inactive vector lanes.

It will have several use cases. First, it will be used by the
ExpandVectorPredication pass for the VP intrinsics, to expand VP intrinsics for
scalable vectors on targets that do not support the `%evl` parameter, see
D78203.

Also, this is part of, and essential for our ARM MVE tail-predication story:
- this intrinsic will be emitted by the LoopVectorizer in D79100, when
  the scalar epilogue is tail-folded into the vector body. This new intrinsic
  will generate the predicate for the masked loads/stores, and it takes the
  back-edge taken count as an argument. The back-edge taken count represents the
  number of elements processed by the loop, which we need to setup MVE
  tail-predication.
- Emitting the intrinsic is controlled by a new TTI hook, see D80597.
- We pick up this new intrinsic in an ARM MVETailPredication backend pass, see
  D79175, and convert it to a MVE target specific intrinsic/instruction to
  create a tail-predicated loop.

Differential Revision: https://reviews.llvm.org/D80596
---
 llvm/docs/LangRef.rst                      | 75 ++++++++++++++++++++++
 llvm/include/llvm/IR/Intrinsics.td         |  4 ++
 llvm/lib/IR/Verifier.cpp                   |  8 +++
 llvm/test/Verifier/get-active-lane-mask.ll | 21 ++++++
 4 files changed, 108 insertions(+)
 create mode 100644 llvm/test/Verifier/get-active-lane-mask.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0e18dcc9f99e8..db19c649c2fca 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16366,6 +16366,81 @@ Examples:
       %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
 
 
+.. _int_get_active_lane_mask:
+
+'``llvm.get.active.lane.mask.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %base, i32 %n)
+      declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %base, i64 %n)
+      declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %base, i64 %n)
+      declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %base, i64 %n)
+
+
+Overview:
+"""""""""
+
+Create a mask representing active and inactive vector lanes.
+
+
+Arguments:
+""""""""""
+
+Both operands have the same scalar integer type. The result is a vector with
+the i1 element type.
+
+Semantics:
+""""""""""
+
+The '``llvm.get.active.lane.mask.*``' intrinsics are semantically equivalent
+to:
+
+::
+
+      %m[i] = icmp ule (%base + i), %n
+
+where ``%m`` is a vector (mask) of active/inactive lanes with its elements
+indexed by ``i``,  and ``%base``, ``%n`` are the two arguments to
+``llvm.get.active.lane.mask.*``, ``%imcp`` is an integer compare and ``ule``
+the unsigned less-than-equal comparison operator.  Overflow cannot occur in
+``(%base + i)`` and its comparison against ``%n`` as it is performed in integer
+numbers and not in machine numbers.  The above is equivalent to:
+
+::
+
+      %m = @llvm.get.active.lane.mask(%base, %n)
+
+This can, for example, be emitted by the loop vectorizer. Then, ``%base`` is
+the first element of the vector induction variable (VIV), and ``%n`` is the
+Back-edge Taken Count (BTC). Thus, these intrinsics perform an element-wise
+less than or equal comparison of VIV with BTC, producing a mask of true/false
+values representing active/inactive vector lanes, except if the VIV overflows
+in which case they return false in the lanes where the VIV overflows.  The
+arguments are scalar types to accomodate scalable vector types, for which it is
+unknown what the type of the step vector needs to be that enumerate its
+lanes without overflow.
+
+This mask ``%m`` can e.g. be used in masked load/store instructions. These
+intrinsics provide a hint to the backend. I.e., for a vector loop, the
+back-edge taken count of the original scalar loop is explicit as the second
+argument.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429)
+      %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+
+
 .. _int_mload_mstore:
 
 Masked Vector Load and Store Intrinsics
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 78409df8f816a..40d4bc5ede3af 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1294,6 +1294,10 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
 
 }
 
+def int_get_active_lane_mask:
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyint_ty, LLVMMatchType<1>],
+            [IntrNoMem, IntrNoSync, IntrWillReturn]>;
 
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e0d28b35efddf..cb96c7ae515a3 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4812,6 +4812,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            "eh.exceptionpointer argument must be a catchpad", Call);
     break;
   }
+  case Intrinsic::get_active_lane_mask: {
+    Assert(Call.getType()->isVectorTy(), "get_active_lane_mask: must return a "
+           "vector", Call);
+    auto *ElemTy = Call.getType()->getScalarType();
+    Assert(ElemTy->isIntegerTy(1), "get_active_lane_mask: element type is not "
+           "i1", Call);
+    break;
+  }
   case Intrinsic::masked_load: {
     Assert(Call.getType()->isVectorTy(), "masked_load: must return a vector",
            Call);
diff --git a/llvm/test/Verifier/get-active-lane-mask.ll b/llvm/test/Verifier/get-active-lane-mask.ll
new file mode 100644
index 0000000000000..94d819b5c75b0
--- /dev/null
+++ b/llvm/test/Verifier/get-active-lane-mask.ll
@@ -0,0 +1,21 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32, i32)
+
+define <4 x i32> @t1(i32 %IV, i32 %BTC) {
+; CHECK:      get_active_lane_mask: element type is not i1
+; CHECK-NEXT: %res = call <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32 %IV, i32 %BTC)
+
+  %res = call <4 x i32> @llvm.get.active.lane.mask.v4i32.i32(i32 %IV, i32 %BTC)
+  ret <4 x i32> %res
+}
+
+declare i32 @llvm.get.active.lane.mask.i32.i32(i32, i32)
+
+define i32 @t2(i32 %IV, i32 %BTC) {
+; CHECK:      Intrinsic has incorrect return type!
+; CHECK-NEXT: i32 (i32, i32)* @llvm.get.active.lane.mask.i32.i32
+
+  %res = call i32 @llvm.get.active.lane.mask.i32.i32(i32 %IV, i32 %BTC)
+  ret i32 %res
+}

From 82bb57c11d8ccb4e1b0f420f4388dd6553bbc57a Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Tue, 19 May 2020 18:05:15 +0200
Subject: [PATCH 476/770] [AST][RecoveryExpr] Make DeduceAutoType fail if the
 auto is deduced from recovery exprs.

Summary:
With recovery-ast, we will get an undeduced `auto` return type for
"auto foo()->undef()" function declaration, the function decl still keeps
valid, it is dangerous, and breaks assumptions in clang, and leads crashes.

This patch invalidates these functions, if we deduce autos from the
return rexpression, which is similar to auto VarDecl.

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80221
---
 clang/lib/Sema/SemaStmt.cpp                   |  1 +
 clang/lib/Sema/SemaTemplateDeduction.cpp      |  2 ++
 .../ast-dump-invalid-auto-return-funcs.cpp    | 28 +++++++++++++++++++
 clang/test/AST/ast-dump-recovery.cpp          |  8 ++----
 clang/test/Sema/invalid-bitwidth-expr.mm      |  9 +++++-
 5 files changed, 41 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/AST/ast-dump-invalid-auto-return-funcs.cpp

diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 0cb600fb46d14..dda0d3486e0e6 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3309,6 +3309,7 @@ Sema::ActOnCapScopeReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) {
     assert(AT && "lost auto type from lambda return type");
     if (DeduceFunctionTypeFromReturnExpr(FD, ReturnLoc, RetValExp, AT)) {
       FD->setInvalidDecl();
+      // FIXME: preserve the ill-formed return expression.
       return StmtError();
     }
     CurCap->ReturnType = FnRetType = FD->getReturnType();
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 19f8248db6bfd..877020ed4dcf9 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -4655,6 +4655,8 @@ Sema::DeduceAutoResult
 Sema::DeduceAutoType(TypeLoc Type, Expr *&Init, QualType &Result,
                      Optional<unsigned> DependentDeductionDepth,
                      bool IgnoreConstraints) {
+  if (Init->containsErrors())
+    return DAR_FailedAlreadyDiagnosed;
   if (Init->getType()->isNonOverloadPlaceholderType()) {
     ExprResult NonPlaceholder = CheckPlaceholderExpr(Init);
     if (NonPlaceholder.isInvalid())
diff --git a/clang/test/AST/ast-dump-invalid-auto-return-funcs.cpp b/clang/test/AST/ast-dump-invalid-auto-return-funcs.cpp
new file mode 100644
index 0000000000000..b77d5335c6619
--- /dev/null
+++ b/clang/test/AST/ast-dump-invalid-auto-return-funcs.cpp
@@ -0,0 +1,28 @@
+// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -fcxx-exceptions -frecovery-ast -std=gnu++17 -ast-dump %s | FileCheck -strict-whitespace %s
+
+// CHECK: FunctionDecl {{.*}} s1 'auto ()'
+auto s1(); // valid
+// FIXME: why we're finding int as the return type. int is used as a fallback type?
+// CHECK: FunctionDecl {{.*}} invalid s2 'auto () -> int'
+auto s2() -> undef();
+// CHECK: FunctionDecl {{.*}} invalid s3 'auto () -> int'
+auto s3() -> decltype(undef());
+// CHECK: FunctionDecl {{.*}} invalid s4 'auto ()'
+auto s4() {
+  return undef();
+}
+// CHECK: FunctionDecl {{.*}} s5 'void ()'
+auto s5() {} // valid, no return stmt, fallback to void
+
+class Foo {
+  // CHECK: CXXMethodDecl {{.*}} foo1 'auto ()'
+  auto foo1(); // valid
+  // CHECK: CXXMethodDecl {{.*}} invalid foo2 'auto () -> int'
+  auto foo2() -> undef();
+  // CHECK: CXXMethodDecl {{.*}} invalid foo3 'auto () -> int'
+  auto foo3() -> decltype(undef());
+  // CHECK: CXXMethodDecl {{.*}} invalid foo4 'auto ()'
+  auto foo4() { return undef(); }
+  // CHECK: CXXMethodDecl {{.*}} foo5 'void ()'
+  auto foo5() {} // valid, no return stmt, fallback to void.
+};
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index b63483fba4168..9b13f4d3e06b1 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -178,10 +178,6 @@ void InitializerForAuto() {
   auto unresolved_typo = gned.*[] {};
 }
 
-// CHECK:      `-TypeAliasDecl {{.*}} Escape 'decltype([] {
-// CHECK-NEXT:   return <recovery-expr>(undef);
-// CHECK-NEXT: }())'
-// CHECK-NEXT:   `-DecltypeType {{.*}} 'decltype([] {
-// CHECK-NEXT:     return <recovery-expr>(undef);
-// CHECK-NEXT:   }())' dependent
+// Verified that the generated call operator is invalid.
+// CHECK: |-CXXMethodDecl {{.*}} invalid operator() 'auto () const -> auto'
 using Escape = decltype([] { return undef(); }());
diff --git a/clang/test/Sema/invalid-bitwidth-expr.mm b/clang/test/Sema/invalid-bitwidth-expr.mm
index fe93cac683ae8..63aced1a3bf4d 100644
--- a/clang/test/Sema/invalid-bitwidth-expr.mm
+++ b/clang/test/Sema/invalid-bitwidth-expr.mm
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -fobjc-runtime=gcc -frecovery-ast -verify %s
-// RUN: %clang_cc1 -fobjc-runtime=gcc -fno-recovery-ast -verify %s
 
 @interface Ivar
 {
@@ -11,3 +10,11 @@ @interface Ivar
 
 constexpr int s = sizeof(Ivar);
 constexpr int ss = sizeof(X);
+
+auto func() {
+  return undef(); // expected-error {{use of undeclared identifier}}
+}
+struct Y {
+  int X : func();
+};
+constexpr int sss = sizeof(Y);

From 7480ccbfc9d258a38eb99bbfccc6511e6ae10b70 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Fri, 29 May 2020 09:05:41 +0100
Subject: [PATCH 477/770] [TTI] New target hook emitGetActiveLaneMask

This is split off from D79100 and adds a new target hook emitGetActiveLaneMask
that can be queried to check if the intrinsic @llvm.get.active.lane.mask() is
supported by the backend and if it should be emitted for a given loop.

See also commit rG7fb8a40e5220 and its commit message for more details/context
on this new intrinsic.

Differential Revision: https://reviews.llvm.org/D80597
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 11 +++++++++++
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h |  5 +++++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h             |  5 +++++
 llvm/lib/Analysis/TargetTransformInfo.cpp            |  5 +++++
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp       |  9 ++++++++-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h         |  3 +++
 6 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 51aa1cb1cb1ec..9cb388c18e8c5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -524,6 +524,11 @@ class TargetTransformInfo {
                                    DominatorTree *DT,
                                    const LoopAccessInfo *LAI) const;
 
+  /// Query the target whether lowering of the llvm.get.active.lane.mask
+  /// intrinsic is supported and if emitting it is desired for this loop.
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFolded) const;
+
   /// @}
 
   /// \name Scalar Target Information
@@ -1251,6 +1256,8 @@ class TargetTransformInfo::Concept {
   preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                               AssumptionCache &AC, TargetLibraryInfo *TLI,
                               DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
+  virtual bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                                     bool TailFolded) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1530,6 +1537,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                    const LoopAccessInfo *LAI) override {
     return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
   }
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFolded) override {
+    return Impl.emitGetActiveLaneMask(L, LI, SE, TailFolded);
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 0e8fc5dd6cfa2..a297675ca3d6f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -140,6 +140,11 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFold) const {
+    return false;
+  }
+
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
                                TTI::UnrollingPreferences &) {}
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c751c3703ba7f..48f56de0747de 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -462,6 +462,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
   }
 
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFold) {
+    return BaseT::emitGetActiveLaneMask(L, LI, SE, TailFold);
+  }
+
   int getInstructionLatency(const Instruction *I) {
     if (isa<LoadInst>(I))
       return getST()->getSchedModel().DefaultLoadLatency;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0c34050a66288..0eb46f43a0785 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -312,6 +312,11 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
   return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
 }
 
+bool TargetTransformInfo::emitGetActiveLaneMask(Loop *L, LoopInfo *LI,
+    ScalarEvolution &SE, bool TailFolded) const {
+  return TTIImpl->emitGetActiveLaneMask(L, LI, SE, TailFolded);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c1af19727ba2b..7874047d0ec85 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1393,7 +1393,14 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
   return canTailPredicateLoop(L, LI, SE, DL, LAI);
 }
 
-
+bool ARMTTIImpl::emitGetActiveLaneMask(Loop *L, LoopInfo *LI,
+    ScalarEvolution &SE, bool TailFolded) const {
+  // TODO: if this loop is tail-folded, we want to emit the
+  // llvm.get.active.lane.mask intrinsic so that this can be picked up in the
+  // MVETailPredication pass that needs to know the number of elements
+  // processed by this vector loop.
+  return false;
+}
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index d6efc6e7ae9e1..72243efb866cc 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -250,6 +250,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFolded) const;
+
   bool shouldBuildLookupTablesForConstant(Constant *C) const {
     // In the ROPI and RWPI relocation models we can't have pointers to global
     // variables or functions in constant data, so don't convert switches to

From 0e0907fa0e257ba63d6f820eafbf2079502153ed Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Fri, 29 May 2020 10:50:06 +0200
Subject: [PATCH 478/770] [VE] Implements minimum MC layer for VE (4/4)

Summary:
This patch includes following items.

 - Adds AsmParser and minimum AsmBackend/ELFObjectWriter/MCCodeEmitter to
   support only LEA instruction in order to reduce the size of this patch.
 - Adds regression test of MC layer for a LEA instruction.
 - Relocations are not supported this time to reduce the size of this patch.

Differential Revision: https://reviews.llvm.org/D79546
---
 llvm/lib/Target/VE/AsmParser/CMakeLists.txt   |   3 +
 llvm/lib/Target/VE/AsmParser/LLVMBuild.txt    |  22 +
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp  | 692 ++++++++++++++++++
 llvm/lib/Target/VE/CMakeLists.txt             |   3 +
 llvm/lib/Target/VE/LLVMBuild.txt              |   4 +-
 .../lib/Target/VE/MCTargetDesc/CMakeLists.txt |   3 +
 .../Target/VE/MCTargetDesc/VEAsmBackend.cpp   | 120 +++
 .../VE/MCTargetDesc/VEELFObjectWriter.cpp     |  53 ++
 .../VE/MCTargetDesc/VEMCCodeEmitter.cpp       | 122 +++
 .../Target/VE/MCTargetDesc/VEMCTargetDesc.cpp |   6 +
 .../Target/VE/MCTargetDesc/VEMCTargetDesc.h   |   9 +-
 llvm/lib/Target/VE/VE.td                      |   8 +
 llvm/lib/Target/VE/VEInstrInfo.td             |  27 +-
 llvm/test/MC/VE/LEA.s                         |  29 +
 llvm/test/MC/VE/lit.local.cfg                 |   2 +
 15 files changed, 1082 insertions(+), 21 deletions(-)
 create mode 100644 llvm/lib/Target/VE/AsmParser/CMakeLists.txt
 create mode 100644 llvm/lib/Target/VE/AsmParser/LLVMBuild.txt
 create mode 100644 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
 create mode 100644 llvm/test/MC/VE/LEA.s
 create mode 100644 llvm/test/MC/VE/lit.local.cfg

diff --git a/llvm/lib/Target/VE/AsmParser/CMakeLists.txt b/llvm/lib/Target/VE/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000000000..1dc76426de673
--- /dev/null
+++ b/llvm/lib/Target/VE/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_component_library(LLVMVEAsmParser
+  VEAsmParser.cpp
+  )
diff --git a/llvm/lib/Target/VE/AsmParser/LLVMBuild.txt b/llvm/lib/Target/VE/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000000000..b55015219bf36
--- /dev/null
+++ b/llvm/lib/Target/VE/AsmParser/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Target/VE/AsmParser/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = VEAsmParser
+parent = VE
+required_libraries = MC MCParser VEDesc VEInfo Support
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
new file mode 100644
index 0000000000000..24147d0cd8c06
--- /dev/null
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -0,0 +1,692 @@
+//===-- VEAsmParser.cpp - Parse VE assembly to MCInst instructions --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEMCExpr.h"
+#include "MCTargetDesc/VEMCTargetDesc.h"
+#include "TargetInfo/VETargetInfo.h"
+#include "VE.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve-asmparser"
+
+namespace {
+
+class VEOperand;
+
+class VEAsmParser : public MCTargetAsmParser {
+  MCAsmParser &Parser;
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "VEGenAsmMatcher.inc"
+
+  /// }
+
+  // public interface of the MCTargetAsmParser.
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  int parseRegisterName(unsigned (*matchFn)(StringRef));
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  // Custom parse functions for VE specific operands.
+  OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
+  OperandMatchResultTy parseVEAsmOperand(std::unique_ptr<VEOperand> &Operand);
+
+public:
+  VEAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+              const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, sti, MII), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// VEOperand - Instances of this class represent a parsed VE machine
+/// instruction.
+class VEOperand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_Token,
+    k_Register,
+    k_Immediate,
+    // SX-Aurora ASX form is disp(index, base).
+    k_MemoryRegRegImm,  // base=reg, index=reg, disp=imm
+    k_MemoryRegImmImm,  // base=reg, index=imm, disp=imm
+    k_MemoryZeroRegImm, // base=0, index=reg, disp=imm
+    k_MemoryZeroImmImm, // base=0, index=imm, disp=imm
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned IndexReg;
+    const MCExpr *Index;
+    const MCExpr *Offset;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+public:
+  VEOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  bool isToken() const override { return Kind == k_Token; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override {
+    return isMEMrri() || isMEMrii() || isMEMzri() || isMEMzii() || isMEMri() ||
+           isMEMzi();
+  }
+  bool isMEMrri() const { return Kind == k_MemoryRegRegImm; }
+  bool isMEMrii() const { return Kind == k_MemoryRegImmImm; }
+  bool isMEMzri() const { return Kind == k_MemoryZeroRegImm; }
+  bool isMEMzii() const { return Kind == k_MemoryZeroImmImm; }
+  // isMEMri and isMEMzi will be implemented later.
+  bool isMEMri() const { return false; }
+  bool isMEMzi() const { return false; }
+  bool isSImm7() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isInt<7>(Value);
+    }
+    return false;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm) &&
+           "Invalid access!");
+    return Mem.Base;
+  }
+
+  unsigned getMemIndexReg() const {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryZeroRegImm) &&
+           "Invalid access!");
+    return Mem.IndexReg;
+  }
+
+  const MCExpr *getMemIndex() const {
+    assert((Kind == k_MemoryRegImmImm || Kind == k_MemoryZeroImmImm) &&
+           "Invalid access!");
+    return Mem.Index;
+  }
+
+  const MCExpr *getMemOffset() const {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
+            Kind == k_MemoryZeroImmImm || Kind == k_MemoryZeroRegImm) &&
+           "Invalid access!");
+    return Mem.Offset;
+  }
+
+  void setMemOffset(const MCExpr *off) {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
+            Kind == k_MemoryZeroImmImm || Kind == k_MemoryZeroRegImm) &&
+           "Invalid access!");
+    Mem.Offset = off;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case k_Token:
+      OS << "Token: " << getToken() << "\n";
+      break;
+    case k_Register:
+      OS << "Reg: #" << getReg() << "\n";
+      break;
+    case k_Immediate:
+      OS << "Imm: " << getImm() << "\n";
+      break;
+    case k_MemoryRegRegImm:
+      assert(getMemOffset() != nullptr);
+      OS << "Mem: #" << getMemBase() << "+#" << getMemIndexReg() << "+"
+         << *getMemOffset() << "\n";
+      break;
+    case k_MemoryRegImmImm:
+      assert(getMemIndex() != nullptr && getMemOffset() != nullptr);
+      OS << "Mem: #" << getMemBase() << "+" << *getMemIndex() << "+"
+         << *getMemOffset() << "\n";
+      break;
+    case k_MemoryZeroRegImm:
+      assert(getMemOffset() != nullptr);
+      OS << "Mem: 0+#" << getMemIndexReg() << "+" << *getMemOffset() << "\n";
+      break;
+    case k_MemoryZeroImmImm:
+      assert(getMemIndex() != nullptr && getMemOffset() != nullptr);
+      OS << "Mem: 0+" << *getMemIndex() << "+" << *getMemOffset() << "\n";
+      break;
+    }
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addSImm7Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addMEMrriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+    Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMriiOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+    addExpr(Inst, getMemIndex());
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMzriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(0));
+    Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMziiOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(0));
+    addExpr(Inst, getMemIndex());
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMriOperands(MCInst &Inst, unsigned N) const {
+    // FIXME: implement
+  }
+
+  void addMEMziOperands(MCInst &Inst, unsigned N) const {
+    // FIXME: implement
+  }
+
+  static std::unique_ptr<VEOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = std::make_unique<VEOperand>(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                              SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                              SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMrri(unsigned Base, unsigned Index, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryRegRegImm;
+    Op->Mem.Base = Base;
+    Op->Mem.IndexReg = Index;
+    Op->Mem.Index = nullptr;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMrii(unsigned Base, const MCExpr *Index,
+                std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryRegImmImm;
+    Op->Mem.Base = Base;
+    Op->Mem.IndexReg = 0;
+    Op->Mem.Index = Index;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMzri(unsigned Index, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryZeroRegImm;
+    Op->Mem.Base = 0;
+    Op->Mem.IndexReg = Index;
+    Op->Mem.Index = nullptr;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMzii(const MCExpr *Index, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryZeroImmImm;
+    Op->Mem.Base = 0;
+    Op->Mem.IndexReg = 0;
+    Op->Mem.Index = Index;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+};
+
+} // end anonymous namespace
+
+bool VEAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                          OperandVector &Operands,
+                                          MCStreamer &Out, uint64_t &ErrorInfo,
+                                          bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+  switch (MatchResult) {
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.emitInstruction(Inst, getSTI());
+    return false;
+
+  case Match_MissingFeature:
+    return Error(IDLoc,
+                 "instruction requires a CPU feature not currently enabled");
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((VEOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction mnemonic");
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool VEAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                SMLoc &EndLoc) {
+  if (tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success)
+    return Error(StartLoc, "invalid register name");
+  return false;
+}
+
+/// Parses a register name using a given matching function.
+/// Checks for lowercase or uppercase if necessary.
+int VEAsmParser::parseRegisterName(unsigned (*matchFn)(StringRef)) {
+  StringRef Name = Parser.getTok().getString();
+
+  int RegNum = matchFn(Name);
+
+  // GCC supports case insensitive register names. All of the VE registers
+  // are all lower case.
+  if (RegNum == VE::NoRegister) {
+    RegNum = matchFn(Name.lower());
+  }
+
+  return RegNum;
+}
+
+/// Maps from the set of all register names to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterName(StringRef Name);
+
+/// Maps from the set of all alternative registernames to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterAltName(StringRef Name);
+
+OperandMatchResultTy
+VEAsmParser::tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  if (getLexer().getKind() != AsmToken::Percent)
+    return MatchOperand_NoMatch;
+  Parser.Lex();
+
+  RegNo = parseRegisterName(&MatchRegisterName);
+  if (RegNo == VE::NoRegister)
+    RegNo = parseRegisterName(&MatchRegisterAltName);
+
+  if (RegNo != VE::NoRegister) {
+    Parser.Lex();
+    return MatchOperand_Success;
+  }
+
+  getLexer().UnLex(Tok);
+  return MatchOperand_NoMatch;
+}
+
+bool VEAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                   SMLoc NameLoc, OperandVector &Operands) {
+
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(VEOperand::CreateToken(Name, NameLoc));
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Name) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token");
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Name) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        return Error(Loc, "unexpected token");
+      }
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    return Error(Loc, "unexpected token");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool VEAsmParser::ParseDirective(AsmToken DirectiveID) {
+  // Let the MC layer to handle other directives.
+  return true;
+}
+
+OperandMatchResultTy VEAsmParser::parseMEMOperand(OperandVector &Operands) {
+  LLVM_DEBUG(dbgs() << "parseMEMOperand\n");
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  SMLoc E = Tok.getEndLoc();
+  // Parse ASX format
+  //   disp
+  //   disp(, base)
+  //   disp(index)
+  //   disp(index, base)
+  //   (, base)
+  //   (index)
+  //   (index, base)
+
+  std::unique_ptr<VEOperand> Offset;
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot: {
+    const MCExpr *EVal;
+    if (!getParser().parseExpression(EVal, E))
+      Offset = VEOperand::CreateImm(EVal, S, E);
+    else
+      return MatchOperand_NoMatch;
+    break;
+  }
+  case AsmToken::LParen:
+    // empty disp (= 0)
+    Offset =
+        VEOperand::CreateImm(MCConstantExpr::create(0, getContext()), S, E);
+    break;
+  }
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_ParseFail;
+
+  case AsmToken::EndOfStatement:
+    Operands.push_back(VEOperand::MorphToMEMzii(
+        MCConstantExpr::create(0, getContext()), std::move(Offset)));
+    return MatchOperand_Success;
+
+  case AsmToken::LParen:
+    Parser.Lex(); // Eat the (
+    break;
+  }
+
+  const MCExpr *IndexValue = nullptr;
+  unsigned IndexReg = 0;
+
+  switch (getLexer().getKind()) {
+  default:
+    if (ParseRegister(IndexReg, S, E))
+      return MatchOperand_ParseFail;
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+    if (getParser().parseExpression(IndexValue, E))
+      return MatchOperand_ParseFail;
+    break;
+
+  case AsmToken::Comma:
+    // empty index
+    IndexValue = MCConstantExpr::create(0, getContext());
+    break;
+  }
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_ParseFail;
+
+  case AsmToken::RParen:
+    Parser.Lex(); // Eat the )
+    Operands.push_back(
+        IndexValue ? VEOperand::MorphToMEMzii(IndexValue, std::move(Offset))
+                   : VEOperand::MorphToMEMzri(IndexReg, std::move(Offset)));
+    return MatchOperand_Success;
+
+  case AsmToken::Comma:
+    Parser.Lex(); // Eat the ,
+    break;
+  }
+
+  unsigned BaseReg = 0;
+  if (ParseRegister(BaseReg, S, E))
+    return MatchOperand_ParseFail;
+
+  if (!Parser.getTok().is(AsmToken::RParen))
+    return MatchOperand_ParseFail;
+
+  Parser.Lex(); // Eat the )
+  Operands.push_back(
+      IndexValue
+          ? VEOperand::MorphToMEMrii(BaseReg, IndexValue, std::move(Offset))
+          : VEOperand::MorphToMEMrri(BaseReg, IndexReg, std::move(Offset)));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy VEAsmParser::parseOperand(OperandVector &Operands,
+                                               StringRef Mnemonic) {
+  LLVM_DEBUG(dbgs() << "parseOperand\n");
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+    return ResTy;
+
+  switch (getLexer().getKind()) {
+  case AsmToken::LParen:
+    // FIXME: Parsing "(" + %vreg + ", " + %vreg + ")"
+    // FALLTHROUGH
+  default: {
+    std::unique_ptr<VEOperand> Op;
+    ResTy = parseVEAsmOperand(Op);
+    if (ResTy != MatchOperand_Success || !Op)
+      return MatchOperand_ParseFail;
+
+    // Push the parsed operand into the list of operands
+    Operands.push_back(std::move(Op));
+
+    if (!Parser.getTok().is(AsmToken::LParen))
+      break;
+
+    // FIXME: Parsing %vec-reg + "(" + %sclar-reg/number + ")"
+    break;
+  }
+  }
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+VEAsmParser::parseVEAsmOperand(std::unique_ptr<VEOperand> &Op) {
+  LLVM_DEBUG(dbgs() << "parseVEAsmOperand\n");
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  Op = nullptr;
+  switch (getLexer().getKind()) {
+  default:
+    break;
+
+  case AsmToken::Percent:
+    unsigned RegNo;
+    if (tryParseRegister(RegNo, S, E) == MatchOperand_Success)
+      Op = VEOperand::CreateReg(RegNo, S, E);
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+    if (!getParser().parseExpression(EVal, E))
+      Op = VEOperand::CreateImm(EVal, S, E);
+    break;
+
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (!getParser().parseIdentifier(Identifier)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+
+      const MCExpr *Res =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      Op = VEOperand::CreateImm(Res, S, E);
+    }
+    break;
+  }
+  }
+  return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
+}
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
+  RegisterMCAsmParser<VEAsmParser> A(getTheVETarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "VEGenAsmMatcher.inc"
diff --git a/llvm/lib/Target/VE/CMakeLists.txt b/llvm/lib/Target/VE/CMakeLists.txt
index 89c946d87e198..4b9169da63c06 100644
--- a/llvm/lib/Target/VE/CMakeLists.txt
+++ b/llvm/lib/Target/VE/CMakeLists.txt
@@ -2,7 +2,9 @@ set(LLVM_TARGET_DEFINITIONS VE.td)
 
 tablegen(LLVM VEGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM VEGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM VEGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM VEGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM VEGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM VEGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM VEGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM VEGenCallingConv.inc -gen-callingconv)
@@ -21,5 +23,6 @@ add_llvm_target(VECodeGen
   VETargetMachine.cpp
   )
 
+add_subdirectory(AsmParser)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/llvm/lib/Target/VE/LLVMBuild.txt b/llvm/lib/Target/VE/LLVMBuild.txt
index eb74a9a387b02..f3169930771cf 100644
--- a/llvm/lib/Target/VE/LLVMBuild.txt
+++ b/llvm/lib/Target/VE/LLVMBuild.txt
@@ -15,13 +15,13 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = MCTargetDesc TargetInfo
+subdirectories = AsmParser MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = VE
 parent = Target
-has_asmparser = 0
+has_asmparser = 1
 has_asmprinter = 1
 
 [component_1]
diff --git a/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
index 9bca0ceeb69bf..4d8787ff86099 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_llvm_component_library(LLVMVEDesc
+  VEAsmBackend.cpp
+  VEELFObjectWriter.cpp
   VEInstPrinter.cpp
   VEMCAsmInfo.cpp
+  VEMCCodeEmitter.cpp
   VEMCExpr.cpp
   VEMCTargetDesc.cpp
   VETargetStreamer.cpp
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
new file mode 100644
index 0000000000000..dcc9fe8f7e4ad
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -0,0 +1,120 @@
+//===-- VEAsmBackend.cpp - VE Assembler Backend ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEFixupKinds.h"
+#include "MCTargetDesc/VEMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  }
+}
+
+namespace {
+class VEAsmBackend : public MCAsmBackend {
+protected:
+  const Target &TheTarget;
+
+public:
+  VEAsmBackend(const Target &T) : MCAsmBackend(support::little), TheTarget(T) {}
+
+  unsigned getNumFixupKinds() const override { return VE::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    // FIXME.
+    llvm_unreachable("getFixupKindInfo() unimplemented");
+  }
+
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
+    // FIXME.
+    return false;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    // FIXME.
+    return false;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    // FIXME.
+    llvm_unreachable("fixupNeedsRelaxation() unimplemented");
+    return false;
+  }
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
+    if ((Count % 8) != 0)
+      return false;
+
+    for (uint64_t i = 0; i < Count; i += 8)
+      support::endian::write<uint64_t>(OS, 0x7900000000000000ULL,
+                                       support::little);
+
+    return true;
+  }
+};
+
+class ELFVEAsmBackend : public VEAsmBackend {
+  Triple::OSType OSType;
+
+public:
+  ELFVEAsmBackend(const Target &T, Triple::OSType OSType)
+      : VEAsmBackend(T), OSType(OSType) {}
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value)
+      return; // Doesn't change encoding.
+
+    // FIXME.
+    llvm_unreachable("applyFixup() unimplemented");
+  }
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
+    return createVEELFObjectWriter(OSABI);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createVEAsmBackend(const Target &T,
+                                       const MCSubtargetInfo &STI,
+                                       const MCRegisterInfo &MRI,
+                                       const MCTargetOptions &Options) {
+  return new ELFVEAsmBackend(T, STI.getTargetTriple().getOS());
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
new file mode 100644
index 0000000000000..77ac97979cef6
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -0,0 +1,53 @@
+//===-- VEELFObjectWriter.cpp - VE ELF Writer -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEFixupKinds.h"
+#include "VEMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class VEELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  VEELFObjectWriter(uint8_t OSABI)
+      : MCELFObjectTargetWriter(/* Is64Bit */ true, OSABI, ELF::EM_VE,
+                                /* HasRelocationAddend */ true) {}
+
+  ~VEELFObjectWriter() override {}
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+
+  bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                               unsigned Type) const override;
+};
+} // namespace
+
+unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+                                         const MCFixup &Fixup,
+                                         bool IsPCRel) const {
+  // FIXME: implements.
+  return ELF::R_VE_NONE;
+}
+
+bool VEELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                unsigned Type) const {
+  // FIXME: implements.
+  return false;
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createVEELFObjectWriter(uint8_t OSABI) {
+  return std::make_unique<VEELFObjectWriter>(OSABI);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
new file mode 100644
index 0000000000000..b8328c6cb58dd
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -0,0 +1,122 @@
+//===-- VEMCCodeEmitter.cpp - Convert VE code to machine code -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VEMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEFixupKinds.h"
+#include "VE.h"
+#include "VEMCExpr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+
+class VEMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+
+public:
+  VEMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
+  VEMCCodeEmitter(const VEMCCodeEmitter &) = delete;
+  VEMCCodeEmitter &operator=(const VEMCCodeEmitter &) = delete;
+  ~VEMCCodeEmitter() override = default;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+private:
+  FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+  void
+  verifyInstructionPredicates(const MCInst &MI,
+                              const FeatureBitset &AvailableFeatures) const;
+};
+
+} // end anonymous namespace
+
+void VEMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+  support::endian::write<uint64_t>(OS, Bits, support::little);
+
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+  const MCExpr *Expr = MO.getExpr();
+  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
+    MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
+    Fixups.push_back(MCFixup::create(0, Expr, Kind));
+    return 0;
+  }
+
+  int64_t Res;
+  if (Expr->evaluateAsAbsolute(Res))
+    return Res;
+
+  llvm_unreachable("Unhandled expression!");
+  return 0;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "VEGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createVEMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           MCContext &Ctx) {
+  return new VEMCCodeEmitter(MCII, Ctx);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index bf964782515cc..a39cffc8f4a65 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -94,6 +94,12 @@ extern "C" void LLVMInitializeVETargetMC() {
     // Register the MC subtarget info.
     TargetRegistry::RegisterMCSubtargetInfo(*T, createVEMCSubtargetInfo);
 
+    // Register the MC Code Emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createVEMCCodeEmitter);
+
+    // Register the asm backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createVEAsmBackend);
+
     // Register the object target streamer.
     TargetRegistry::RegisterObjectTargetStreamer(*T,
                                                  createObjectTargetStreamer);
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
index 79acc509b3c88..7fb8a556aa749 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -22,7 +22,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -31,6 +31,13 @@ class Triple;
 class StringRef;
 class raw_pwrite_stream;
 class raw_ostream;
+
+MCCodeEmitter *createVEMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI, MCContext &Ctx);
+MCAsmBackend *createVEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+                                 const MCRegisterInfo &MRI,
+                                 const MCTargetOptions &Options);
+std::unique_ptr<MCObjectTargetWriter> createVEELFObjectWriter(uint8_t OSABI);
 } // namespace llvm
 
 // Defines symbolic names for VE registers.  This defines a mapping from
diff --git a/llvm/lib/Target/VE/VE.td b/llvm/lib/Target/VE/VE.td
index 7404321b1a063..617a6ea458b68 100644
--- a/llvm/lib/Target/VE/VE.td
+++ b/llvm/lib/Target/VE/VE.td
@@ -29,6 +29,13 @@ include "VEInstrInfo.td"
 
 def VEInstrInfo : InstrInfo;
 
+def VEAsmParser : AsmParser {
+  // Use both VE register name matcher to accept "S0~S63" register names
+  // and default register matcher to accept other registeres.
+  let AllowDuplicateRegisterNames = 1;
+  let ShouldEmitMatchRegisterAltName = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // VE processors supported.
 //===----------------------------------------------------------------------===//
@@ -51,6 +58,7 @@ def VEAsmWriter : AsmWriter {
 def VE : Target {
   // Pull in Instruction Info:
   let InstructionSet = VEInstrInfo;
+  let AssemblyParsers = [VEAsmParser];
   let AssemblyWriters = [VEAsmWriter];
   let AllowRegisterRenaming = 1;
 }
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index c7815efb8c71e..0a37e52e0240e 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -263,20 +263,6 @@ def fcond2ccSwap : SDNodeXForm<cond, [{
   return CurDAG->getTargetConstant(cc, SDLoc(N), MVT::i32);
 }]>;
 
-// Addressing modes.
-def ADDRri : ComplexPattern<iPTR, 2, "selectADDRri", [frameindex], []>;
-
-def MEMri : Operand<iPTR> {
-  let PrintMethod = "printMemASOperandASX";
-  let MIOperandInfo = (ops ptr_rc, i64imm);
-}
-
-// AS format of memory address
-def MEMASri : Operand<iPTR> {
-  let PrintMethod = "printMemASOperand";
-  let MIOperandInfo = (ops ptr_rc, i64imm);
-}
-
 // Addressing modes.
 // SX-Aurora has following fields.
 //    sz: register or 0
@@ -307,8 +293,10 @@ def ADDRrri : ComplexPattern<iPTR, 3, "selectADDRrri", [frameindex], []>;
 def ADDRrii : ComplexPattern<iPTR, 3, "selectADDRrii", [frameindex], []>;
 def ADDRzri : ComplexPattern<iPTR, 3, "selectADDRzri", [], []>;
 def ADDRzii : ComplexPattern<iPTR, 3, "selectADDRzii", [], []>;
+// AS format:
+def ADDRri : ComplexPattern<iPTR, 2, "selectADDRri", [frameindex], []>;
 //
-// ASX assembly instrcution format:
+// ASX assembly instruction format:
 def VEMEMrriAsmOperand : AsmOperandClass {
   let Name = "MEMrri";
   let ParserMethod = "parseMEMOperand";
@@ -345,14 +333,12 @@ def MEMzii : Operand<iPTR> {
   let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i32imm);
   let ParserMatchClass = VEMEMziiAsmOperand;
 }
-// AS assembly instrcution format:
+// AS assembly instruction format:
 def VEMEMriAsmOperand : AsmOperandClass {
   let Name = "MEMri";
-  let ParserMethod = "parseMEMAsOperand";
 }
 def VEMEMziAsmOperand : AsmOperandClass {
   let Name = "MEMzi";
-  let ParserMethod = "parseMEMAsOperand";
 }
 // AS generic assembly instruction format:
 def MEMriASX : Operand<iPTR> {
@@ -365,6 +351,11 @@ def MEMziASX : Operand<iPTR> {
   let MIOperandInfo = (ops i32imm /* = 0 */, i32imm);
   let ParserMatchClass = VEMEMziAsmOperand;
 }
+def MEMASri : Operand<iPTR> {
+  let PrintMethod = "printMemASOperand";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = VEMEMriAsmOperand;
+}
 
 // Branch targets have OtherVT type.
 def brtarget32 : Operand<OtherVT> {
diff --git a/llvm/test/MC/VE/LEA.s b/llvm/test/MC/VE/LEA.s
new file mode 100644
index 0000000000000..fca99d59f57c0
--- /dev/null
+++ b/llvm/test/MC/VE/LEA.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple ve-unknown-unknown --show-encoding %s | FileCheck %s
+
+# CHECK: lea %s11, 23
+# CHECK: encoding: [0x17,0x00,0x00,0x00,0x00,0x00,0x0b,0x06]
+lea %s11, 23
+
+# CHECK: lea %s63, 324(, %s11)
+# CHECK: encoding: [0x44,0x01,0x00,0x00,0x8b,0x00,0x3f,0x06]
+lea %s63, 324(,%s11)
+
+# CHECK: lea %s11, 324(%s10)
+# CHECK: encoding: [0x44,0x01,0x00,0x00,0x00,0x8a,0x0b,0x06]
+lea %s11, 324(%s10  )
+
+# CHECK: lea %s11, 324(%s13, %s11)
+# CHECK: encoding: [0x44,0x01,0x00,0x00,0x8b,0x8d,0x0b,0x06]
+lea %s11, 324 (%s13,%s11)
+
+# CHECK: lea %s11, (%s10)
+# CHECK: encoding: [0x00,0x00,0x00,0x00,0x00,0x8a,0x0b,0x06]
+lea %s11, (%s10)
+
+# CHECK: lea %s11, (, %s12)
+# CHECK: encoding: [0x00,0x00,0x00,0x00,0x8c,0x00,0x0b,0x06]
+lea %s11, (,%s12)
+
+# CHECK: lea.sl %s11, -1(%s13, %s11)
+# CHECK: encoding: [0xff,0xff,0xff,0xff,0x8b,0x8d,0x8b,0x06]
+lea.sl %s11, -1(%s13, %s11)
diff --git a/llvm/test/MC/VE/lit.local.cfg b/llvm/test/MC/VE/lit.local.cfg
new file mode 100644
index 0000000000000..b6366779272df
--- /dev/null
+++ b/llvm/test/MC/VE/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'VE' in config.root.targets:
+    config.unsupported = True

From 036d4b0dbfd1bb5f960c620ff899669d85e62e08 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 29 May 2020 09:34:29 +0100
Subject: [PATCH 479/770] [AMDGPU] Use numbers::pi instead of M_PI. NFC.

---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 +--------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp      | 7 +------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2a546433a2454..d2ce11340a0f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -11,13 +11,6 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
-// from the Visual C++ cmath / math.h headers:
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
-#define _USE_MATH_DEFINES
-#endif
-
 #include "AMDGPULegalizerInfo.h"
 
 #include "AMDGPU.h"
@@ -1877,7 +1870,7 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
   unsigned Flags = MI.getFlags();
 
   Register TrigVal;
-  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
+  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / numbers::pi);
   if (ST.hasTrigReducedRange()) {
     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3b8930c433a3a..ace28d5c9dcb2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11,11 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-// Provide M_PI.
-#define _USE_MATH_DEFINES
-#endif
-
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
@@ -8230,7 +8225,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 
   // TODO: Should this propagate fast-math-flags?
 
-  SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
+  SDValue OneOver2Pi = DAG.getConstantFP(0.5 / numbers::pi, DL, VT);
 
   if (Subtarget->hasTrigReducedRange()) {
     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);

From b28d038ff34dd54ce8eb9fe83506cc3742e6b85a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 29 May 2020 09:42:00 +0100
Subject: [PATCH 480/770] [AMDGPU] Better use of llvm::numbers

Tweak a few constant expressions involving numbers::pi etc to avoid
rounding errors. NFCI though it's possible some of these will now be
more accurate in the last bit.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp  | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d1891e25e5f29..2db9af885a6c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1240,7 +1240,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FROUND: return LowerFROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::FLOG:
-    return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
+    return LowerFLOG(Op, DAG, numbers::ln2f);
   case ISD::FLOG10:
     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
   case ISD::FEXP:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index d2ce11340a0f7..308659f652009 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1384,7 +1384,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
   case TargetOpcode::G_ATOMIC_CMPXCHG:
     return legalizeAtomicCmpXChg(MI, MRI, B);
   case TargetOpcode::G_FLOG:
-    return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
+    return legalizeFlog(MI, B, numbers::ln2f);
   case TargetOpcode::G_FLOG10:
     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
   case TargetOpcode::G_FEXP:
@@ -1870,7 +1870,7 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
   unsigned Flags = MI.getFlags();
 
   Register TrigVal;
-  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / numbers::pi);
+  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
   if (ST.hasTrigReducedRange()) {
     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ace28d5c9dcb2..43588c7de45a4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8225,7 +8225,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 
   // TODO: Should this propagate fast-math-flags?
 
-  SDValue OneOver2Pi = DAG.getConstantFP(0.5 / numbers::pi, DL, VT);
+  SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
 
   if (Subtarget->hasTrigReducedRange()) {
     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);

From 7a3a2535854c84b1c8f6b0a2f2677e89b0e1a250 Mon Sep 17 00:00:00 2001
From: Ehsan Toosi <ehsan.nadjaran_toosi@dfki.de>
Date: Wed, 20 May 2020 18:23:43 +0200
Subject: [PATCH 481/770] [MLIR][BufferPlacement] Support functions that return
 Memref typed results

Buffer placement can now operates on functions that return buffers. These
buffers escape from the deallocation phase of buffer placement.

Differential Revision: https://reviews.llvm.org/D80696
---
 .../include/mlir/Transforms/BufferPlacement.h | 89 +++++++++---------
 .../Linalg/Transforms/TensorsToBuffers.cpp    |  2 +-
 mlir/lib/Transforms/BufferPlacement.cpp       | 33 ++++---
 .../buffer-placement-preparation.mlir         | 28 ++++--
 mlir/test/Transforms/buffer-placement.mlir    | 29 ++++++
 .../lib/Transforms/TestBufferPlacement.cpp    | 93 +++++++++++--------
 6 files changed, 170 insertions(+), 104 deletions(-)

diff --git a/mlir/include/mlir/Transforms/BufferPlacement.h b/mlir/include/mlir/Transforms/BufferPlacement.h
index 030b87599d06a..10949160fcbd6 100644
--- a/mlir/include/mlir/Transforms/BufferPlacement.h
+++ b/mlir/include/mlir/Transforms/BufferPlacement.h
@@ -76,11 +76,23 @@ class BufferAssignmentOpConversionPattern
   TypeConverter *converter;
 };
 
-/// Converts the signature of the function using the type converter.
-/// It adds an extra argument for each illegally-typed function
-/// result to the function arguments. `BufferAssignmentTypeConverter`
-/// is a helper `TypeConverter` for this purpose. All the non-shaped types
-/// of the input function will be converted to memref.
+/// A helper type converter class for using inside Buffer Assignment operation
+/// conversion patterns. The default constructor keeps all the types intact
+/// except for the ranked-tensor types which is converted to memref types.
+class BufferAssignmentTypeConverter : public TypeConverter {
+public:
+  BufferAssignmentTypeConverter();
+
+  /// A helper function to check if `type` has been converted from non-memref
+  /// type to memref.
+  static bool isConvertedMemref(Type type, Type before);
+};
+
+/// Converts the signature of the function using the type converter. It adds an
+/// extra argument for each function result type which is going to be a memref
+/// type after type conversion. The other function result types remain
+/// unchanged. `BufferAssignmentTypeConverter` is a helper `TypeConverter` for
+/// this purpose.
 class FunctionAndBlockSignatureConverter
     : public BufferAssignmentOpConversionPattern<FuncOp> {
 public:
@@ -93,12 +105,14 @@ class FunctionAndBlockSignatureConverter
                   ConversionPatternRewriter &rewriter) const final;
 };
 
-/// Converts the source `ReturnOp` to target `ReturnOp`, removes all
-/// the buffer operands from the operands list, and inserts `CopyOp`s
-/// for all buffer operands instead.
+/// Rewrites the `ReturnOp` to conform with the changed function signature.
+/// Operands that correspond to return values that have been rewritten from
+/// tensor results to memref arguments are dropped. In their place, a
+/// corresponding copy operation from the operand to the new function argument
+/// is inserted.
 template <typename ReturnOpSourceTy, typename ReturnOpTargetTy,
           typename CopyOpTy>
-class NoBufferOperandsReturnOpConverter
+class BufferAssignmentReturnOpConverter
     : public BufferAssignmentOpConversionPattern<ReturnOpSourceTy> {
 public:
   using BufferAssignmentOpConversionPattern<
@@ -108,50 +122,41 @@ class NoBufferOperandsReturnOpConverter
   LogicalResult
   matchAndRewrite(ReturnOpSourceTy returnOp, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const final {
+    // Split the operands by their kinds whether they are converted memref or
+    // not.
+    SmallVector<Value, 2> needCopyOperands, newOperands;
+    unsigned operandsSize = operands.size();
+    needCopyOperands.reserve(operandsSize);
+    newOperands.reserve(operandsSize);
+    for (auto operand : llvm::enumerate(operands))
+      if (BufferAssignmentTypeConverter::isConvertedMemref(
+              operand.value().getType(),
+              returnOp.getOperand(operand.index()).getType()))
+        needCopyOperands.push_back(operand.value());
+      else
+        newOperands.push_back(operand.value());
+
     Block &entryBlock = returnOp.getParentRegion()->front();
     unsigned numFuncArgs = entryBlock.getNumArguments();
-    Location loc = returnOp.getLoc();
-
-    // The target `ReturnOp` should not contain any memref operands.
-    SmallVector<Value, 2> newOperands(operands.begin(), operands.end());
-    llvm::erase_if(newOperands, [](Value operand) {
-      return operand.getType().isa<MemRefType>();
-    });
 
     // Find the index of the first destination buffer.
-    unsigned numBufferOperands = operands.size() - newOperands.size();
-    unsigned destArgNum = numFuncArgs - numBufferOperands;
-
+    assert(needCopyOperands.size() <= numFuncArgs &&
+           "The number of operands of return operation is more than the "
+           "number of function arguments.");
+    unsigned destArgNum = numFuncArgs - needCopyOperands.size();
     rewriter.setInsertionPoint(returnOp);
-    // Find the corresponding destination buffer for each memref operand.
-    for (Value operand : operands)
-      if (operand.getType().isa<MemRefType>()) {
-        assert(destArgNum < numFuncArgs &&
-               "The number of operands of return operation is more than the "
-               "number of function argument.");
-
-        // For each memref type operand of the source `ReturnOp`, a new `CopyOp`
-        // is inserted that copies the buffer content from the operand to the
-        // target.
-        rewriter.create<CopyOpTy>(loc, operand,
-                                  entryBlock.getArgument(destArgNum));
-        ++destArgNum;
-      }
+    for (Value operand : needCopyOperands) {
+      // Insert a `CopyOp` for each converted memref-type operand.
+      rewriter.create<CopyOpTy>(returnOp.getLoc(), operand,
+                                entryBlock.getArgument(destArgNum));
+      ++destArgNum;
+    }
 
     // Insert the new target Return operation.
     rewriter.replaceOpWithNewOp<ReturnOpTargetTy>(returnOp, newOperands);
     return success();
   }
 };
-
-/// A helper type converter class for using inside Buffer Assignment operation
-/// conversion patterns. The default constructor keeps all the types intact
-/// except for the ranked-tensor types which is converted to memref types.
-class BufferAssignmentTypeConverter : public TypeConverter {
-public:
-  BufferAssignmentTypeConverter();
-};
-
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_BUFFERPLACEMENT_H
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
index 9b5855dff0ceb..c663eb6017e5b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
@@ -21,7 +21,7 @@
 
 using namespace mlir;
 using ReturnOpConverter =
-    NoBufferOperandsReturnOpConverter<mlir::ReturnOp, mlir::ReturnOp,
+    BufferAssignmentReturnOpConverter<mlir::ReturnOp, mlir::ReturnOp,
                                       linalg::CopyOp>;
 
 namespace {
diff --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferPlacement.cpp
index cd0641c1ac325..60f49d4e305c4 100644
--- a/mlir/lib/Transforms/BufferPlacement.cpp
+++ b/mlir/lib/Transforms/BufferPlacement.cpp
@@ -389,7 +389,13 @@ struct BufferPlacementPass
 
       // If there is an existing dealloc, move it to the right place.
       Operation *nextOp = positions.getDeallocPosition()->getNextNode();
-      assert(nextOp && "Invalid Dealloc operation position");
+      // If the Dealloc position is at the terminator operation of the block,
+      // then the value should escape from a deallocation.
+      if (!nextOp) {
+        assert(deallocs.size() == 0 &&
+               "There should be no dealloc for the returned buffer");
+        continue;
+      }
       if (deallocs.size()) {
         (*deallocs.begin())->moveBefore(nextOp);
       } else {
@@ -431,11 +437,6 @@ LogicalResult FunctionAndBlockSignatureConverter::matchAndRewrite(
     return failure();
   }
   auto funcType = funcOp.getType();
-  TypeRange resultTypes = funcType.getResults();
-  if (llvm::any_of(resultTypes,
-                   [](Type type) { return type.isa<MemRefType>(); }))
-    return funcOp.emitError("BufferAssignmentPlacer doesn't currently support "
-                            "functions which return memref typed values");
 
   // Convert function arguments using the provided TypeConverter.
   TypeConverter::SignatureConversion conversion(funcType.getNumInputs());
@@ -443,17 +444,16 @@ LogicalResult FunctionAndBlockSignatureConverter::matchAndRewrite(
     conversion.addInputs(argType.index(),
                          converter->convertType(argType.value()));
 
-  // Adding a function argument for each function result which is going to be a
-  // memref type after type conversion.
+  // If a function result type is not a memref but it would be a memref after
+  // type conversion, a new argument should be appended to the function
+  // arguments list for this result. Otherwise, it remains unchanged as a
+  // function result.
   SmallVector<Type, 2> newResultTypes;
   newResultTypes.reserve(funcOp.getNumResults());
-  for (Type resType : resultTypes) {
+  for (Type resType : funcType.getResults()) {
     Type convertedType = converter->convertType(resType);
-
-    // If the result type is memref after the type conversion, a new argument
-    // should be appended to the function arguments list for this result.
-    // Otherwise, it remains unchanged as a function result.
-    if (convertedType.isa<MemRefType>())
+    if (BufferAssignmentTypeConverter::isConvertedMemref(convertedType,
+                                                         resType))
       conversion.addInputs(convertedType);
     else
       newResultTypes.push_back(convertedType);
@@ -482,6 +482,11 @@ BufferAssignmentTypeConverter::BufferAssignmentTypeConverter() {
   });
 }
 
+/// Checks if `type` has been converted from non-memref type to memref.
+bool BufferAssignmentTypeConverter::isConvertedMemref(Type type, Type before) {
+  return type.isa<MemRefType>() && !before.isa<MemRefType>();
+}
+
 //===----------------------------------------------------------------------===//
 // BufferPlacementPass construction
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Transforms/buffer-placement-preparation.mlir b/mlir/test/Transforms/buffer-placement-preparation.mlir
index ef7a2e328da5e..8458154e4985e 100644
--- a/mlir/test/Transforms/buffer-placement-preparation.mlir
+++ b/mlir/test/Transforms/buffer-placement-preparation.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-buffer-placement-preparation -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
+// RUN: mlir-opt -test-buffer-placement-preparation -split-input-file %s | FileCheck %s -dump-input-on-failure
 
 // CHECK-LABEL: func @func_signature_conversion
 func @func_signature_conversion(%arg0: tensor<4x8xf32>) {
@@ -8,12 +8,28 @@ func @func_signature_conversion(%arg0: tensor<4x8xf32>) {
 
 // -----
 
-// expected-error @below {{BufferAssignmentPlacer doesn't currently support functions which return memref typed values}}
-// expected-error @below {{failed to legalize operation 'func'}}
-func @memref_in_function_results(%arg0: tensor<4x8xf32>) -> (tensor<4x8xf32>, memref<5xf32>) {
-  %0 = alloc() : memref<5xf32>
-  return %arg0, %0 : tensor<4x8xf32>, memref<5xf32>
+// Only tensor typed function result should be converted to memref and move to the
+// function arguments list. The other memref function results remain as function
+// results.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @memref_in_function_results
+func @memref_in_function_results(%arg0: tensor<5xf32>, %arg1: memref<10xf32>) -> (tensor<5xf32>, memref<10xf32>, memref<15xf32>) {
+  %0 = alloc() : memref<15xf32>
+  %1 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg0 {
+    ^bb0(%gen1_arg0: f32):
+      %tmp1 = exp %gen1_arg0 : f32
+      linalg.yield %tmp1 : f32
+    }: tensor<5xf32> -> tensor<5xf32>
+  return %1, %arg1, %0 : tensor<5xf32>, memref<10xf32>, memref<15xf32>
 }
+//      CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, %[[RESULT:.*]]: memref<5xf32>)
+// CHECK-SAME: (memref<10xf32>, memref<15xf32>)
+//      CHECK: %[[FIRST_ALLOC:.*]] = alloc()
+//      CHECK: %[[LINALG_ALLOC:.*]] = alloc()
+//      CHECK: linalg.copy(%[[LINALG_ALLOC]], %[[RESULT]])
+//      CHECK: return %[[ARG1]], %[[FIRST_ALLOC]]
 
 // -----
 
diff --git a/mlir/test/Transforms/buffer-placement.mlir b/mlir/test/Transforms/buffer-placement.mlir
index afbf34ce43fbb..4b401cc841afe 100644
--- a/mlir/test/Transforms/buffer-placement.mlir
+++ b/mlir/test/Transforms/buffer-placement.mlir
@@ -457,3 +457,32 @@ func @nested_regions_and_cond_branch(%arg0: i1, %arg1: memref<2xf32>, %arg2: mem
 //      CHECK:  ^[[BB3:.*]]({{.*}}):
 //      CHECK:  linalg.copy
 // CHECK-NEXT:  dealloc %[[GENERIC1_ALLOC]]
+
+// -----
+
+// Test Case: buffer deallocation escaping
+// BufferPlacement Expected Behaviour: It must not dealloc %arg1 and %x
+// since they are operands of return operation and should escape from
+// deallocating. It should dealloc %y after linalg.copy.
+
+#map0 = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: func @memref_in_function_results
+func @memref_in_function_results(%arg0: memref<5xf32>, %arg1: memref<10xf32>, %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) {
+  %x = alloc() : memref<15xf32>
+  %y = alloc() : memref<5xf32>
+  linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]} %arg0, %y {
+  ^bb0(%arg3: f32, %arg4: f32):
+    %2 = exp %arg3 : f32
+    linalg.yield %2 : f32
+  }: memref<5xf32>, memref<5xf32>
+  linalg.copy(%y, %arg2) : memref<5xf32>, memref<5xf32>
+  return %arg1, %x : memref<10xf32>, memref<15xf32>
+}
+// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, %[[RESULT:.*]]: memref<5xf32>)
+// CHECK: %[[X:.*]] = alloc()
+// CHECK: %[[Y:.*]] = alloc()
+// CHECK: linalg.copy
+// CHECK: dealloc %[[Y]]
+// CHECK: return %[[ARG1]], %[[X]]
+
diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
index 2d781e64cdfa7..6152a9b854359 100644
--- a/mlir/test/lib/Transforms/TestBufferPlacement.cpp
+++ b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
@@ -23,7 +23,7 @@ using namespace mlir;
 namespace {
 /// This pass tests the computeAllocPosition helper method and two provided
 /// operation converters, FunctionAndBlockSignatureConverter and
-/// NoBufferOperandsReturnOpConverter. Furthermore, this pass converts linalg
+/// BufferAssignmentReturnOpConverter. Furthermore, this pass converts linalg
 /// operations on tensors to linalg operations on buffers to prepare them for
 /// the BufferPlacement pass that can be applied afterwards.
 struct TestBufferPlacementPreparationPass
@@ -41,16 +41,18 @@ struct TestBufferPlacementPreparationPass
     LogicalResult
     matchAndRewrite(linalg::GenericOp op, ArrayRef<Value> operands,
                     ConversionPatternRewriter &rewriter) const final {
-      auto loc = op.getLoc();
-      SmallVector<Value, 4> args(operands.begin(), operands.end());
+      Location loc = op.getLoc();
+      ResultRange results = op.getOperation()->getResults();
+      SmallVector<Value, 2> newArgs, newResults;
+      newArgs.reserve(operands.size() + results.size());
+      newArgs.append(operands.begin(), operands.end());
+      newResults.reserve(results.size());
 
       // Update all types to memref types.
-      auto results = op.getOperation()->getResults();
       for (auto result : results) {
-        auto type = result.getType().cast<ShapedType>();
-        if (!type)
-          op.emitOpError()
-              << "tensor to buffer conversion expects ranked results";
+        ShapedType type = result.getType().cast<ShapedType>();
+        assert(type && "Generic operations with non-shaped typed results are "
+                       "not currently supported.");
         if (!type.hasStaticShape())
           return rewriter.notifyMatchFailure(
               op, "dynamic shapes not currently supported");
@@ -62,27 +64,39 @@ struct TestBufferPlacementPreparationPass
         rewriter.restoreInsertionPoint(
             bufferAssignment->computeAllocPosition(result));
         auto alloc = rewriter.create<AllocOp>(loc, memrefType);
-        result.replaceAllUsesWith(alloc);
-        args.push_back(alloc);
+        newArgs.push_back(alloc);
+        newResults.push_back(alloc);
       }
 
       // Generate a new linalg operation that works on buffers.
       auto linalgOp = rewriter.create<linalg::GenericOp>(
-          loc, llvm::None, args, rewriter.getI64IntegerAttr(operands.size()),
+          loc, llvm::None, newArgs, rewriter.getI64IntegerAttr(operands.size()),
           rewriter.getI64IntegerAttr(results.size()), op.indexing_maps(),
           op.iterator_types(), op.docAttr(), op.library_callAttr());
 
-      // Move regions from the old operation to the new one.
-      auto &region = linalgOp.region();
-      rewriter.inlineRegionBefore(op.region(), region, region.end());
-
-      // TODO: verify the internal memref-based linalg functionality.
-      auto &entryBlock = region.front();
-      for (auto result : results) {
-        auto type = result.getType().cast<ShapedType>();
-        entryBlock.addArgument(type.getElementType());
-      }
-      rewriter.eraseOp(op);
+      // Create a new block in the region of the new Generic Op.
+      Block &oldBlock = op.getRegion().front();
+      Region &newRegion = linalgOp.region();
+      Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(),
+                                             oldBlock.getArgumentTypes());
+
+      // Map the old block arguments to the new ones.
+      BlockAndValueMapping mapping;
+      mapping.map(oldBlock.getArguments(), newBlock->getArguments());
+
+      // Add the result arguments to the new block.
+      for (auto result : newResults)
+        newBlock->addArgument(
+            result.getType().cast<ShapedType>().getElementType());
+
+      // Clone the body of the old block to the new block.
+      rewriter.setInsertionPointToEnd(newBlock);
+      for (auto &op : oldBlock.getOperations())
+        rewriter.clone(op, mapping);
+
+      // Replace the results of the old Generic Op with the results of the new
+      // one.
+      rewriter.replaceOp(op, newResults);
       return success();
     }
   };
@@ -94,34 +108,33 @@ struct TestBufferPlacementPreparationPass
     patterns->insert<
                    FunctionAndBlockSignatureConverter,
                    GenericOpConverter,
-                   NoBufferOperandsReturnOpConverter<
+                   BufferAssignmentReturnOpConverter<
                       ReturnOp, ReturnOp, linalg::CopyOp>
     >(context, placer, converter);
     // clang-format on
   }
 
   void runOnOperation() override {
-    auto &context = getContext();
+    MLIRContext &context = getContext();
     ConversionTarget target(context);
     BufferAssignmentTypeConverter converter;
+
+    // Mark all Standard operations legal.
     target.addLegalDialect<StandardOpsDialect>();
 
-    // Make all linalg operations illegal as long as they work on tensors.
+    // Mark all Linalg operations illegal as long as they work on tensors.
+    auto isIllegalType = [&](Type type) { return !converter.isLegal(type); };
+    auto isLegalOperation = [&](Operation *op) {
+      return llvm::none_of(op->getOperandTypes(), isIllegalType) &&
+             llvm::none_of(op->getResultTypes(), isIllegalType);
+    };
     target.addDynamicallyLegalDialect<linalg::LinalgDialect>(
         Optional<ConversionTarget::DynamicLegalityCallbackFn>(
-            [&](Operation *op) {
-              auto isIllegalType = [&](Type type) {
-                return !converter.isLegal(type);
-              };
-              return llvm::none_of(op->getOperandTypes(), isIllegalType) &&
-                     llvm::none_of(op->getResultTypes(), isIllegalType);
-            }));
-
-    // Mark std.ReturnOp illegal as long as an operand is tensor or buffer.
+            isLegalOperation));
+
+    // Mark Standard Return operations illegal as long as one operand is tensor.
     target.addDynamicallyLegalOp<mlir::ReturnOp>([&](mlir::ReturnOp returnOp) {
-      return llvm::none_of(returnOp.getOperandTypes(), [&](Type type) {
-        return type.isa<MemRefType>() || !converter.isLegal(type);
-      });
+      return llvm::none_of(returnOp.getOperandTypes(), isIllegalType);
     });
 
     // Mark the function whose arguments are in tensor-type illegal.
@@ -130,16 +143,14 @@ struct TestBufferPlacementPreparationPass
     });
 
     // Walk over all the functions to apply buffer assignment.
-    getOperation().walk([&](FuncOp function) {
+    getOperation().walk([&](FuncOp function) -> WalkResult {
       OwningRewritePatternList patterns;
       BufferAssignmentPlacer placer(function);
       populateTensorLinalgToBufferLinalgConversionPattern(
           &context, &placer, &converter, &patterns);
 
       // Applying full conversion
-      return failed(applyFullConversion(function, target, patterns, &converter))
-                 ? WalkResult::interrupt()
-                 : WalkResult::advance();
+      return applyFullConversion(function, target, patterns, &converter);
     });
   };
 };

From 058f5f6fd813d1ee1480497394d6fd44e65ec62b Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 29 May 2020 01:02:36 -0700
Subject: [PATCH 482/770] Avoid O_CLOEXEC to allow building on older Linux
 (RHEL5)

Summary:
See https://github.com/google/sanitizers/issues/1253.

Small patch to enable compilation on (ancient) Red Hat Enterprise Linux 5.

Reviewers: kcc, vitalybuka

Reviewed By: vitalybuka

Tags: #sanitizers

Differential Revision: https://reviews.llvm.org/D80648
---
 compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
index d890a3a317737..e21661b42f8d2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.cpp
@@ -347,9 +347,17 @@ int GetNamedMappingFd(const char *name, uptr size, int *flags) {
   CHECK(internal_strlen(name) < sizeof(shmname) - 10);
   internal_snprintf(shmname, sizeof(shmname), "/dev/shm/%zu [%s]",
                     internal_getpid(), name);
+  int o_cloexec = 0;
+#if defined(O_CLOEXEC)
+  o_cloexec = O_CLOEXEC;
+#endif
   int fd = ReserveStandardFds(
-      internal_open(shmname, O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRWXU));
+      internal_open(shmname, O_RDWR | O_CREAT | O_TRUNC | o_cloexec, S_IRWXU));
   CHECK_GE(fd, 0);
+  if (!o_cloexec) {
+    int res = fcntl(fd, F_SETFD, FD_CLOEXEC);
+    CHECK_EQ(0, res);
+  }
   int res = internal_ftruncate(fd, size);
   CHECK_EQ(0, res);
   res = internal_unlink(shmname);

From 5f0267984792429d7b9ac574c17566b98527576e Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Fri, 29 May 2020 11:19:14 +0200
Subject: [PATCH 483/770] [lldb][NFC] Remove a std::string->C string->StringRef
 conversion in ClangUserExpression

---
 .../Plugins/ExpressionParser/Clang/ClangUserExpression.cpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index d649f226b6b81..f01357f101152 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -420,7 +420,7 @@ void ClangUserExpression::CreateSourceCode(
     m_transformed_text = m_expr_text;
   } else {
     m_source_code.reset(ClangExpressionSourceCode::CreateWrapped(
-        m_filename, prefix.c_str(), m_expr_text.c_str()));
+        m_filename, prefix, m_expr_text));
 
     if (!m_source_code->GetText(m_transformed_text, m_expr_lang,
                                 m_in_static_method, exe_ctx, !m_ctx_obj,

From 2c22c1473569352efa9326aa165cc451d39da015 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 25 May 2020 14:37:04 +0200
Subject: [PATCH 484/770] [lldb] Make "inline" tests more configurable

Summary:
This patch adds two new arguments to the MakeInlineTest function. The
main motivation is a follow-up patch I'm preparing, but they seem
generally useful.

The first argument allows the user to specify the "build dictionary".
With this argument one can avoid the need to provide a custom Makefile
if all he needs is to override a couple of make variables. This hooks in
neatly into the existing dictionary support for non-inline tests.

The second argument specifies the name of the test. This could be used
to provide better names to the generated test classes, but it's mainly
useful in conjuction with the first argument: now that we can specify a
custom build dictionary, it may sometimes make sense to run the same
test twice with different build configurations. To achieve that, we need
to give the two tests different names, and this argument achieves that.

The usage of the arguments is demonstrated via TestBasicEntryValues.py.

Reviewers: vsk, JDevlieghere

Subscribers: lldb-commits

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D80518
---
 .../Python/lldbsuite/test/lldbinline.py        | 18 ++++++++++--------
 .../basic_entry_values/Makefile                |  3 ---
 .../basic_entry_values/TestBasicEntryValues.py | 10 ++++++----
 3 files changed, 16 insertions(+), 15 deletions(-)
 delete mode 100644 lldb/test/API/functionalities/param_entry_vals/basic_entry_values/Makefile

diff --git a/lldb/packages/Python/lldbsuite/test/lldbinline.py b/lldb/packages/Python/lldbsuite/test/lldbinline.py
index 5ef7aaac42f79..71143ce3f16aa 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbinline.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbinline.py
@@ -120,7 +120,7 @@ def BuildMakefile(self):
 
     def _test(self):
         self.BuildMakefile()
-        self.build()
+        self.build(dictionary=self._build_dict)
         self.do_test()
 
     def execute_user_command(self, __command):
@@ -190,24 +190,26 @@ def ApplyDecoratorsToFunction(func, decorators):
     return tmp
 
 
-def MakeInlineTest(__file, __globals, decorators=None):
+def MakeInlineTest(__file, __globals, decorators=None, name=None,
+        build_dict=None):
     # Adjust the filename if it ends in .pyc.  We want filenames to
     # reflect the source python file, not the compiled variant.
     if __file is not None and __file.endswith(".pyc"):
         # Strip the trailing "c"
         __file = __file[0:-1]
 
-    # Derive the test name from the current file name
-    file_basename = os.path.basename(__file)
-
-    test_name, _ = os.path.splitext(file_basename)
+    if name is None:
+        # Derive the test name from the current file name
+        file_basename = os.path.basename(__file)
+        name, _ = os.path.splitext(file_basename)
 
     test_func = ApplyDecoratorsToFunction(InlineTest._test, decorators)
     # Build the test case
-    test_class = type(test_name, (InlineTest,), dict(test=test_func, name=test_name))
+    test_class = type(name, (InlineTest,), dict(test=test_func,
+        name=name, _build_dict=build_dict))
 
     # Add the test case to the globals, and hide InlineTest
-    __globals.update({test_name: test_class})
+    __globals.update({name: test_class})
 
     # Keep track of the original test filename so we report it
     # correctly in test results.
diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/Makefile b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/Makefile
deleted file mode 100644
index ab505a6841262..0000000000000
--- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-CXX_SOURCES := main.cpp
-CXXFLAGS_EXTRAS := -O2 -glldb
-include Makefile.rules
diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/TestBasicEntryValues.py b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/TestBasicEntryValues.py
index 94689d14f990f..0d6c5e32948e1 100644
--- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/TestBasicEntryValues.py
+++ b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/TestBasicEntryValues.py
@@ -3,9 +3,11 @@
 from lldbsuite.test import lldbplatformutil
 
 supported_archs = ["x86_64", "aarch64"]
-
-lldbinline.MakeInlineTest(__file__, globals(),
-        [skipIf(archs=no_match(supported_archs)),
+decorators = [skipIf(archs=no_match(supported_archs)),
          skipIf(compiler="clang", compiler_version=['<', '10.0']),
          skipUnlessHasCallSiteInfo,
-         skipIf(dwarf_version=['<', '4'])])
+         skipIf(dwarf_version=['<', '4'])]
+
+lldbinline.MakeInlineTest(__file__, globals(), decorators=decorators,
+        name="BasicEntryValues_V5",
+        build_dict=dict(CXXFLAGS_EXTRAS="-O2 -glldb"))

From 7ff2de4f0c60c5d13880440e85ef8edc78482a2f Mon Sep 17 00:00:00 2001
From: Emre Kultursay <emrekultursay@google.com>
Date: Fri, 29 May 2020 11:18:26 +0200
Subject: [PATCH 485/770] Do not list adb devices when a device id is given

Summary:
On Android, this method gets called twice: first when establishing
a host-server connection, then when attaching to a process id.

Each call takes several seconds to finish (especially slower on Windows)
and eliminating the call for the typical case improves latency significantly.

Reviewed By: labath

Differential Revision: https://reviews.llvm.org/D79586
---
 .../Plugins/Platform/Android/AdbClient.cpp    | 24 ++++-----
 .../Platform/Android/AdbClientTest.cpp        | 51 +++++++++++++++++++
 .../unittests/Platform/Android/CMakeLists.txt |  8 +++
 lldb/unittests/Platform/CMakeLists.txt        |  2 +
 4 files changed, 71 insertions(+), 14 deletions(-)
 create mode 100644 lldb/unittests/Platform/Android/AdbClientTest.cpp
 create mode 100644 lldb/unittests/Platform/Android/CMakeLists.txt

diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.cpp b/lldb/source/Plugins/Platform/Android/AdbClient.cpp
index 14d97ebe7c3ca..17707118d9c91 100644
--- a/lldb/source/Plugins/Platform/Android/AdbClient.cpp
+++ b/lldb/source/Plugins/Platform/Android/AdbClient.cpp
@@ -94,11 +94,7 @@ Status ReadAllBytes(Connection &conn, void *buffer, size_t size) {
 
 Status AdbClient::CreateByDeviceID(const std::string &device_id,
                                    AdbClient &adb) {
-  DeviceIDList connect_devices;
-  auto error = adb.GetDevices(connect_devices);
-  if (error.Fail())
-    return error;
-
+  Status error;
   std::string android_serial;
   if (!device_id.empty())
     android_serial = device_id;
@@ -106,18 +102,18 @@ Status AdbClient::CreateByDeviceID(const std::string &device_id,
     android_serial = env_serial;
 
   if (android_serial.empty()) {
-    if (connect_devices.size() != 1)
+    DeviceIDList connected_devices;
+    error = adb.GetDevices(connected_devices);
+    if (error.Fail())
+      return error;
+
+    if (connected_devices.size() != 1)
       return Status("Expected a single connected device, got instead %zu - try "
                     "setting 'ANDROID_SERIAL'",
-                    connect_devices.size());
-    adb.SetDeviceID(connect_devices.front());
+                    connected_devices.size());
+    adb.SetDeviceID(connected_devices.front());
   } else {
-    auto find_it = std::find(connect_devices.begin(), connect_devices.end(),
-                             android_serial);
-    if (find_it == connect_devices.end())
-      return Status("Device \"%s\" not found", android_serial.c_str());
-
-    adb.SetDeviceID(*find_it);
+    adb.SetDeviceID(android_serial);
   }
   return error;
 }
diff --git a/lldb/unittests/Platform/Android/AdbClientTest.cpp b/lldb/unittests/Platform/Android/AdbClientTest.cpp
new file mode 100644
index 0000000000000..0808b96f69fc8
--- /dev/null
+++ b/lldb/unittests/Platform/Android/AdbClientTest.cpp
@@ -0,0 +1,51 @@
+//===-- AdbClientTest.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "Plugins/Platform/Android/AdbClient.h"
+#include <cstdlib>
+
+static void set_env(const char *var, const char *value) {
+#ifdef _WIN32
+  _putenv_s(var, value);
+#else
+  setenv(var, value, true);
+#endif
+}
+
+using namespace lldb;
+using namespace lldb_private;
+
+namespace lldb_private {
+namespace platform_android {
+
+class AdbClientTest : public ::testing::Test {
+public:
+  void SetUp() override { set_env("ANDROID_SERIAL", ""); }
+
+  void TearDown() override { set_env("ANDROID_SERIAL", ""); }
+};
+
+TEST(AdbClientTest, CreateByDeviceId) {
+  AdbClient adb;
+  Status error = AdbClient::CreateByDeviceID("device1", adb);
+  EXPECT_TRUE(error.Success());
+  EXPECT_EQ("device1", adb.GetDeviceID());
+}
+
+TEST(AdbClientTest, CreateByDeviceId_ByEnvVar) {
+  set_env("ANDROID_SERIAL", "device2");
+
+  AdbClient adb;
+  Status error = AdbClient::CreateByDeviceID("", adb);
+  EXPECT_TRUE(error.Success());
+  EXPECT_EQ("device2", adb.GetDeviceID());
+}
+
+} // end namespace platform_android
+} // end namespace lldb_private
diff --git a/lldb/unittests/Platform/Android/CMakeLists.txt b/lldb/unittests/Platform/Android/CMakeLists.txt
new file mode 100644
index 0000000000000..3de2a2d120164
--- /dev/null
+++ b/lldb/unittests/Platform/Android/CMakeLists.txt
@@ -0,0 +1,8 @@
+include_directories(${LLDB_SOURCE_DIR}/source/Plugins/Platform/Android)
+
+add_lldb_unittest(AdbClientTest
+  AdbClientTest.cpp
+
+  LINK_LIBS
+    lldbPluginPlatformAndroid
+  )
diff --git a/lldb/unittests/Platform/CMakeLists.txt b/lldb/unittests/Platform/CMakeLists.txt
index 3362ca08d60c0..eb7f0a6ca3c41 100644
--- a/lldb/unittests/Platform/CMakeLists.txt
+++ b/lldb/unittests/Platform/CMakeLists.txt
@@ -6,3 +6,5 @@ add_lldb_unittest(LLDBPlatformTests
   LINK_COMPONENTS
     Support
   )
+
+add_subdirectory(Android)

From 1bfc58e6557cd3f3c310de214f316469c20c1476 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Wed, 27 May 2020 17:56:25 +0300
Subject: [PATCH 486/770] [llvm-readobj][test] - unwind.test: add comments,
 document the current behavior.

Here I've added comments, added testing for llvm-readelf and documented
the behavior that we already have.

It was discussed in the D80380 thread that we want to improve the
"p_memsz does not match p_filesz for GNU_EH_FRAME" message reported
(and probably convert error to a warning). This patch is a preparation
for that.

Differential revision: https://reviews.llvm.org/D80635
---
 llvm/test/tools/llvm-readobj/ELF/unwind.test | 52 +++++++++++++++++---
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/llvm/test/tools/llvm-readobj/ELF/unwind.test b/llvm/test/tools/llvm-readobj/ELF/unwind.test
index 2fe673d806e2e..e3fdbcc0f5ecc 100644
--- a/llvm/test/tools/llvm-readobj/ELF/unwind.test
+++ b/llvm/test/tools/llvm-readobj/ELF/unwind.test
@@ -1,5 +1,16 @@
-# RUN: yaml2obj --docnum=1 %s -o %t1.exe
-# RUN: llvm-readobj --unwind %t1.exe | FileCheck %s
+## In this test we check how the unwind information is dumped with the use of --unwind.
+
+## First, check that both llvm-readobj and llvm-readelf are able to dump a valid unwind information.
+## Check that the output is the same for these tools.
+## The memory size of the PT_GNU_EH_FRAME equals its file size and they both are equal to 0x3c.
+## 0x3c is the size of the .eh_frame_hdr section.
+# RUN: yaml2obj --docnum=1 %s -DMEMSIZE=0x3c -DFILESIZE=0x3c -o %t1.valid
+# RUN: llvm-readobj --unwind %t1.valid | FileCheck %s
+# RUN: llvm-readelf --sections --unwind %t1.valid | FileCheck %s --check-prefixes=SIZE,CHECK
+
+## Validate the size of the .eh_frame_hdr section.
+# SIZE: [Nr] Name          Type     Address          Off    Size
+# SIZE: [ 2] .eh_frame_hdr PROGBITS 00000000004013c0 0000bc 00003c
 
 # CHECK:      EHFrameHeader {
 # CHECK-NEXT:  Address: 0x4013c0
@@ -202,19 +213,46 @@ Symbols:
     Value:           0x0000000000400000
     Binding:         STB_GLOBAL
 ProgramHeaders:
-  - Type: PT_LOAD
-    Flags: [ PF_X, PF_R ]
-    VAddr: 0x00400000
-    PAddr: 0x00400000
+  - Type:     PT_LOAD
+    Flags:    [ PF_X, PF_R ]
+    VAddr:    0x00400000
+    PAddr:    0x00400000
     Sections:
       - Section: .text
   - Type: PT_GNU_EH_FRAME
     Flags: [ PF_X, PF_R ]
     VAddr: 0x004013C0
     PAddr: 0x004013C0
+    MemSize:  [[MEMSIZE]]
+    FileSize: [[FILESIZE]]
     Sections:
       - Section: .eh_frame_hdr
-...
+
+## Document we report a error when the memory size of the PT_GNU_EH_FRAME does not match its file size.
+## TODO: we want to report a warning and continue dumping instead.
+# RUN: yaml2obj --docnum=1 %s -DMEMSIZE=0x3b -DFILESIZE=0x3c -o %t1.size.mismatch
+# RUN: not llvm-readobj --unwind %t1.size.mismatch 2>&1 | \
+# RUN:   FileCheck -DFILE=%t1.size.mismatch %s --check-prefix=SIZE-MISMATCH
+# RUN: not llvm-readelf --unwind %t1.size.mismatch 2>&1 | \
+# RUN:   FileCheck -DFILE=%t1.size.mismatch %s --check-prefix=SIZE-MISMATCH
+
+# SIZE-MISMATCH: error: '[[FILE]]': p_memsz does not match p_filesz for GNU_EH_FRAME
+
+## Check we partially dump the unwind information when the PT_GNU_EH_FRAME segment
+## points to truncated data.
+# RUN: yaml2obj --docnum=1 %s -DMEMSIZE=0x1 -DFILESIZE=0x1 -o %t1.truncated
+# RUN: not llvm-readobj --unwind %t1.truncated 2>&1 | FileCheck -DFILE=%t1.truncated %s --check-prefix=TRUNCATED
+# RUN: not llvm-readelf --unwind %t1.truncated 2>&1 | FileCheck -DFILE=%t1.truncated %s --check-prefix=TRUNCATED
+
+# TRUNCATED:      EHFrameHeader {
+# TRUNCATED-NEXT:   Address: 0x4013c0
+# TRUNCATED-NEXT:   Offset: 0xbc
+# TRUNCATED-NEXT:   Size: 0x1
+# TRUNCATED-NEXT:   Corresponding Section: .eh_frame_hdr
+# TRUNCATED-NEXT:   Header {
+# TRUNCATED-NEXT:     version: 1
+# TRUNCATED-NEXT:     eh_frame_ptr_enc: 0x0
+# TRUNCATED-NEXT: error: '[[FILE]]': unexpected encoding eh_frame_ptr_enc
 
 ## Check we report an error when the tool is unable to parse .eh_frame section.
 # RUN: yaml2obj --docnum=2 %s -o %t2.exe

From d0fcdcd28f95d699b27d2026ede964a7f9cff9dd Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne@apple.com>
Date: Fri, 15 May 2020 15:58:19 -0400
Subject: [PATCH 487/770] [libc++] Fix the
 LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT setting

When the __config_site header is generated, but LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
wasn't specified, _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT would be defined
to 0, which was the NonUnique RTTI comparison implementation. The intent
was to use the Unique RTTI comparison implementation in that case, which
caused https://llvm.org/PR45549.

Instead, use a proper "switch" to select the RTTI comparison implementation.
Note that 0 can't be used as a value, because that is treated the same
by CMake as a variable that is just not defined.

Differential Revision: https://reviews.llvm.org/D80037
---
 libcxx/CMakeLists.txt                         | 17 +++++++-------
 libcxx/cmake/caches/Apple.cmake               |  2 +-
 libcxx/docs/BuildingLibcxx.rst                | 22 ++++++++++++-------
 libcxx/include/__config                       |  6 ++---
 libcxx/include/__config_site.in               |  4 ++--
 libcxx/include/typeinfo                       | 11 ++++++----
 ...ype_info.comparison.apple.compile.pass.cpp |  8 +++----
 .../type_info.comparison.merged.sh.cpp        |  6 ++---
 .../type_info.comparison.unmerged.sh.cpp      |  6 ++---
 9 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bdb2d56da8539..1a7e0a0bc759c 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -154,14 +154,13 @@ option(LIBCXX_ABI_FORCE_ITANIUM "Ignore auto-detection and force use of the Itan
 option(LIBCXX_ABI_FORCE_MICROSOFT "Ignore auto-detection and force use of the Microsoft ABI.")
 
 
-set(LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT  "" CACHE STRING
-  "Whether typeinfo names are expected to be unique. Defining this option overrides the default configuration in the library.")
-set(MERGED_TYPEINFO_VALUES ";ON;OFF")
-set_property(CACHE LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT PROPERTY STRINGS ${MERGED_TYPEINFO_DEFAULTS})
-list(FIND MERGED_TYPEINFO_VALUES "${LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT}" IS_VALID_DEFAULT)
+set(LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION "" CACHE STRING "The implementation of typeinfo comparison to use.")
+set(TYPEINFO_COMPARISON_VALUES ";1;2")
+set_property(CACHE LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION PROPERTY STRINGS ${TYPEINFO_COMPARISON_VALUES})
+list(FIND TYPEINFO_COMPARISON_VALUES "${LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION}" IS_VALID_DEFAULT)
 if (${IS_VALID_DEFAULT} EQUAL -1)
-  message(FATAL_ERROR "Value '${LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT}' is not a valid value for
-          LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT")
+  message(FATAL_ERROR "Value '${LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION}' is not a valid value for
+          LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION")
 endif()
 
 option(LIBCXX_HIDE_FROM_ABI_PER_TU_BY_DEFAULT "Enable per TU ABI insulation by default. To be used by vendors." OFF)
@@ -840,8 +839,8 @@ config_define_if_not(LIBCXX_ENABLE_STDOUT _LIBCPP_HAS_NO_STDOUT)
 config_define_if_not(LIBCXX_ENABLE_THREADS _LIBCPP_HAS_NO_THREADS)
 config_define_if_not(LIBCXX_ENABLE_MONOTONIC_CLOCK _LIBCPP_HAS_NO_MONOTONIC_CLOCK)
 config_define_if_not(LIBCXX_ENABLE_THREAD_UNSAFE_C_FUNCTIONS _LIBCPP_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS)
-if (NOT LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT STREQUAL "")
-  config_define("${LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT}" _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT)
+if (NOT LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION STREQUAL "")
+  config_define("${LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION}" _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION)
 endif()
 
 config_define_if(LIBCXX_HAS_PTHREAD_API _LIBCPP_HAS_THREAD_API_PTHREAD)
diff --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake
index 26985c9f335e8..622a3af84f2bc 100644
--- a/libcxx/cmake/caches/Apple.cmake
+++ b/libcxx/cmake/caches/Apple.cmake
@@ -7,7 +7,7 @@ set(LIBCXX_ABI_VERSION "1" CACHE STRING "")
 set(LIBCXX_ENABLE_EXPERIMENTAL_LIBRARY OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_STATIC OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
-set(LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT ON CACHE STRING "")
+set(LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION "1" CACHE STRING "")
 set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
 set(LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS OFF CACHE BOOL "")
 set(LIBCXX_HIDE_FROM_ABI_PER_TU_BY_DEFAULT ON CACHE BOOL "")
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 262558e79add0..0fbb6693099cc 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -380,18 +380,24 @@ The following options allow building libc++ for a different ABI version.
   See ``include/__config`` for the list of ABI macros.
 
 
-.. option:: LIBCXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
+.. option:: LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION
 
-  **Default**: ``None``. When defined this option overrides the libraries default configuration
-  for whether merged type info names are present.
+  **Default**: ``None``, which lets the library figure out which implementation
+  to use based on the object format.
 
+  This setting defines what implementation to use for comparing typeinfo objects.
+  There are two main implementations, which differ on whether we make the assumption
+  that type info names for a type have been fully merged are unique across the entire
+  program. This may not be the case for libraries built with ``-Bsymbolic`` or due to
+  compiler or linker bugs (Ex. llvm.org/PR37398).
 
-  Build ``std::type_info`` with the assumption that type info names for a type have been fully
-  merged are unique across the entire program. This may not be the case for libraries built with
-  ``-Bsymbolic`` or due to compiler or linker bugs (Ex. llvm.org/PR37398).
 
-  When the value is ``ON`` typeinfo comparisons compare only the pointer value, otherwise ``strcmp``
-  is used as a fallback.
+  When the value is set to ``1``, we assume that typeinfos are unique across the
+  whole program, and typeinfo comparisons compare only the pointer value.
+
+  When the value is set to ``2``, we do not assume that typeinfos are unique across
+  the whole program. We first compare the pointers, and then use ``strcmp`` on the
+  typeinfo names as a fallback.
 
 
 .. _LLVM-specific variables:
diff --git a/libcxx/include/__config b/libcxx/include/__config
index bfde9e9c9895c..cf596a7872abd 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -758,13 +758,13 @@ typedef __char32_t char32_t;
 #  endif
 #endif
 
-#ifndef _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
+#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #  ifdef _LIBCPP_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
-#    define _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 0
+#    define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 2
 #  else
      // TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398
      // And we should consider defaulting to OFF.
-#    define _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 1
+#    define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 1
 #  endif
 #endif
 
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index 4da11a9e67445..a6984b2eefc11 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -27,8 +27,8 @@
 #cmakedefine _LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL
 #cmakedefine _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS
 #cmakedefine _LIBCPP_NO_VCRUNTIME
-#ifndef _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
-#cmakedefine01 _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
+#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
+#cmakedefine _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION @_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION@
 #endif
 #cmakedefine _LIBCPP_ABI_NAMESPACE @_LIBCPP_ABI_NAMESPACE@
 #cmakedefine _LIBCPP_HAS_PARALLEL_ALGORITHMS
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 8b86c61b974d4..7e76da5387967 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -121,6 +121,7 @@ public:
 // ========================================================================== //
 // ------------------------------------------------------------------------- //
 //                               Unique
+//               (_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION = 1)
 // ------------------------------------------------------------------------- //
 // This implementation of type_info assumes a unique copy of the RTTI for a
 // given type inside a program. This is a valid assumption when abiding to
@@ -130,6 +131,7 @@ public:
 // a deep string comparison.
 // -------------------------------------------------------------------------- //
 //                             NonUnique
+//               (_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION = 2)
 // -------------------------------------------------------------------------- //
 // This implementation of type_info does not assume there is always a unique
 // copy of the RTTI for a given type inside a program. For various reasons
@@ -139,6 +141,7 @@ public:
 // comparison is equal.
 // -------------------------------------------------------------------------- //
 //                          NonUniqueARMRTTIBit
+// (selected on ARM64 regardless of _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION)
 // -------------------------------------------------------------------------- //
 // This implementation of type_info does not assume always a unique copy of
 // the RTTI for a given type inside a program. It packs the pointer to the
@@ -256,12 +259,12 @@ struct __type_info_implementations {
   typedef
 #if defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__)
     __non_unique_arm_rtti_bit_impl
-#elif _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT == 0
-    __non_unique_impl
-#elif _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT == 1
+#elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1
     __unique_impl
+#elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2
+    __non_unique_impl
 #else
-#   error invalid configuration for _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
+#   error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #endif
      __impl;
 };
diff --git a/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.apple.compile.pass.cpp b/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.apple.compile.pass.cpp
index d6fd0fba31d37..68066416fd367 100644
--- a/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.apple.compile.pass.cpp
+++ b/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.apple.compile.pass.cpp
@@ -18,10 +18,10 @@
 
 #include <typeinfo>
 
-#if !defined(_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT)
-#   error "_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT should be defined on Apple platforms"
+#if !defined(_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION)
+#   error "_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION should be defined on Apple platforms"
 #endif
 
-#if _LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT != 1
-#   error "_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT should be 1 (assume RTTI is merged) on Apple platforms"
+#if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION != 1
+#   error "_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION should be 1 (assume RTTI is merged) on Apple platforms"
 #endif
diff --git a/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.merged.sh.cpp b/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.merged.sh.cpp
index a69d53fc94146..08e3eb67af8b7 100644
--- a/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.merged.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.merged.sh.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: -fno-rtti
 
 // FILE_DEPENDENCIES: %t.exe
-// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu1.o -DTU1 -D_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT=1
-// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu2.o -DTU2 -D_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT=1
-// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.main.o -DMAIN -D_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT=1
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu1.o -DTU1 -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=1
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu2.o -DTU2 -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=1
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.main.o -DMAIN -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=1
 // RUN: %{cxx} %t.tu1.o %t.tu2.o %t.main.o %{flags} %{link_flags} -o %t.exe
 // RUN: %{exec} %t.exe
 
diff --git a/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.unmerged.sh.cpp b/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.unmerged.sh.cpp
index c7213b7748c82..e81a1b0e34378 100644
--- a/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.unmerged.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.unmerged.sh.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: -fno-rtti
 
 // FILE_DEPENDENCIES: %t.exe
-// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu1.o -DTU1 -D_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT=0
-// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu2.o -DTU2 -D_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT=0
-// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.main.o -DMAIN -D_LIBCPP_HAS_MERGED_TYPEINFO_NAMES_DEFAULT=0
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu1.o -DTU1 -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=2
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.tu2.o -DTU2 -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=2
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -o %t.main.o -DMAIN -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=2
 // RUN: %{cxx} %t.tu1.o %t.tu2.o %t.main.o %{flags} %{link_flags} -o %t.exe
 // RUN: %{exec} %t.exe
 

From b9bb3ad3ed3b13607d15472a0b881da9fb00fc03 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 29 May 2020 12:18:25 +0200
Subject: [PATCH 488/770] Unbreak the build of mlir-cuda-runner

---
 mlir/tools/mlir-cuda-runner/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/tools/mlir-cuda-runner/CMakeLists.txt b/mlir/tools/mlir-cuda-runner/CMakeLists.txt
index f669cfbea57b2..5488262d7ee7e 100644
--- a/mlir/tools/mlir-cuda-runner/CMakeLists.txt
+++ b/mlir/tools/mlir-cuda-runner/CMakeLists.txt
@@ -51,6 +51,7 @@ if(MLIR_CUDA_RUNNER_ENABLED)
     MLIRParser
     MLIRSupport
     MLIRTargetLLVMIR
+    MLIRTargetNVVMIR
     MLIRTransforms
     MLIRTranslation
     ${CUDA_RUNTIME_LIBRARY}

From b742eaa321219fa3444e3bcd33eda441bea6b73a Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 2 Apr 2020 10:53:45 +0200
Subject: [PATCH 489/770] [clangd] Handle additional includes while parsing
 ASTs

Summary:
Enables building ASTs with stale preambles by handling additional preamble
includes. Sets the correct location information for those imaginary includes so
that features like gotodef/documentlink keeps functioning propoerly.

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D77644
---
 clang-tools-extra/clangd/CodeComplete.cpp     |  13 ++-
 clang-tools-extra/clangd/ParsedAST.cpp        |  19 ++-
 clang-tools-extra/clangd/Preamble.cpp         |  34 +++++-
 clang-tools-extra/clangd/Preamble.h           |  22 +++-
 .../clangd/unittests/FindSymbolsTests.cpp     |  15 ++-
 .../clangd/unittests/ParsedASTTests.cpp       | 109 ++++++++++++++++++
 .../clangd/unittests/PreambleTests.cpp        |  94 ++++++++++-----
 7 files changed, 252 insertions(+), 54 deletions(-)

diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index f2bdadb0ad9dd..d2ee41f36cf4e 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1031,7 +1031,7 @@ struct SemaCompleteInput {
   PathRef FileName;
   const tooling::CompileCommand &Command;
   const PreambleData &Preamble;
-  const PreamblePatch &Patch;
+  llvm::Optional<const PreamblePatch> Patch;
   llvm::StringRef Contents;
   size_t Offset;
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS;
@@ -1105,7 +1105,8 @@ bool semaCodeComplete(std::unique_ptr<CodeCompleteConsumer> Consumer,
   PreambleBounds PreambleRegion =
       ComputePreambleBounds(*CI->getLangOpts(), ContentsBuffer.get(), 0);
   bool CompletingInPreamble = PreambleRegion.Size > Input.Offset;
-  Input.Patch.apply(*CI);
+  if (Input.Patch)
+    Input.Patch->apply(*CI);
   // NOTE: we must call BeginSourceFile after prepareCompilerInstance. Otherwise
   // the remapped buffers do not get freed.
   auto Clang = prepareCompilerInstance(
@@ -1767,7 +1768,8 @@ codeComplete(PathRef FileName, const tooling::CompileCommand &Command,
              : std::move(Flow).run({FileName, Command, *Preamble,
                                     // We want to serve code completions with
                                     // low latency, so don't bother patching.
-                                    PreamblePatch(), Contents, *Offset, VFS});
+                                    /*PreamblePatch=*/llvm::None, Contents,
+                                    *Offset, VFS});
 }
 
 SignatureHelp signatureHelp(PathRef FileName,
@@ -1792,10 +1794,11 @@ SignatureHelp signatureHelp(PathRef FileName,
   PI.CompileCommand = Command;
   PI.Contents = Contents.str();
   PI.FS = std::move(VFS);
-  auto PP = PreamblePatch::create(FileName, PI, Preamble);
   semaCodeComplete(
       std::make_unique<SignatureHelpCollector>(Options, Index, Result), Options,
-      {FileName, Command, Preamble, PP, Contents, *Offset, std::move(PI.FS)});
+      {FileName, Command, Preamble,
+       PreamblePatch::create(FileName, PI, Preamble), Contents, *Offset,
+       std::move(PI.FS)});
   return Result;
 }
 
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index e7678f3d69e99..660932a7b259e 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -14,6 +14,7 @@
 #include "Diagnostics.h"
 #include "Headers.h"
 #include "IncludeFixer.h"
+#include "Preamble.h"
 #include "SourceCode.h"
 #include "index/CanonicalIncludes.h"
 #include "index/Index.h"
@@ -48,6 +49,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <memory>
+#include <vector>
 
 // Force the linker to link in Clang-tidy modules.
 // clangd doesn't support the static analyzer.
@@ -268,6 +270,11 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
 
   StoreDiags ASTDiags;
 
+  llvm::Optional<PreamblePatch> Patch;
+  if (Preamble) {
+    Patch = PreamblePatch::create(Filename, Inputs, *Preamble);
+    Patch->apply(*CI);
+  }
   auto Clang = prepareCompilerInstance(
       std::move(CI), PreamblePCH,
       llvm::MemoryBuffer::getMemBufferCopy(Inputs.Contents, Filename), VFS,
@@ -369,12 +376,14 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     Clang->setExternalSemaSource(FixIncludes->unresolvedNameRecorder());
   }
 
-  // Copy over the includes from the preamble, then combine with the
-  // non-preamble includes below.
-  auto Includes = Preamble ? Preamble->Includes : IncludeStructure{};
-  // Replay the preamble includes so that clang-tidy checks can see them.
-  if (Preamble)
+  IncludeStructure Includes;
+  // If we are using a preamble, copy existing includes.
+  if (Preamble) {
+    Includes = Preamble->Includes;
+    Includes.MainFileIncludes = Patch->preambleIncludes();
+    // Replay the preamble includes so that clang-tidy checks can see them.
     ReplayPreamble::attach(Includes, *Clang, Preamble->Preamble.getBounds());
+  }
   // Important: collectIncludeStructure is registered *after* ReplayPreamble!
   // Otherwise we would collect the replayed includes again...
   // (We can't *just* use the replayed includes, they don't have Resolved path).
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index d3eaa92d4c1ac..959b241dac120 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -24,6 +24,8 @@
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
@@ -274,6 +276,7 @@ void escapeBackslashAndQuotes(llvm::StringRef Text, llvm::raw_ostream &OS) {
 PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
                                     const ParseInputs &Modified,
                                     const PreambleData &Baseline) {
+  assert(llvm::sys::path::is_absolute(FileName) && "relative FileName!");
   // First scan the include directives in Baseline and Modified. These will be
   // used to figure out newly added directives in Modified. Scanning can fail,
   // the code just bails out and creates an empty patch in such cases, as:
@@ -301,7 +304,7 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
   }
   // No patch needed if includes are equal.
   if (*BaselineIncludes == *ModifiedIncludes)
-    return {};
+    return PreamblePatch::unmodified(Baseline);
 
   PreamblePatch PP;
   // This shouldn't coincide with any real file name.
@@ -312,10 +315,15 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
 
   // We are only interested in newly added includes, record the ones in Baseline
   // for exclusion.
-  llvm::DenseSet<std::pair<tok::PPKeywordKind, llvm::StringRef>>
+  llvm::DenseMap<std::pair<tok::PPKeywordKind, llvm::StringRef>,
+                 /*Resolved=*/llvm::StringRef>
       ExistingIncludes;
+  for (const auto &Inc : Baseline.Includes.MainFileIncludes)
+    ExistingIncludes[{Inc.Directive, Inc.Written}] = Inc.Resolved;
+  // There might be includes coming from disabled regions, record these for
+  // exclusion too. note that we don't have resolved paths for those.
   for (const auto &Inc : *BaselineIncludes)
-    ExistingIncludes.insert({Inc.Directive, Inc.Written});
+    ExistingIncludes.try_emplace({Inc.Directive, Inc.Written});
   // Calculate extra includes that needs to be inserted.
   llvm::raw_string_ostream Patch(PP.PatchContents);
   // Set default filename for subsequent #line directives
@@ -324,9 +332,15 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
   // might lead to problems on windows especially.
   escapeBackslashAndQuotes(FileName, Patch);
   Patch << "\"\n";
-  for (const auto &Inc : *ModifiedIncludes) {
-    if (ExistingIncludes.count({Inc.Directive, Inc.Written}))
+  for (auto &Inc : *ModifiedIncludes) {
+    auto It = ExistingIncludes.find({Inc.Directive, Inc.Written});
+    // Include already present in the baseline preamble. Set resolved path and
+    // put into preamble includes.
+    if (It != ExistingIncludes.end()) {
+      Inc.Resolved = It->second.str();
+      PP.PreambleIncludes.push_back(Inc);
       continue;
+    }
     // Include is new in the modified preamble. Inject it into the patch and use
     // #line to set the presumed location to where it is spelled.
     auto LineCol = offsetToClangLineColumn(Modified.Contents, Inc.HashOffset);
@@ -356,5 +370,15 @@ void PreamblePatch::apply(CompilerInvocation &CI) const {
   PPOpts.Includes.push_back(PatchFileName);
 }
 
+std::vector<Inclusion> PreamblePatch::preambleIncludes() const {
+  return PreambleIncludes;
+}
+
+PreamblePatch PreamblePatch::unmodified(const PreambleData &Preamble) {
+  PreamblePatch PP;
+  PP.PreambleIncludes = Preamble.Includes.MainFileIncludes;
+  return PP;
+}
+
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/Preamble.h b/clang-tools-extra/clangd/Preamble.h
index f2e1ba98f3326..c88688a51736e 100644
--- a/clang-tools-extra/clangd/Preamble.h
+++ b/clang-tools-extra/clangd/Preamble.h
@@ -91,12 +91,14 @@ bool isPreambleCompatible(const PreambleData &Preamble,
                           const CompilerInvocation &CI);
 
 /// Stores information required to parse a TU using a (possibly stale) Baseline
-/// preamble. Updates compiler invocation to approximately reflect additions to
-/// the preamble section of Modified contents, e.g. new include directives.
+/// preamble. Later on this information can be injected into the main file by
+/// updating compiler invocation with \c apply. This injected section
+/// approximately reflects additions to the preamble in Modified contents, e.g.
+/// new include directives.
 class PreamblePatch {
 public:
-  // With an empty patch, the preamble is used verbatim.
-  PreamblePatch() = default;
+  /// \p Preamble is used verbatim.
+  static PreamblePatch unmodified(const PreambleData &Preamble);
   /// Builds a patch that contains new PP directives introduced to the preamble
   /// section of \p Modified compared to \p Baseline.
   /// FIXME: This only handles include directives, we should at least handle
@@ -109,9 +111,21 @@ class PreamblePatch {
   /// \p CI that contains new directives calculated in create.
   void apply(CompilerInvocation &CI) const;
 
+  /// Returns #include directives from the \c Modified preamble that were
+  /// resolved using the \c Baseline preamble. This covers the new locations of
+  /// inclusions that were moved around, but not inclusions of new files. Those
+  /// will be recorded when parsing the main file: the includes in the injected
+  /// section will be resolved back to their spelled positions in the main file
+  /// using the presumed-location mechanism.
+  std::vector<Inclusion> preambleIncludes() const;
+
 private:
+  PreamblePatch() = default;
   std::string PatchContents;
   std::string PatchFileName;
+  /// Includes that are present in both \p Baseline and \p Modified. Used for
+  /// patching includes of baseline preamble.
+  std::vector<Inclusion> PreambleIncludes;
 };
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp b/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
index f9343458841a7..2b014ac0a69b4 100644
--- a/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
@@ -666,7 +666,8 @@ TEST_F(DocumentSymbolsTest, UsingDirectives) {
 }
 
 TEST_F(DocumentSymbolsTest, TempSpecs) {
-  addFile("foo.cpp", R"cpp(
+  std::string FilePath = testPath("foo.cpp");
+  addFile(FilePath, R"cpp(
       template <typename T, typename U, int X = 5> class Foo {};
       template <typename T> class Foo<int, T> {};
       template <> class Foo<bool, int> {};
@@ -674,7 +675,7 @@ TEST_F(DocumentSymbolsTest, TempSpecs) {
       )cpp");
   // Foo is higher ranked because of exact name match.
   EXPECT_THAT(
-      getSymbols("foo.cpp"),
+      getSymbols(FilePath),
       UnorderedElementsAre(
           AllOf(WithName("Foo"), WithKind(SymbolKind::Class)),
           AllOf(WithName("Foo<int, T>"), WithKind(SymbolKind::Class)),
@@ -683,7 +684,8 @@ TEST_F(DocumentSymbolsTest, TempSpecs) {
 }
 
 TEST_F(DocumentSymbolsTest, Qualifiers) {
-  addFile("foo.cpp", R"cpp(
+  std::string FilePath = testPath("foo.cpp");
+  addFile(FilePath, R"cpp(
     namespace foo { namespace bar {
       struct Cls;
 
@@ -706,7 +708,7 @@ TEST_F(DocumentSymbolsTest, Qualifiers) {
   )cpp");
 
   // All the qualifiers should be preserved exactly as written.
-  EXPECT_THAT(getSymbols("foo.cpp"),
+  EXPECT_THAT(getSymbols(FilePath),
               UnorderedElementsAre(
                   WithName("foo"), WithName("foo::bar::Cls"),
                   WithName("foo::bar::func1"), WithName("::foo::bar::func2"),
@@ -715,7 +717,8 @@ TEST_F(DocumentSymbolsTest, Qualifiers) {
 }
 
 TEST_F(DocumentSymbolsTest, QualifiersWithTemplateArgs) {
-  addFile("foo.cpp", R"cpp(
+  std::string FilePath = testPath("foo.cpp");
+  addFile(FilePath, R"cpp(
       template <typename T, typename U = double> class Foo;
 
       template <>
@@ -738,7 +741,7 @@ TEST_F(DocumentSymbolsTest, QualifiersWithTemplateArgs) {
       int Foo_type::method3() { return 30; }
       )cpp");
   EXPECT_THAT(
-      getSymbols("foo.cpp"),
+      getSymbols(FilePath),
       UnorderedElementsAre(WithName("Foo"), WithName("Foo<int, double>"),
                            WithName("int_type"),
                            WithName("Foo<int_type, double>::method1"),
diff --git a/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp b/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
index a2bc996be4f14..d86f741d9e72e 100644
--- a/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
@@ -17,7 +17,9 @@
 #include "Annotations.h"
 #include "Compiler.h"
 #include "Diagnostics.h"
+#include "Headers.h"
 #include "ParsedAST.h"
+#include "Preamble.h"
 #include "SourceCode.h"
 #include "TestFS.h"
 #include "TestTU.h"
@@ -28,6 +30,7 @@
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Token.h"
 #include "clang/Tooling/Syntax/Tokens.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "gmock/gmock-matchers.h"
@@ -82,6 +85,13 @@ MATCHER_P(RangeIs, R, "") {
   return arg.beginOffset() == R.Begin && arg.endOffset() == R.End;
 }
 
+MATCHER(EqInc, "") {
+  Inclusion Actual = testing::get<0>(arg);
+  Inclusion Expected = testing::get<1>(arg);
+  return std::tie(Actual.HashLine, Actual.Written) ==
+         std::tie(Expected.HashLine, Expected.Written);
+}
+
 TEST(ParsedASTTest, TopLevelDecls) {
   TestTU TU;
   TU.HeaderCode = R"(
@@ -431,6 +441,105 @@ TEST(ParsedASTTest, ReplayPreambleForTidyCheckers) {
   }
 }
 
+TEST(ParsedASTTest, PatchesAdditionalIncludes) {
+  llvm::StringLiteral ModifiedContents = R"cpp(
+    #include "baz.h"
+    #include "foo.h"
+    #include "sub/aux.h"
+    void bar() {
+      foo();
+      baz();
+      aux();
+    })cpp";
+  // Build expected ast with symbols coming from headers.
+  TestTU TU;
+  TU.Filename = "foo.cpp";
+  TU.AdditionalFiles["foo.h"] = "void foo();";
+  TU.AdditionalFiles["sub/baz.h"] = "void baz();";
+  TU.AdditionalFiles["sub/aux.h"] = "void aux();";
+  TU.ExtraArgs = {"-I" + testPath("sub")};
+  TU.Code = ModifiedContents.str();
+  auto ExpectedAST = TU.build();
+
+  // Build preamble with no includes.
+  TU.Code = "";
+  StoreDiags Diags;
+  auto Inputs = TU.inputs();
+  auto CI = buildCompilerInvocation(Inputs, Diags);
+  auto EmptyPreamble =
+      buildPreamble(testPath("foo.cpp"), *CI, Inputs, true, nullptr);
+  ASSERT_TRUE(EmptyPreamble);
+  EXPECT_THAT(EmptyPreamble->Includes.MainFileIncludes, testing::IsEmpty());
+
+  // Now build an AST using empty preamble and ensure patched includes worked.
+  TU.Code = ModifiedContents.str();
+  Inputs = TU.inputs();
+  auto PatchedAST = ParsedAST::build(testPath("foo.cpp"), Inputs, std::move(CI),
+                                     {}, EmptyPreamble);
+  ASSERT_TRUE(PatchedAST);
+  ASSERT_TRUE(PatchedAST->getDiagnostics().empty());
+
+  // Ensure source location information is correct, including resolved paths.
+  EXPECT_THAT(PatchedAST->getIncludeStructure().MainFileIncludes,
+              testing::Pointwise(
+                  EqInc(), ExpectedAST.getIncludeStructure().MainFileIncludes));
+  auto StringMapToVector = [](const llvm::StringMap<unsigned> SM) {
+    std::vector<std::pair<std::string, unsigned>> Res;
+    for (const auto &E : SM)
+      Res.push_back({E.first().str(), E.second});
+    llvm::sort(Res);
+    return Res;
+  };
+  // Ensure file proximity signals are correct.
+  EXPECT_EQ(StringMapToVector(PatchedAST->getIncludeStructure().includeDepth(
+                testPath("foo.cpp"))),
+            StringMapToVector(ExpectedAST.getIncludeStructure().includeDepth(
+                testPath("foo.cpp"))));
+}
+
+TEST(ParsedASTTest, PatchesDeletedIncludes) {
+  TestTU TU;
+  TU.Filename = "foo.cpp";
+  TU.Code = "";
+  auto ExpectedAST = TU.build();
+
+  // Build preamble with no includes.
+  TU.Code = R"cpp(#include <foo.h>)cpp";
+  StoreDiags Diags;
+  auto Inputs = TU.inputs();
+  auto CI = buildCompilerInvocation(Inputs, Diags);
+  auto BaselinePreamble =
+      buildPreamble(testPath("foo.cpp"), *CI, Inputs, true, nullptr);
+  ASSERT_TRUE(BaselinePreamble);
+  EXPECT_THAT(BaselinePreamble->Includes.MainFileIncludes,
+              ElementsAre(testing::Field(&Inclusion::Written, "<foo.h>")));
+
+  // Now build an AST using additional includes and check that locations are
+  // correctly parsed.
+  TU.Code = "";
+  Inputs = TU.inputs();
+  auto PatchedAST = ParsedAST::build(testPath("foo.cpp"), Inputs, std::move(CI),
+                                     {}, BaselinePreamble);
+  ASSERT_TRUE(PatchedAST);
+
+  // Ensure source location information is correct.
+  EXPECT_THAT(PatchedAST->getIncludeStructure().MainFileIncludes,
+              testing::Pointwise(
+                  EqInc(), ExpectedAST.getIncludeStructure().MainFileIncludes));
+  auto StringMapToVector = [](const llvm::StringMap<unsigned> SM) {
+    std::vector<std::pair<std::string, unsigned>> Res;
+    for (const auto &E : SM)
+      Res.push_back({E.first().str(), E.second});
+    llvm::sort(Res);
+    return Res;
+  };
+  // Ensure file proximity signals are correct.
+  EXPECT_EQ(StringMapToVector(PatchedAST->getIncludeStructure().includeDepth(
+                testPath("foo.cpp"))),
+            StringMapToVector(ExpectedAST.getIncludeStructure().includeDepth(
+                testPath("foo.cpp"))));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/PreambleTests.cpp b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
index db615e6e66e13..bb471ab30175f 100644
--- a/clang-tools-extra/clangd/unittests/PreambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -8,6 +8,7 @@
 
 #include "Annotations.h"
 #include "Compiler.h"
+#include "Headers.h"
 #include "Preamble.h"
 #include "TestFS.h"
 #include "TestTU.h"
@@ -21,6 +22,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <clang/Frontend/FrontendActions.h>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -30,51 +32,58 @@ namespace clang {
 namespace clangd {
 namespace {
 
+MATCHER_P2(Distance, File, D, "") {
+  return arg.first() == File && arg.second == D;
+}
+
+std::shared_ptr<const PreambleData>
+createPreamble(llvm::StringRef Contents = "") {
+  auto TU = TestTU::withCode(Contents);
+  // ms-compatibility changes meaning of #import, make sure it is turned off.
+  TU.ExtraArgs = {"-fno-ms-compatibility"};
+  TU.Filename = "preamble.cpp";
+  auto PI = TU.inputs();
+  IgnoreDiagnostics Diags;
+  auto CI = buildCompilerInvocation(PI, Diags);
+  if (!CI) {
+    ADD_FAILURE() << "failed to build compiler invocation";
+    return nullptr;
+  }
+  if (auto Preamble = buildPreamble(TU.Filename, *CI, PI, true, nullptr))
+    return Preamble;
+  ADD_FAILURE() << "failed to build preamble";
+  return nullptr;
+}
+
 // Builds a preamble for BaselineContents, patches it for ModifiedContents and
 // returns the includes in the patch.
 IncludeStructure
 collectPatchedIncludes(llvm::StringRef ModifiedContents,
                        llvm::StringRef BaselineContents,
                        llvm::StringRef MainFileName = "main.cpp") {
-  std::string MainFile = testPath(MainFileName);
-  ParseInputs PI;
-  PI.FS = new llvm::vfs::InMemoryFileSystem;
-  MockCompilationDatabase CDB;
+  auto BaselinePreamble = createPreamble(BaselineContents);
+  // Create the patch.
+  auto TU = TestTU::withCode(ModifiedContents);
+  TU.Filename = MainFileName.str();
   // ms-compatibility changes meaning of #import, make sure it is turned off.
-  CDB.ExtraClangFlags.push_back("-fno-ms-compatibility");
-  PI.CompileCommand = CDB.getCompileCommand(MainFile).getValue();
-  // Create invocation
+  TU.ExtraArgs = {"-fno-ms-compatibility"};
+  auto PI = TU.inputs();
+  auto PP = PreamblePatch::create(testPath(TU.Filename), PI, *BaselinePreamble);
+  // Collect patch contents.
   IgnoreDiagnostics Diags;
   auto CI = buildCompilerInvocation(PI, Diags);
-  assert(CI && "failed to create compiler invocation");
-  // Build baseline preamble.
-  PI.Contents = BaselineContents.str();
-  PI.Version = "baseline preamble";
-  auto BaselinePreamble = buildPreamble(MainFile, *CI, PI, true, nullptr);
-  assert(BaselinePreamble && "failed to build baseline preamble");
-  // Create the patch.
-  PI.Contents = ModifiedContents.str();
-  PI.Version = "modified contents";
-  auto PP = PreamblePatch::create(MainFile, PI, *BaselinePreamble);
-  // Collect patch contents.
   PP.apply(*CI);
-  llvm::StringRef PatchContents;
-  for (const auto &Rempaped : CI->getPreprocessorOpts().RemappedFileBuffers) {
-    if (Rempaped.first == testPath("__preamble_patch__.h")) {
-      PatchContents = Rempaped.second->getBuffer();
-      break;
-    }
-  }
-  // Run preprocessor over the modified contents with patched Invocation to and
-  // BaselinePreamble to collect includes in the patch. We trim the input to
-  // only preamble section to not collect includes in the mainfile.
+  // Run preprocessor over the modified contents with patched Invocation. We
+  // provide a preamble and trim contents to ensure only the implicit header
+  // introduced by the patch is parsed and nothing else.
+  // We don't run PP directly over the patch cotents to test production
+  // behaviour.
   auto Bounds = Lexer::ComputePreamble(ModifiedContents, *CI->getLangOpts());
   auto Clang =
       prepareCompilerInstance(std::move(CI), &BaselinePreamble->Preamble,
                               llvm::MemoryBuffer::getMemBufferCopy(
                                   ModifiedContents.slice(0, Bounds.Size).str()),
                               PI.FS, Diags);
-  Clang->getPreprocessorOpts().ImplicitPCHInclude.clear();
   PreprocessOnlyAction Action;
   if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0])) {
     ADD_FAILURE() << "failed begin source file";
@@ -163,6 +172,33 @@ TEST(PreamblePatchTest, MainFileIsEscaped) {
   EXPECT_THAT(Includes, ElementsAre(AllOf(Field(&Inclusion::Written, "<a.h>"),
                                           Field(&Inclusion::HashLine, 0))));
 }
+
+TEST(PreamblePatchTest, PatchesPreambleIncludes) {
+  IgnoreDiagnostics Diags;
+  auto TU = TestTU::withCode(R"cpp(
+    #include "a.h"
+    #include "c.h"
+  )cpp");
+  TU.AdditionalFiles["a.h"] = "#include \"b.h\"";
+  TU.AdditionalFiles["b.h"] = "";
+  TU.AdditionalFiles["c.h"] = "";
+  auto PI = TU.inputs();
+  auto BaselinePreamble = buildPreamble(
+      TU.Filename, *buildCompilerInvocation(PI, Diags), PI, true, nullptr);
+  // We drop c.h from modified and add a new header. Since the latter is patched
+  // we should only get a.h in preamble includes.
+  TU.Code = R"cpp(
+    #include "a.h"
+    #include "b.h"
+  )cpp";
+  auto PP = PreamblePatch::create(testPath(TU.Filename), TU.inputs(),
+                                  *BaselinePreamble);
+  // Only a.h should exists in the preamble, as c.h has been dropped and b.h was
+  // newly introduced.
+  EXPECT_THAT(PP.preambleIncludes(),
+              ElementsAre(AllOf(Field(&Inclusion::Written, "\"a.h\""),
+                                Field(&Inclusion::Resolved, testPath("a.h")))));
+}
 } // namespace
 } // namespace clangd
 } // namespace clang

From 478f6fb2001698eb102ddce9500ff0885eaaeaab Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 14 May 2020 12:23:21 +0200
Subject: [PATCH 490/770] [clangd] Add buildPreamble to TestTU

Summary: Depends on D77644.

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79930
---
 .../clangd/unittests/PreambleTests.cpp        | 26 +++----------------
 clang-tools-extra/clangd/unittests/TestTU.cpp | 16 +++++++++---
 clang-tools-extra/clangd/unittests/TestTU.h   |  1 +
 3 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/clang-tools-extra/clangd/unittests/PreambleTests.cpp b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
index bb471ab30175f..75aad728280a1 100644
--- a/clang-tools-extra/clangd/unittests/PreambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -36,37 +36,19 @@ MATCHER_P2(Distance, File, D, "") {
   return arg.first() == File && arg.second == D;
 }
 
-std::shared_ptr<const PreambleData>
-createPreamble(llvm::StringRef Contents = "") {
-  auto TU = TestTU::withCode(Contents);
-  // ms-compatibility changes meaning of #import, make sure it is turned off.
-  TU.ExtraArgs = {"-fno-ms-compatibility"};
-  TU.Filename = "preamble.cpp";
-  auto PI = TU.inputs();
-  IgnoreDiagnostics Diags;
-  auto CI = buildCompilerInvocation(PI, Diags);
-  if (!CI) {
-    ADD_FAILURE() << "failed to build compiler invocation";
-    return nullptr;
-  }
-  if (auto Preamble = buildPreamble(TU.Filename, *CI, PI, true, nullptr))
-    return Preamble;
-  ADD_FAILURE() << "failed to build preamble";
-  return nullptr;
-}
-
 // Builds a preamble for BaselineContents, patches it for ModifiedContents and
 // returns the includes in the patch.
 IncludeStructure
 collectPatchedIncludes(llvm::StringRef ModifiedContents,
                        llvm::StringRef BaselineContents,
                        llvm::StringRef MainFileName = "main.cpp") {
-  auto BaselinePreamble = createPreamble(BaselineContents);
-  // Create the patch.
-  auto TU = TestTU::withCode(ModifiedContents);
+  auto TU = TestTU::withCode(BaselineContents);
   TU.Filename = MainFileName.str();
   // ms-compatibility changes meaning of #import, make sure it is turned off.
   TU.ExtraArgs = {"-fno-ms-compatibility"};
+  auto BaselinePreamble = TU.preamble();
+  // Create the patch.
+  TU = TestTU::withCode(ModifiedContents);
   auto PI = TU.inputs();
   auto PP = PreamblePatch::create(testPath(TU.Filename), PI, *BaselinePreamble);
   // Collect patch contents.
diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp
index 824c4cc8ff143..b4781425943c6 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.cpp
+++ b/clang-tools-extra/clangd/unittests/TestTU.cpp
@@ -66,14 +66,24 @@ ParseInputs TestTU::inputs() const {
   return Inputs;
 }
 
+std::shared_ptr<const PreambleData> TestTU::preamble() const {
+  auto Inputs = inputs();
+  IgnoreDiagnostics Diags;
+  auto CI = buildCompilerInvocation(Inputs, Diags);
+  assert(CI && "Failed to build compilation invocation.");
+  return clang::clangd::buildPreamble(testPath(Filename), *CI, Inputs,
+                                      /*StoreInMemory=*/true,
+                                      /*PreambleCallback=*/nullptr);
+}
+
 ParsedAST TestTU::build() const {
   auto Inputs = inputs();
   StoreDiags Diags;
   auto CI = buildCompilerInvocation(Inputs, Diags);
   assert(CI && "Failed to build compilation invocation.");
-  auto Preamble =
-      buildPreamble(testPath(Filename), *CI, Inputs,
-                    /*StoreInMemory=*/true, /*PreambleCallback=*/nullptr);
+  auto Preamble = clang::clangd::buildPreamble(testPath(Filename), *CI, Inputs,
+                                               /*StoreInMemory=*/true,
+                                               /*PreambleCallback=*/nullptr);
   auto AST = ParsedAST::build(testPath(Filename), Inputs, std::move(CI),
                               Diags.take(), Preamble);
   if (!AST.hasValue()) {
diff --git a/clang-tools-extra/clangd/unittests/TestTU.h b/clang-tools-extra/clangd/unittests/TestTU.h
index 2be294f78e7ec..57017f4a91753 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.h
+++ b/clang-tools-extra/clangd/unittests/TestTU.h
@@ -68,6 +68,7 @@ struct TestTU {
   // By default, build() will report Error diagnostics as GTest errors.
   // Suppress this behavior by adding an 'error-ok' comment to the code.
   ParsedAST build() const;
+  std::shared_ptr<const PreambleData> preamble() const;
   ParseInputs inputs() const;
   SymbolSlab headerSymbols() const;
   RefSlab headerRefs() const;

From c68ee6da283c5697e935d1f9b7401c086cb18e03 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Tue, 19 May 2020 17:29:45 +0300
Subject: [PATCH 491/770] [llvm-readelf] - --elf-hash-histogram: do not crash
 when the .gnu.hash goes past the EOF.

llvm-readelf might crash when the .gnu.hash table goes past the EOF.

This patch splits and updates the code of a helper function `checkGNUHashTable`,
which is similar to `checkHashTable` and fixes the issue.

Differential revision: https://reviews.llvm.org/D80215
---
 llvm/test/tools/llvm-readobj/ELF/gnuhash.test |  5 ++
 .../llvm-readobj/ELF/hash-histogram.test      | 52 ++++++++++++++++
 llvm/tools/llvm-readobj/ELFDumper.cpp         | 59 ++++++++++++-------
 3 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/llvm/test/tools/llvm-readobj/ELF/gnuhash.test b/llvm/test/tools/llvm-readobj/ELF/gnuhash.test
index 1ba746cd9d989..415c3b4f4b872 100644
--- a/llvm/test/tools/llvm-readobj/ELF/gnuhash.test
+++ b/llvm/test/tools/llvm-readobj/ELF/gnuhash.test
@@ -301,3 +301,8 @@ ProgramHeaders:
 # ERR-NEXT:   Shift Count: 2
 # ERR-NEXT: warning: '[[FILE]]': unable to dump the SHT_GNU_HASH section at 0x78: it goes past the end of the file
 # ERR-NEXT: }
+
+## Check we report a single warning about the broken GNU hash table when both
+## --gnu-hash-table and --elf-hash-histogram options are requested.
+# RUN: llvm-readelf --gnu-hash-table --elf-hash-histogram %t.err.nbuckets 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.err.nbuckets -DMASKWORDS=2 -DNBUCKETS=4294967295 --check-prefix=ERR --implicit-check-not=warning
diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
index 5447bffceec30..4ec772a18f820 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-histogram.test
@@ -266,3 +266,55 @@ ProgramHeaders:
       - Section: .hash
       - Section: .gnu.hash
       - Section: .dynamic
+
+## Check we report a proper warning when the GNU hash table goes past the end of the file.
+
+## Case A: the 'nbuckets' field is set so that the GNU hash table goes past the end of the file.
+##         The value of 1 for the NBUCKETS is no-op.
+# RUN: yaml2obj --docnum=6 -D MASKWORDS=4294967295 -D NBUCKETS=1 %s -o %t7
+# RUN: llvm-readelf --elf-hash-histogram %t7 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t7 --check-prefix=ERR5 --implicit-check-not="Histogram"
+
+# ERR5: warning: '[[FILE]]': unable to dump the SHT_GNU_HASH section at 0x78: it goes past the end of the file
+
+## Case B: the 'maskwords' field is set so that the GNU hash table goes past the end of the file.
+##         The value of 1 for the MASKWORDS is no-op.
+# RUN: yaml2obj --docnum=6 -D MASKWORDS=1 -D NBUCKETS=4294967295 %s -o %t8
+# RUN: llvm-readelf --elf-hash-histogram %t8 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t8 --check-prefix=ERR5 --implicit-check-not="Histogram"
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.hash
+    Type:  SHT_GNU_HASH
+    Flags: [ SHF_ALLOC ]
+    Header:
+      SymNdx: 0x0
+      Shift2: 0x0
+## The number of words in the Bloom filter.
+      MaskWords: [[MASKWORDS]]
+## The number of hash buckets.
+      NBuckets:  [[NBUCKETS]]
+    BloomFilter: [ 0x0 ]
+    HashBuckets: [ 0x0 ]
+    HashValues:  [ 0x0 ]
+  - Name:  .dynamic
+    Type:  SHT_DYNAMIC
+    Flags: [ SHF_ALLOC ]
+    Link:  .dynstr
+    Entries:
+      - Tag:   DT_GNU_HASH
+        Value: 0x0
+      - Tag:   DT_NULL
+        Value: 0x0
+DynamicSymbols: []
+ProgramHeaders:
+  - Type: PT_LOAD
+    Sections:
+      - Section: .gnu.hash
+      - Section: .dynamic
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 84a68b17b298f..eebe65f8400f3 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -2666,6 +2666,28 @@ static bool checkHashTable(const ELFFile<ELFT> *Obj,
   return true;
 }
 
+template <class ELFT>
+static Error checkGNUHashTable(const ELFFile<ELFT> *Obj,
+                               const typename ELFT::GnuHash *GnuHashTable,
+                               bool *IsHeaderValid = nullptr) {
+  const uint8_t *TableData = reinterpret_cast<const uint8_t *>(GnuHashTable);
+  assert(TableData >= Obj->base() &&
+         TableData < Obj->base() + Obj->getBufSize() &&
+         "GnuHashTable must always point to a location inside the file");
+
+  uint64_t TableOffset = TableData - Obj->base();
+  if (IsHeaderValid)
+    *IsHeaderValid = TableOffset + /*Header size:*/ 16 < Obj->getBufSize();
+  if (TableOffset + 16 + GnuHashTable->nbuckets * 4 +
+          GnuHashTable->maskwords * sizeof(typename ELFT::Off) >=
+      Obj->getBufSize())
+    return createError("unable to dump the SHT_GNU_HASH "
+                       "section at 0x" +
+                       Twine::utohexstr(TableOffset) +
+                       ": it goes past the end of the file");
+  return Error::success();
+}
+
 template <typename ELFT> void ELFDumper<ELFT>::printHashTable() {
   DictScope D(W, "HashTable");
   if (!HashTable ||
@@ -2682,27 +2704,19 @@ void ELFDumper<ELFT>::printGnuHashTable(const object::ObjectFile *Obj) {
   DictScope D(W, "GnuHashTable");
   if (!GnuHashTable)
     return;
-  W.printNumber("Num Buckets", GnuHashTable->nbuckets);
-  W.printNumber("First Hashed Symbol Index", GnuHashTable->symndx);
-  W.printNumber("Num Mask Words", GnuHashTable->maskwords);
-  W.printNumber("Shift Count", GnuHashTable->shift2);
-
-  MemoryBufferRef File = Obj->getMemoryBufferRef();
-  const char *TableData = reinterpret_cast<const char *>(GnuHashTable);
-  assert(TableData >= File.getBufferStart() &&
-         TableData < File.getBufferEnd() &&
-         "GnuHashTable must always point to a location inside the file");
 
-  uint64_t TableOffset = TableData - File.getBufferStart();
-  if (TableOffset +
-          /*Header size:*/ 16 + GnuHashTable->nbuckets * 4 +
-          GnuHashTable->maskwords * sizeof(typename ELFT::Off) >=
-      File.getBufferSize()) {
-    reportWarning(createError("unable to dump the SHT_GNU_HASH "
-                              "section at 0x" +
-                              Twine::utohexstr(TableOffset) +
-                              ": it goes past the end of the file"),
-                  ObjF->getFileName());
+  bool IsHeaderValid;
+  Error Err =
+      checkGNUHashTable<ELFT>(ObjF->getELFFile(), GnuHashTable, &IsHeaderValid);
+  if (IsHeaderValid) {
+    W.printNumber("Num Buckets", GnuHashTable->nbuckets);
+    W.printNumber("First Hashed Symbol Index", GnuHashTable->symndx);
+    W.printNumber("Num Mask Words", GnuHashTable->maskwords);
+    W.printNumber("Shift Count", GnuHashTable->shift2);
+  }
+
+  if (Err) {
+    reportUniqueWarning(std::move(Err));
     return;
   }
 
@@ -4680,7 +4694,10 @@ void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
 
   // Print histogram for the .gnu.hash section.
   if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable())
-    printGnuHashHistogram(*GnuHashTable);
+    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable))
+      this->reportUniqueWarning(std::move(E));
+    else
+      printGnuHashHistogram(*GnuHashTable);
 }
 
 template <class ELFT>

From 1772adb0594bf0d8684fe8b63609352ad4a1ccf0 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 29 May 2020 12:35:16 +0200
Subject: [PATCH 492/770] [clangd] Preserve extra args in
 PreambleTests::IncludeParsing to fix windows build bots

---
 clang-tools-extra/clangd/unittests/PreambleTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/unittests/PreambleTests.cpp b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
index 75aad728280a1..b1548ef4732b4 100644
--- a/clang-tools-extra/clangd/unittests/PreambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -48,7 +48,7 @@ collectPatchedIncludes(llvm::StringRef ModifiedContents,
   TU.ExtraArgs = {"-fno-ms-compatibility"};
   auto BaselinePreamble = TU.preamble();
   // Create the patch.
-  TU = TestTU::withCode(ModifiedContents);
+  TU.Code = ModifiedContents.str();
   auto PI = TU.inputs();
   auto PP = PreamblePatch::create(testPath(TU.Filename), PI, *BaselinePreamble);
   // Collect patch contents.

From 85de54f8066aa73ddee14868b089d191d8b73280 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 May 2020 15:50:13 +0100
Subject: [PATCH 493/770] VirtualFileSystem.h - reduce Twine.h include to
 forward declaration. NFC.

---
 llvm/include/llvm/Support/VirtualFileSystem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index 28fd3298e47e7..af09c21085c5e 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
@@ -38,6 +37,7 @@
 namespace llvm {
 
 class MemoryBuffer;
+class Twine;
 
 namespace vfs {
 

From fabf4afe0554303a73beeaec33541495e4cd2539 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 29 May 2020 11:17:55 +0100
Subject: [PATCH 494/770] IPDBInjectedSource.h - remove unused includes and
 forward declarations. NFC.

---
 llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h b/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h
index d5b36f9846b52..6ee6c7cc8fc1d 100644
--- a/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h
+++ b/llvm/include/llvm/DebugInfo/PDB/IPDBInjectedSource.h
@@ -9,15 +9,11 @@
 #ifndef LLVM_DEBUGINFO_PDB_IPDBINJECTEDSOURCE_H
 #define LLVM_DEBUGINFO_PDB_IPDBINJECTEDSOURCE_H
 
-#include "llvm/Support/raw_ostream.h"
-#include <memory>
+#include <cstdint>
 #include <string>
 
 namespace llvm {
-class raw_ostream;
-
 namespace pdb {
-
 /// IPDBInjectedSource defines an interface used to represent source files
 /// which were injected directly into the PDB file during the compilation
 /// process.  This is used, for example, to add natvis files to a PDB, but

From 9ab7215846f6b1bb6b6b0bcf19bd34ab2742e100 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 29 May 2020 11:23:40 +0100
Subject: [PATCH 495/770] IPDBLineNumber.h - remove unused includes. NFC.

---
 llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h b/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
index 77e88999497e5..80d8a68bd032d 100644
--- a/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
+++ b/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_IPDBLINENUMBER_H
 #define LLVM_DEBUGINFO_PDB_IPDBLINENUMBER_H
 
-#include "PDBTypes.h"
+#include <cstdint.>
 
 namespace llvm {
 namespace pdb {

From fcde3d5b04b612ebc4164fe8f3e83f93cd1fce53 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 14 May 2020 12:20:33 +0200
Subject: [PATCH 496/770] [clangd] Patch PP directives to use stale preambles
 while building ASTs

Summary:
Depends on D79930.

This enables more accurate parsing of the AST, by making new macro
definitions in preamble section visible. This is handled by injecting
define directives into preamble patch.

This patch doesn't handle any location mappings yet, so features like go-to-def,
go-to-refs and hover might not work as expected. These will be addressed in a
follow-up patch.

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79992
---
 clang-tools-extra/clangd/Preamble.cpp         | 184 +++++++++++++-----
 clang-tools-extra/clangd/Preamble.h           |   3 +
 .../clangd/unittests/PreambleTests.cpp        | 105 ++++++++++
 3 files changed, 241 insertions(+), 51 deletions(-)

diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index 959b241dac120..dcfafc914d107 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
@@ -106,14 +107,70 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
   const SourceManager *SourceMgr = nullptr;
 };
 
-/// Gets the includes in the preamble section of the file by running
-/// preprocessor over \p Contents. Returned includes do not contain resolved
-/// paths. \p VFS and \p Cmd is used to build the compiler invocation, which
-/// might stat/read files.
-llvm::Expected<std::vector<Inclusion>>
-scanPreambleIncludes(llvm::StringRef Contents,
-                     llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                     const tooling::CompileCommand &Cmd) {
+// Represents directives other than includes, where basic textual information is
+// enough.
+struct TextualPPDirective {
+  unsigned DirectiveLine;
+  // Full text that's representing the directive, including the `#`.
+  std::string Text;
+
+  bool operator==(const TextualPPDirective &RHS) const {
+    return std::tie(DirectiveLine, Text) ==
+           std::tie(RHS.DirectiveLine, RHS.Text);
+  }
+};
+
+// Collects #define directives inside the main file.
+struct DirectiveCollector : public PPCallbacks {
+  DirectiveCollector(const Preprocessor &PP,
+                     std::vector<TextualPPDirective> &TextualDirectives)
+      : LangOpts(PP.getLangOpts()), SM(PP.getSourceManager()),
+        TextualDirectives(TextualDirectives) {}
+
+  void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                   SrcMgr::CharacteristicKind FileType,
+                   FileID PrevFID) override {
+    InMainFile = SM.isWrittenInMainFile(Loc);
+  }
+
+  void MacroDefined(const Token &MacroNameTok,
+                    const MacroDirective *MD) override {
+    if (!InMainFile)
+      return;
+    TextualDirectives.emplace_back();
+    TextualPPDirective &TD = TextualDirectives.back();
+
+    auto DecompLoc = SM.getDecomposedLoc(MacroNameTok.getLocation());
+    TD.DirectiveLine = SM.getLineNumber(DecompLoc.first, DecompLoc.second);
+
+    SourceRange DefRange(
+        MD->getMacroInfo()->getDefinitionLoc(),
+        Lexer::getLocForEndOfToken(MD->getMacroInfo()->getDefinitionEndLoc(), 0,
+                                   SM, LangOpts));
+    llvm::raw_string_ostream OS(TD.Text);
+    OS << "#define " << toSourceCode(SM, DefRange);
+  }
+
+private:
+  bool InMainFile = true;
+  const LangOptions &LangOpts;
+  const SourceManager &SM;
+  std::vector<TextualPPDirective> &TextualDirectives;
+};
+
+struct ScannedPreamble {
+  std::vector<Inclusion> Includes;
+  std::vector<TextualPPDirective> TextualDirectives;
+};
+
+/// Scans the preprocessor directives in the preamble section of the file by
+/// running preprocessor over \p Contents. Returned includes do not contain
+/// resolved paths. \p VFS and \p Cmd is used to build the compiler invocation,
+/// which might stat/read files.
+llvm::Expected<ScannedPreamble>
+scanPreamble(llvm::StringRef Contents,
+             llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+             const tooling::CompileCommand &Cmd) {
   // Build and run Preprocessor over the preamble.
   ParseInputs PI;
   PI.Contents = Contents.str();
@@ -147,14 +204,18 @@ scanPreambleIncludes(llvm::StringRef Contents,
   if (!Action.BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0]))
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "failed BeginSourceFile");
+  const auto &SM = Clang->getSourceManager();
   Preprocessor &PP = Clang->getPreprocessor();
   IncludeStructure Includes;
+  PP.addPPCallbacks(collectIncludeStructureCallback(SM, &Includes));
+  ScannedPreamble SP;
   PP.addPPCallbacks(
-      collectIncludeStructureCallback(Clang->getSourceManager(), &Includes));
+      std::make_unique<DirectiveCollector>(PP, SP.TextualDirectives));
   if (llvm::Error Err = Action.Execute())
     return std::move(Err);
   Action.EndSourceFile();
-  return Includes.MainFileIncludes;
+  SP.Includes = std::move(Includes.MainFileIncludes);
+  return SP;
 }
 
 const char *spellingForIncDirective(tok::PPKeywordKind IncludeDirective) {
@@ -277,7 +338,7 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
                                     const ParseInputs &Modified,
                                     const PreambleData &Baseline) {
   assert(llvm::sys::path::is_absolute(FileName) && "relative FileName!");
-  // First scan the include directives in Baseline and Modified. These will be
+  // First scan preprocessor directives in Baseline and Modified. These will be
   // used to figure out newly added directives in Modified. Scanning can fail,
   // the code just bails out and creates an empty patch in such cases, as:
   // - If scanning for Baseline fails, no knowledge of existing includes hence
@@ -285,25 +346,28 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
   //   whole preamble, which is terribly slow.
   // - If scanning for Modified fails, cannot figure out newly added ones so
   //   there's nothing to do but generate an empty patch.
-  auto BaselineIncludes = scanPreambleIncludes(
+  auto BaselineScan = scanPreamble(
       // Contents needs to be null-terminated.
       Baseline.Preamble.getContents().str(),
       Baseline.StatCache->getConsumingFS(Modified.FS), Modified.CompileCommand);
-  if (!BaselineIncludes) {
-    elog("Failed to scan includes for baseline of {0}: {1}", FileName,
-         BaselineIncludes.takeError());
-    return {};
+  if (!BaselineScan) {
+    elog("Failed to scan baseline of {0}: {1}", FileName,
+         BaselineScan.takeError());
+    return PreamblePatch::unmodified(Baseline);
   }
-  auto ModifiedIncludes = scanPreambleIncludes(
+  auto ModifiedScan = scanPreamble(
       Modified.Contents, Baseline.StatCache->getConsumingFS(Modified.FS),
       Modified.CompileCommand);
-  if (!ModifiedIncludes) {
-    elog("Failed to scan includes for modified contents of {0}: {1}", FileName,
-         ModifiedIncludes.takeError());
-    return {};
+  if (!ModifiedScan) {
+    elog("Failed to scan modified contents of {0}: {1}", FileName,
+         ModifiedScan.takeError());
+    return PreamblePatch::unmodified(Baseline);
   }
-  // No patch needed if includes are equal.
-  if (*BaselineIncludes == *ModifiedIncludes)
+
+  bool IncludesChanged = BaselineScan->Includes != ModifiedScan->Includes;
+  bool DirectivesChanged =
+      BaselineScan->TextualDirectives != ModifiedScan->TextualDirectives;
+  if (!IncludesChanged && !DirectivesChanged)
     return PreamblePatch::unmodified(Baseline);
 
   PreamblePatch PP;
@@ -313,18 +377,6 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
                           "__preamble_patch__.h");
   PP.PatchFileName = PatchName.str().str();
 
-  // We are only interested in newly added includes, record the ones in Baseline
-  // for exclusion.
-  llvm::DenseMap<std::pair<tok::PPKeywordKind, llvm::StringRef>,
-                 /*Resolved=*/llvm::StringRef>
-      ExistingIncludes;
-  for (const auto &Inc : Baseline.Includes.MainFileIncludes)
-    ExistingIncludes[{Inc.Directive, Inc.Written}] = Inc.Resolved;
-  // There might be includes coming from disabled regions, record these for
-  // exclusion too. note that we don't have resolved paths for those.
-  for (const auto &Inc : *BaselineIncludes)
-    ExistingIncludes.try_emplace({Inc.Directive, Inc.Written});
-  // Calculate extra includes that needs to be inserted.
   llvm::raw_string_ostream Patch(PP.PatchContents);
   // Set default filename for subsequent #line directives
   Patch << "#line 0 \"";
@@ -332,25 +384,55 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
   // might lead to problems on windows especially.
   escapeBackslashAndQuotes(FileName, Patch);
   Patch << "\"\n";
-  for (auto &Inc : *ModifiedIncludes) {
-    auto It = ExistingIncludes.find({Inc.Directive, Inc.Written});
-    // Include already present in the baseline preamble. Set resolved path and
-    // put into preamble includes.
-    if (It != ExistingIncludes.end()) {
-      Inc.Resolved = It->second.str();
-      PP.PreambleIncludes.push_back(Inc);
-      continue;
+
+  if (IncludesChanged) {
+    // We are only interested in newly added includes, record the ones in
+    // Baseline for exclusion.
+    llvm::DenseMap<std::pair<tok::PPKeywordKind, llvm::StringRef>,
+                   /*Resolved=*/llvm::StringRef>
+        ExistingIncludes;
+    for (const auto &Inc : Baseline.Includes.MainFileIncludes)
+      ExistingIncludes[{Inc.Directive, Inc.Written}] = Inc.Resolved;
+    // There might be includes coming from disabled regions, record these for
+    // exclusion too. note that we don't have resolved paths for those.
+    for (const auto &Inc : BaselineScan->Includes)
+      ExistingIncludes.try_emplace({Inc.Directive, Inc.Written});
+    // Calculate extra includes that needs to be inserted.
+    for (auto &Inc : ModifiedScan->Includes) {
+      auto It = ExistingIncludes.find({Inc.Directive, Inc.Written});
+      // Include already present in the baseline preamble. Set resolved path and
+      // put into preamble includes.
+      if (It != ExistingIncludes.end()) {
+        Inc.Resolved = It->second.str();
+        PP.PreambleIncludes.push_back(Inc);
+        continue;
+      }
+      // Include is new in the modified preamble. Inject it into the patch and
+      // use #line to set the presumed location to where it is spelled.
+      auto LineCol = offsetToClangLineColumn(Modified.Contents, Inc.HashOffset);
+      Patch << llvm::formatv("#line {0}\n", LineCol.first);
+      Patch << llvm::formatv(
+          "#{0} {1}\n", spellingForIncDirective(Inc.Directive), Inc.Written);
     }
-    // Include is new in the modified preamble. Inject it into the patch and use
-    // #line to set the presumed location to where it is spelled.
-    auto LineCol = offsetToClangLineColumn(Modified.Contents, Inc.HashOffset);
-    Patch << llvm::formatv("#line {0}\n", LineCol.first);
-    Patch << llvm::formatv("#{0} {1}\n", spellingForIncDirective(Inc.Directive),
-                           Inc.Written);
   }
-  Patch.flush();
 
-  // FIXME: Handle more directives, e.g. define/undef.
+  if (DirectivesChanged) {
+    // We need to patch all the directives, since they are order dependent. e.g:
+    // #define BAR(X) NEW(X) // Newly introduced in Modified
+    // #define BAR(X) OLD(X) // Exists in the Baseline
+    //
+    // If we've patched only the first directive, the macro definition would've
+    // been wrong for the rest of the file, since patch is applied after the
+    // baseline preamble.
+    //
+    // Note that we deliberately ignore conditional directives and undefs to
+    // reduce complexity. The former might cause problems because scanning is
+    // imprecise and might pick directives from disabled regions.
+    for (const auto &TD : ModifiedScan->TextualDirectives)
+      Patch << TD.Text << '\n';
+  }
+  dlog("Created preamble patch: {0}", Patch.str());
+  Patch.flush();
   return PP;
 }
 
diff --git a/clang-tools-extra/clangd/Preamble.h b/clang-tools-extra/clangd/Preamble.h
index c88688a51736e..f9816611153ff 100644
--- a/clang-tools-extra/clangd/Preamble.h
+++ b/clang-tools-extra/clangd/Preamble.h
@@ -119,6 +119,9 @@ class PreamblePatch {
   /// using the presumed-location mechanism.
   std::vector<Inclusion> preambleIncludes() const;
 
+  /// Returns textual patch contents.
+  llvm::StringRef text() const { return PatchContents; }
+
 private:
   PreamblePatch() = default;
   std::string PatchContents;
diff --git a/clang-tools-extra/clangd/unittests/PreambleTests.cpp b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
index b1548ef4732b4..b69efe75beec1 100644
--- a/clang-tools-extra/clangd/unittests/PreambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -26,7 +26,9 @@
 #include <string>
 #include <vector>
 
+using testing::Contains;
 using testing::Field;
+using testing::MatchesRegex;
 
 namespace clang {
 namespace clangd {
@@ -181,6 +183,109 @@ TEST(PreamblePatchTest, PatchesPreambleIncludes) {
               ElementsAre(AllOf(Field(&Inclusion::Written, "\"a.h\""),
                                 Field(&Inclusion::Resolved, testPath("a.h")))));
 }
+
+llvm::Optional<ParsedAST> createPatchedAST(llvm::StringRef Baseline,
+                                           llvm::StringRef Modified) {
+  auto BaselinePreamble = TestTU::withCode(Baseline).preamble();
+  if (!BaselinePreamble) {
+    ADD_FAILURE() << "Failed to build baseline preamble";
+    return llvm::None;
+  }
+
+  IgnoreDiagnostics Diags;
+  auto TU = TestTU::withCode(Modified);
+  auto CI = buildCompilerInvocation(TU.inputs(), Diags);
+  if (!CI) {
+    ADD_FAILURE() << "Failed to build compiler invocation";
+    return llvm::None;
+  }
+  return ParsedAST::build(testPath("main.cpp"), TU.inputs(), std::move(CI), {},
+                          BaselinePreamble);
+}
+
+std::string getPreamblePatch(llvm::StringRef Baseline,
+                             llvm::StringRef Modified) {
+  auto BaselinePreamble = TestTU::withCode(Baseline).preamble();
+  if (!BaselinePreamble) {
+    ADD_FAILURE() << "Failed to build baseline preamble";
+    return "";
+  }
+  auto TU = TestTU::withCode(Modified);
+  return PreamblePatch::create(testPath("main.cpp"), TU.inputs(),
+                               *BaselinePreamble)
+      .text()
+      .str();
+}
+
+TEST(PreamblePatchTest, Define) {
+  // BAR should be defined while parsing the AST.
+  struct {
+    llvm::StringLiteral Contents;
+    llvm::StringLiteral ExpectedPatch;
+  } Cases[] = {
+      {
+          R"cpp(
+        #define BAR
+        [[BAR]])cpp",
+          R"cpp(#line 0 ".*main.cpp"
+#define BAR
+)cpp",
+      },
+      // multiline macro
+      {
+          R"cpp(
+        #define BAR \
+
+        [[BAR]])cpp",
+          R"cpp(#line 0 ".*main.cpp"
+#define BAR
+)cpp",
+      },
+      // multiline macro
+      {
+          R"cpp(
+        #define \
+                BAR
+        [[BAR]])cpp",
+          R"cpp(#line 0 ".*main.cpp"
+#define BAR
+)cpp",
+      },
+  };
+
+  for (const auto &Case : Cases) {
+    SCOPED_TRACE(Case.Contents);
+    Annotations Modified(Case.Contents);
+    EXPECT_THAT(getPreamblePatch("", Modified.code()),
+                MatchesRegex(Case.ExpectedPatch.str()));
+
+    auto AST = createPatchedAST("", Modified.code());
+    ASSERT_TRUE(AST);
+    EXPECT_THAT(AST->getDiagnostics(),
+                Not(Contains(Field(&Diag::Range, Modified.range()))));
+  }
+}
+
+TEST(PreamblePatchTest, OrderingPreserved) {
+  llvm::StringLiteral Baseline = "#define BAR(X) X";
+  Annotations Modified(R"cpp(
+    #define BAR(X, Y) X Y
+    #define BAR(X) X
+    [[BAR]](int y);
+  )cpp");
+
+  llvm::StringLiteral ExpectedPatch(R"cpp(#line 0 ".*main.cpp"
+#define BAR\(X, Y\) X Y
+#define BAR\(X\) X
+)cpp");
+  EXPECT_THAT(getPreamblePatch(Baseline, Modified.code()),
+              MatchesRegex(ExpectedPatch.str()));
+
+  auto AST = createPatchedAST(Baseline, Modified.code());
+  ASSERT_TRUE(AST);
+  EXPECT_THAT(AST->getDiagnostics(),
+              Not(Contains(Field(&Diag::Range, Modified.range()))));
+}
 } // namespace
 } // namespace clangd
 } // namespace clang

From 538c2753f3ec818eae57a5c5dfbe1f05af57ee37 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 14 May 2020 12:26:47 +0200
Subject: [PATCH 497/770] [clangd] locateMacroAt handles patched macros

Summary: Depends on D79992.

This patch changes locateMacroAt to perform #line directive substitution
for macro identifier locations.

We first check whether a location is inside a file included through
built-in header. If so we check whether line directive maps it back to
the main file, and afterwards use TokenBuffers to find exact location of
the identifier on the line.

Instead of performing the mapping in locateMacroAt, we could also store
a mapping inside the ParsedAST whenever we use a patched preamble. But
that would imply adding more responsibility to ParsedAST and paying for
the mapping even when it is not going to be used.

====

Go-To-Definition:

Later on these locations are used for serving go-to-definition requests,
this enables jumping to definition inside the preamble section in
presence of patched macros.

=====

Go-To-Refs:

Macro references in main file are collected separetely and stored as a
map from macro's symbol id to reference ranges. Those ranges are
computed inside PPCallbacks, hence we don't have access to TokenBuffer.

In presence of preamble patch, any reference to a macro inside the
preamble section will unfortunately have the wrong range. They'll point
into the patch rather than the main file. Hence during findReferences,
we won't get any ranges reported for those.

Fixing those requires:
- Lexing the preamble section to figure out "real range" of a patched
  macro definition
- Postponing range/location calculations until a later step in which we
  have access to tokenbuffers.

This patch trades some accuracy in favor of code complexity. We don't do
any patching for references inside the preamble patch but get any
reference inside the main file for free.

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80198
---
 clang-tools-extra/clangd/Headers.cpp          |  19 +-
 clang-tools-extra/clangd/Hover.cpp            |  28 ++-
 clang-tools-extra/clangd/Preamble.cpp         |  91 +++++++-
 clang-tools-extra/clangd/Preamble.h           |   5 +
 clang-tools-extra/clangd/SourceCode.cpp       |   5 +-
 clang-tools-extra/clangd/SourceCode.h         |   4 +
 clang-tools-extra/clangd/XRefs.cpp            |   4 +-
 .../clangd/unittests/HeadersTests.cpp         |   4 +-
 .../clangd/unittests/PreambleTests.cpp        | 210 +++++++++++++++++-
 9 files changed, 321 insertions(+), 49 deletions(-)

diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp
index 2b9f8feb7db86..876645cd6a503 100644
--- a/clang-tools-extra/clangd/Headers.cpp
+++ b/clang-tools-extra/clangd/Headers.cpp
@@ -8,6 +8,7 @@
 
 #include "Headers.h"
 #include "Compiler.h"
+#include "Preamble.h"
 #include "SourceCode.h"
 #include "support/Logger.h"
 #include "clang/Basic/SourceLocation.h"
@@ -23,11 +24,6 @@ namespace clang {
 namespace clangd {
 namespace {
 
-bool isMainFile(llvm::StringRef FileName, const SourceManager &SM) {
-  auto FE = SM.getFileManager().getFile(FileName);
-  return FE && *FE == SM.getFileEntryForID(SM.getMainFileID());
-}
-
 class RecordHeaders : public PPCallbacks {
 public:
   RecordHeaders(const SourceManager &SM, IncludeStructure *Out)
@@ -44,17 +40,8 @@ class RecordHeaders : public PPCallbacks {
                           SrcMgr::CharacteristicKind FileKind) override {
     auto MainFID = SM.getMainFileID();
     // If an include is part of the preamble patch, translate #line directives.
-    if (InBuiltinFile) {
-      auto Presumed = SM.getPresumedLoc(HashLoc);
-      // Presumed locations will have an invalid file id when #line directive
-      // changes the filename.
-      if (Presumed.getFileID().isInvalid() &&
-          isMainFile(Presumed.getFilename(), SM)) {
-        // Now we'll hit the case below.
-        HashLoc = SM.translateLineCol(MainFID, Presumed.getLine(),
-                                      Presumed.getColumn());
-      }
-    }
+    if (InBuiltinFile)
+      HashLoc = translatePreamblePatchLocation(HashLoc, SM);
 
     // Record main-file inclusions (including those mapped from the preamble
     // patch).
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index e2a3a0dd62f52..1cc564be5cf53 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -376,8 +376,7 @@ llvm::Optional<StringRef> fieldName(const Expr *E) {
   const auto *ME = llvm::dyn_cast<MemberExpr>(E->IgnoreCasts());
   if (!ME || !llvm::isa<CXXThisExpr>(ME->getBase()->IgnoreCasts()))
     return llvm::None;
-  const auto *Field =
-      llvm::dyn_cast<FieldDecl>(ME->getMemberDecl());
+  const auto *Field = llvm::dyn_cast<FieldDecl>(ME->getMemberDecl());
   if (!Field || !Field->getDeclName().isIdentifier())
     return llvm::None;
   return Field->getDeclName().getAsIdentifierInfo()->getName();
@@ -556,7 +555,14 @@ HoverInfo getHoverContents(const DefinedMacro &Macro, ParsedAST &AST) {
   // Try to get the full definition, not just the name
   SourceLocation StartLoc = Macro.Info->getDefinitionLoc();
   SourceLocation EndLoc = Macro.Info->getDefinitionEndLoc();
-  if (EndLoc.isValid()) {
+  // Ensure that EndLoc is a valid offset. For example it might come from
+  // preamble, and source file might've changed, in such a scenario EndLoc still
+  // stays valid, but getLocForEndOfToken will fail as it is no longer a valid
+  // offset.
+  // Note that this check is just to ensure there's text data inside the range.
+  // It will still succeed even when the data inside the range is irrelevant to
+  // macro definition.
+  if (SM.getPresumedLoc(EndLoc, /*UseLineDirectives=*/false).isValid()) {
     EndLoc = Lexer::getLocForEndOfToken(EndLoc, 0, SM, AST.getLangOpts());
     bool Invalid;
     StringRef Buffer = SM.getBufferData(SM.getFileID(StartLoc), &Invalid);
@@ -873,20 +879,20 @@ llvm::Optional<llvm::StringRef> getBacktickQuoteRange(llvm::StringRef Line,
   if (!Suffix.empty() && !AfterEndChars.contains(Suffix.front()))
     return llvm::None;
 
-  return Line.slice(Offset, Next+1);
+  return Line.slice(Offset, Next + 1);
 }
 
 void parseDocumentationLine(llvm::StringRef Line, markup::Paragraph &Out) {
   // Probably this is appendText(Line), but scan for something interesting.
   for (unsigned I = 0; I < Line.size(); ++I) {
     switch (Line[I]) {
-      case '`':
-        if (auto Range = getBacktickQuoteRange(Line, I)) {
-          Out.appendText(Line.substr(0, I));
-          Out.appendCode(Range->trim("`"), /*Preserve=*/true);
-          return parseDocumentationLine(Line.substr(I+Range->size()), Out);
-        }
-        break;
+    case '`':
+      if (auto Range = getBacktickQuoteRange(Line, I)) {
+        Out.appendText(Line.substr(0, I));
+        Out.appendCode(Range->trim("`"), /*Preserve=*/true);
+        return parseDocumentationLine(Line.substr(I + Range->size()), Out);
+      }
+      break;
     }
   }
   Out.appendText(Line).appendSpace();
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index dcfafc914d107..667fa04ff3458 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -50,6 +50,7 @@
 namespace clang {
 namespace clangd {
 namespace {
+constexpr llvm::StringLiteral PreamblePatchHeaderName = "__preamble_patch__.h";
 
 bool compileCommandsAreEqual(const tooling::CompileCommand &LHS,
                              const tooling::CompileCommand &RHS) {
@@ -120,6 +121,49 @@ struct TextualPPDirective {
   }
 };
 
+// Formats a PP directive consisting of Prefix (e.g. "#define ") and Body ("X
+// 10"). The formatting is copied so that the tokens in Body have PresumedLocs
+// with correct columns and lines.
+std::string spellDirective(llvm::StringRef Prefix,
+                           CharSourceRange DirectiveRange,
+                           const LangOptions &LangOpts, const SourceManager &SM,
+                           unsigned &DirectiveLine) {
+  std::string SpelledDirective;
+  llvm::raw_string_ostream OS(SpelledDirective);
+  OS << Prefix;
+
+  // Make sure DirectiveRange is a char range and doesn't contain macro ids.
+  DirectiveRange = SM.getExpansionRange(DirectiveRange);
+  if (DirectiveRange.isTokenRange()) {
+    DirectiveRange.setEnd(
+        Lexer::getLocForEndOfToken(DirectiveRange.getEnd(), 0, SM, LangOpts));
+  }
+
+  auto DecompLoc = SM.getDecomposedLoc(DirectiveRange.getBegin());
+  DirectiveLine = SM.getLineNumber(DecompLoc.first, DecompLoc.second);
+  auto TargetColumn = SM.getColumnNumber(DecompLoc.first, DecompLoc.second) - 1;
+
+  // Pad with spaces before DirectiveRange to make sure it will be on right
+  // column when patched.
+  if (Prefix.size() <= TargetColumn) {
+    // There is enough space for Prefix and space before directive, use it.
+    // We try to squeeze the Prefix into the same line whenever we can, as
+    // putting onto a separate line won't work at the beginning of the file.
+    OS << std::string(TargetColumn - Prefix.size(), ' ');
+  } else {
+    // Prefix was longer than the space we had. We produce e.g.:
+    // #line N-1
+    // #define \
+    //    X 10
+    OS << "\\\n" << std::string(TargetColumn, ' ');
+    // Decrement because we put an additional line break before
+    // DirectiveRange.begin().
+    --DirectiveLine;
+  }
+  OS << toSourceCode(SM, DirectiveRange.getAsRange());
+  return OS.str();
+}
+
 // Collects #define directives inside the main file.
 struct DirectiveCollector : public PPCallbacks {
   DirectiveCollector(const Preprocessor &PP,
@@ -140,15 +184,12 @@ struct DirectiveCollector : public PPCallbacks {
     TextualDirectives.emplace_back();
     TextualPPDirective &TD = TextualDirectives.back();
 
-    auto DecompLoc = SM.getDecomposedLoc(MacroNameTok.getLocation());
-    TD.DirectiveLine = SM.getLineNumber(DecompLoc.first, DecompLoc.second);
-
-    SourceRange DefRange(
-        MD->getMacroInfo()->getDefinitionLoc(),
-        Lexer::getLocForEndOfToken(MD->getMacroInfo()->getDefinitionEndLoc(), 0,
-                                   SM, LangOpts));
-    llvm::raw_string_ostream OS(TD.Text);
-    OS << "#define " << toSourceCode(SM, DefRange);
+    const auto *MI = MD->getMacroInfo();
+    TD.Text =
+        spellDirective("#define ",
+                       CharSourceRange::getTokenRange(
+                           MI->getDefinitionLoc(), MI->getDefinitionEndLoc()),
+                       LangOpts, SM, TD.DirectiveLine);
   }
 
 private:
@@ -231,6 +272,13 @@ const char *spellingForIncDirective(tok::PPKeywordKind IncludeDirective) {
   }
   llvm_unreachable("not an include directive");
 }
+
+// Checks whether \p FileName is a valid spelling of main file.
+bool isMainFile(llvm::StringRef FileName, const SourceManager &SM) {
+  auto FE = SM.getFileManager().getFile(FileName);
+  return FE && *FE == SM.getFileEntryForID(SM.getMainFileID());
+}
+
 } // namespace
 
 PreambleData::PreambleData(const ParseInputs &Inputs,
@@ -374,7 +422,7 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
   // This shouldn't coincide with any real file name.
   llvm::SmallString<128> PatchName;
   llvm::sys::path::append(PatchName, llvm::sys::path::parent_path(FileName),
-                          "__preamble_patch__.h");
+                          PreamblePatchHeaderName);
   PP.PatchFileName = PatchName.str().str();
 
   llvm::raw_string_ostream Patch(PP.PatchContents);
@@ -428,8 +476,10 @@ PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
     // Note that we deliberately ignore conditional directives and undefs to
     // reduce complexity. The former might cause problems because scanning is
     // imprecise and might pick directives from disabled regions.
-    for (const auto &TD : ModifiedScan->TextualDirectives)
+    for (const auto &TD : ModifiedScan->TextualDirectives) {
+      Patch << "#line " << TD.DirectiveLine << '\n';
       Patch << TD.Text << '\n';
+    }
   }
   dlog("Created preamble patch: {0}", Patch.str());
   Patch.flush();
@@ -462,5 +512,24 @@ PreamblePatch PreamblePatch::unmodified(const PreambleData &Preamble) {
   return PP;
 }
 
+SourceLocation translatePreamblePatchLocation(SourceLocation Loc,
+                                              const SourceManager &SM) {
+  auto DefFile = SM.getFileID(Loc);
+  if (auto *FE = SM.getFileEntryForID(DefFile)) {
+    auto IncludeLoc = SM.getIncludeLoc(DefFile);
+    // Preamble patch is included inside the builtin file.
+    if (IncludeLoc.isValid() && SM.isWrittenInBuiltinFile(IncludeLoc) &&
+        FE->getName().endswith(PreamblePatchHeaderName)) {
+      auto Presumed = SM.getPresumedLoc(Loc);
+      // Check that line directive is pointing at main file.
+      if (Presumed.isValid() && Presumed.getFileID().isInvalid() &&
+          isMainFile(Presumed.getFilename(), SM)) {
+        Loc = SM.translateLineCol(SM.getMainFileID(), Presumed.getLine(),
+                                  Presumed.getColumn());
+      }
+    }
+  }
+  return Loc;
+}
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/Preamble.h b/clang-tools-extra/clangd/Preamble.h
index f9816611153ff..35c168573b90a 100644
--- a/clang-tools-extra/clangd/Preamble.h
+++ b/clang-tools-extra/clangd/Preamble.h
@@ -131,6 +131,11 @@ class PreamblePatch {
   std::vector<Inclusion> PreambleIncludes;
 };
 
+/// Translates locations inside preamble patch to their main-file equivalent
+/// using presumed locations. Returns \p Loc if it isn't inside preamble patch.
+SourceLocation translatePreamblePatchLocation(SourceLocation Loc,
+                                              const SourceManager &SM);
+
 } // namespace clangd
 } // namespace clang
 
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 0d08bf8e0a1ae..f6f9371d436fe 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -8,6 +8,7 @@
 #include "SourceCode.h"
 
 #include "FuzzyMatch.h"
+#include "Preamble.h"
 #include "Protocol.h"
 #include "refactor/Tweak.h"
 #include "support/Context.h"
@@ -961,7 +962,9 @@ llvm::Optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok,
     Loc = Loc.getLocWithOffset(-1);
   MacroDefinition MacroDef = PP.getMacroDefinitionAtLoc(IdentifierInfo, Loc);
   if (auto *MI = MacroDef.getMacroInfo())
-    return DefinedMacro{IdentifierInfo->getName(), MI};
+    return DefinedMacro{
+        IdentifierInfo->getName(), MI,
+        translatePreamblePatchLocation(MI->getDefinitionLoc(), SM)};
   return None;
 }
 
diff --git a/clang-tools-extra/clangd/SourceCode.h b/clang-tools-extra/clangd/SourceCode.h
index c00cc17ac9bcf..73fd9398b71d5 100644
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@@ -292,6 +292,10 @@ EligibleRegion getEligiblePoints(llvm::StringRef Code,
 struct DefinedMacro {
   llvm::StringRef Name;
   const MacroInfo *Info;
+  /// Location of the identifier that names the macro.
+  /// Unlike Info->Location, this translates preamble-patch locations to
+  /// main-file locations.
+  SourceLocation NameLoc;
 };
 /// Gets the macro referenced by \p SpelledTok. It must be a spelled token
 /// aligned to the beginning of an identifier.
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 7de1dc53596e9..df2c7a7c72285 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -211,8 +211,8 @@ llvm::Optional<LocatedSymbol>
 locateMacroReferent(const syntax::Token &TouchedIdentifier, ParsedAST &AST,
                     llvm::StringRef MainFilePath) {
   if (auto M = locateMacroAt(TouchedIdentifier, AST.getPreprocessor())) {
-    if (auto Loc = makeLocation(AST.getASTContext(),
-                                M->Info->getDefinitionLoc(), MainFilePath)) {
+    if (auto Loc =
+            makeLocation(AST.getASTContext(), M->NameLoc, MainFilePath)) {
       LocatedSymbol Macro;
       Macro.Name = std::string(M->Name);
       Macro.PreferredDeclaration = *Loc;
diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
index 2de0b5ca92a70..07ab835bdb673 100644
--- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
@@ -306,7 +306,7 @@ TEST(Headers, NoHeaderSearchInfo) {
 }
 
 TEST_F(HeadersTest, PresumedLocations) {
-  std::string HeaderFile = "implicit_include.h";
+  std::string HeaderFile = "__preamble_patch__.h";
 
   // Line map inclusion back to main file.
   std::string HeaderContents =
@@ -317,7 +317,7 @@ TEST_F(HeadersTest, PresumedLocations) {
   FS.Files[HeaderFile] = HeaderContents;
 
   // Including through non-builtin file has no effects.
-  FS.Files[MainFile] = "#include \"implicit_include.h\"\n\n";
+  FS.Files[MainFile] = "#include \"__preamble_patch__.h\"\n\n";
   EXPECT_THAT(collectIncludes().MainFileIncludes,
               Not(Contains(Written("<a.h>"))));
 
diff --git a/clang-tools-extra/clangd/unittests/PreambleTests.cpp b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
index b69efe75beec1..87a68ceea048a 100644
--- a/clang-tools-extra/clangd/unittests/PreambleTests.cpp
+++ b/clang-tools-extra/clangd/unittests/PreambleTests.cpp
@@ -9,14 +9,19 @@
 #include "Annotations.h"
 #include "Compiler.h"
 #include "Headers.h"
+#include "Hover.h"
 #include "Preamble.h"
+#include "SourceCode.h"
 #include "TestFS.h"
 #include "TestTU.h"
+#include "XRefs.h"
+#include "clang/Format/Format.h"
 #include "clang/Frontend/PrecompiledPreamble.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "gmock/gmock.h"
@@ -28,6 +33,7 @@
 
 using testing::Contains;
 using testing::Field;
+using testing::Matcher;
 using testing::MatchesRegex;
 
 namespace clang {
@@ -199,7 +205,7 @@ llvm::Optional<ParsedAST> createPatchedAST(llvm::StringRef Baseline,
     ADD_FAILURE() << "Failed to build compiler invocation";
     return llvm::None;
   }
-  return ParsedAST::build(testPath("main.cpp"), TU.inputs(), std::move(CI), {},
+  return ParsedAST::build(testPath(TU.Filename), TU.inputs(), std::move(CI), {},
                           BaselinePreamble);
 }
 
@@ -228,7 +234,8 @@ TEST(PreamblePatchTest, Define) {
         #define BAR
         [[BAR]])cpp",
           R"cpp(#line 0 ".*main.cpp"
-#define BAR
+#line 2
+#define         BAR
 )cpp",
       },
       // multiline macro
@@ -238,7 +245,8 @@ TEST(PreamblePatchTest, Define) {
 
         [[BAR]])cpp",
           R"cpp(#line 0 ".*main.cpp"
-#define BAR
+#line 2
+#define         BAR
 )cpp",
       },
       // multiline macro
@@ -248,7 +256,8 @@ TEST(PreamblePatchTest, Define) {
                 BAR
         [[BAR]])cpp",
           R"cpp(#line 0 ".*main.cpp"
-#define BAR
+#line 3
+#define         BAR
 )cpp",
       },
   };
@@ -275,8 +284,10 @@ TEST(PreamblePatchTest, OrderingPreserved) {
   )cpp");
 
   llvm::StringLiteral ExpectedPatch(R"cpp(#line 0 ".*main.cpp"
-#define BAR\(X, Y\) X Y
-#define BAR\(X\) X
+#line 2
+#define     BAR\(X, Y\) X Y
+#line 3
+#define     BAR\(X\) X
 )cpp");
   EXPECT_THAT(getPreamblePatch(Baseline, Modified.code()),
               MatchesRegex(ExpectedPatch.str()));
@@ -286,6 +297,193 @@ TEST(PreamblePatchTest, OrderingPreserved) {
   EXPECT_THAT(AST->getDiagnostics(),
               Not(Contains(Field(&Diag::Range, Modified.range()))));
 }
+
+TEST(PreamblePatchTest, LocateMacroAtWorks) {
+  struct {
+    llvm::StringLiteral Baseline;
+    llvm::StringLiteral Modified;
+  } Cases[] = {
+      // Addition of new directive
+      {
+          "",
+          R"cpp(
+            #define $def^FOO
+            $use^FOO)cpp",
+      },
+      // Available inside preamble section
+      {
+          "",
+          R"cpp(
+            #define $def^FOO
+            #undef $use^FOO)cpp",
+      },
+      // Available after undef, as we don't patch those
+      {
+          "",
+          R"cpp(
+            #define $def^FOO
+            #undef FOO
+            $use^FOO)cpp",
+      },
+      // Identifier on a different line
+      {
+          "",
+          R"cpp(
+            #define \
+              $def^FOO
+            $use^FOO)cpp",
+      },
+      // In presence of comment tokens
+      {
+          "",
+          R"cpp(
+            #\
+              define /* FOO */\
+              /* FOO */ $def^FOO
+            $use^FOO)cpp",
+      },
+      // Moved around
+      {
+          "#define FOO",
+          R"cpp(
+            #define BAR
+            #define $def^FOO
+            $use^FOO)cpp",
+      },
+  };
+  for (const auto &Case : Cases) {
+    SCOPED_TRACE(Case.Modified);
+    llvm::Annotations Modified(Case.Modified);
+    auto AST = createPatchedAST(Case.Baseline, Modified.code());
+    ASSERT_TRUE(AST);
+
+    const auto &SM = AST->getSourceManager();
+    auto *MacroTok = AST->getTokens().spelledTokenAt(
+        SM.getComposedLoc(SM.getMainFileID(), Modified.point("use")));
+    ASSERT_TRUE(MacroTok);
+
+    auto FoundMacro = locateMacroAt(*MacroTok, AST->getPreprocessor());
+    ASSERT_TRUE(FoundMacro);
+    EXPECT_THAT(FoundMacro->Name, "FOO");
+
+    auto MacroLoc = FoundMacro->NameLoc;
+    EXPECT_EQ(SM.getFileID(MacroLoc), SM.getMainFileID());
+    EXPECT_EQ(SM.getFileOffset(MacroLoc), Modified.point("def"));
+  }
+}
+
+TEST(PreamblePatchTest, LocateMacroAtDeletion) {
+  {
+    // We don't patch deleted define directives, make sure we don't crash.
+    llvm::StringLiteral Baseline = "#define FOO";
+    llvm::Annotations Modified("^FOO");
+
+    auto AST = createPatchedAST(Baseline, Modified.code());
+    ASSERT_TRUE(AST);
+
+    const auto &SM = AST->getSourceManager();
+    auto *MacroTok = AST->getTokens().spelledTokenAt(
+        SM.getComposedLoc(SM.getMainFileID(), Modified.point()));
+    ASSERT_TRUE(MacroTok);
+
+    auto FoundMacro = locateMacroAt(*MacroTok, AST->getPreprocessor());
+    ASSERT_TRUE(FoundMacro);
+    EXPECT_THAT(FoundMacro->Name, "FOO");
+    auto HI =
+        getHover(*AST, offsetToPosition(Modified.code(), Modified.point()),
+                 format::getLLVMStyle(), nullptr);
+    ASSERT_TRUE(HI);
+    EXPECT_THAT(HI->Definition, testing::IsEmpty());
+  }
+
+  {
+    // Offset is valid, but underlying text is different.
+    llvm::StringLiteral Baseline = "#define FOO";
+    Annotations Modified(R"cpp(#define BAR
+    ^FOO")cpp");
+
+    auto AST = createPatchedAST(Baseline, Modified.code());
+    ASSERT_TRUE(AST);
+
+    auto HI = getHover(*AST, Modified.point(), format::getLLVMStyle(), nullptr);
+    ASSERT_TRUE(HI);
+    EXPECT_THAT(HI->Definition, "#define BAR");
+  }
+}
+
+TEST(PreamblePatchTest, RefsToMacros) {
+  struct {
+    llvm::StringLiteral Baseline;
+    llvm::StringLiteral Modified;
+  } Cases[] = {
+      // Newly added
+      {
+          "",
+          R"cpp(
+            #define ^FOO
+            ^[[FOO]])cpp",
+      },
+      // Moved around
+      {
+          "#define FOO",
+          R"cpp(
+            #define BAR
+            #define ^FOO
+            ^[[FOO]])cpp",
+      },
+      // Ref in preamble section
+      {
+          "",
+          R"cpp(
+            #define ^FOO
+            #undef ^FOO)cpp",
+      },
+  };
+
+  for (const auto &Case : Cases) {
+    Annotations Modified(Case.Modified);
+    auto AST = createPatchedAST("", Modified.code());
+    ASSERT_TRUE(AST);
+
+    const auto &SM = AST->getSourceManager();
+    std::vector<Matcher<Location>> ExpectedLocations;
+    for (const auto &R : Modified.ranges())
+      ExpectedLocations.push_back(Field(&Location::range, R));
+
+    for (const auto &P : Modified.points()) {
+      auto *MacroTok = AST->getTokens().spelledTokenAt(SM.getComposedLoc(
+          SM.getMainFileID(),
+          llvm::cantFail(positionToOffset(Modified.code(), P))));
+      ASSERT_TRUE(MacroTok);
+      EXPECT_THAT(findReferences(*AST, P, 0).References,
+                  testing::ElementsAreArray(ExpectedLocations));
+    }
+  }
+}
+
+TEST(TranslatePreamblePatchLocation, Simple) {
+  auto TU = TestTU::withHeaderCode(R"cpp(
+    #line 3 "main.cpp"
+    int foo();)cpp");
+  // Presumed line/col needs to be valid in the main file.
+  TU.Code = R"cpp(// line 1
+    // line 2
+    // line 3
+    // line 4)cpp";
+  TU.Filename = "main.cpp";
+  TU.HeaderFilename = "__preamble_patch__.h";
+  TU.ImplicitHeaderGuard = false;
+
+  auto AST = TU.build();
+  auto &SM = AST.getSourceManager();
+  auto &ND = findDecl(AST, "foo");
+  EXPECT_NE(SM.getFileID(ND.getLocation()), SM.getMainFileID());
+
+  auto TranslatedLoc = translatePreamblePatchLocation(ND.getLocation(), SM);
+  auto DecompLoc = SM.getDecomposedLoc(TranslatedLoc);
+  EXPECT_EQ(DecompLoc.first, SM.getMainFileID());
+  EXPECT_EQ(SM.getLineNumber(DecompLoc.first, DecompLoc.second), 3U);
+}
 } // namespace
 } // namespace clangd
 } // namespace clang

From a91b801b3914bfed334f85fb2e43a25b91a9056d Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 29 May 2020 12:49:11 +0200
Subject: [PATCH 498/770] Fix broken include

---
 llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h b/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
index 80d8a68bd032d..47b6397099b78 100644
--- a/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
+++ b/llvm/include/llvm/DebugInfo/PDB/IPDBLineNumber.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_IPDBLINENUMBER_H
 #define LLVM_DEBUGINFO_PDB_IPDBLINENUMBER_H
 
-#include <cstdint.>
+#include <cstdint>
 
 namespace llvm {
 namespace pdb {

From 4265f1d23cc9d10b9544bafc7a285ca5f143faea Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Wed, 27 May 2020 10:36:25 +0100
Subject: [PATCH 499/770] [CodeGen] Fix warnings in getZeroExtendInReg

We should be using getVectorElementCount() to assert that two types
have the same numbers of elements. I encountered the warnings while
compiling this test:

  CodeGen/AArch64/sve-intrinsics-ld1.ll

Differential Revision: https://reviews.llvm.org/D80616
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2f277eee84956..bd1a5a4a876ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1175,7 +1175,7 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
          "getZeroExtendInReg type should be vector iff the operand "
          "type is vector!");
   assert((!VT.isVector() ||
-          VT.getVectorNumElements() == OpVT.getVectorNumElements()) &&
+          VT.getVectorElementCount() == OpVT.getVectorElementCount()) &&
          "Vector element counts must match in getZeroExtendInReg");
   assert(VT.bitsLE(OpVT) && "Not extending!");
   if (OpVT == VT)

From 838a955ab9c79f906ef936fc0100f94cae71a83c Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 29 May 2020 12:55:11 +0200
Subject: [PATCH 500/770] [readobj] Fix dangling else warning

---
 llvm/tools/llvm-readobj/ELFDumper.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index eebe65f8400f3..dc080c8dd49cb 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -4693,11 +4693,12 @@ void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
       printHashHistogram(*HashTable);
 
   // Print histogram for the .gnu.hash section.
-  if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable())
+  if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable()) {
     if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable))
       this->reportUniqueWarning(std::move(E));
     else
       printGnuHashHistogram(*GnuHashTable);
+  }
 }
 
 template <class ELFT>

From 01f999ae8871544ab4996fd1368c0dfe4c4a0765 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 29 May 2020 09:29:39 +0100
Subject: [PATCH 501/770] [SCCP] Switch to widen at PHIs, stores and call
 edges.

Currently SCCP does not widen PHIs, stores or along call edges
(arguments/return values), but on operations that directly extend ranges
(like binary operators).

This means PHIs, stores and call edges are not pessimized by widening
currently, while binary operators are. The main reason for widening
operators initially was that opting-out for certain operations was
more straight-forward in the initial implementation (and it did not
matter too much, as range support initially was only implemented for a
very limited set of operations.

During the discussion in D78391, it was suggested to consider flipping
widening to PHIs, stores and along call edges. After adding support for
tracking the number of range extensions in ValueLattice, limiting the
number of range extensions per value is straight forward.

This patch introduces a MaxWidenSteps option to the MergeOptions,
limiting the number of range extensions per value. For PHIs, it seems
natural allow an extension for each (active) incoming value plus 1. For
the other cases, a arbitrary limit of 10 has been chosen initially. It would
potentially make sense to set it depending on the users of a
function/global, but that still needs investigating. This potentially
leads to more state-changes and longer compile-times.

The results look quite promising (MultiSource, SPEC):

Same hash: 179 (filtered out)
Remaining: 58
Metric: sccp.IPNumInstRemoved

Program                                        base    widen-phi diff
 test-suite...ks/Prolangs-C/agrep/agrep.test    58.00   82.00    41.4%
 test-suite...marks/SciMark2-C/scimark2.test    32.00   43.00    34.4%
 test-suite...rks/FreeBench/mason/mason.test     6.00    8.00    33.3%
 test-suite...langs-C/football/football.test   104.00  128.00    23.1%
 test-suite...cations/hexxagon/hexxagon.test    36.00   42.00    16.7%
 test-suite...CFP2000/177.mesa/177.mesa.test   214.00  249.00    16.4%
 test-suite...ngs-C/assembler/assembler.test    14.00   16.00    14.3%
 test-suite...arks/VersaBench/dbms/dbms.test    10.00   11.00    10.0%
 test-suite...oxyApps-C++/miniFE/miniFE.test    43.00   47.00     9.3%
 test-suite...ications/JM/ldecod/ldecod.test   179.00  195.00     8.9%
 test-suite...CFP2006/433.milc/433.milc.test   249.00  265.00     6.4%
 test-suite.../CINT2000/175.vpr/175.vpr.test    98.00  104.00     6.1%
 test-suite...peg2/mpeg2dec/mpeg2decode.test    70.00   74.00     5.7%
 test-suite...CFP2000/188.ammp/188.ammp.test    71.00   75.00     5.6%
 test-suite...ce/Benchmarks/PAQ8p/paq8p.test   111.00  117.00     5.4%
 test-suite...ce/Applications/Burg/burg.test    41.00   43.00     4.9%
 test-suite...000/197.parser/197.parser.test    66.00   69.00     4.5%
 test-suite...tions/lambda-0.1.3/lambda.test    23.00   24.00     4.3%
 test-suite...urce/Applications/lua/lua.test   301.00  313.00     4.0%
 test-suite...TimberWolfMC/timberwolfmc.test    76.00   79.00     3.9%
 test-suite...lications/ClamAV/clamscan.test   991.00  1030.00    3.9%
 test-suite...plications/d/make_dparser.test    53.00   55.00     3.8%
 test-suite...fice-ispell/office-ispell.test    83.00   86.00     3.6%
 test-suite...lications/obsequi/Obsequi.test    28.00   29.00     3.6%
 test-suite.../Prolangs-C/bison/mybison.test    56.00   58.00     3.6%
 test-suite.../CINT2000/254.gap/254.gap.test   170.00  176.00     3.5%
 test-suite.../Applications/lemon/lemon.test    30.00   31.00     3.3%
 test-suite.../CINT2000/176.gcc/176.gcc.test   1202.00 1240.00    3.2%
 test-suite...pplications/treecc/treecc.test    79.00   81.00     2.5%
 test-suite...chmarks/MallocBench/gs/gs.test   357.00  366.00     2.5%
 test-suite...eeBench/analyzer/analyzer.test   103.00  105.00     1.9%
 test-suite...T2006/445.gobmk/445.gobmk.test   1697.00 1724.00    1.6%
 test-suite...006/453.povray/453.povray.test   1812.00 1839.00    1.5%
 test-suite.../Benchmarks/Bullet/bullet.test   337.00  342.00     1.5%
 test-suite.../CINT2000/252.eon/252.eon.test   426.00  432.00     1.4%
 test-suite...T2000/300.twolf/300.twolf.test   214.00  217.00     1.4%
 test-suite...pplications/oggenc/oggenc.test   244.00  247.00     1.2%
 test-suite.../CINT2006/403.gcc/403.gcc.test   4008.00 4055.00    1.2%
 test-suite...T2006/456.hmmer/456.hmmer.test   175.00  177.00     1.1%
 test-suite...nal/skidmarks10/skidmarks.test   430.00  434.00     0.9%
 test-suite.../Applications/sgefa/sgefa.test   115.00  116.00     0.9%
 test-suite...006/447.dealII/447.dealII.test   1082.00 1091.00    0.8%
 test-suite...6/482.sphinx3/482.sphinx3.test   141.00  142.00     0.7%
 test-suite...ocBench/espresso/espresso.test   152.00  153.00     0.7%
 test-suite...3.xalancbmk/483.xalancbmk.test   4003.00 4025.00    0.5%
 test-suite...lications/sqlite3/sqlite3.test   548.00  551.00     0.5%
 test-suite...marks/7zip/7zip-benchmark.test   5522.00 5551.00    0.5%
 test-suite...nsumer-lame/consumer-lame.test   208.00  209.00     0.5%
 test-suite...:: External/Povray/povray.test   1556.00 1563.00    0.4%
 test-suite...000/186.crafty/186.crafty.test   298.00  299.00     0.3%
 test-suite.../Applications/SPASS/SPASS.test   2019.00 2025.00    0.3%
 test-suite...ications/JM/lencod/lencod.test   8427.00 8449.00    0.3%
 test-suite...6/464.h264ref/464.h264ref.test   6797.00 6813.00    0.2%
 test-suite...6/471.omnetpp/471.omnetpp.test   431.00  430.00    -0.2%
 test-suite...006/450.soplex/450.soplex.test   446.00  447.00     0.2%
 test-suite...0.perlbench/400.perlbench.test   1729.00 1727.00   -0.1%
 test-suite...000/255.vortex/255.vortex.test   3815.00 3819.00    0.1%

Reviewers: efriedma, nikic, davide

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D79036
---
 llvm/include/llvm/Analysis/ValueLattice.h     |  21 +-
 llvm/lib/Transforms/Scalar/SCCP.cpp           |  57 +++--
 .../Transforms/SCCP/constant-range-struct.ll  |  24 +-
 llvm/test/Transforms/SCCP/ipsccp-cycles.ll    | 242 ++++++++++++++++++
 .../SCCP/resolvedundefsin-tracked-fn.ll       |   5 +-
 llvm/test/Transforms/SCCP/widening.ll         | 130 ++++++++--
 6 files changed, 410 insertions(+), 69 deletions(-)
 create mode 100644 llvm/test/Transforms/SCCP/ipsccp-cycles.ll

diff --git a/llvm/include/llvm/Analysis/ValueLattice.h b/llvm/include/llvm/Analysis/ValueLattice.h
index 718c2646ec8f7..00a230fb08c52 100644
--- a/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/llvm/include/llvm/Analysis/ValueLattice.h
@@ -113,10 +113,16 @@ class ValueLatticeElement {
     /// number of steps.
     bool CheckWiden;
 
+    /// The number of allowed widening steps (including setting the range
+    /// initially).
+    unsigned MaxWidenSteps;
+
     MergeOptions() : MergeOptions(false, false) {}
 
-    MergeOptions(bool MayIncludeUndef, bool CheckWiden)
-        : MayIncludeUndef(MayIncludeUndef), CheckWiden(CheckWiden) {}
+    MergeOptions(bool MayIncludeUndef, bool CheckWiden,
+                 unsigned MaxWidenSteps = 1)
+        : MayIncludeUndef(MayIncludeUndef), CheckWiden(CheckWiden),
+          MaxWidenSteps(MaxWidenSteps) {}
 
     MergeOptions &setMayIncludeUndef(bool V = true) {
       MayIncludeUndef = V;
@@ -127,6 +133,12 @@ class ValueLatticeElement {
       CheckWiden = V;
       return *this;
     }
+
+    MergeOptions &setMaxWidenSteps(unsigned Steps = 1) {
+      CheckWiden = true;
+      MaxWidenSteps = Steps;
+      return *this;
+    }
   };
 
   // ConstVal and Range are initialized on-demand.
@@ -349,7 +361,7 @@ class ValueLatticeElement {
 
       // Simple form of widening. If a range is extended multiple times, go to
       // overdefined.
-      if (Opts.CheckWiden && ++NumRangeExtensions == 1)
+      if (Opts.CheckWiden && ++NumRangeExtensions > Opts.MaxWidenSteps)
         return markOverdefined();
 
       assert(NewR.contains(getConstantRange()) &&
@@ -458,6 +470,9 @@ class ValueLatticeElement {
 
     return nullptr;
   }
+
+  unsigned getNumRangeExtensions() const { return NumRangeExtensions; }
+  void setNumRangeExtensions(unsigned N) { NumRangeExtensions = N; }
 };
 
 static_assert(sizeof(ValueLatticeElement) <= 40,
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index e6477e3d259bb..6eb83af8a3718 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -72,6 +72,15 @@ STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
 STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
 STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
 
+// The maximum number of range extensions allowed for operations requiring
+// widening.
+static const unsigned MaxNumRangeExtensions = 10;
+
+/// Returns MergeOptions with MaxWidenSteps set to MaxNumRangeExtensions.
+static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
+  return ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+      MaxNumRangeExtensions);
+}
 namespace {
 
 // Helper to check if \p LV is either a constant or a constant
@@ -401,7 +410,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   bool mergeInValue(ValueLatticeElement &IV, Value *V,
                     ValueLatticeElement MergeWithV,
                     ValueLatticeElement::MergeOptions Opts = {
-                        /*MayIncludeUndef=*/false, /*CheckWiden=*/true}) {
+                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
     if (IV.mergeIn(MergeWithV, Opts)) {
       pushToWorkList(IV, V);
       LLVM_DEBUG(dbgs() << "Merged " << MergeWithV << " into " << *V << " : "
@@ -413,7 +422,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
 
   bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
                     ValueLatticeElement::MergeOptions Opts = {
-                        /*MayIncludeUndef=*/false, /*CheckWiden=*/true}) {
+                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
     assert(!V->getType()->isStructTy() &&
            "non-structs should use markConstant");
     return mergeInValue(ValueState[V], V, MergeWithV, Opts);
@@ -725,24 +734,36 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   if (PN.getNumIncomingValues() > 64)
     return (void)markOverdefined(&PN);
 
+  unsigned NumActiveIncoming = 0;
+
   // Look at all of the executable operands of the PHI node.  If any of them
   // are overdefined, the PHI becomes overdefined as well.  If they are all
   // constant, and they agree with each other, the PHI becomes the identical
-  // constant.  If they are constant and don't agree, the PHI is overdefined.
-  // If there are no executable operands, the PHI remains unknown.
-  bool Changed = false;
+  // constant.  If they are constant and don't agree, the PHI is a constant
+  // range. If there are no executable operands, the PHI remains unknown.
+  ValueLatticeElement PhiState = getValueState(&PN);
   for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-    ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
     if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
       continue;
 
-    ValueLatticeElement &Res = getValueState(&PN);
-    Changed |= Res.mergeIn(IV);
-    if (Res.isOverdefined())
+    ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
+    PhiState.mergeIn(IV);
+    NumActiveIncoming++;
+    if (PhiState.isOverdefined())
       break;
   }
-  if (Changed)
-    pushToWorkListMsg(ValueState[&PN], &PN);
+
+  // We allow up to 1 range extension per active incoming value and one
+  // additional extension. Note that we manually adjust the number of range
+  // extensions to match the number of active incoming values. This helps to
+  // limit multiple extensions caused by the same incoming value, if other
+  // incoming values are equal.
+  mergeInValue(&PN, PhiState,
+               ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+                   NumActiveIncoming + 1));
+  ValueLatticeElement &PhiStateRef = getValueState(&PN);
+  PhiStateRef.setNumRangeExtensions(
+      std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
 }
 
 void SCCPSolver::visitReturnInst(ReturnInst &I) {
@@ -1112,8 +1133,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
       // If we are tracking this global, merge in the known value for it.
       auto It = TrackedGlobals.find(GV);
       if (It != TrackedGlobals.end()) {
-        mergeInValue(IV, &I, It->second,
-                     ValueLatticeElement::MergeOptions().setCheckWiden(false));
+        mergeInValue(IV, &I, It->second, getMaxWidenStepsOpts());
         return;
       }
     }
@@ -1201,11 +1221,11 @@ void SCCPSolver::handleCallArguments(CallBase &CB) {
       if (auto *STy = dyn_cast<StructType>(AI->getType())) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           ValueLatticeElement CallArg = getStructValueState(*CAI, i);
-          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
+          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
+                       getMaxWidenStepsOpts());
         }
       } else
-        mergeInValue(&*AI, getValueState(*CAI),
-                     ValueLatticeElement::MergeOptions().setCheckWiden(false));
+        mergeInValue(&*AI, getValueState(*CAI), getMaxWidenStepsOpts());
     }
   }
 }
@@ -1316,14 +1336,15 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
     // into this call site.
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
       mergeInValue(getStructValueState(&CB, i), &CB,
-                   TrackedMultipleRetVals[std::make_pair(F, i)]);
+                   TrackedMultipleRetVals[std::make_pair(F, i)],
+                   getMaxWidenStepsOpts());
   } else {
     auto TFRVI = TrackedRetVals.find(F);
     if (TFRVI == TrackedRetVals.end())
       return handleCallOverdefined(CB); // Not tracking this callee.
 
     // If so, propagate the return value of the callee into this call result.
-    mergeInValue(&CB, TFRVI->second);
+    mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
   }
 }
 
diff --git a/llvm/test/Transforms/SCCP/constant-range-struct.ll b/llvm/test/Transforms/SCCP/constant-range-struct.ll
index 6a602feefa4ca..290437d386aaf 100644
--- a/llvm/test/Transforms/SCCP/constant-range-struct.ll
+++ b/llvm/test/Transforms/SCCP/constant-range-struct.ll
@@ -102,22 +102,14 @@ define void @struct2_caller() {
 ; CHECK-NEXT:    [[S:%.*]] = call { i64, i64 } @struct2()
 ; CHECK-NEXT:    [[V1:%.*]] = extractvalue { i64, i64 } [[S]], 0
 ; CHECK-NEXT:    [[V2:%.*]] = extractvalue { i64, i64 } [[S]], 1
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ne i64 [[V1]], 10
-; CHECK-NEXT:    call void @use(i1 [[T_1]])
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i64 [[V1]], 100
-; CHECK-NEXT:    call void @use(i1 [[T_2]])
-; CHECK-NEXT:    [[T_3:%.*]] = icmp ne i64 [[V2]], 0
-; CHECK-NEXT:    call void @use(i1 [[T_3]])
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ult i64 [[V2]], 301
-; CHECK-NEXT:    call void @use(i1 [[T_4]])
-; CHECK-NEXT:    [[F_1:%.*]] = icmp eq i64 [[V1]], 10
-; CHECK-NEXT:    call void @use(i1 [[F_1]])
-; CHECK-NEXT:    [[F_2:%.*]] = icmp ult i64 [[V1]], 19
-; CHECK-NEXT:    call void @use(i1 [[F_2]])
-; CHECK-NEXT:    [[F_3:%.*]] = icmp eq i64 [[V2]], 50
-; CHECK-NEXT:    call void @use(i1 [[F_3]])
-; CHECK-NEXT:    [[F_4:%.*]] = icmp ugt i64 [[V2]], 301
-; CHECK-NEXT:    call void @use(i1 [[F_4]])
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    call void @use(i1 false)
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i64 [[V1]], 25
 ; CHECK-NEXT:    call void @use(i1 [[C_1]])
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i64 [[V1]], 25
diff --git a/llvm/test/Transforms/SCCP/ipsccp-cycles.ll b/llvm/test/Transforms/SCCP/ipsccp-cycles.ll
new file mode 100644
index 0000000000000..e4b81fbb607b2
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/ipsccp-cycles.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+define internal i32 @test1a(i32 %A, i32 %b) {
+; CHECK-LABEL: @test1a(
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[BB_TRUE:%.*]], label [[BB_FALSE:%.*]]
+; CHECK:       bb.true:
+; CHECK-NEXT:    [[R:%.*]] = call i32 @test1a(i32 [[X]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[R]]
+; CHECK:       bb.false:
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %X = add i32 %A, 1
+  %c = icmp eq i32 %X, %b
+  br i1 %c, label %bb.true, label %bb.false
+
+bb.true:
+  %r = call i32 @test1a(i32 %X, i32 %b)
+  ret i32 %r
+
+bb.false:
+  ret i32 %A
+}
+
+define i32 @test1b(i32 %b) {
+; CHECK-LABEL: @test1b(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @test1a(i32 17, i32 [[B:%.*]])
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %X = call i32 @test1a( i32 17, i32 %b)
+  ret i32 %X
+}
+
+@Getopt.optind = internal global i32 1, align 4
+
+define i32 @test2(i32 %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* @Getopt.optind, align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LV]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* @Getopt.optind, align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[ADD]], [[A:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  br label %loop
+
+loop:
+  %lv = load i32, i32* @Getopt.optind, align 4
+  %add = add i32 %lv, 1
+  store i32 %add, i32* @Getopt.optind
+  %c = icmp eq i32 %add, %a
+  br i1 %c, label %exit, label %loop
+
+exit:
+  ret i32 %add
+}
+
+
+define internal i32 @test3a(i32 %a) {
+; CHECK-LABEL: @test3a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[RES]], 1000
+; CHECK-NEXT:    br i1 [[C]], label [[BB_TRUE:%.*]], label [[BB_FALSE:%.*]]
+; CHECK:       bb.true:
+; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK:       bb.false:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %res = add i32 %a, 1
+  %c = icmp ult i32 %res, 1000
+  br i1 %c, label %bb.true, label %bb.false
+
+bb.true:
+  ret i32 %res
+
+bb.false:
+  ret i32 0
+}
+
+define i32 @test3b(i32 %a) {
+; CHECK-LABEL: @test3b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @test3a(i32 0)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[V2:%.*]] = call i32 @test3a(i32 [[V1]])
+; CHECK-NEXT:    [[V3:%.*]] = add i32 [[V2]], 1
+; CHECK-NEXT:    [[V4:%.*]] = call i32 @test3a(i32 [[V3]])
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[V4]], [[A:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 [[V4]]
+;
+entry:
+  %v1 = call i32 @test3a(i32 0)
+  br label %loop
+
+loop:
+  %v2 = call i32 @test3a(i32 %v1)
+  %v3 = add i32 %v2, 1
+  %v4 = call i32 @test3a(i32 %v3)
+  %c = icmp eq i32 %v4, %a
+  br i1 %c, label %exit, label %loop
+
+exit:
+  ret i32 %v4
+}
+
+%struct.S = type { i32, i32 }
+
+; Check for a range extension cycle through a  struct argument.
+define internal i32 @test4a(%struct.S %s) {
+; CHECK-LABEL: @test4a(
+; CHECK-NEXT:    [[A:%.*]] = extractvalue [[STRUCT_S:%.*]] %s, 0
+; CHECK-NEXT:    [[B:%.*]] = extractvalue [[STRUCT_S]] %s, 1
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], [[B]]
+; CHECK-NEXT:    br i1 [[C]], label [[BB_TRUE:%.*]], label [[BB_FALSE:%.*]]
+; CHECK:       bb.true:
+; CHECK-NEXT:    [[S2:%.*]] = insertvalue [[STRUCT_S]] %s, i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = call i32 @test4a(%struct.S [[S2]])
+; CHECK-NEXT:    ret i32 [[R]]
+; CHECK:       bb.false:
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %a = extractvalue %struct.S %s, 0
+  %b = extractvalue %struct.S %s, 1
+
+  %x = add i32 %a, 1
+  %c = icmp eq i32 %x, %b
+  br i1 %c, label %bb.true, label %bb.false
+
+bb.true:
+  %s2 = insertvalue %struct.S %s, i32 %x, 0
+  %r = call i32 @test4a(%struct.S %s2)
+  ret i32 %r
+
+bb.false:
+  ret i32 %a
+}
+
+define i32 @test4b(i32 %b) {
+; CHECK-LABEL: @test4b(
+; CHECK-NEXT:    [[S2:%.*]] = insertvalue [[STRUCT_S:%.*]] { i32 17, i32 undef }, i32 [[B:%.*]], 1
+; CHECK-NEXT:    [[X:%.*]] = call i32 @test4a(%struct.S [[S2]])
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %s1 = insertvalue %struct.S undef, i32 17, 0
+  %s2 = insertvalue %struct.S %s1, i32 %b, 1
+  %X = call i32 @test4a(%struct.S %s2)
+  ret i32 %X
+}
+
+; Check for a range extension cycle through a returned value.
+
+define internal i32 @test5a(i8* %arg, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: @test5a(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = icmp eq i8* [[ARG:%.*]], null
+; CHECK-NEXT:    br i1 [[TMP]], label [[BB6:%.*]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @test5a(i8* [[ARG]], i32 0, i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], -1
+; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK:       bb6:
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = icmp eq i8* %arg, null
+  br i1 %tmp, label %bb6, label %bb3
+
+bb3:                                              ; preds = %bb
+  %tmp4 = tail call i32 @test5a(i8* %arg, i32 %arg1, i32 %arg2)
+  %tmp5 = add nsw i32 %tmp4, %arg2
+  ret i32 %tmp5
+
+bb6:                                              ; preds = %bb
+  ret i32 %arg1
+}
+
+define void @test5b(i8* %ptr) {
+; CHECK-LABEL: @test5b(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @test5a(i8* [[PTR:%.*]], i32 0, i32 -1)
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @test5a(i8* %ptr, i32 0, i32 -1)
+  ret void
+}
+
+%struct = type { i32, i32 }
+
+define internal %struct @test6a(i8* %arg, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: @test6a(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = icmp eq i8* [[ARG:%.*]], null
+; CHECK-NEXT:    br i1 [[TMP]], label [[BB6:%.*]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[S1:%.*]] = tail call [[STRUCT:%.*]] @test6a(i8* [[ARG]], i32 0, i32 -1)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[STRUCT]] %s1, 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], -1
+; CHECK-NEXT:    [[S2:%.*]] = insertvalue [[STRUCT]] %s1, i32 [[TMP5]], 0
+; CHECK-NEXT:    ret [[STRUCT]] %s2
+; CHECK:       bb6:
+; CHECK-NEXT:    ret [[STRUCT]] { i32 0, i32 undef }
+;
+bb:
+  %tmp = icmp eq i8* %arg, null
+  br i1 %tmp, label %bb6, label %bb3
+
+bb3:                                              ; preds = %bb
+  %s1 = tail call %struct @test6a(i8* %arg, i32 %arg1, i32 %arg2)
+  %tmp4 = extractvalue %struct %s1, 0
+  %tmp5 = add nsw i32 %tmp4, %arg2
+  %s2 = insertvalue %struct %s1, i32 %tmp5, 0
+  ret %struct %s2
+
+bb6:                                              ; preds = %bb
+  %s3 = insertvalue %struct undef, i32 %arg1, 0
+  ret %struct %s3
+}
+
+define void @test6b(i8* %ptr) {
+; CHECK-LABEL: @test6b(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = tail call [[STRUCT:%.*]] @test6a(i8* [[PTR:%.*]], i32 0, i32 -1)
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = tail call %struct @test6a(i8* %ptr, i32 0, i32 -1)
+  ret void
+}
diff --git a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
index d56c91061cadc..e1c7b3d5662d0 100644
--- a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
+++ b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
@@ -386,10 +386,7 @@ define void @test3() {
 ; CHECK-NEXT:    store i32 [[MUL]], i32* @pcount, align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.end24:
-; CHECK-NEXT:    [[CMP25474:%.*]] = icmp sgt i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[CMP25474]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SCCP/widening.ll b/llvm/test/Transforms/SCCP/widening.ll
index 9f5e1c78ae3ad..825175882b1e1 100644
--- a/llvm/test/Transforms/SCCP/widening.ll
+++ b/llvm/test/Transforms/SCCP/widening.ll
@@ -17,10 +17,8 @@ define void @test_2_incoming_constants(i32 %x) {
 ; SCCP:       exit:
 ; SCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB1]] ]
 ; SCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
-; SCCP-NEXT:    [[T_1:%.*]] = icmp ult i32 [[A]], 20
-; SCCP-NEXT:    call void @use(i1 [[T_1]])
-; SCCP-NEXT:    [[F_1:%.*]] = icmp ugt i32 [[A]], 10
-; SCCP-NEXT:    call void @use(i1 [[F_1]])
+; SCCP-NEXT:    call void @use(i1 true)
+; SCCP-NEXT:    call void @use(i1 false)
 ; SCCP-NEXT:    ret void
 ;
 ; IPSCCP-LABEL: @test_2_incoming_constants(
@@ -32,10 +30,8 @@ define void @test_2_incoming_constants(i32 %x) {
 ; IPSCCP:       exit:
 ; IPSCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB1]] ]
 ; IPSCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
-; IPSCCP-NEXT:    [[T_1:%.*]] = icmp ult i32 [[A]], 20
-; IPSCCP-NEXT:    call void @use(i1 [[T_1]])
-; IPSCCP-NEXT:    [[F_1:%.*]] = icmp ugt i32 [[A]], 10
-; IPSCCP-NEXT:    call void @use(i1 [[F_1]])
+; IPSCCP-NEXT:    call void @use(i1 true)
+; IPSCCP-NEXT:    call void @use(i1 false)
 ; IPSCCP-NEXT:    ret void
 ;
 entry:
@@ -68,10 +64,8 @@ define void @test_3_incoming_constants(i32 %x) {
 ; SCCP:       exit:
 ; SCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB1]] ], [ 2, [[BB2]] ]
 ; SCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
-; SCCP-NEXT:    [[T_1:%.*]] = icmp ult i32 [[A]], 20
-; SCCP-NEXT:    call void @use(i1 [[T_1]])
-; SCCP-NEXT:    [[F_1:%.*]] = icmp ugt i32 [[A]], 10
-; SCCP-NEXT:    call void @use(i1 [[F_1]])
+; SCCP-NEXT:    call void @use(i1 true)
+; SCCP-NEXT:    call void @use(i1 false)
 ; SCCP-NEXT:    ret void
 ;
 ; IPSCCP-LABEL: @test_3_incoming_constants(
@@ -86,10 +80,8 @@ define void @test_3_incoming_constants(i32 %x) {
 ; IPSCCP:       exit:
 ; IPSCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB1]] ], [ 2, [[BB2]] ]
 ; IPSCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
-; IPSCCP-NEXT:    [[T_1:%.*]] = icmp ult i32 [[A]], 20
-; IPSCCP-NEXT:    call void @use(i1 [[T_1]])
-; IPSCCP-NEXT:    [[F_1:%.*]] = icmp ugt i32 [[A]], 10
-; IPSCCP-NEXT:    call void @use(i1 [[F_1]])
+; IPSCCP-NEXT:    call void @use(i1 true)
+; IPSCCP-NEXT:    call void @use(i1 false)
 ; IPSCCP-NEXT:    ret void
 ;
 entry:
@@ -132,10 +124,8 @@ define void @test_5_incoming_constants(i32 %x) {
 ; SCCP:       exit:
 ; SCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB1]] ], [ 2, [[BB2]] ], [ 3, [[BB3]] ], [ 4, [[BB4]] ]
 ; SCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
-; SCCP-NEXT:    [[T_1:%.*]] = icmp ult i32 [[A]], 20
-; SCCP-NEXT:    call void @use(i1 [[T_1]])
-; SCCP-NEXT:    [[F_1:%.*]] = icmp ugt i32 [[A]], 10
-; SCCP-NEXT:    call void @use(i1 [[F_1]])
+; SCCP-NEXT:    call void @use(i1 true)
+; SCCP-NEXT:    call void @use(i1 false)
 ; SCCP-NEXT:    ret void
 ;
 ; IPSCCP-LABEL: @test_5_incoming_constants(
@@ -156,10 +146,8 @@ define void @test_5_incoming_constants(i32 %x) {
 ; IPSCCP:       exit:
 ; IPSCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB1]] ], [ 2, [[BB2]] ], [ 3, [[BB3]] ], [ 4, [[BB4]] ]
 ; IPSCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
-; IPSCCP-NEXT:    [[T_1:%.*]] = icmp ult i32 [[A]], 20
-; IPSCCP-NEXT:    call void @use(i1 [[T_1]])
-; IPSCCP-NEXT:    [[F_1:%.*]] = icmp ugt i32 [[A]], 10
-; IPSCCP-NEXT:    call void @use(i1 [[F_1]])
+; IPSCCP-NEXT:    call void @use(i1 true)
+; IPSCCP-NEXT:    call void @use(i1 false)
 ; IPSCCP-NEXT:    ret void
 ;
 entry:
@@ -369,8 +357,7 @@ define void @loop_with_header_1(i32 %x) {
 ; IPSCCP-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 2
 ; IPSCCP-NEXT:    br i1 [[C_1]], label [[LOOP_BODY]], label [[EXIT:%.*]]
 ; IPSCCP:       loop.body:
-; IPSCCP-NEXT:    [[T_1:%.*]] = icmp slt i32 [[IV]], 2
-; IPSCCP-NEXT:    call void @use(i1 [[T_1]])
+; IPSCCP-NEXT:    call void @use(i1 true)
 ; IPSCCP-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; IPSCCP-NEXT:    br label [[LOOP_HEADER]]
 ; IPSCCP:       exit:
@@ -418,8 +405,7 @@ define void @loop_with_header_2(i32 %x) {
 ; IPSCCP-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 200
 ; IPSCCP-NEXT:    br i1 [[C_1]], label [[LOOP_BODY]], label [[EXIT:%.*]]
 ; IPSCCP:       loop.body:
-; IPSCCP-NEXT:    [[T_1:%.*]] = icmp slt i32 [[IV]], 200
-; IPSCCP-NEXT:    call void @use(i1 [[T_1]])
+; IPSCCP-NEXT:    call void @use(i1 true)
 ; IPSCCP-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; IPSCCP-NEXT:    br label [[LOOP_HEADER]]
 ; IPSCCP:       exit:
@@ -877,3 +863,91 @@ bb66:                                             ; preds = %bb60, %bb35
   %tmp67 = phi i8* [ %tmp36, %bb35 ], [ null, %bb60 ]
   ret i8* %tmp67
 }
+
+
+define i32 @loop_with_multiple_euqal_incomings(i32 %N) {
+; SCCP-LABEL: @loop_with_multiple_euqal_incomings(
+; SCCP-NEXT:  entry:
+; SCCP-NEXT:    br label [[LOOP:%.*]]
+; SCCP:       loop:
+; SCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[P_NEXT:%.*]], [[BB3:%.*]] ], [ 0, [[BB4:%.*]] ], [ 0, [[BB5:%.*]] ], [ 0, [[BB6:%.*]] ]
+; SCCP-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; SCCP-NEXT:    br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]]
+; SCCP:       bb1:
+; SCCP-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; SCCP-NEXT:    br i1 [[C_2]], label [[BB3]], label [[BB4]]
+; SCCP:       bb2:
+; SCCP-NEXT:    [[C_4:%.*]] = call i1 @cond()
+; SCCP-NEXT:    br i1 [[C_4]], label [[BB5]], label [[BB6]]
+; SCCP:       bb3:
+; SCCP-NEXT:    [[P_NEXT]] = add i32 [[P]], 1
+; SCCP-NEXT:    br label [[LOOP]]
+; SCCP:       bb4:
+; SCCP-NEXT:    [[C_3:%.*]] = call i1 @cond()
+; SCCP-NEXT:    br i1 [[C_3]], label [[LOOP]], label [[END:%.*]]
+; SCCP:       bb5:
+; SCCP-NEXT:    br label [[LOOP]]
+; SCCP:       bb6:
+; SCCP-NEXT:    br label [[LOOP]]
+; SCCP:       end:
+; SCCP-NEXT:    ret i32 [[P]]
+;
+; IPSCCP-LABEL: @loop_with_multiple_euqal_incomings(
+; IPSCCP-NEXT:  entry:
+; IPSCCP-NEXT:    br label [[LOOP:%.*]]
+; IPSCCP:       loop:
+; IPSCCP-NEXT:    [[P:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[P_NEXT:%.*]], [[BB3:%.*]] ], [ 0, [[BB4:%.*]] ], [ 0, [[BB5:%.*]] ], [ 0, [[BB6:%.*]] ]
+; IPSCCP-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; IPSCCP-NEXT:    br i1 [[C_1]], label [[BB1:%.*]], label [[BB2:%.*]]
+; IPSCCP:       bb1:
+; IPSCCP-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; IPSCCP-NEXT:    br i1 [[C_2]], label [[BB3]], label [[BB4]]
+; IPSCCP:       bb2:
+; IPSCCP-NEXT:    [[C_4:%.*]] = call i1 @cond()
+; IPSCCP-NEXT:    br i1 [[C_4]], label [[BB5]], label [[BB6]]
+; IPSCCP:       bb3:
+; IPSCCP-NEXT:    [[P_NEXT]] = add i32 [[P]], 1
+; IPSCCP-NEXT:    br label [[LOOP]]
+; IPSCCP:       bb4:
+; IPSCCP-NEXT:    [[C_3:%.*]] = call i1 @cond()
+; IPSCCP-NEXT:    br i1 [[C_3]], label [[LOOP]], label [[END:%.*]]
+; IPSCCP:       bb5:
+; IPSCCP-NEXT:    br label [[LOOP]]
+; IPSCCP:       bb6:
+; IPSCCP-NEXT:    br label [[LOOP]]
+; IPSCCP:       end:
+; IPSCCP-NEXT:    ret i32 [[P]]
+;
+entry:
+  br label %loop
+
+loop:
+  %p = phi i32 [ 0, %entry ], [ %p.next, %bb3 ], [ 0, %bb4 ], [ 0, %bb5], [ 0, %bb6 ]
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %bb1, label %bb2
+
+bb1:
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %bb3, label %bb4
+
+bb2:
+  %c.4 = call i1 @cond()
+  br i1 %c.4, label %bb5, label %bb6
+
+bb3:
+  %p.next = add i32 %p, 1
+  br label %loop
+
+bb4:
+  %c.3 = call i1 @cond()
+  br i1 %c.3, label %loop, label %end
+
+bb5:
+  br label %loop
+
+bb6:
+  br label %loop
+
+end:
+  ret i32 %p
+}

From dac21fd29cd2ae2b979a276747ad5ad82fca09bf Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne@apple.com>
Date: Fri, 10 Apr 2020 17:41:45 -0400
Subject: [PATCH 502/770] [lit] Add an option to print all features used in
 tests

Lit test suites can tend to accumulate annotations that are not necessarily
relevant as time goes by, for example XFAILS on old compilers or platforms.
To help spot old annotations that can be cleaned up, it can be useful to
look at all features used inside a test suite.

This commit adds a new Lit option '--show-used-features' that prints all
the features used in XFAIL, REQUIRES and UNSUPPORTED of all tests that
are discovered.

Differential Revision: https://reviews.llvm.org/D78589
---
 llvm/utils/lit/lit/BooleanExpression.py       | 11 +--
 llvm/utils/lit/lit/Test.py                    | 23 +++++-
 llvm/utils/lit/lit/TestRunner.py              | 73 ++++++++++++-------
 llvm/utils/lit/lit/cl_arguments.py            |  3 +
 llvm/utils/lit/lit/main.py                    |  7 +-
 .../tests/Inputs/show-used-features/lit.cfg   |  6 ++
 .../tests/Inputs/show-used-features/mixed.txt |  4 +
 .../Inputs/show-used-features/requires.txt    |  2 +
 .../Inputs/show-used-features/unsupported.txt |  2 +
 .../tests/Inputs/show-used-features/xfail.txt |  2 +
 llvm/utils/lit/tests/show-used-features.py    |  6 ++
 11 files changed, 104 insertions(+), 35 deletions(-)
 create mode 100644 llvm/utils/lit/tests/Inputs/show-used-features/lit.cfg
 create mode 100644 llvm/utils/lit/tests/Inputs/show-used-features/mixed.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/show-used-features/requires.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/show-used-features/unsupported.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/show-used-features/xfail.txt
 create mode 100644 llvm/utils/lit/tests/show-used-features.py

diff --git a/llvm/utils/lit/lit/BooleanExpression.py b/llvm/utils/lit/lit/BooleanExpression.py
index 3eb5060de3e30..34e07fc1b8e5e 100644
--- a/llvm/utils/lit/lit/BooleanExpression.py
+++ b/llvm/utils/lit/lit/BooleanExpression.py
@@ -79,9 +79,10 @@ def expect(self, t):
             raise ValueError("expected: %s\nhave: %s" %
                              (self.quote(t), self.quote(self.token)))
 
-    def isIdentifier(self, t):
-        if (t is BooleanExpression.END or t == '&&' or t == '||' or
-            t == '!' or t == '(' or t == ')'):
+    @staticmethod
+    def isIdentifier(token):
+        if (token is BooleanExpression.END or token == '&&' or token == '||' or
+            token == '!' or token == '(' or token == ')'):
             return False
         return True
 
@@ -92,7 +93,7 @@ def parseNOT(self):
         elif self.accept('('):
             self.parseOR()
             self.expect(')')
-        elif not self.isIdentifier(self.token):
+        elif not BooleanExpression.isIdentifier(self.token):
             raise ValueError("expected: '!' or '(' or identifier\nhave: %s" %
                              self.quote(self.token))
         else:
@@ -191,7 +192,7 @@ def checkException(self, expr, error):
                            "actual error was:\n%s\n" +
                            "expected error was:\n%s\n") % (expr, e, error))
         except BaseException as e:
-            self.fail(("expression %r caused the wrong exception; actual " + 
+            self.fail(("expression %r caused the wrong exception; actual " +
                       "exception was: \n%r") % (expr, e))
 
     def test_errors(self):
diff --git a/llvm/utils/lit/lit/Test.py b/llvm/utils/lit/lit/Test.py
index 627785829efca..4c94e9806a160 100644
--- a/llvm/utils/lit/lit/Test.py
+++ b/llvm/utils/lit/lit/Test.py
@@ -1,3 +1,4 @@
+import itertools
 import os
 from xml.sax.saxutils import quoteattr
 from json import JSONEncoder
@@ -162,7 +163,7 @@ def addMicroResult(self, name, microResult):
         addMicroResult(microResult)
 
         Attach a micro-test result to the test result, with the given name and
-        result.  It is an error to attempt to attach a micro-test with the 
+        result.  It is an error to attempt to attach a micro-test with the
         same name multiple times.
 
         Each micro-test result must be an instance of the Result class.
@@ -359,6 +360,26 @@ def getUnsupportedFeatures(self):
         except ValueError as e:
             raise ValueError('Error in UNSUPPORTED list:\n%s' % str(e))
 
+    def getUsedFeatures(self):
+        """
+        getUsedFeatures() -> list of strings
+
+        Returns a list of all features appearing in XFAIL, UNSUPPORTED and
+        REQUIRES annotations for this test.
+        """
+        import lit.TestRunner
+        parsed = lit.TestRunner._parseKeywords(self.getSourcePath(), require_script=False)
+        feature_keywords = ('UNSUPPORTED:', 'REQUIRES:', 'XFAIL:')
+        boolean_expressions = itertools.chain.from_iterable(
+            parsed[k] or [] for k in feature_keywords
+        )
+        tokens = itertools.chain.from_iterable(
+            BooleanExpression.tokenize(expr) for expr in
+                boolean_expressions if expr != '*'
+        )
+        identifiers = set(filter(BooleanExpression.isIdentifier, tokens))
+        return identifiers
+
     def isEarlyTest(self):
         """
         isEarlyTest() -> bool
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 6c92a2bc26c42..edcb696a8b162 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1373,31 +1373,26 @@ def _handleBooleanExpr(line_number, line, output):
                 BooleanExpression.evaluate(s, [])
         return output
 
-def parseIntegratedTestScript(test, additional_parsers=[],
-                              require_script=True):
-    """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test
-    script and extract the lines to 'RUN' as well as 'XFAIL', 'REQUIRES',
-    'UNSUPPORTED' and 'ALLOW_RETRIES' information.
 
-    If additional parsers are specified then the test is also scanned for the
-    keywords they specify and all matches are passed to the custom parser.
+def _parseKeywords(sourcepath, additional_parsers=[],
+                   require_script=True):
+    """_parseKeywords
 
-    If 'require_script' is False an empty script
-    may be returned. This can be used for test formats where the actual script
-    is optional or ignored.
-    """
+    Scan an LLVM/Clang style integrated test script and extract all the lines
+    pertaining to a special parser. This includes 'RUN', 'XFAIL', 'REQUIRES',
+    'UNSUPPORTED' and 'ALLOW_RETRIES', as well as other specified custom
+    parsers.
 
+    Returns a dictionary mapping each custom parser to its value after
+    parsing the test.
+    """
     # Install the built-in keyword parsers.
     script = []
     builtin_parsers = [
-        IntegratedTestKeywordParser('RUN:', ParserKind.COMMAND,
-                                    initial_value=script),
-        IntegratedTestKeywordParser('XFAIL:', ParserKind.BOOLEAN_EXPR,
-                                    initial_value=test.xfails),
-        IntegratedTestKeywordParser('REQUIRES:', ParserKind.BOOLEAN_EXPR,
-                                    initial_value=test.requires),
-        IntegratedTestKeywordParser('UNSUPPORTED:', ParserKind.BOOLEAN_EXPR,
-                                    initial_value=test.unsupported),
+        IntegratedTestKeywordParser('RUN:', ParserKind.COMMAND, initial_value=script),
+        IntegratedTestKeywordParser('XFAIL:', ParserKind.BOOLEAN_EXPR),
+        IntegratedTestKeywordParser('REQUIRES:', ParserKind.BOOLEAN_EXPR),
+        IntegratedTestKeywordParser('UNSUPPORTED:', ParserKind.BOOLEAN_EXPR),
         IntegratedTestKeywordParser('ALLOW_RETRIES:', ParserKind.INTEGER),
         IntegratedTestKeywordParser('END.', ParserKind.TAG)
     ]
@@ -1414,7 +1409,6 @@ def parseIntegratedTestScript(test, additional_parsers=[],
         keyword_parsers[parser.keyword] = parser
 
     # Collect the test lines from the script.
-    sourcepath = test.getSourcePath()
     for line_number, command_type, ln in \
             parseIntegratedTestScriptCommands(sourcepath,
                                               keyword_parsers.keys()):
@@ -1441,6 +1435,37 @@ def parseIntegratedTestScript(test, additional_parsers=[],
         if value and value[-1][-1] == '\\':
             raise ValueError("Test has unterminated %s lines (with '\\')" % key)
 
+    # Make sure there's at most one ALLOW_RETRIES: line
+    allowed_retries = keyword_parsers['ALLOW_RETRIES:'].getValue()
+    if allowed_retries and len(allowed_retries) > 1:
+        return lit.Test.Result(Test.UNRESOLVED,
+                               "Test has more than one ALLOW_RETRIES lines")
+
+    return {p.keyword: p.getValue() for p in keyword_parsers.values()}
+
+
+def parseIntegratedTestScript(test, additional_parsers=[],
+                              require_script=True):
+    """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test
+    script and extract the lines to 'RUN' as well as 'XFAIL', 'REQUIRES',
+    'UNSUPPORTED' and 'ALLOW_RETRIES' information into the given test.
+
+    If additional parsers are specified then the test is also scanned for the
+    keywords they specify and all matches are passed to the custom parser.
+
+    If 'require_script' is False an empty script
+    may be returned. This can be used for test formats where the actual script
+    is optional or ignored.
+    """
+    # Parse the test sources and extract test properties
+    parsed = _parseKeywords(test.getSourcePath(), additional_parsers, require_script)
+    script = parsed['RUN:'] or []
+    test.xfails = parsed['XFAIL:'] or []
+    test.requires = parsed['REQUIRES:'] or []
+    test.unsupported = parsed['UNSUPPORTED:'] or []
+    if parsed['ALLOW_RETRIES:']:
+        test.allowed_retries = parsed['ALLOW_RETRIES:'][0]
+
     # Enforce REQUIRES:
     missing_required_features = test.getMissingRequiredFeatures()
     if missing_required_features:
@@ -1458,14 +1483,6 @@ def parseIntegratedTestScript(test, additional_parsers=[],
             "Test does not support the following features "
             "and/or targets: %s" % msg)
 
-    # Handle ALLOW_RETRIES:
-    allowed_retries = keyword_parsers['ALLOW_RETRIES:'].getValue()
-    if allowed_retries:
-        if len(allowed_retries) > 1:
-            return lit.Test.Result(Test.UNRESOLVED,
-                                   "Test has more than one ALLOW_RETRIES lines")
-        test.allowed_retries = allowed_retries[0]
-
     # Enforce limit_to_features.
     if not test.isWithinFeatureLimits():
         msg = ', '.join(test.config.limit_to_features)
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index 06b1313cb8bb1..6bbaa4203b478 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -157,6 +157,9 @@ def parse_args():
     debug_group.add_argument("--show-tests",
             help="Show all discovered tests and exit",
             action="store_true")
+    debug_group.add_argument("--show-used-features",
+            help="Show all features used in the test suite (in XFAIL, UNSUPPORTED and REQUIRES) and exit",
+            action="store_true")
 
     # LIT is special: environment variables override command line arguments.
     env_args = shlex.split(os.environ.get("LIT_OPTS", ""))
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index d2958590aa8a2..6c423167ff4c4 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -4,6 +4,7 @@
 See lit.pod for more information.
 """
 
+import itertools
 import os
 import platform
 import sys
@@ -47,6 +48,11 @@ def main(builtin_params={}):
         print_discovered(discovered_tests, opts.show_suites, opts.show_tests)
         sys.exit(0)
 
+    if opts.show_used_features:
+        features = set(itertools.chain.from_iterable(t.getUsedFeatures() for t in discovered_tests))
+        print(' '.join(sorted(features)))
+        sys.exit(0)
+
     # Command line overrides configuration for maxIndividualTestTime.
     if opts.maxIndividualTestTime is not None:  # `not None` is important (default: 0)
         if opts.maxIndividualTestTime != lit_config.maxIndividualTestTime:
@@ -127,7 +133,6 @@ def print_discovered(tests, show_suites, show_tests):
     tests.sort(key=lit.reports.by_suite_and_test_path)
 
     if show_suites:
-        import itertools
         tests_by_suite = itertools.groupby(tests, lambda t: t.suite)
         print('-- Test Suites --')
         for suite, test_iter in tests_by_suite:
diff --git a/llvm/utils/lit/tests/Inputs/show-used-features/lit.cfg b/llvm/utils/lit/tests/Inputs/show-used-features/lit.cfg
new file mode 100644
index 0000000000000..7ee2154d2e19b
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/show-used-features/lit.cfg
@@ -0,0 +1,6 @@
+import lit.formats
+config.name = 'show-used-features'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/llvm/utils/lit/tests/Inputs/show-used-features/mixed.txt b/llvm/utils/lit/tests/Inputs/show-used-features/mixed.txt
new file mode 100644
index 0000000000000..1de0f7442a086
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/show-used-features/mixed.txt
@@ -0,0 +1,4 @@
+
+// REQUIRES: my-require-feature-2 || my-require-feature-3
+// UNSUPPORTED: my-unsupported-feature-2, my-unsupported-feature-3
+// XFAIL: my-xfail-feature-2, my-xfail-feature-3
diff --git a/llvm/utils/lit/tests/Inputs/show-used-features/requires.txt b/llvm/utils/lit/tests/Inputs/show-used-features/requires.txt
new file mode 100644
index 0000000000000..3e550f4495fba
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/show-used-features/requires.txt
@@ -0,0 +1,2 @@
+
+// REQUIRES: my-require-feature-1
diff --git a/llvm/utils/lit/tests/Inputs/show-used-features/unsupported.txt b/llvm/utils/lit/tests/Inputs/show-used-features/unsupported.txt
new file mode 100644
index 0000000000000..b19582624abc8
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/show-used-features/unsupported.txt
@@ -0,0 +1,2 @@
+
+// UNSUPPORTED: my-unsupported-feature-1
diff --git a/llvm/utils/lit/tests/Inputs/show-used-features/xfail.txt b/llvm/utils/lit/tests/Inputs/show-used-features/xfail.txt
new file mode 100644
index 0000000000000..f8cde28ab4c7f
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/show-used-features/xfail.txt
@@ -0,0 +1,2 @@
+
+// XFAIL: my-xfail-feature-1
diff --git a/llvm/utils/lit/tests/show-used-features.py b/llvm/utils/lit/tests/show-used-features.py
new file mode 100644
index 0000000000000..069ee08196701
--- /dev/null
+++ b/llvm/utils/lit/tests/show-used-features.py
@@ -0,0 +1,6 @@
+# Check that --show-used-features works correctly.
+#
+# RUN: %{lit} %{inputs}/show-used-features --show-used-features | FileCheck %s
+# CHECK: my-require-feature-1 my-require-feature-2 my-require-feature-3
+# CHECK: my-unsupported-feature-1 my-unsupported-feature-2 my-unsupported-feature-3
+# CHECK: my-xfail-feature-1 my-xfail-feature-2 my-xfail-feature-3

From 92f3d29af0c0f2c98ba0dfacac00bbb6eb1f741d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 28 May 2020 10:21:27 +0000
Subject: [PATCH 503/770] [SelectionDAG] Update getNode asserts for
 EXTRACT/INSERT_SUBVECTOR.

Summary:
The description of EXTACT_SUBVECTOR and INSERT_SUBVECTOR has been
changed to accommodate scalable vectors (see ISDOpcodes.h). This
patch updates the asserts used to verify these requirements when
using SelectionDAG's getNode interface.

This patch introduces the MVT function getVectorMinNumElements
that can be used against fixed-length and scalable vectors when
only the known minimum vector length is required.

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80709
---
 llvm/include/llvm/CodeGen/ValueTypes.h        |  5 +++
 llvm/include/llvm/Support/MachineValueType.h  |  5 +++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 42 +++++++++++--------
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index 15a4bfe1e5553..c6f8a813ca333 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -292,6 +292,11 @@ namespace llvm {
       return {getExtendedVectorNumElements(), isExtendedScalableVector()};
     }
 
+    /// Given a vector type, return the minimum number of elements it contains.
+    unsigned getVectorMinNumElements() const {
+      return getVectorElementCount().Min;
+    }
+
     /// Return the size of the specified value type in bits.
     ///
     /// If the value type is a scalable vector type, the scalable property will
diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h
index 93683eb7dcf74..9d2d0c85f585e 100644
--- a/llvm/include/llvm/Support/MachineValueType.h
+++ b/llvm/include/llvm/Support/MachineValueType.h
@@ -706,6 +706,11 @@ namespace llvm {
       return { getVectorNumElements(), isScalableVector() };
     }
 
+    /// Given a vector type, return the minimum number of elements it contains.
+    unsigned getVectorMinNumElements() const {
+      return getVectorElementCount().Min;
+    }
+
     /// Returns the size of the specified MVT in bits.
     ///
     /// If the value type is a scalable vector type, the scalable property will
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bd1a5a4a876ed..0a108c95bd286 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5461,21 +5461,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     break;
   case ISD::EXTRACT_SUBVECTOR:
-    assert(VT.isVector() && N1.getValueType().isVector() &&
-           "Extract subvector VTs must be a vectors!");
-    assert(VT.getVectorElementType() ==
-               N1.getValueType().getVectorElementType() &&
+    EVT N1VT = N1.getValueType();
+    assert(VT.isVector() && N1VT.isVector() &&
+           "Extract subvector VTs must be vectors!");
+    assert(VT.getVectorElementType() == N1VT.getVectorElementType() &&
            "Extract subvector VTs must have the same element type!");
-    assert(VT.getVectorNumElements() <=
-               N1.getValueType().getVectorNumElements() &&
+    assert((VT.isFixedLengthVector() || N1VT.isScalableVector()) &&
+           "Cannot extract a scalable vector from a fixed length vector!");
+    assert((VT.isScalableVector() != N1VT.isScalableVector() ||
+            VT.getVectorMinNumElements() <= N1VT.getVectorMinNumElements()) &&
            "Extract subvector must be from larger vector to smaller vector!");
     assert(N2C && "Extract subvector index must be a constant");
-    assert(VT.getVectorNumElements() + N2C->getZExtValue() <=
-               N1.getValueType().getVectorNumElements() &&
+    assert((VT.isScalableVector() != N1VT.isScalableVector() ||
+            (VT.getVectorMinNumElements() + N2C->getZExtValue()) <=
+                N1VT.getVectorMinNumElements()) &&
            "Extract subvector overflow!");
 
     // Trivial extraction.
-    if (VT == N1.getValueType())
+    if (VT == N1VT)
       return N1;
 
     // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
@@ -5665,22 +5668,27 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // Inserting undef into undef is still undef.
     if (N1.isUndef() && N2.isUndef())
       return getUNDEF(VT);
-    assert(VT.isVector() && N1.getValueType().isVector() &&
-           N2.getValueType().isVector() &&
-           "Insert subvector VTs must be a vectors");
+
+    EVT N2VT = N2.getValueType();
     assert(VT == N1.getValueType() &&
            "Dest and insert subvector source types must match!");
-    assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
+    assert(VT.isVector() && N2VT.isVector() &&
+           "Insert subvector VTs must be vectors!");
+    assert((VT.isScalableVector() || N2VT.isFixedLengthVector()) &&
+           "Cannot insert a scalable vector into a fixed length vector!");
+    assert((VT.isScalableVector() != N2VT.isScalableVector() ||
+            VT.getVectorMinNumElements() >= N2VT.getVectorMinNumElements()) &&
            "Insert subvector must be from smaller vector to larger vector!");
     assert(isa<ConstantSDNode>(N3) &&
            "Insert subvector index must be constant");
-    assert(N2.getValueType().getVectorNumElements() +
-                   cast<ConstantSDNode>(N3)->getZExtValue() <=
-               VT.getVectorNumElements() &&
+    assert((VT.isScalableVector() != N2VT.isScalableVector() ||
+            (N2VT.getVectorMinNumElements() +
+             cast<ConstantSDNode>(N3)->getZExtValue()) <=
+                VT.getVectorMinNumElements()) &&
            "Insert subvector overflow!");
 
     // Trivial insertion.
-    if (VT == N2.getValueType())
+    if (VT == N2VT)
       return N2;
 
     // If this is an insert of an extracted vector into an undef vector, we

From 1f4ba66ecc877562e75059e32d4c95a67e1fd483 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Tue, 7 Apr 2020 21:18:00 +0200
Subject: [PATCH 504/770] [clangd] Run PreambleThread in async mode behind a
 flag

Summary: Depends on D80198.

This patch implies ASTs might be built with stale preambles without
blocking for a fresh one. It also drops any guarantees on every preamble
version being built. In case of multiple preamble build requests, in
addition to being debounced.

Any preamble requested with a WantDiags::Yes will always be built, this
is ensured by blocking enqueueing of any subsequent reqest.

AST worker will still block for initial preamble to reduce duplicate
work.

Subscribers: ilya-biryukov, javed.absar, MaskRay, jkorous, arphaman, jfb, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80293
---
 clang-tools-extra/clangd/ClangdServer.cpp     |  2 +
 clang-tools-extra/clangd/ClangdServer.h       |  3 +
 clang-tools-extra/clangd/ParsedAST.cpp        |  5 +
 clang-tools-extra/clangd/ParsedAST.h          |  7 ++
 clang-tools-extra/clangd/TUScheduler.cpp      | 99 ++++++++++++-------
 clang-tools-extra/clangd/TUScheduler.h        |  7 +-
 clang-tools-extra/clangd/tool/ClangdMain.cpp  |  9 ++
 .../clangd/unittests/ClangdTests.cpp          | 12 +--
 .../clangd/unittests/TUSchedulerTests.cpp     | 78 ++++++++++++++-
 9 files changed, 174 insertions(+), 48 deletions(-)

diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 910c591c1f1ec..044b37944b6ce 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -114,6 +114,7 @@ ClangdServer::Options ClangdServer::optsForTest() {
   Opts.StorePreamblesInMemory = true;
   Opts.AsyncThreadsCount = 4; // Consistent!
   Opts.TheiaSemanticHighlighting = true;
+  Opts.AsyncPreambleBuilds = true;
   return Opts;
 }
 
@@ -123,6 +124,7 @@ ClangdServer::Options::operator TUScheduler::Options() const {
   Opts.RetentionPolicy = RetentionPolicy;
   Opts.StorePreamblesInMemory = StorePreamblesInMemory;
   Opts.UpdateDebounce = UpdateDebounce;
+  Opts.AsyncPreambleBuilds = AsyncPreambleBuilds;
   return Opts;
 }
 
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index 68344eb8f51e7..2c477faecedd1 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -97,6 +97,9 @@ class ClangdServer {
 
     /// Cached preambles are potentially large. If false, store them on disk.
     bool StorePreamblesInMemory = true;
+    /// Reuse even stale preambles, and rebuild them in the background.
+    /// This improves latency at the cost of accuracy.
+    bool AsyncPreambleBuilds = false;
 
     /// If true, ClangdServer builds a dynamic in-memory index for symbols in
     /// opened files and uses the index to augment code completion results.
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 660932a7b259e..082c7cae00219 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -551,5 +551,10 @@ ParsedAST::ParsedAST(llvm::StringRef Version,
   assert(this->Action);
 }
 
+llvm::Optional<llvm::StringRef> ParsedAST::preambleVersion() const {
+  if (!Preamble)
+    return llvm::None;
+  return llvm::StringRef(Preamble->Version);
+}
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/ParsedAST.h b/clang-tools-extra/clangd/ParsedAST.h
index d90f77f9263b3..c01f1fa0e6d8d 100644
--- a/clang-tools-extra/clangd/ParsedAST.h
+++ b/clang-tools-extra/clangd/ParsedAST.h
@@ -33,6 +33,9 @@
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include <memory>
 #include <string>
 #include <vector>
@@ -102,6 +105,10 @@ class ParsedAST {
   /// Returns the version of the ParseInputs this AST was built from.
   llvm::StringRef version() const { return Version; }
 
+  /// Returns the version of the ParseInputs used to build Preamble part of this
+  /// AST. Might be None if no Preamble is used.
+  llvm::Optional<llvm::StringRef> preambleVersion() const;
+
 private:
   ParsedAST(llvm::StringRef Version,
             std::shared_ptr<const PreambleData> Preamble,
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index ee6d52188934a..b53daa251b035 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -239,10 +239,14 @@ class PreambleThread {
       return;
     }
     {
-      std::lock_guard<std::mutex> Lock(Mutex);
-      // If shutdown is issued, don't bother building.
-      if (Done)
-        return;
+      std::unique_lock<std::mutex> Lock(Mutex);
+      // If NextReq was requested with WantDiagnostics::Yes we cannot just drop
+      // that on the floor. Block until we start building it. This won't
+      // dead-lock as we are blocking the caller thread, while builds continue
+      // on preamble thread.
+      ReqCV.wait(Lock, [this] {
+        return !NextReq || NextReq->WantDiags != WantDiagnostics::Yes;
+      });
       NextReq = std::move(Req);
     }
     // Let the worker thread know there's a request, notify_one is safe as there
@@ -359,8 +363,7 @@ class ASTWorker {
   friend class ASTWorkerHandle;
   ASTWorker(PathRef FileName, const GlobalCompilationDatabase &CDB,
             TUScheduler::ASTCache &LRUCache, Semaphore &Barrier, bool RunSync,
-            DebouncePolicy UpdateDebounce, bool StorePreamblesInMemory,
-            ParsingCallbacks &Callbacks);
+            const TUScheduler::Options &Opts, ParsingCallbacks &Callbacks);
 
 public:
   /// Create a new ASTWorker and return a handle to it.
@@ -368,11 +371,12 @@ class ASTWorker {
   /// is null, all requests will be processed on the calling thread
   /// synchronously instead. \p Barrier is acquired when processing each
   /// request, it is used to limit the number of actively running threads.
-  static ASTWorkerHandle
-  create(PathRef FileName, const GlobalCompilationDatabase &CDB,
-         TUScheduler::ASTCache &IdleASTs, AsyncTaskRunner *Tasks,
-         Semaphore &Barrier, DebouncePolicy UpdateDebounce,
-         bool StorePreamblesInMemory, ParsingCallbacks &Callbacks);
+  static ASTWorkerHandle create(PathRef FileName,
+                                const GlobalCompilationDatabase &CDB,
+                                TUScheduler::ASTCache &IdleASTs,
+                                AsyncTaskRunner *Tasks, Semaphore &Barrier,
+                                const TUScheduler::Options &Opts,
+                                ParsingCallbacks &Callbacks);
   ~ASTWorker();
 
   void update(ParseInputs Inputs, WantDiagnostics);
@@ -476,6 +480,7 @@ class ASTWorker {
   std::queue<Request> PreambleRequests;   /* GUARDED_BY(Mutex) */
   llvm::Optional<Request> CurrentRequest; /* GUARDED_BY(Mutex) */
   mutable std::condition_variable RequestsCV;
+  Notification ReceivedPreamble;
   /// Guards the callback that publishes results of AST-related computations
   /// (diagnostics, highlightings) and file statuses.
   std::mutex PublishMu;
@@ -535,14 +540,14 @@ class ASTWorkerHandle {
   std::shared_ptr<ASTWorker> Worker;
 };
 
-ASTWorkerHandle
-ASTWorker::create(PathRef FileName, const GlobalCompilationDatabase &CDB,
-                  TUScheduler::ASTCache &IdleASTs, AsyncTaskRunner *Tasks,
-                  Semaphore &Barrier, DebouncePolicy UpdateDebounce,
-                  bool StorePreamblesInMemory, ParsingCallbacks &Callbacks) {
-  std::shared_ptr<ASTWorker> Worker(
-      new ASTWorker(FileName, CDB, IdleASTs, Barrier, /*RunSync=*/!Tasks,
-                    UpdateDebounce, StorePreamblesInMemory, Callbacks));
+ASTWorkerHandle ASTWorker::create(PathRef FileName,
+                                  const GlobalCompilationDatabase &CDB,
+                                  TUScheduler::ASTCache &IdleASTs,
+                                  AsyncTaskRunner *Tasks, Semaphore &Barrier,
+                                  const TUScheduler::Options &Opts,
+                                  ParsingCallbacks &Callbacks) {
+  std::shared_ptr<ASTWorker> Worker(new ASTWorker(
+      FileName, CDB, IdleASTs, Barrier, /*RunSync=*/!Tasks, Opts, Callbacks));
   if (Tasks) {
     Tasks->runAsync("ASTWorker:" + llvm::sys::path::filename(FileName),
                     [Worker]() { Worker->run(); });
@@ -555,15 +560,13 @@ ASTWorker::create(PathRef FileName, const GlobalCompilationDatabase &CDB,
 
 ASTWorker::ASTWorker(PathRef FileName, const GlobalCompilationDatabase &CDB,
                      TUScheduler::ASTCache &LRUCache, Semaphore &Barrier,
-                     bool RunSync, DebouncePolicy UpdateDebounce,
-                     bool StorePreamblesInMemory, ParsingCallbacks &Callbacks)
-    : IdleASTs(LRUCache), RunSync(RunSync), UpdateDebounce(UpdateDebounce),
+                     bool RunSync, const TUScheduler::Options &Opts,
+                     ParsingCallbacks &Callbacks)
+    : IdleASTs(LRUCache), RunSync(RunSync), UpdateDebounce(Opts.UpdateDebounce),
       FileName(FileName), CDB(CDB), Callbacks(Callbacks), Barrier(Barrier),
       Done(false), Status(FileName, Callbacks),
-      PreamblePeer(FileName, Callbacks, StorePreamblesInMemory,
-                   // FIXME: Run PreamblePeer asynchronously once ast patching
-                   // is available.
-                   /*RunSync=*/true, Status, *this) {
+      PreamblePeer(FileName, Callbacks, Opts.StorePreamblesInMemory,
+                   RunSync || !Opts.AsyncPreambleBuilds, Status, *this) {
   // Set a fallback command because compile command can be accessed before
   // `Inputs` is initialized. Other fields are only used after initialization
   // from client inputs.
@@ -648,6 +651,12 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
 
     PreamblePeer.update(std::move(Invocation), std::move(Inputs),
                         std::move(CompilerInvocationDiags), WantDiags);
+    // Block until first preamble is ready, as patching an empty preamble would
+    // imply rebuilding it from scratch.
+    // This isn't the natural place to block, rather where the preamble would be
+    // consumed. But that's too late, we'd be running on the worker thread with
+    // the PreambleTask scheduled and so we'd deadlock.
+    ReceivedPreamble.wait();
     return;
   };
   startTask(TaskName, std::move(Task), WantDiags, TUScheduler::NoInvalidation);
@@ -771,6 +780,7 @@ void ASTWorker::updatePreamble(std::unique_ptr<CompilerInvocation> CI,
   };
   if (RunSync) {
     Task();
+    ReceivedPreamble.notify();
     return;
   }
   {
@@ -779,6 +789,7 @@ void ASTWorker::updatePreamble(std::unique_ptr<CompilerInvocation> CI,
                            steady_clock::now(), Context::current().clone(),
                            llvm::None, TUScheduler::NoInvalidation, nullptr});
   }
+  ReceivedPreamble.notify();
   RequestsCV.notify_all();
 }
 
@@ -915,6 +926,7 @@ void ASTWorker::stop() {
     Done = true;
   }
   // We are no longer going to build any preambles, let the waiters know that.
+  ReceivedPreamble.notify();
   BuiltFirstPreamble.notify();
   PreamblePeer.stop();
   Status.stop();
@@ -1117,10 +1129,24 @@ bool ASTWorker::shouldSkipHeadLocked() const {
 }
 
 bool ASTWorker::blockUntilIdle(Deadline Timeout) const {
-  std::unique_lock<std::mutex> Lock(Mutex);
-  return wait(Lock, RequestsCV, Timeout, [&] {
-    return PreambleRequests.empty() && Requests.empty() && !CurrentRequest;
-  });
+  auto WaitUntilASTWorkerIsIdle = [&] {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    return wait(Lock, RequestsCV, Timeout, [&] {
+      return PreambleRequests.empty() && Requests.empty() && !CurrentRequest;
+    });
+  };
+  // Make sure ASTWorker has processed all requests, which might issue new
+  // updates to PreamblePeer.
+  WaitUntilASTWorkerIsIdle();
+  // Now that ASTWorker processed all requests, ensure PreamblePeer has served
+  // all update requests. This might create new PreambleRequests for the
+  // ASTWorker.
+  PreamblePeer.blockUntilIdle(Timeout);
+  assert(Requests.empty() &&
+         "No new normal tasks can be scheduled concurrently with "
+         "blockUntilIdle(): ASTWorker isn't threadsafe");
+  // Finally make sure ASTWorker has processed all of the preamble updates.
+  return WaitUntilASTWorkerIsIdle();
 }
 
 // Render a TUAction to a user-facing string representation.
@@ -1178,13 +1204,12 @@ struct TUScheduler::FileData {
 TUScheduler::TUScheduler(const GlobalCompilationDatabase &CDB,
                          const Options &Opts,
                          std::unique_ptr<ParsingCallbacks> Callbacks)
-    : CDB(CDB), StorePreamblesInMemory(Opts.StorePreamblesInMemory),
+    : CDB(CDB), Opts(Opts),
       Callbacks(Callbacks ? move(Callbacks)
                           : std::make_unique<ParsingCallbacks>()),
       Barrier(Opts.AsyncThreadsCount),
       IdleASTs(
-          std::make_unique<ASTCache>(Opts.RetentionPolicy.MaxRetainedASTs)),
-      UpdateDebounce(Opts.UpdateDebounce) {
+          std::make_unique<ASTCache>(Opts.RetentionPolicy.MaxRetainedASTs)) {
   if (0 < Opts.AsyncThreadsCount) {
     PreambleTasks.emplace();
     WorkerThreads.emplace();
@@ -1218,10 +1243,10 @@ bool TUScheduler::update(PathRef File, ParseInputs Inputs,
   bool NewFile = FD == nullptr;
   if (!FD) {
     // Create a new worker to process the AST-related tasks.
-    ASTWorkerHandle Worker = ASTWorker::create(
-        File, CDB, *IdleASTs,
-        WorkerThreads ? WorkerThreads.getPointer() : nullptr, Barrier,
-        UpdateDebounce, StorePreamblesInMemory, *Callbacks);
+    ASTWorkerHandle Worker =
+        ASTWorker::create(File, CDB, *IdleASTs,
+                          WorkerThreads ? WorkerThreads.getPointer() : nullptr,
+                          Barrier, Opts, *Callbacks);
     FD = std::unique_ptr<FileData>(
         new FileData{Inputs.Contents, std::move(Worker)});
   } else {
diff --git a/clang-tools-extra/clangd/TUScheduler.h b/clang-tools-extra/clangd/TUScheduler.h
index f24a777d5836f..fb32c3b2edff4 100644
--- a/clang-tools-extra/clangd/TUScheduler.h
+++ b/clang-tools-extra/clangd/TUScheduler.h
@@ -191,6 +191,10 @@ class TUScheduler {
 
     /// Determines when to keep idle ASTs in memory for future use.
     ASTRetentionPolicy RetentionPolicy;
+
+    /// Whether to run PreamblePeer asynchronously.
+    /// No-op if AsyncThreadsCount is 0.
+    bool AsyncPreambleBuilds = false;
   };
 
   TUScheduler(const GlobalCompilationDatabase &CDB, const Options &Opts,
@@ -301,7 +305,7 @@ class TUScheduler {
 
 private:
   const GlobalCompilationDatabase &CDB;
-  const bool StorePreamblesInMemory;
+  const Options Opts;
   std::unique_ptr<ParsingCallbacks> Callbacks; // not nullptr
   Semaphore Barrier;
   llvm::StringMap<std::unique_ptr<FileData>> Files;
@@ -310,7 +314,6 @@ class TUScheduler {
   // asynchronously.
   llvm::Optional<AsyncTaskRunner> PreambleTasks;
   llvm::Optional<AsyncTaskRunner> WorkerThreads;
-  DebouncePolicy UpdateDebounce;
 };
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index cab6c97cf121e..eec3a830f6e77 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -417,6 +417,15 @@ opt<bool> PrettyPrint{
     init(false),
 };
 
+opt<bool> AsyncPreamble{
+    "async-preamble",
+    cat(Misc),
+    desc("Reuse even stale preambles, and rebuild them in the background. This "
+         "improves latency at the cost of accuracy."),
+    init(ClangdServer::Options().AsyncPreambleBuilds),
+    Hidden,
+};
+
 /// Supports a test URI scheme with relaxed constraints for lit tests.
 /// The path in a test URI will be combined with a platform-specific fake
 /// directory to form an absolute path. For example, test:///a.cpp is resolved
diff --git a/clang-tools-extra/clangd/unittests/ClangdTests.cpp b/clang-tools-extra/clangd/unittests/ClangdTests.cpp
index 81075ff1bbe73..46164015a3ce0 100644
--- a/clang-tools-extra/clangd/unittests/ClangdTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdTests.cpp
@@ -211,18 +211,18 @@ int b = a;
   FS.Files[FooCpp] = SourceContents;
 
   Server.addDocument(FooCpp, SourceContents);
-  auto DumpParse1 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   ASSERT_TRUE(Server.blockUntilIdleForTest()) << "Waiting for diagnostics";
+  auto DumpParse1 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   EXPECT_FALSE(DiagConsumer.hadErrorInLastDiags());
 
   Server.addDocument(FooCpp, "");
-  auto DumpParseEmpty = dumpASTWithoutMemoryLocs(Server, FooCpp);
   ASSERT_TRUE(Server.blockUntilIdleForTest()) << "Waiting for diagnostics";
+  auto DumpParseEmpty = dumpASTWithoutMemoryLocs(Server, FooCpp);
   EXPECT_FALSE(DiagConsumer.hadErrorInLastDiags());
 
   Server.addDocument(FooCpp, SourceContents);
-  auto DumpParse2 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   ASSERT_TRUE(Server.blockUntilIdleForTest()) << "Waiting for diagnostics";
+  auto DumpParse2 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   EXPECT_FALSE(DiagConsumer.hadErrorInLastDiags());
 
   EXPECT_EQ(DumpParse1, DumpParse2);
@@ -247,20 +247,20 @@ int b = a;
   FS.Files[FooCpp] = SourceContents;
 
   Server.addDocument(FooCpp, SourceContents);
-  auto DumpParse1 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   ASSERT_TRUE(Server.blockUntilIdleForTest()) << "Waiting for diagnostics";
+  auto DumpParse1 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   EXPECT_FALSE(DiagConsumer.hadErrorInLastDiags());
 
   FS.Files[FooH] = "";
   Server.addDocument(FooCpp, SourceContents);
-  auto DumpParseDifferent = dumpASTWithoutMemoryLocs(Server, FooCpp);
   ASSERT_TRUE(Server.blockUntilIdleForTest()) << "Waiting for diagnostics";
+  auto DumpParseDifferent = dumpASTWithoutMemoryLocs(Server, FooCpp);
   EXPECT_TRUE(DiagConsumer.hadErrorInLastDiags());
 
   FS.Files[FooH] = "int a;";
   Server.addDocument(FooCpp, SourceContents);
-  auto DumpParse2 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   ASSERT_TRUE(Server.blockUntilIdleForTest()) << "Waiting for diagnostics";
+  auto DumpParse2 = dumpASTWithoutMemoryLocs(Server, FooCpp);
   EXPECT_FALSE(DiagConsumer.hadErrorInLastDiags());
 
   EXPECT_EQ(DumpParse1, DumpParse2);
diff --git a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
index 9e6ad6c9b6e1e..8c8520aae620c 100644
--- a/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TUSchedulerTests.cpp
@@ -21,14 +21,20 @@
 #include "support/Threading.h"
 #include "clang/Basic/DiagnosticDriver.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <cstdint>
+#include <memory>
+#include <string>
 #include <utility>
 
 namespace clang {
@@ -407,6 +413,7 @@ TEST_F(TUSchedulerTests, ManyUpdates) {
   int TotalASTReads = 0;
   int TotalPreambleReads = 0;
   int TotalUpdates = 0;
+  llvm::StringMap<int> LatestDiagVersion;
 
   // Run TUScheduler and collect some stats.
   {
@@ -441,15 +448,23 @@ TEST_F(TUSchedulerTests, ManyUpdates) {
         auto Inputs = getInputs(File, Contents.str());
         {
           WithContextValue WithNonce(NonceKey, ++Nonce);
-          Inputs.Version = std::to_string(Nonce);
+          Inputs.Version = std::to_string(UpdateI);
           updateWithDiags(
               S, File, Inputs, WantDiagnostics::Auto,
-              [File, Nonce, &Mut, &TotalUpdates](std::vector<Diag>) {
+              [File, Nonce, Version(Inputs.Version), &Mut, &TotalUpdates,
+               &LatestDiagVersion](std::vector<Diag>) {
                 EXPECT_THAT(Context::current().get(NonceKey), Pointee(Nonce));
 
                 std::lock_guard<std::mutex> Lock(Mut);
                 ++TotalUpdates;
                 EXPECT_EQ(File, *TUScheduler::getFileBeingProcessedInContext());
+                // Make sure Diags are for a newer version.
+                auto It = LatestDiagVersion.try_emplace(File, -1);
+                const int PrevVersion = It.first->second;
+                int CurVersion;
+                ASSERT_TRUE(llvm::to_integer(Version, CurVersion, 10));
+                EXPECT_LT(PrevVersion, CurVersion);
+                It.first->getValue() = CurVersion;
               });
         }
         {
@@ -494,7 +509,13 @@ TEST_F(TUSchedulerTests, ManyUpdates) {
   } // TUScheduler destructor waits for all operations to finish.
 
   std::lock_guard<std::mutex> Lock(Mut);
-  EXPECT_EQ(TotalUpdates, FilesCount * UpdatesPerFile);
+  // Updates might get coalesced in preamble thread and result in dropping
+  // diagnostics for intermediate snapshots.
+  EXPECT_GE(TotalUpdates, FilesCount);
+  EXPECT_LE(TotalUpdates, FilesCount * UpdatesPerFile);
+  // We should receive diags for last update.
+  for (const auto &Entry : LatestDiagVersion)
+    EXPECT_EQ(Entry.second, UpdatesPerFile - 1);
   EXPECT_EQ(TotalASTReads, FilesCount * UpdatesPerFile);
   EXPECT_EQ(TotalPreambleReads, FilesCount * UpdatesPerFile);
 }
@@ -972,6 +993,57 @@ TEST(DebouncePolicy, Compute) {
   EXPECT_NEAR(25, Compute({}), 0.01) << "no history -> max";
 }
 
+TEST_F(TUSchedulerTests, AsyncPreambleThread) {
+  // Blocks preamble thread while building preamble with \p BlockVersion until
+  // \p N is notified.
+  class BlockPreambleThread : public ParsingCallbacks {
+  public:
+    BlockPreambleThread(llvm::StringRef BlockVersion, Notification &N)
+        : BlockVersion(BlockVersion), N(N) {}
+    void onPreambleAST(PathRef Path, llvm::StringRef Version, ASTContext &Ctx,
+                       std::shared_ptr<clang::Preprocessor> PP,
+                       const CanonicalIncludes &) override {
+      if (Version == BlockVersion)
+        N.wait();
+    }
+
+  private:
+    llvm::StringRef BlockVersion;
+    Notification &N;
+  };
+
+  static constexpr llvm::StringLiteral InputsV0 = "v0";
+  static constexpr llvm::StringLiteral InputsV1 = "v1";
+  Notification Ready;
+  TUScheduler S(CDB, optsForTest(),
+                std::make_unique<BlockPreambleThread>(InputsV1, Ready));
+
+  Path File = testPath("foo.cpp");
+  auto PI = getInputs(File, "");
+  PI.Version = InputsV0.str();
+  S.update(File, PI, WantDiagnostics::Auto);
+  S.blockUntilIdle(timeoutSeconds(10));
+
+  // Block preamble builds.
+  PI.Version = InputsV1.str();
+  // Issue second update which will block preamble thread.
+  S.update(File, PI, WantDiagnostics::Auto);
+
+  Notification RunASTAction;
+  // Issue an AST read, which shouldn't be blocked and see latest version of the
+  // file.
+  S.runWithAST("test", File, [&](Expected<InputsAndAST> AST) {
+    ASSERT_TRUE(bool(AST));
+    // Make sure preamble is built with stale inputs, but AST was built using
+    // new ones.
+    EXPECT_THAT(AST->AST.preambleVersion(), InputsV0);
+    EXPECT_THAT(AST->Inputs.Version, InputsV1.str());
+    RunASTAction.notify();
+  });
+  RunASTAction.wait();
+  Ready.notify();
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang

From b9826c10866997a8869a7356a37aade759338b08 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 29 May 2020 12:25:27 +0100
Subject: [PATCH 505/770] [CGP] Ensure address scaled offset is representable
 as int64_t

AddressingModeMatcher::matchScaledValue was calling getSExtValue for a constant before ensuring that we can actually represent the value as int64_t

Fixes OSSFuzz#22723 which is a followup to rGc479052a74b2 (PR46004 / OSSFuzz#22357)
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp |  5 +++--
 llvm/test/CodeGen/X86/pr46004.ll    | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index ee4b43446ee1c..c22cf5f81ee5a 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3715,10 +3715,11 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
   // X*Scale + C*Scale to addr mode.
   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
-      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
+      CI->getValue().isSignedIntN(64)) {
     TestAddrMode.InBounds = false;
     TestAddrMode.ScaledReg = AddLHS;
-    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+    TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
 
     // If this addressing mode is legal, commit it and remember that we folded
     // this instruction.
diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll
index 5b00e5998a3ec..19353560e738d 100644
--- a/llvm/test/CodeGen/X86/pr46004.ll
+++ b/llvm/test/CodeGen/X86/pr46004.ll
@@ -19,3 +19,18 @@ define void @fuzz22357(i128 %a0) {
   store i8 0, i8* %3, align 1
   ret void
 }
+
+; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=22723
+define void @fuzz22723(i128 %a0) {
+; X86-LABEL: fuzz22723:
+; X86:       # %bb.0:
+; X86-NEXT:    retl
+;
+; X64-LABEL: fuzz22723:
+; X64:       # %bb.0:
+; X64-NEXT:    retq
+  %1 = add i128 %a0, 170141183460469231731687303715884105727
+  %2 = getelementptr i128*, i128** undef, i128 %1
+  store i128* undef, i128** %2, align 8
+  ret void
+}

From aa93659c9ffcf36dc1b0c70ce5b0f526a2b9c007 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Fri, 29 May 2020 06:42:35 -0400
Subject: [PATCH 506/770] [mlir][SCF] Add utility to clone an scf.ForOp while
 appending new yield values.

This utility factors out the machinery required to add iterArgs and yield values to an scf.ForOp.

Differential Revision: https://reviews.llvm.org/D80656
---
 mlir/include/mlir/Dialect/SCF/Utils.h         | 50 +++++++++++++
 .../lib/Dialect/SCF/Transforms/CMakeLists.txt |  1 +
 mlir/lib/Dialect/SCF/Transforms/Utils.cpp     | 73 +++++++++++++++++++
 mlir/test/Transforms/loop-utils.mlir          | 40 ++++++++++
 mlir/test/lib/Transforms/CMakeLists.txt       |  2 +
 mlir/test/lib/Transforms/TestSCFUtils.cpp     | 58 +++++++++++++++
 mlir/tools/mlir-opt/mlir-opt.cpp              |  2 +
 7 files changed, 226 insertions(+)
 create mode 100644 mlir/include/mlir/Dialect/SCF/Utils.h
 create mode 100644 mlir/lib/Dialect/SCF/Transforms/Utils.cpp
 create mode 100644 mlir/test/Transforms/loop-utils.mlir
 create mode 100644 mlir/test/lib/Transforms/TestSCFUtils.cpp

diff --git a/mlir/include/mlir/Dialect/SCF/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils.h
new file mode 100644
index 0000000000000..7f8ebd3a42606
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SCF/Utils.h
@@ -0,0 +1,50 @@
+//===- Utils.h - SCF dialect utilities --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various SCF utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_SCF_UTILS_H_
+#define MLIR_DIALECT_SCF_UTILS_H_
+
+namespace mlir {
+class OpBuilder;
+class ValueRange;
+
+namespace scf {
+class ForOp;
+class ParallelOp;
+} // end namespace scf
+
+/// Create a clone of `loop` with `newIterOperands` added as new initialization
+/// values and `newYieldedValues` added as new yielded values. The returned
+/// ForOp has `newYieldedValues.size()` new result values.  The `loop` induction
+/// variable and `newIterOperands` are remapped to the new induction variable
+/// and the new entry block arguments respectively.
+///
+/// Additionally, if `replaceLoopResults` is true, all uses of
+/// `loop.getResults()` are replaced with the first `loop.getNumResults()`
+/// return values respectively. This additional replacement is provided as a
+/// convenience to update the consumers of `loop`, in the case e.g. when `loop`
+/// is soon to be deleted.
+///
+/// Return the cloned loop.
+///
+/// This convenience function is useful to factorize common mechanisms related
+/// to hoisting roundtrips to memory into yields. It does not perform any
+/// legality checks.
+///
+/// Prerequisite: `newYieldedValues.size() == newYieldedValues.size()`.
+scf::ForOp cloneWithNewYields(OpBuilder &b, scf::ForOp loop,
+                              ValueRange newIterOperands,
+                              ValueRange newYieldedValues,
+                              bool replaceLoopResults = true);
+
+} // end namespace mlir
+#endif // MLIR_DIALECT_SCF_UTILS_H_
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index f2b0265ed74dd..58890d4a3782a 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   ParallelLoopFusion.cpp
   ParallelLoopSpecialization.cpp
   ParallelLoopTiling.cpp
+  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SCF
diff --git a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
new file mode 100644
index 0000000000000..6ae360a34abc4
--- /dev/null
+++ b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
@@ -0,0 +1,73 @@
+//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements miscellaneous loop transformation routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SCF/Utils.h"
+
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+
+using namespace mlir;
+
+scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop,
+                                    ValueRange newIterOperands,
+                                    ValueRange newYieldedValues,
+                                    bool replaceLoopResults) {
+  assert(newIterOperands.size() == newYieldedValues.size() &&
+         "newIterOperands must be of the same size as newYieldedValues");
+
+  // Create a new loop before the existing one, with the extra operands.
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(loop);
+  auto operands = llvm::to_vector<4>(loop.getIterOperands());
+  operands.append(newIterOperands.begin(), newIterOperands.end());
+  scf::ForOp newLoop =
+      b.create<scf::ForOp>(loop.getLoc(), loop.lowerBound(), loop.upperBound(),
+                           loop.step(), operands);
+
+  auto &loopBody = *loop.getBody();
+  auto &newLoopBody = *newLoop.getBody();
+  // Clone / erase the yield inside the original loop to both:
+  //   1. augment its operands with the newYieldedValues.
+  //   2. automatically apply the BlockAndValueMapping on its operand
+  auto yield = cast<scf::YieldOp>(loopBody.getTerminator());
+  b.setInsertionPoint(yield);
+  auto yieldOperands = llvm::to_vector<4>(yield.getOperands());
+  yieldOperands.append(newYieldedValues.begin(), newYieldedValues.end());
+  auto newYield = b.create<scf::YieldOp>(yield.getLoc(), yieldOperands);
+
+  // Clone the loop body with remaps.
+  BlockAndValueMapping bvm;
+  // a. remap the induction variable.
+  bvm.map(loop.getInductionVar(), newLoop.getInductionVar());
+  // b. remap the BB args.
+  bvm.map(loopBody.getArguments(),
+          newLoopBody.getArguments().take_front(loopBody.getNumArguments()));
+  // c. remap the iter args.
+  bvm.map(newIterOperands,
+          newLoop.getRegionIterArgs().take_back(newIterOperands.size()));
+  b.setInsertionPointToStart(&newLoopBody);
+  // Skip the original yield terminator which does not have enough operands.
+  for (auto &o : loopBody.without_terminator())
+    b.clone(o, bvm);
+
+  // Replace `loop`'s results if requested.
+  if (replaceLoopResults) {
+    for (auto it : llvm::zip(loop.getResults(), newLoop.getResults().take_front(
+                                                    loop.getNumResults())))
+      std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
+  }
+
+  // TODO: this is unsafe in the context of a PatternRewrite.
+  newYield.erase();
+
+  return newLoop;
+}
diff --git a/mlir/test/Transforms/loop-utils.mlir b/mlir/test/Transforms/loop-utils.mlir
new file mode 100644
index 0000000000000..3d3dadfba1791
--- /dev/null
+++ b/mlir/test/Transforms/loop-utils.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt -allow-unregistered-dialect -test-scf-utils -mlir-disable-threading %s | FileCheck %s
+
+// CHECK-LABEL: @hoist
+//  CHECK-SAME: %[[lb:[a-zA-Z0-9]*]]: index,
+//  CHECK-SAME: %[[ub:[a-zA-Z0-9]*]]: index,
+//  CHECK-SAME: %[[step:[a-zA-Z0-9]*]]: index
+func @hoist(%lb: index, %ub: index, %step: index) {
+  // CHECK: %[[A:.*]] = "fake_read"() : () -> index
+  // CHECK: %[[RES:.*]] = scf.for %[[I:.*]] = %[[lb]] to %[[ub]] step %[[step]] iter_args(%[[VAL:.*]] = %[[A]]) -> (index)
+  // CHECK:   %[[YIELD:.*]] = "fake_compute"(%[[VAL]]) : (index) -> index
+  // CHECK:   scf.yield %[[YIELD]] : index
+  // CHECK: "fake_write"(%[[RES]]) : (index) -> ()
+  scf.for %i = %lb to %ub step %step {
+    %0 = "fake_read"() : () -> (index)
+    %1 = "fake_compute"(%0) : (index) -> (index)
+    "fake_write"(%1) : (index) -> ()
+  }
+  return
+}
+
+// CHECK-LABEL: @hoist2
+//  CHECK-SAME: %[[lb:[a-zA-Z0-9]*]]: index,
+//  CHECK-SAME: %[[ub:[a-zA-Z0-9]*]]: index,
+//  CHECK-SAME: %[[step:[a-zA-Z0-9]*]]: index
+//  CHECK-SAME: %[[extra_arg:[a-zA-Z0-9]*]]: f32
+func @hoist2(%lb: index, %ub: index, %step: index, %extra_arg: f32) -> f32 {
+  // CHECK: %[[A:.*]] = "fake_read"() : () -> index
+  // CHECK: %[[RES:.*]]:2 = scf.for %[[I:.*]] = %[[lb]] to %[[ub]] step %[[step]] iter_args(%[[VAL0:.*]] = %[[extra_arg]], %[[VAL1:.*]] = %[[A]]) -> (f32, index)
+  // CHECK:   %[[YIELD:.*]] = "fake_compute"(%[[VAL1]]) : (index) -> index
+  // CHECK:   scf.yield %[[VAL0]], %[[YIELD]] : f32, index
+  // CHECK: "fake_write"(%[[RES]]#1) : (index) -> ()
+  // CHECK: return %[[RES]]#0 : f32
+  %0 = scf.for %i = %lb to %ub step %step iter_args(%iter = %extra_arg) -> (f32) {
+    %0 = "fake_read"() : () -> (index)
+    %1 = "fake_compute"(%0) : (index) -> (index)
+    "fake_write"(%1) : (index) -> ()
+    scf.yield %iter: f32
+  }
+  return %0: f32
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 6069570316a8f..3f2befefe704e 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_library(MLIRTestTransforms
   TestMemRefBoundCheck.cpp
   TestMemRefDependenceCheck.cpp
   TestMemRefStrideCalculation.cpp
+  TestSCFUtils.cpp
   TestVectorTransforms.cpp
 
   EXCLUDE_FROM_LIBMLIR
@@ -41,6 +42,7 @@ add_mlir_library(MLIRTestTransforms
   MLIRLinalgTransforms
   MLIRNVVMIR
   MLIRSCF
+  MLIRSCFTransforms
   MLIRGPU
   MLIRPass
   MLIRROCDLIR
diff --git a/mlir/test/lib/Transforms/TestSCFUtils.cpp b/mlir/test/lib/Transforms/TestSCFUtils.cpp
new file mode 100644
index 0000000000000..ba06bbcc8860f
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestSCFUtils.cpp
@@ -0,0 +1,58 @@
+//===- TestSCFUtils.cpp --- Pass to test independent SCF dialect utils ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to test SCF dialect utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SCF/Utils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/SetVector.h"
+
+using namespace mlir;
+
+namespace {
+class TestSCFUtilsPass : public PassWrapper<TestSCFUtilsPass, FunctionPass> {
+public:
+  explicit TestSCFUtilsPass() {}
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+    SmallVector<scf::ForOp, 4> toErase;
+
+    func.walk([&](Operation *fakeRead) {
+      if (fakeRead->getName().getStringRef() != "fake_read")
+        return;
+      auto *fakeCompute = fakeRead->getResult(0).use_begin()->getOwner();
+      auto *fakeWrite = fakeCompute->getResult(0).use_begin()->getOwner();
+      auto loop = fakeRead->getParentOfType<scf::ForOp>();
+
+      OpBuilder b(loop);
+      loop.moveOutOfLoop({fakeRead});
+      fakeWrite->moveAfter(loop);
+      auto newLoop = cloneWithNewYields(b, loop, fakeRead->getResult(0),
+                                        fakeCompute->getResult(0));
+      fakeCompute->getResult(0).replaceAllUsesWith(
+          newLoop.getResults().take_back()[0]);
+      toErase.push_back(loop);
+    });
+    for (auto loop : llvm::reverse(toErase))
+      loop.erase();
+  }
+};
+} // end namespace
+
+namespace mlir {
+void registerTestSCFUtilsPass() {
+  PassRegistration<TestSCFUtilsPass>("test-scf-utils", "test scf utils");
+}
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 2764b23b7b35e..165d6ad159ac9 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -61,6 +61,7 @@ void registerTestMemRefStrideCalculation();
 void registerTestOpaqueLoc();
 void registerTestParallelismDetection();
 void registerTestGpuParallelLoopMappingPass();
+void registerTestSCFUtilsPass();
 void registerTestVectorConversions();
 void registerVectorizerTestPass();
 } // namespace mlir
@@ -131,6 +132,7 @@ void registerTestPasses() {
   registerTestOpaqueLoc();
   registerTestParallelismDetection();
   registerTestGpuParallelLoopMappingPass();
+  registerTestSCFUtilsPass();
   registerTestVectorConversions();
   registerVectorizerTestPass();
 }

From 20b2af3e5559e50d1b5279311c6e5034a2d1928d Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 29 May 2020 12:31:35 +0200
Subject: [PATCH 507/770] [clangd][NFC] Add traces for PreamblePatch::create

---
 clang-tools-extra/clangd/Preamble.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index 667fa04ff3458..d02685f4bf46d 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -385,6 +385,8 @@ void escapeBackslashAndQuotes(llvm::StringRef Text, llvm::raw_ostream &OS) {
 PreamblePatch PreamblePatch::create(llvm::StringRef FileName,
                                     const ParseInputs &Modified,
                                     const PreambleData &Baseline) {
+  trace::Span Tracer("CreatePreamblePatch");
+  SPAN_ATTACH(Tracer, "File", FileName);
   assert(llvm::sys::path::is_absolute(FileName) && "relative FileName!");
   // First scan preprocessor directives in Baseline and Modified. These will be
   // used to figure out newly added directives in Modified. Scanning can fail,

From ea7db621d289022f0733eb63bb10a837936cbb38 Mon Sep 17 00:00:00 2001
From: Xing GUO <higuoxing@gmail.com>
Date: Fri, 29 May 2020 19:56:32 +0800
Subject: [PATCH 508/770] [ObjectYAML][DWARF] Make the `PubSection` optional.

This patch helps make the `PubSection` optional in the DWARF structure.

Reviewed By: jhenderson, aprantl

Differential Revision: https://reviews.llvm.org/D80722
---
 llvm/include/llvm/ObjectYAML/DWARFYAML.h      | 11 ++-
 llvm/lib/ObjectYAML/DWARFYAML.cpp             | 18 ++--
 llvm/lib/ObjectYAML/MachOEmitter.cpp          | 10 ++-
 .../ObjectYAML/MachO/DWARF-pubsections.yaml   | 90 +++++++++++++++++++
 llvm/tools/obj2yaml/dwarf2yaml.cpp            | 32 +++++--
 5 files changed, 133 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 806dd13715e51..509417beb2837 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -99,6 +99,9 @@ struct PubSection {
   uint32_t UnitSize;
   bool IsGNUStyle = false;
   std::vector<PubEntry> Entries;
+
+  PubSection() = default;
+  PubSection(bool IsGNUStyle) : IsGNUStyle(IsGNUStyle) {}
 };
 
 struct FormValue {
@@ -161,11 +164,11 @@ struct Data {
   std::vector<StringRef> DebugStrings;
   std::vector<ARange> ARanges;
   std::vector<Ranges> DebugRanges;
-  PubSection PubNames;
-  PubSection PubTypes;
+  Optional<PubSection> PubNames;
+  Optional<PubSection> PubTypes;
 
-  PubSection GNUPubNames;
-  PubSection GNUPubTypes;
+  Optional<PubSection> GNUPubNames;
+  Optional<PubSection> GNUPubTypes;
 
   std::vector<Unit> CompileUnits;
 
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 4805f727e0ce9..30a77c74dc6c9 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -17,10 +17,8 @@ namespace llvm {
 
 bool DWARFYAML::Data::isEmpty() const {
   return DebugStrings.empty() && AbbrevDecls.empty() && ARanges.empty() &&
-         DebugRanges.empty() && PubNames.Entries.empty() &&
-         PubTypes.Entries.empty() && GNUPubNames.Entries.empty() &&
-         GNUPubTypes.Entries.empty() && CompileUnits.empty() &&
-         DebugLines.empty();
+         DebugRanges.empty() && !PubNames && !PubTypes && !GNUPubNames &&
+         !GNUPubTypes && CompileUnits.empty() && DebugLines.empty();
 }
 
 SetVector<StringRef> DWARFYAML::Data::getUsedSectionNames() const {
@@ -41,14 +39,10 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
     IO.mapOptional("debug_aranges", DWARF.ARanges);
   if (!DWARF.DebugRanges.empty() || !IO.outputting())
     IO.mapOptional("debug_ranges", DWARF.DebugRanges);
-  if (!DWARF.PubNames.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_pubnames", DWARF.PubNames);
-  if (!DWARF.PubTypes.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
-  if (!DWARF.GNUPubNames.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_gnu_pubnames", DWARF.GNUPubNames);
-  if (!DWARF.GNUPubTypes.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_gnu_pubtypes", DWARF.GNUPubTypes);
+  IO.mapOptional("debug_pubnames", DWARF.PubNames);
+  IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
+  IO.mapOptional("debug_gnu_pubnames", DWARF.GNUPubNames);
+  IO.mapOptional("debug_gnu_pubtypes", DWARF.GNUPubTypes);
   IO.mapOptional("debug_info", DWARF.CompileUnits);
   IO.mapOptional("debug_line", DWARF.DebugLines);
   IO.setContext(&oldContext);
diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index f8661e0c3c317..c9bca43a4ef9f 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -295,11 +295,13 @@ Error MachOWriter::writeSectionData(raw_ostream &OS) {
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_ranges", 16)) {
             DWARFYAML::EmitDebugRanges(OS, Obj.DWARF);
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubnames", 16)) {
-            DWARFYAML::EmitPubSection(OS, Obj.DWARF.PubNames,
-                                      Obj.IsLittleEndian);
+            if (Obj.DWARF.PubNames)
+              DWARFYAML::EmitPubSection(OS, *Obj.DWARF.PubNames,
+                                        Obj.IsLittleEndian);
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubtypes", 16)) {
-            DWARFYAML::EmitPubSection(OS, Obj.DWARF.PubTypes,
-                                      Obj.IsLittleEndian);
+            if (Obj.DWARF.PubTypes)
+              DWARFYAML::EmitPubSection(OS, *Obj.DWARF.PubTypes,
+                                        Obj.IsLittleEndian);
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16)) {
             DWARFYAML::EmitDebugInfo(OS, Obj.DWARF);
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16)) {
diff --git a/llvm/test/ObjectYAML/MachO/DWARF-pubsections.yaml b/llvm/test/ObjectYAML/MachO/DWARF-pubsections.yaml
index f2eadd20d8a73..4faac060c8a57 100644
--- a/llvm/test/ObjectYAML/MachO/DWARF-pubsections.yaml
+++ b/llvm/test/ObjectYAML/MachO/DWARF-pubsections.yaml
@@ -1,3 +1,9 @@
+## This file contains test cases for generating .debug_pubnames/.debug_pubtypes
+## section in object files from the DWARF entry of Mach-O YAML inputs
+
+## a) Test that yaml2obj emits the .debug_pubnames and .debug_pubtypes sections and
+## obj2yaml converts them back.
+
 # RUN: yaml2obj %s | obj2yaml | FileCheck %s
 
 --- !mach-o
@@ -345,3 +351,87 @@ DWARF:
 #CHECK:       - DieOffset:       0x00000071
 #CHECK:         Name:            char
 #CHECK: ...
+
+## b) Test that yaml2obj will not emit the .debug_pubnames/.debug_pubtypes section's
+## contents, if the "debug_pubnames"/"debug_pubtypes" entry doesn't exist in the
+## "DWARF" entry.
+
+# RUN: yaml2obj --docnum=2 %s | obj2yaml | FileCheck %s --check-prefix=EMPTY
+
+#      EMPTY: Sections:
+# EMPTY-NEXT:   - sectname:        __debug_pubnames
+# EMPTY-NEXT:     segname:         __DWARF
+# EMPTY-NEXT:     addr:            0x0000000000000000
+# EMPTY-NEXT:     size:            0
+# EMPTY-NEXT:     offset:          0x00000000
+# EMPTY-NEXT:     align:           0
+# EMPTY-NEXT:     reloff:          0x00000000
+# EMPTY-NEXT:     nreloc:          0
+# EMPTY-NEXT:     flags:           0x00000000
+# EMPTY-NEXT:     reserved1:       0x00000000
+# EMPTY-NEXT:     reserved2:       0x00000000
+# EMPTY-NEXT:     reserved3:       0x00000000
+# EMPTY-NEXT:     content:         ''
+# EMPTY-NEXT:   - sectname:        __debug_pubtypes
+# EMPTY-NEXT:     segname:         __DWARF
+# EMPTY-NEXT:     addr:            0x0000000000000000
+# EMPTY-NEXT:     size:            0
+# EMPTY-NEXT:     offset:          0x00000720
+# EMPTY-NEXT:     align:           0
+# EMPTY-NEXT:     reloff:          0x00000000
+# EMPTY-NEXT:     nreloc:          0
+# EMPTY-NEXT:     flags:           0x00000000
+# EMPTY-NEXT:     reserved1:       0x00000000
+# EMPTY-NEXT:     reserved2:       0x00000000
+# EMPTY-NEXT:     reserved3:       0x00000000
+# EMPTY-NEXT:     content:         ''
+# EMPTY-NEXT: ...
+
+--- !mach-o
+FileHeader:
+  magic:      0xFEEDFACF
+  cputype:    0x01000007
+  cpusubtype: 0x00000003
+  filetype:   0x0000000A
+  ncmds:      1
+  sizeofcmds: 1800
+  flags:      0x00000000
+  reserved:   0x00000000
+LoadCommands:
+  - cmd:      LC_SEGMENT_64
+    cmdsize:  232
+    segname:  __DWARF
+    vmaddr:   0x00000000
+    vmsize:   0x00000000
+    fileoff:  0
+    filesize: 0
+    maxprot:  0
+    initprot: 0
+    nsects:   2
+    flags:    0
+    Sections:
+      - sectname:  __debug_pubnames
+        segname:   __DWARF
+        addr:      0x0000000000000000
+        size:      0
+        offset:    0x00000000
+        align:     0
+        reloff:    0x00000000
+        nreloc:    0
+        flags:     0x00000000
+        reserved1: 0x00000000
+        reserved2: 0x00000000
+        reserved3: 0x00000000
+      - sectname:  __debug_pubtypes
+        segname:   __DWARF
+        addr:      0x0000000000000000
+        size:      0
+        offset:    0x00000720
+        align:     0
+        reloff:    0x00000000
+        nreloc:    0
+        flags:     0x00000000
+        reserved1: 0x00000000
+        reserved2: 0x00000000
+        reserved3: 0x00000000
+DWARF:
diff --git a/llvm/tools/obj2yaml/dwarf2yaml.cpp b/llvm/tools/obj2yaml/dwarf2yaml.cpp
index a007efe7e507f..0a1cac8c73c1e 100644
--- a/llvm/tools/obj2yaml/dwarf2yaml.cpp
+++ b/llvm/tools/obj2yaml/dwarf2yaml.cpp
@@ -11,6 +11,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
 
 #include <algorithm>
@@ -135,17 +136,32 @@ void dumpPubSection(DWARFContext &DCtx, DWARFYAML::PubSection &Y,
 
 void dumpDebugPubSections(DWARFContext &DCtx, DWARFYAML::Data &Y) {
   const DWARFObject &D = DCtx.getDWARFObj();
-  Y.PubNames.IsGNUStyle = false;
-  dumpPubSection(DCtx, Y.PubNames, D.getPubnamesSection());
 
-  Y.PubTypes.IsGNUStyle = false;
-  dumpPubSection(DCtx, Y.PubTypes, D.getPubtypesSection());
+  const DWARFSection PubNames = D.getPubnamesSection();
+  if (!PubNames.Data.empty()) {
+    Y.PubNames.emplace(/*IsGNUStyle=*/false);
+    dumpPubSection(DCtx, *Y.PubNames, PubNames);
+  }
 
-  Y.GNUPubNames.IsGNUStyle = true;
-  dumpPubSection(DCtx, Y.GNUPubNames, D.getGnuPubnamesSection());
+  const DWARFSection PubTypes = D.getPubtypesSection();
+  if (!PubTypes.Data.empty()) {
+    Y.PubTypes.emplace(/*IsGNUStyle=*/false);
+    dumpPubSection(DCtx, *Y.PubTypes, PubTypes);
+  }
 
-  Y.GNUPubTypes.IsGNUStyle = true;
-  dumpPubSection(DCtx, Y.GNUPubTypes, D.getGnuPubtypesSection());
+  const DWARFSection GNUPubNames = D.getGnuPubnamesSection();
+  if (!GNUPubNames.Data.empty()) {
+    // TODO: Test dumping .debug_gnu_pubnames section.
+    Y.GNUPubNames.emplace(/*IsGNUStyle=*/true);
+    dumpPubSection(DCtx, *Y.GNUPubNames, GNUPubNames);
+  }
+
+  const DWARFSection GNUPubTypes = D.getGnuPubtypesSection();
+  if (!GNUPubTypes.Data.empty()) {
+    // TODO: Test dumping .debug_gnu_pubtypes section.
+    Y.GNUPubTypes.emplace(/*IsGNUStyle=*/true);
+    dumpPubSection(DCtx, *Y.GNUPubTypes, GNUPubTypes);
+  }
 }
 
 void dumpDebugInfo(DWARFContext &DCtx, DWARFYAML::Data &Y) {

From 1ee114322cb251f851028c72e7974bf85e707e55 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Fri, 29 May 2020 08:01:15 -0400
Subject: [PATCH 509/770] [mlir][Linalg][Vector] Add forwarding patterns
 between linalg.copy and vector.transfer

This revision adds custom rewrites for patterns that arise during linalg structured
ops vectorization. These patterns allow the composition of linalg promotion,
vectorization and removal of redundant copies.

The patterns are voluntarily limited and restrictive atm.
More robust behavior will be implemented once more powerful side effect modeling and analyses are available on view/subview.

On the transfer_read side, the following pattern is rewritten:
```
   %alloc = ...
   [optional] %view = std.view %alloc ...
   %subView = subview %allocOrView ...
   [optional] linalg.fill(%allocOrView, %cst) ...
   ...
   linalg.copy(%in, %subView) ...
   vector.transfer_read %allocOrView[...], %cst ...
```
into
```
   [unchanged] %alloc = ...
   [unchanged] [optional] %view = std.view %alloc ...
   [unchanged] [unchanged] %subView = subview %allocOrView ...
   ...
   vector.transfer_read %in[...], %cst ...
```

On the transfer_write side, the following pattern is rewriten:
```
   %alloc = ...
   [optional] %view = std.view %alloc ...
   %subView = subview %allocOrView...
   ...
   vector.transfer_write %..., %allocOrView[...]
   linalg.copy(%subView, %out)
```

Differential Revision: https://reviews.llvm.org/D80728
---
 .../Dialect/Linalg/Transforms/Transforms.h    |  68 +++++++
 .../Linalg/Transforms/Vectorization.cpp       | 177 +++++++++++++++++-
 .../Linalg/forward-vector-transfers.mlir      | 153 +++++++++++++++
 .../lib/Transforms/TestLinalgTransforms.cpp   |  91 +++++----
 4 files changed, 443 insertions(+), 46 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/forward-vector-transfers.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 2da631956572f..2e0673795f305 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -14,6 +14,13 @@
 #include "llvm/ADT/SmallBitVector.h"
 
 namespace mlir {
+namespace vector {
+
+class TransferReadOp;
+class TransferWriteOp;
+
+} // namespace vector
+
 namespace linalg {
 
 struct LinalgTilingOptions;
@@ -437,6 +444,67 @@ struct LinalgLoweringPattern : public RewritePattern {
   LinalgLoweringType loweringType;
 };
 
+//===----------------------------------------------------------------------===//
+// Op-specific patterns.
+//===----------------------------------------------------------------------===//
+/// Match and rewrite for the pattern:
+/// ```
+///    %alloc = ...
+///    [optional] %view = std.view %alloc ...
+///    %subView = subview %allocOrView ...
+///    [optional] linalg.fill(%allocOrView, %cst) ...
+///    ...
+///    linalg.copy(%in, %subView) ...
+///    vector.transfer_read %allocOrView[...], %cst ...
+/// ```
+/// into
+/// ```
+///    [unchanged] %alloc = ...
+///    [unchanged] [optional] %view = std.view %alloc ...
+///    [unchanged] [unchanged] %subView = subview %allocOrView ...
+///    ...
+///    vector.transfer_read %in[...], %cst ...
+/// ```
+/// Where there is no interleaved use between linalg.copy and transfer_read as
+/// well as no interleaved use between linalg.fill and linalg.copy (if
+/// linalg.fill is specified).
+/// This is a custom rewrite to forward partial reads (with optional fills) to
+/// vector.transfer_read.
+struct LinalgCopyVTRForwardingPattern
+    : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp xferOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+/// Match and rewrite for the pattern:
+/// ```
+///    %alloc = ...
+///    [optional] %view = std.view %alloc ...
+///    %subView = subview %allocOrView...
+///    ...
+///    vector.transfer_write %..., %allocOrView[...]
+///    linalg.copy(%subView, %out)
+/// ```
+/// into
+/// ```
+///    [unchanged] %alloc = ...
+///    [unchanged] [optional] %view = std.view %alloc ...
+///    [unchanged] %subView = subview %allocOrView...
+///    ...
+///    vector.transfer_write %..., %out[...]
+/// ```
+/// Where there is no interleaved use between transfer_write and linalg.copy.
+/// This is a custom rewrite to forward partial writes to vector.transfer_write.
+struct LinalgCopyVTWForwardingPattern
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
+                                PatternRewriter &rewriter) const override;
+};
+
 //===----------------------------------------------------------------------===//
 // Support for staged pattern application.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index f27baa3c662a9..8fa0aa35a8746 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -103,12 +103,13 @@ void mlir::linalg::vectorizeLinalgOp(OpBuilder &builder, Operation *op) {
       llvm_unreachable("Unexpected conv with padding");
   }
 
+  StringRef dbgPref = "\n[" DEBUG_TYPE "]: ";
+  (void)dbgPref;
   edsc::ScopedContext scope(builder, op->getLoc());
   if (auto fillOp = dyn_cast<linalg::FillOp>(op)) {
     // Vectorize fill as a vector.broadcast.
-    LLVM_DEBUG(dbgs() << "\n[" DEBUG_TYPE
-                         "]: Rewrite linalg.fill as vector.broadcast: "
-                      << *op << ":\n");
+    LLVM_DEBUG(dbgs() << dbgPref
+                      << "Rewrite linalg.fill as vector.broadcast: " << *op);
     Value memref = vector_type_cast(fillOp.getOutputBuffer(0));
     Value dst = std_load(memref);
     Value res = vector_broadcast(dst.getType(), fillOp.value());
@@ -117,9 +118,8 @@ void mlir::linalg::vectorizeLinalgOp(OpBuilder &builder, Operation *op) {
   }
 
   // Vectorize other ops as vector contraction (currently only matmul).
-  LLVM_DEBUG(dbgs() << "\n[" DEBUG_TYPE
-                       "]: Rewrite linalg op as vector.contract: "
-                    << *op << ":\n");
+  LLVM_DEBUG(dbgs() << dbgPref
+                    << "Rewrite linalg op as vector.contract: " << *op);
   auto linalgOp = cast<linalg::LinalgOp>(op);
   Value a = std_load(vector_type_cast(linalgOp.getInput(0)));
   Value b = std_load(vector_type_cast(linalgOp.getInput(1)));
@@ -129,3 +129,168 @@ void mlir::linalg::vectorizeLinalgOp(OpBuilder &builder, Operation *op) {
                               linalgOp.iterator_types());
   std_store(res, memref);
 }
+
+/// Check whether there is any interleaved use of any `values` between `firstOp`
+/// and `secondOp`. Conservatively return `true` if any op or value is in a
+/// different block.
+static bool mayExistInterleavedUses(Operation *firstOp, Operation *secondOp,
+                                    ValueRange values) {
+  StringRef dbgPref = "\n[" DEBUG_TYPE "]: ";
+  (void)dbgPref;
+  if (firstOp->getBlock() != secondOp->getBlock() ||
+      !firstOp->isBeforeInBlock(secondOp)) {
+    LLVM_DEBUG(llvm::dbgs()
+               << dbgPref << "interleavedUses precondition failed, firstOp: "
+               << *firstOp << ", second op: " << *secondOp);
+    return true;
+  }
+  for (auto v : values) {
+    for (auto &u : v.getUses()) {
+      Operation *owner = u.getOwner();
+      if (owner == firstOp || owner == secondOp)
+        continue;
+      // TODO: this is too conservative, use dominance info in the future.
+      if (owner->getBlock() == firstOp->getBlock() &&
+          (owner->isBeforeInBlock(firstOp) || secondOp->isBeforeInBlock(owner)))
+        continue;
+      LLVM_DEBUG(llvm::dbgs()
+                 << dbgPref << " found interleaved op " << *owner
+                 << ", firstOp: " << *firstOp << ", second op: " << *secondOp);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Return the unique subview use of `v` if it is indeed unique, null otherwise.
+static SubViewOp getSubViewUseIfUnique(Value v) {
+  SubViewOp subViewOp;
+  for (auto &u : v.getUses()) {
+    if (auto newSubViewOp = dyn_cast<SubViewOp>(u.getOwner())) {
+      if (subViewOp)
+        return SubViewOp();
+      subViewOp = newSubViewOp;
+    }
+  }
+  return subViewOp;
+}
+
+/// TODO: use interfaces, side-effects and aliasing analysis as appropriate,
+/// when available.
+LogicalResult LinalgCopyVTRForwardingPattern::matchAndRewrite(
+    vector::TransferReadOp xferOp, PatternRewriter &rewriter) const {
+
+  // Transfer into `view`.
+  Value viewOrAlloc = xferOp.memref();
+  if (!viewOrAlloc.getDefiningOp<ViewOp>() &&
+      !viewOrAlloc.getDefiningOp<AllocOp>())
+    return failure();
+
+  StringRef dbgPref = "\n[" DEBUG_TYPE "]: VTRForwarding: ";
+  (void)dbgPref;
+  LLVM_DEBUG(llvm::dbgs() << dbgPref << viewOrAlloc);
+
+  // Ensure there is exactly one subview of `viewOrAlloc` defining `subView`.
+  SubViewOp subViewOp = getSubViewUseIfUnique(viewOrAlloc);
+  if (!subViewOp)
+    return failure();
+  Value subView = subViewOp.getResult();
+  LLVM_DEBUG(llvm::dbgs() << dbgPref << "with subView " << subView);
+
+  // Find the copy into `subView` without interleaved uses.
+  CopyOp copyOp;
+  for (auto &u : subView.getUses()) {
+    if (auto newCopyOp = dyn_cast<CopyOp>(u.getOwner())) {
+      if (newCopyOp.getOutputBuffer(0) != subView)
+        continue;
+      LLVM_DEBUG(llvm::dbgs() << dbgPref << "copy candidate " << *newCopyOp);
+      if (mayExistInterleavedUses(newCopyOp, xferOp, {viewOrAlloc, subView}))
+        continue;
+      copyOp = newCopyOp;
+      break;
+    }
+  }
+  if (!copyOp)
+    return failure();
+  LLVM_DEBUG(llvm::dbgs() << dbgPref << "with copy " << *copyOp);
+
+  // Find the fill into `viewOrAlloc` without interleaved uses before the copy.
+  FillOp maybeFillOp;
+  for (auto &u : viewOrAlloc.getUses()) {
+    if (auto newFillOp = dyn_cast<FillOp>(u.getOwner())) {
+      if (newFillOp.getOutputBuffer(0) != viewOrAlloc)
+        continue;
+      LLVM_DEBUG(llvm::dbgs() << dbgPref << "fill candidate " << *newFillOp);
+      if (mayExistInterleavedUses(newFillOp, copyOp, {viewOrAlloc, subView}))
+        continue;
+      maybeFillOp = newFillOp;
+      break;
+    }
+  }
+  // Ensure padding matches.
+  if (maybeFillOp && xferOp.padding() != maybeFillOp.value())
+    return failure();
+  if (maybeFillOp)
+    LLVM_DEBUG(llvm::dbgs() << dbgPref << "with maybeFillOp " << *maybeFillOp);
+
+  // `in` is the subview that linalg.copy reads. Replace it.
+  Value in = copyOp.getInput(0);
+
+  Value res = rewriter.create<vector::TransferReadOp>(
+      xferOp.getLoc(), xferOp.getVectorType(), in, xferOp.indices(),
+      xferOp.permutation_map(), xferOp.padding(),
+      xferOp.masked() ? *xferOp.masked() : ArrayAttr());
+
+  if (maybeFillOp)
+    rewriter.eraseOp(maybeFillOp);
+  rewriter.eraseOp(copyOp);
+  rewriter.replaceOp(xferOp, res);
+
+  return success();
+}
+
+/// TODO: use interfaces, side-effects and aliasing analysis as appropriate,
+/// when available.
+LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite(
+    vector::TransferWriteOp xferOp, PatternRewriter &rewriter) const {
+  // Transfer into `viewOrAlloc`.
+  Value viewOrAlloc = xferOp.memref();
+  if (!viewOrAlloc.getDefiningOp<ViewOp>() &&
+      !viewOrAlloc.getDefiningOp<AllocOp>())
+    return failure();
+
+  // Ensure there is exactly one subview of `viewOrAlloc` defining `subView`.
+  SubViewOp subViewOp = getSubViewUseIfUnique(viewOrAlloc);
+  if (!subViewOp)
+    return failure();
+  Value subView = subViewOp.getResult();
+
+  // Find the copy from `subView` without interleaved uses.
+  CopyOp copyOp;
+  for (auto &u : subViewOp.getResult().getUses()) {
+    if (auto newCopyOp = dyn_cast<CopyOp>(u.getOwner())) {
+      if (newCopyOp.getInput(0) != subView)
+        continue;
+      if (mayExistInterleavedUses(xferOp, newCopyOp, {viewOrAlloc, subView}))
+        continue;
+      copyOp = newCopyOp;
+      break;
+    }
+  }
+  if (!copyOp)
+    return failure();
+
+  // `out` is the subview copied into that we replace.
+  Value out = copyOp.getOutputBuffer(0);
+
+  // Forward vector.transfer into copy.
+  rewriter.create<vector::TransferWriteOp>(
+      xferOp.getLoc(), xferOp.vector(), out, xferOp.indices(),
+      xferOp.permutation_map(),
+      xferOp.masked() ? *xferOp.masked() : ArrayAttr());
+
+  rewriter.eraseOp(copyOp);
+  rewriter.eraseOp(xferOp);
+
+  return success();
+}
diff --git a/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir b/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
new file mode 100644
index 0000000000000..7f56234219fe6
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/forward-vector-transfers.mlir
@@ -0,0 +1,153 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -test-linalg-transform-patterns=test-vector-transfer-forwarding-patterns | FileCheck %s
+
+// CHECK-LABEL: testAllocRead
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: linalg.fill
+//   CHECK-NOT: linalg.copy
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_read %[[ARG0]]
+func @testAllocRead(%in: memref<? x f32>) -> vector<32 x f32> {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<32 x f32>
+  %subview = subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
+  %0 = vector.transfer_read %alloc[%c0], %f0: memref<32 x f32>, vector<32 x f32>
+  dealloc %alloc : memref<32 x f32>
+  return %0: vector<32 x f32>
+}
+
+// CHECK-LABEL: testAllocFillRead
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: linalg.fill
+//   CHECK-NOT: linalg.copy
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_read %[[ARG0]]
+func @testAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<32 x f32>
+  linalg.fill(%alloc, %f0): memref<32 x f32>, f32
+  %subview = subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
+  %0 = vector.transfer_read %alloc[%c0], %f0: memref<32 x f32>, vector<32 x f32>
+  dealloc %alloc : memref<32 x f32>
+  return %0: vector<32 x f32>
+}
+
+// CHECK-LABEL: testViewRead
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: linalg.fill
+//   CHECK-NOT: linalg.copy
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_read %[[ARG0]]
+func @testViewRead(%in: memref<? x f32>) -> vector<32 x f32> {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<128 x i8>
+  %view = view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
+  %subview = subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
+  %0 = vector.transfer_read %view[%c0], %f0: memref<32 x f32>, vector<32 x f32>
+  dealloc %alloc : memref<128 x i8>
+  return %0: vector<32 x f32>
+}
+
+// CHECK-LABEL: testViewFillRead
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: linalg.fill
+//   CHECK-NOT: linalg.copy
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_read %[[ARG0]]
+func @testViewFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<128 x i8>
+  %view = view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
+  %subview = subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  linalg.fill(%view, %f0): memref<32 x f32>, f32
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
+  %0 = vector.transfer_read %view[%c0], %f0: memref<32 x f32>, vector<32 x f32>
+  dealloc %alloc : memref<128 x i8>
+  return %0: vector<32 x f32>
+}
+
+// CHECK-LABEL: testAllocWrite
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: vector
+//  CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: linalg.copy
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_write %[[ARG0]], %[[ARG1]]
+func @testAllocWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<32 x f32>
+  %subview = subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  vector.transfer_write %vec, %alloc[%c0] : vector<32 x f32>, memref<32 x f32>
+  linalg.copy(%subview, %out): memref<16 x f32>, memref<? x f32>
+  dealloc %alloc : memref<32 x f32>
+  return
+}
+
+// CHECK-LABEL: testViewWrite
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: vector
+//  CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: linalg.copy
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_write %[[ARG0]], %[[ARG1]]
+func @testViewWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<128 x i8>
+  %view = view %alloc[%c0][] : memref<128 x i8> to memref<32 x f32>
+  %subview = subview %view[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  vector.transfer_write %vec, %view[%c0] : vector<32 x f32>, memref<32 x f32>
+  linalg.copy(%subview, %out): memref<16 x f32>, memref<? x f32>
+  dealloc %alloc : memref<128 x i8>
+  return
+}
+
+///===--------------------------------------------------------------------===///
+// Negative tests
+///===--------------------------------------------------------------------===///
+
+// This should fail the rewrite due to mismatching fill and transfer read value.
+// CHECK-LABEL: failAllocFillRead
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: vector.transfer_read %[[ARG0]]
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: linalg.copy
+//       CHECK: vector.transfer_read %[[ALLOC]]
+func @failAllocFillRead(%in: memref<? x f32>) -> vector<32 x f32> {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %f1 = constant 1.0: f32
+  %alloc = alloc() : memref<32 x f32>
+  linalg.fill(%alloc, %f0): memref<32 x f32>, f32
+  %subview = subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  linalg.copy(%in, %subview): memref<? x f32>, memref<16 x f32>
+  "some_interleaved_use"(%subview) : (memref<16 x f32>) -> ()
+  %0 = vector.transfer_read %alloc[%c0], %f1: memref<32 x f32>, vector<32 x f32>
+  dealloc %alloc : memref<32 x f32>
+  return %0: vector<32 x f32>
+}
+
+// This should fail the rewrite due to some interleaved use.
+// CHECK-LABEL: failAllocWrite
+//  CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: vector
+//  CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
+//   CHECK-NOT: vector.transfer_write %[[ARG0]], %[[ARG1]]
+//       CHECK: %[[ALLOC:.*]] = alloc
+//       CHECK: vector.transfer_write %[[ARG0]], %[[ALLOC]]
+//       CHECK: linalg.copy
+func @failAllocWrite(%vec: vector<32 x f32>, %out: memref<? x f32>) {
+  %c0 = constant 0: index
+  %f0 = constant 0.0: f32
+  %alloc = alloc() : memref<32 x f32>
+  %subview = subview %alloc[0][16][1] : memref<32 x f32> to memref<16 x f32>
+  vector.transfer_write %vec, %alloc[%c0] : vector<32 x f32>, memref<32 x f32>
+  "some_interleaved_use"(%subview) : (memref<16 x f32>) -> ()
+  linalg.copy(%subview, %out): memref<16 x f32>, memref<? x f32>
+  dealloc %alloc : memref<32 x f32>
+  return
+}
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index c38494fe27783..31189f47f9aea 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
@@ -48,6 +49,11 @@ struct TestLinalgTransforms
   Option<bool> testPromotionOptions{*this, "test-linalg-promotion-options",
                                     llvm::cl::desc("Test promotion options"),
                                     llvm::cl::init(false)};
+  Option<bool> testVectorTransferForwardingPatterns{
+      *this, "test-vector-transfer-forwarding-patterns",
+      llvm::cl::desc(
+          "Test a fused pass that forwards linalg.copy to vector.transfer"),
+      llvm::cl::init(false)};
 };
 } // end anonymous namespace
 
@@ -167,19 +173,6 @@ static void applyPatterns(FuncOp funcOp) {
   });
 }
 
-static OwningRewritePatternList
-getMatmulToVectorCanonicalizationPatterns(MLIRContext *context) {
-  OwningRewritePatternList patterns;
-  AffineApplyOp::getCanonicalizationPatterns(patterns, context);
-  AffineMinOp::getCanonicalizationPatterns(patterns, context);
-  AffineMaxOp::getCanonicalizationPatterns(patterns, context);
-  AllocOp::getCanonicalizationPatterns(patterns, context);
-  SubViewOp::getCanonicalizationPatterns(patterns, context);
-  ViewOp::getCanonicalizationPatterns(patterns, context);
-  MatmulOp::getCanonicalizationPatterns(patterns, context);
-  return patterns;
-}
-
 static void fillL1TilingAndMatmulToVectorPatterns(
     FuncOp funcOp, StringRef startMarker,
     SmallVectorImpl<OwningRewritePatternList> &patternsVector) {
@@ -261,40 +254,58 @@ void fillPromotionCallBackPatterns(MLIRContext *context,
       LinalgMarker({"PROMOTE"}));
 }
 
+static void
+applyMatmulToVectorPatterns(FuncOp funcOp,
+                            bool testMatmulToVectorPatterns1dTiling,
+                            bool testMatmulToVectorPatterns2dTiling) {
+  MLIRContext *ctx = funcOp.getContext();
+  SmallVector<OwningRewritePatternList, 4> stage1Patterns;
+  if (testMatmulToVectorPatterns1dTiling) {
+    fillL1TilingAndMatmulToVectorPatterns(funcOp, "START", stage1Patterns);
+  } else if (testMatmulToVectorPatterns2dTiling) {
+    stage1Patterns.emplace_back(
+        LinalgTilingPattern<MatmulOp>(ctx,
+                                      LinalgTilingOptions()
+                                          .setTileSizes({768, 264, 768})
+                                          .setInterchange({1, 2, 0}),
+                                      LinalgMarker({"START"}, "L2")));
+    fillL1TilingAndMatmulToVectorPatterns(funcOp, "L2", stage1Patterns);
+  }
+  OwningRewritePatternList stage2Patterns =
+      getLinalgTilingCanonicalizationPatterns(ctx);
+  applyStagedPatterns(funcOp, stage1Patterns, stage2Patterns);
+}
+
+static void applyVectorTransferForwardingPatterns(FuncOp funcOp) {
+  OwningRewritePatternList forwardPattern;
+  forwardPattern.insert<LinalgCopyVTRForwardingPattern>(funcOp.getContext());
+  forwardPattern.insert<LinalgCopyVTWForwardingPattern>(funcOp.getContext());
+  applyPatternsAndFoldGreedily(funcOp, forwardPattern);
+}
+
 /// Apply transformations specified as patterns.
 void TestLinalgTransforms::runOnFunction() {
-  if (testPatterns) {
-    applyPatterns(getFunction());
-    return;
-  }
+  auto lambda = [&](void *) {
+    getFunction().walk([](LinalgOp op) {
+      op.removeAttr(LinalgTransforms::kLinalgTransformMarker);
+    });
+  };
+  std::unique_ptr<void, decltype(lambda)> cleanupGuard{(void *)1, lambda};
+
   if (testPromotionOptions) {
     OwningRewritePatternList patterns;
     fillPromotionCallBackPatterns(&getContext(), patterns);
     applyPatternsAndFoldGreedily(getFunction(), patterns);
-  } else {
-    SmallVector<OwningRewritePatternList, 4> stage1Patterns;
-    if (testMatmulToVectorPatterns1dTiling) {
-      fillL1TilingAndMatmulToVectorPatterns(getFunction(), "START",
-                                            stage1Patterns);
-    } else if (testMatmulToVectorPatterns2dTiling) {
-      stage1Patterns.emplace_back(
-          LinalgTilingPattern<MatmulOp>(&getContext(),
-                                        LinalgTilingOptions()
-                                            .setTileSizes({768, 264, 768})
-                                            .setInterchange({1, 2, 0}),
-                                        LinalgMarker({"START"}, "L2")));
-      fillL1TilingAndMatmulToVectorPatterns(getFunction(), "L2",
-                                            stage1Patterns);
-    }
-    OwningRewritePatternList stage2Patterns =
-        getMatmulToVectorCanonicalizationPatterns(&getContext());
-    applyStagedPatterns(getFunction(), stage1Patterns, stage2Patterns);
+    return;
   }
-
-  // Drop the marker.
-  getFunction().walk([](LinalgOp op) {
-    op.removeAttr(LinalgTransforms::kLinalgTransformMarker);
-  });
+  if (testPatterns)
+    return applyPatterns(getFunction());
+  if (testMatmulToVectorPatterns1dTiling || testMatmulToVectorPatterns2dTiling)
+    return applyMatmulToVectorPatterns(getFunction(),
+                                       testMatmulToVectorPatterns1dTiling,
+                                       testMatmulToVectorPatterns2dTiling);
+  if (testVectorTransferForwardingPatterns)
+    return applyVectorTransferForwardingPatterns(getFunction());
 }
 
 namespace mlir {

From d20a3d35e1875d7a4928184117e6a875c35f3f63 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 29 May 2020 12:53:30 +0100
Subject: [PATCH 510/770] [DAGComb] Do not turn insert_elt into shuffle for
 single elt vectors.

Currently combineInsertEltToShuffle turns insert_vector_elt into a
vector_shuffle, even if the inserted element is a vector with a single
element. In this case, it should be unlikely that the additional shuffle
would be more efficient than a insert_vector_elt.

Additionally, this fixes a infinite cycle in DAGCombine, where
combineInsertEltToShuffle turns a insert_vector_elt into a shuffle,
which gets turned back into a insert_vector_elt/extract_vector_elt by
a custom AArch64 lowering (in visitVECTOR_SHUFFLE).

Such insert_vector_elt and extract_vector_elt combinations can be
lowered efficiently using mov on AArch64.

There are 2 test changes in arm64-neon-copy.ll: we now use one or two
mov instructions instead of a single zip1. The reason that we need a
second mov in ins1f2 is that we have to move the result to the result
register and is not really related to the DAGCombine fold I think.
But in any case, on most uarchs, mov should be cheaper than zip1. On a
Cortex-A75 for example, zip1 is twice as expensive as mov
(https://developer.arm.com/docs/101398/latest/arm-cortex-a75-software-optimization-guide-v20)

Reviewers: spatel, efriedma, dmgreen, RKSimon

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D80710
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  4 +++
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  5 +--
 .../AArch64/vector-insert-shuffle-cycle.ll    | 35 +++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9216151272851..0176ae3a0ab5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17137,6 +17137,10 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   EVT SubVecVT = SubVec.getValueType();
   EVT VT = DestVec.getValueType();
   unsigned NumSrcElts = SubVecVT.getVectorNumElements();
+  // If the source only has a single vector element, the cost of creating adding
+  // it to a vector is likely to exceed the cost of a insert_vector_elt.
+  if (NumSrcElts == 1)
+    return SDValue();
   unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
   unsigned NumMaskVals = ExtendRatio * NumSrcElts;
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 7820734e366d0..05a273f5f2d9b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -200,7 +200,8 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
 ; CHECK-LABEL: ins1f2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
@@ -211,7 +212,7 @@ define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1)
 ; CHECK-LABEL: ins1f2_args_flipped:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill:  def $d1 killed $d1 def $q1
-; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
diff --git a/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
new file mode 100644
index 0000000000000..57e7ef1a0e77f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - | FileCheck %s
+
+target triple = "arm64-apple-ios13.4.0"
+
+; Make we do not get stuck in a cycle in DAGCombiner.
+
+define void @test(i1 %c, <1 x double>* %ptr) {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    tbz w0, #0, LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:  LBB0_2: ; %bb2
+; CHECK-NEXT:    ldr q1, [x8]
+; CHECK-NEXT:    mov.d v1[0], v0[0]
+; CHECK-NEXT:    str q1, [x8]
+; CHECK-NEXT:    ret
+entry:
+  br i1 %c, label %bb1, label %bb2
+
+bb1:
+  %lv1 = load <1 x double>, <1 x double>* %ptr, align 16
+  br label %bb2
+
+bb2:
+  %p = phi <1 x double> [ %lv1, %bb1 ], [ zeroinitializer, %entry ]
+  %vecext19 = extractelement <1 x double> %p, i32 0
+  %arrayidx21 = getelementptr inbounds [4 x <4 x double>], [4 x <4 x double>]* undef, i64 0, i64 3
+  %lv2 = load <4 x double>, <4 x double>* %arrayidx21, align 16
+  %vecins22 = insertelement <4 x double> %lv2, double %vecext19, i32 2
+  store <4 x double> %vecins22, <4 x double>* %arrayidx21, align 16
+  ret void
+}

From ac1f7ab007e347dc4a542aa3415e6378289480f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 29 Apr 2020 22:19:48 +0300
Subject: [PATCH 511/770] [clang] [Darwin] Add reverse mappings for
 aarch64/aarch64_32 to darwin arch names

These are mapped in MachO::getMachOArchName already, but were missing
in ToolChain::getDefaultUniversalArchName.

Having these reverse mapped here fixes weird inconsistencies like
-dumpmachine showing a target triple like "aarch64-apple-darwin",
while "clang -target aarch64-apple-darwin" didn't use to work (ended
up mapped as unknown-apple-ios).

Differential Revision: https://reviews.llvm.org/D79117
---
 clang/lib/Driver/ToolChain.cpp          | 7 +++++--
 clang/test/Driver/darwin-arm64-target.c | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/darwin-arm64-target.c

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index ad66e8e6b5d31..cf04fd07e2a0e 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -230,9 +230,12 @@ ToolChain::getTargetAndModeFromProgramName(StringRef PN) {
 StringRef ToolChain::getDefaultUniversalArchName() const {
   // In universal driver terms, the arch name accepted by -arch isn't exactly
   // the same as the ones that appear in the triple. Roughly speaking, this is
-  // an inverse of the darwin::getArchTypeForDarwinArchName() function, but the
-  // only interesting special case is powerpc.
+  // an inverse of the darwin::getArchTypeForDarwinArchName() function.
   switch (Triple.getArch()) {
+  case llvm::Triple::aarch64:
+    return "arm64";
+  case llvm::Triple::aarch64_32:
+    return "arm64_32";
   case llvm::Triple::ppc:
     return "ppc";
   case llvm::Triple::ppc64:
diff --git a/clang/test/Driver/darwin-arm64-target.c b/clang/test/Driver/darwin-arm64-target.c
new file mode 100644
index 0000000000000..397afa288360d
--- /dev/null
+++ b/clang/test/Driver/darwin-arm64-target.c
@@ -0,0 +1,3 @@
+// RUN: %clang -target aarch64-apple-darwin %s -miphoneos-version-min=8.0 -### 2>&1 | FileCheck %s
+
+// CHECK: "-cc1"{{.*}} "-triple" "arm64-apple-ios8.0.0"

From ab4d02cf265982d4c03123d2f52b9d5ee8df575d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Miku=C5=82a?= <mati865@gmail.com>
Date: Fri, 29 May 2020 15:09:44 +0300
Subject: [PATCH 512/770] [clang] [MinGW] Fix libunwind extension

Differential Revision: https://reviews.llvm.org/D79995
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  9 ++++++++-
 clang/test/Driver/compiler-rt-unwind.c     | 23 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 33c43222b5f9d..b2c984912154c 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1235,7 +1235,14 @@ static void AddUnwindLibrary(const ToolChain &TC, const Driver &D,
   case ToolChain::UNW_CompilerRT:
     if (LGT == LibGccType::StaticLibGcc)
       CmdArgs.push_back("-l:libunwind.a");
-    else
+    else if (TC.getTriple().isOSCygMing()) {
+      if (LGT == LibGccType::SharedLibGcc)
+        CmdArgs.push_back("-l:libunwind.dll.a");
+      else
+        // Let the linker choose between libunwind.dll.a and libunwind.a
+        // depending on what's available, and depending on the -static flag
+        CmdArgs.push_back("-lunwind");
+    } else
       CmdArgs.push_back("-l:libunwind.so");
     break;
   }
diff --git a/clang/test/Driver/compiler-rt-unwind.c b/clang/test/Driver/compiler-rt-unwind.c
index 652a48c6ad78e..e21916d41f930 100644
--- a/clang/test/Driver/compiler-rt-unwind.c
+++ b/clang/test/Driver/compiler-rt-unwind.c
@@ -48,3 +48,26 @@
 // RUN:     --gcc-toolchain="" \
 // RUN: FileCheck --input-file=%t.err --check-prefix=RTLIB-GCC-UNWINDLIB-COMPILER_RT %s
 // RTLIB-GCC-UNWINDLIB-COMPILER_RT: "{{[.|\\\n]*}}--rtlib=libgcc requires --unwindlib=libgcc"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=x86_64-w64-mingw32 -rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:     -shared-libgcc \
+// RUN:     --gcc-toolchain="" \
+// RUN:   | FileCheck --check-prefix=MINGW-RTLIB-COMPILER-RT-SHARED-UNWINDLIB-COMPILER-RT %s
+// MINGW-RTLIB-COMPILER-RT-SHARED-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// MINGW-RTLIB-COMPILER-RT-SHARED-UNWINDLIB-COMPILER-RT: "{{.*}}l:libunwind.dll.a"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=x86_64-w64-mingw32 -rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:     -static-libgcc \
+// RUN:     --gcc-toolchain="" \
+// RUN:   | FileCheck --check-prefix=MINGW-RTLIB-COMPILER-RT-STATIC-UNWINDLIB-COMPILER-RT %s
+// MINGW-RTLIB-COMPILER-RT-STATIC-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// MINGW-RTLIB-COMPILER-RT-STATIC-UNWINDLIB-COMPILER-RT: "{{.*}}l:libunwind.a"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=x86_64-w64-mingw32 -rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:     --gcc-toolchain="" \
+// RUN:   | FileCheck --check-prefix=MINGW-RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT %s
+// MINGW-RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// MINGW-RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT: "{{.*}}lunwind"

From d4ef654673a921878ba5aedb9725b2ac32681f01 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Fri, 29 May 2020 14:12:51 +0200
Subject: [PATCH 513/770] Rename APIs in unittests/AST/Language.h in
 preparation to share them

Summary:
Declaring these helpers in the ast_matcher namespace in the clangAST
unit test seems inappropriate -- neither these helpers, nor clangAST have
anything to do with AST matchers. Therefore, I moved these helpers to
the clang namespace.

Declaring another typedef called "ArgVector" is not a good idea -- we
already have both "ArgVector", "ArgsVector", and "ArgList". I expanded
it into the underlying type.

Declaring another enum called "Language" is not a good idea because we
arleady have the "clang::Language" enum. I renamed it to
"TestLanguage".

Similarly, I renamed "getBasicRunOptionsForLanguage" to
"getCommandLineArgsForTesting" to explain the semantics better (what are
"run options"?) and not repeat types in the function name
("ForLanguage").

Reviewers: shafik, rengolin, sammccall

Reviewed By: sammccall

Subscribers: gribozavr2, sammccall, martong, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80786
---
 clang/unittests/AST/ASTImporterFixtures.cpp   | 33 ++++++-------
 clang/unittests/AST/ASTImporterFixtures.h     | 46 +++++++++++--------
 .../AST/ASTImporterODRStrategiesTest.cpp      | 30 ++++++------
 clang/unittests/AST/ASTImporterTest.cpp       | 46 ++++++++++---------
 .../AST/ASTImporterVisibilityTest.cpp         | 16 ++++---
 clang/unittests/AST/Language.cpp              | 25 +++++-----
 clang/unittests/AST/Language.h                | 27 +++++------
 clang/unittests/AST/MatchVerifier.h           | 20 ++++----
 .../AST/StructuralEquivalenceTest.cpp         | 28 +++++------
 9 files changed, 141 insertions(+), 130 deletions(-)

diff --git a/clang/unittests/AST/ASTImporterFixtures.cpp b/clang/unittests/AST/ASTImporterFixtures.cpp
index 36732ee712f36..897b370dd3cdc 100644
--- a/clang/unittests/AST/ASTImporterFixtures.cpp
+++ b/clang/unittests/AST/ASTImporterFixtures.cpp
@@ -38,7 +38,8 @@ void createVirtualFileIfNeeded(ASTUnit *ToAST, StringRef FileName,
                                    llvm::MemoryBuffer::getMemBuffer(Code));
 }
 
-ASTImporterTestBase::TU::TU(StringRef Code, StringRef FileName, ArgVector Args,
+ASTImporterTestBase::TU::TU(StringRef Code, StringRef FileName,
+                            std::vector<std::string> Args,
                             ImporterConstructor C,
                             ASTImporter::ODRHandlingType ODRHandling)
     : Code(std::string(Code)), FileName(std::string(FileName)),
@@ -112,11 +113,12 @@ void ASTImporterTestBase::lazyInitSharedState(TranslationUnitDecl *ToTU) {
     SharedStatePtr = std::make_shared<ASTImporterSharedState>(*ToTU);
 }
 
-void ASTImporterTestBase::lazyInitToAST(Language ToLang, StringRef ToSrcCode,
+void ASTImporterTestBase::lazyInitToAST(TestLanguage ToLang,
+                                        StringRef ToSrcCode,
                                         StringRef FileName) {
   if (ToAST)
     return;
-  ArgVector ToArgs = getArgVectorForLanguage(ToLang);
+  std::vector<std::string> ToArgs = getCommandLineArgsForLanguage(ToLang);
   // Source code must be a valid live buffer through the tests lifetime.
   ToCode = std::string(ToSrcCode);
   // Build the AST from an empty file.
@@ -136,12 +138,11 @@ ASTImporterTestBase::TU *ASTImporterTestBase::findFromTU(Decl *From) {
   return &*It;
 }
 
-std::tuple<Decl *, Decl *>
-ASTImporterTestBase::getImportedDecl(StringRef FromSrcCode, Language FromLang,
-                                     StringRef ToSrcCode, Language ToLang,
-                                     StringRef Identifier) {
-  ArgVector FromArgs = getArgVectorForLanguage(FromLang),
-            ToArgs = getArgVectorForLanguage(ToLang);
+std::tuple<Decl *, Decl *> ASTImporterTestBase::getImportedDecl(
+    StringRef FromSrcCode, TestLanguage FromLang, StringRef ToSrcCode,
+    TestLanguage ToLang, StringRef Identifier) {
+  std::vector<std::string> FromArgs = getCommandLineArgsForLanguage(FromLang);
+  std::vector<std::string> ToArgs = getCommandLineArgsForLanguage(ToLang);
 
   FromTUs.emplace_back(FromSrcCode, InputFileName, FromArgs, Creator,
                        ODRHandling);
@@ -170,13 +171,13 @@ ASTImporterTestBase::getImportedDecl(StringRef FromSrcCode, Language FromLang,
 }
 
 TranslationUnitDecl *ASTImporterTestBase::getTuDecl(StringRef SrcCode,
-                                                    Language Lang,
+                                                    TestLanguage Lang,
                                                     StringRef FileName) {
   assert(llvm::find_if(FromTUs, [FileName](const TU &E) {
            return E.FileName == FileName;
          }) == FromTUs.end());
 
-  ArgVector Args = getArgVectorForLanguage(Lang);
+  std::vector<std::string> Args = getCommandLineArgsForLanguage(Lang);
   FromTUs.emplace_back(SrcCode, FileName, Args, Creator, ODRHandling);
   TU &Tu = FromTUs.back();
 
@@ -184,14 +185,14 @@ TranslationUnitDecl *ASTImporterTestBase::getTuDecl(StringRef SrcCode,
 }
 
 TranslationUnitDecl *ASTImporterTestBase::getToTuDecl(StringRef ToSrcCode,
-                                                      Language ToLang) {
-  ArgVector ToArgs = getArgVectorForLanguage(ToLang);
+                                                      TestLanguage ToLang) {
+  std::vector<std::string> ToArgs = getCommandLineArgsForLanguage(ToLang);
   assert(!ToAST);
   lazyInitToAST(ToLang, ToSrcCode, OutputFileName);
   return ToAST->getASTContext().getTranslationUnitDecl();
 }
 
-Decl *ASTImporterTestBase::Import(Decl *From, Language ToLang) {
+Decl *ASTImporterTestBase::Import(Decl *From, TestLanguage ToLang) {
   lazyInitToAST(ToLang, "", OutputFileName);
   TU *FromTU = findFromTU(From);
   assert(SharedStatePtr);
@@ -200,7 +201,7 @@ Decl *ASTImporterTestBase::Import(Decl *From, Language ToLang) {
 }
 
 llvm::Expected<Decl *> ASTImporterTestBase::importOrError(Decl *From,
-                                                          Language ToLang) {
+                                                          TestLanguage ToLang) {
   lazyInitToAST(ToLang, "", OutputFileName);
   TU *FromTU = findFromTU(From);
   assert(SharedStatePtr);
@@ -210,7 +211,7 @@ llvm::Expected<Decl *> ASTImporterTestBase::importOrError(Decl *From,
 }
 
 QualType ASTImporterTestBase::ImportType(QualType FromType, Decl *TUDecl,
-                                         Language ToLang) {
+                                         TestLanguage ToLang) {
   lazyInitToAST(ToLang, "", OutputFileName);
   TU *FromTU = findFromTU(TUDecl);
   assert(SharedStatePtr);
diff --git a/clang/unittests/AST/ASTImporterFixtures.h b/clang/unittests/AST/ASTImporterFixtures.h
index 3d9d64c95b2fb..619c3f590be4f 100644
--- a/clang/unittests/AST/ASTImporterFixtures.h
+++ b/clang/unittests/AST/ASTImporterFixtures.h
@@ -52,13 +52,14 @@ void createVirtualFileIfNeeded(ASTUnit *ToAST, StringRef FileName,
 class CompilerOptionSpecificTest : public ::testing::Test {
 protected:
   // Return the extra arguments appended to runtime options at compilation.
-  virtual ArgVector getExtraArgs() const { return ArgVector(); }
+  virtual std::vector<std::string> getExtraArgs() const { return {}; }
 
   // Returns the argument vector used for a specific language option, this set
   // can be tweaked by the test parameters.
-  ArgVector getArgVectorForLanguage(Language Lang) const {
-    ArgVector Args = getBasicRunOptionsForLanguage(Lang);
-    ArgVector ExtraArgs = getExtraArgs();
+  std::vector<std::string>
+  getCommandLineArgsForLanguage(TestLanguage Lang) const {
+    std::vector<std::string> Args = getCommandLineArgsForTesting(Lang);
+    std::vector<std::string> ExtraArgs = getExtraArgs();
     for (const auto &Arg : ExtraArgs) {
       Args.push_back(Arg);
     }
@@ -66,10 +67,13 @@ class CompilerOptionSpecificTest : public ::testing::Test {
   }
 };
 
-const auto DefaultTestArrayForRunOptions = std::array<ArgVector, 4>{
-    {ArgVector(), ArgVector{"-fdelayed-template-parsing"},
-     ArgVector{"-fms-compatibility"},
-     ArgVector{"-fdelayed-template-parsing", "-fms-compatibility"}}};
+const auto DefaultTestArrayForRunOptions =
+    std::array<std::vector<std::string>, 4>{
+        {std::vector<std::string>(),
+         std::vector<std::string>{"-fdelayed-template-parsing"},
+         std::vector<std::string>{"-fms-compatibility"},
+         std::vector<std::string>{"-fdelayed-template-parsing",
+                                  "-fms-compatibility"}}};
 
 const auto DefaultTestValuesForRunOptions =
     ::testing::ValuesIn(DefaultTestArrayForRunOptions);
@@ -111,7 +115,7 @@ class ASTImporterTestBase : public CompilerOptionSpecificTest {
     ImporterConstructor Creator;
     ASTImporter::ODRHandlingType ODRHandling;
 
-    TU(StringRef Code, StringRef FileName, ArgVector Args,
+    TU(StringRef Code, StringRef FileName, std::vector<std::string> Args,
        ImporterConstructor C = ImporterConstructor(),
        ASTImporter::ODRHandlingType ODRHandling =
            ASTImporter::ODRHandlingType::Conservative);
@@ -141,7 +145,8 @@ class ASTImporterTestBase : public CompilerOptionSpecificTest {
   // Initialize the shared state if not initialized already.
   void lazyInitSharedState(TranslationUnitDecl *ToTU);
 
-  void lazyInitToAST(Language ToLang, StringRef ToSrcCode, StringRef FileName);
+  void lazyInitToAST(TestLanguage ToLang, StringRef ToSrcCode,
+                     StringRef FileName);
 
 protected:
   std::shared_ptr<ASTImporterSharedState> SharedStatePtr;
@@ -157,32 +162,33 @@ class ASTImporterTestBase : public CompilerOptionSpecificTest {
   // of the identifier into the To context.
   // Must not be called more than once within the same test.
   std::tuple<Decl *, Decl *>
-  getImportedDecl(StringRef FromSrcCode, Language FromLang, StringRef ToSrcCode,
-                  Language ToLang, StringRef Identifier = DeclToImportID);
+  getImportedDecl(StringRef FromSrcCode, TestLanguage FromLang,
+                  StringRef ToSrcCode, TestLanguage ToLang,
+                  StringRef Identifier = DeclToImportID);
 
   // Creates a TU decl for the given source code which can be used as a From
   // context.  May be called several times in a given test (with different file
   // name).
-  TranslationUnitDecl *getTuDecl(StringRef SrcCode, Language Lang,
+  TranslationUnitDecl *getTuDecl(StringRef SrcCode, TestLanguage Lang,
                                  StringRef FileName = "input.cc");
 
   // Creates the To context with the given source code and returns the TU decl.
-  TranslationUnitDecl *getToTuDecl(StringRef ToSrcCode, Language ToLang);
+  TranslationUnitDecl *getToTuDecl(StringRef ToSrcCode, TestLanguage ToLang);
 
   // Import the given Decl into the ToCtx.
   // May be called several times in a given test.
   // The different instances of the param From may have different ASTContext.
-  Decl *Import(Decl *From, Language ToLang);
+  Decl *Import(Decl *From, TestLanguage ToLang);
 
-  template <class DeclT> DeclT *Import(DeclT *From, Language Lang) {
+  template <class DeclT> DeclT *Import(DeclT *From, TestLanguage Lang) {
     return cast_or_null<DeclT>(Import(cast<Decl>(From), Lang));
   }
 
   // Import the given Decl into the ToCtx.
   // Same as Import but returns the result of the import which can be an error.
-  llvm::Expected<Decl *> importOrError(Decl *From, Language ToLang);
+  llvm::Expected<Decl *> importOrError(Decl *From, TestLanguage ToLang);
 
-  QualType ImportType(QualType FromType, Decl *TUDecl, Language ToLang);
+  QualType ImportType(QualType FromType, Decl *TUDecl, TestLanguage ToLang);
 
   ASTImporterTestBase()
       : ODRHandling(ASTImporter::ODRHandlingType::Conservative) {}
@@ -191,9 +197,9 @@ class ASTImporterTestBase : public CompilerOptionSpecificTest {
 
 class ASTImporterOptionSpecificTestBase
     : public ASTImporterTestBase,
-      public ::testing::WithParamInterface<ArgVector> {
+      public ::testing::WithParamInterface<std::vector<std::string>> {
 protected:
-  ArgVector getExtraArgs() const override { return GetParam(); }
+  std::vector<std::string> getExtraArgs() const override { return GetParam(); }
 };
 
 template <class T>
diff --git a/clang/unittests/AST/ASTImporterODRStrategiesTest.cpp b/clang/unittests/AST/ASTImporterODRStrategiesTest.cpp
index 2bd62e23ea38d..9ae27a08bc76d 100644
--- a/clang/unittests/AST/ASTImporterODRStrategiesTest.cpp
+++ b/clang/unittests/AST/ASTImporterODRStrategiesTest.cpp
@@ -37,7 +37,7 @@ struct Function {
   BindableMatcher<Decl> getPattern() {
     return functionDecl(hasName("X"), unless(isImplicit()));
   }
-  Language getLang() { return Lang_C; }
+  TestLanguage getLang() { return Lang_C; }
 };
 
 struct Typedef {
@@ -45,7 +45,7 @@ struct Typedef {
   static constexpr auto *Definition = "typedef int X;";
   static constexpr auto *ConflictingDefinition = "typedef double X;";
   BindableMatcher<Decl> getPattern() { return typedefNameDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 struct TypedefAlias {
@@ -53,7 +53,7 @@ struct TypedefAlias {
   static constexpr auto *Definition = "using X = int;";
   static constexpr auto *ConflictingDefinition = "using X = double;";
   BindableMatcher<Decl> getPattern() { return typedefNameDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX11; }
+  TestLanguage getLang() { return Lang_CXX11; }
 };
 
 struct Enum {
@@ -61,7 +61,7 @@ struct Enum {
   static constexpr auto *Definition = "enum X { a, b };";
   static constexpr auto *ConflictingDefinition = "enum X { a, b, c };";
   BindableMatcher<Decl> getPattern() { return enumDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 struct EnumClass {
@@ -69,7 +69,7 @@ struct EnumClass {
   static constexpr auto *Definition = "enum class X { a, b };";
   static constexpr auto *ConflictingDefinition = "enum class X { a, b, c };";
   BindableMatcher<Decl> getPattern() { return enumDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX11; }
+  TestLanguage getLang() { return Lang_CXX11; }
 };
 
 struct EnumConstant {
@@ -77,7 +77,7 @@ struct EnumConstant {
   static constexpr auto *Definition = "enum E { X = 0 };";
   static constexpr auto *ConflictingDefinition = "enum E { X = 1 };";
   BindableMatcher<Decl> getPattern() { return enumConstantDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 struct Class {
@@ -88,7 +88,7 @@ struct Class {
   BindableMatcher<Decl> getPattern() {
     return cxxRecordDecl(hasName("X"), unless(isImplicit()));
   }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 struct Variable {
@@ -98,7 +98,7 @@ struct Variable {
   static constexpr auto *Definition = "int X;";
   static constexpr auto *ConflictingDefinition = "float X;";
   BindableMatcher<Decl> getPattern() { return varDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 struct ClassTemplate {
@@ -112,7 +112,7 @@ struct ClassTemplate {
   BindableMatcher<Decl> getPattern() {
     return classTemplateDecl(hasName("X"), unless(isImplicit()));
   }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 struct FunctionTemplate {
@@ -133,7 +133,7 @@ struct FunctionTemplate {
   }
   static std::string getDef0() { return Definition0; }
   static std::string getDef1() { return Definition1; }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 static const internal::VariadicDynCastAllOfMatcher<Decl, VarTemplateDecl>
@@ -152,7 +152,7 @@ struct VarTemplate {
       constexpr int X = 0;
       )";
   BindableMatcher<Decl> getPattern() { return varTemplateDecl(hasName("X")); }
-  Language getLang() { return Lang_CXX14; }
+  TestLanguage getLang() { return Lang_CXX14; }
 };
 
 struct ClassTemplateSpec {
@@ -175,7 +175,7 @@ struct ClassTemplateSpec {
   BindableMatcher<Decl> getPattern() {
     return classTemplateSpecializationDecl(hasName("X"), unless(isImplicit()));
   }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 // Function template specializations are all "full" specializations.
@@ -208,7 +208,7 @@ struct FunctionTemplateSpec {
   }
   static std::string getDef0() { return Definition0; }
   static std::string getDef1() { return Definition1; }
-  Language getLang() { return Lang_CXX; }
+  TestLanguage getLang() { return Lang_CXX; }
 };
 
 static const internal::VariadicDynCastAllOfMatcher<
@@ -230,7 +230,7 @@ struct VarTemplateSpec {
   BindableMatcher<Decl> getPattern() {
     return varTemplateSpecializationDecl(hasName("X"), unless(isImplicit()));
   }
-  Language getLang() { return Lang_CXX14; }
+  TestLanguage getLang() { return Lang_CXX14; }
 };
 
 template <typename TypeParam, ASTImporter::ODRHandlingType ODRHandlingParam>
@@ -252,7 +252,7 @@ struct ODRViolation : ASTImporterOptionSpecificTestBase {
     return TypeParam::ConflictingProtoDef;
   }
   static BindableMatcher<Decl> getPattern() { return TypeParam().getPattern(); }
-  static Language getLang() { return TypeParam().getLang(); }
+  static TestLanguage getLang() { return TypeParam().getLang(); }
 
   template <std::string (*ToTUContent)(), std::string (*FromTUContent)(),
             void (*ResultChecker)(llvm::Expected<Decl *> &, Decl *, Decl *)>
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 06f6aa199be75..5e70d28f884d2 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -28,8 +28,9 @@ using internal::BindableMatcher;
 using llvm::StringMap;
 
 // Base class for those tests which use the family of `testImport` functions.
-class TestImportBase : public CompilerOptionSpecificTest,
-                       public ::testing::WithParamInterface<ArgVector> {
+class TestImportBase
+    : public CompilerOptionSpecificTest,
+      public ::testing::WithParamInterface<std::vector<std::string>> {
 
   template <typename NodeType>
   llvm::Expected<NodeType> importNode(ASTUnit *From, ASTUnit *To,
@@ -62,8 +63,9 @@ class TestImportBase : public CompilerOptionSpecificTest,
 
   template <typename NodeType>
   testing::AssertionResult
-  testImport(const std::string &FromCode, const ArgVector &FromArgs,
-             const std::string &ToCode, const ArgVector &ToArgs,
+  testImport(const std::string &FromCode,
+             const std::vector<std::string> &FromArgs,
+             const std::string &ToCode, const std::vector<std::string> &ToArgs,
              MatchVerifier<NodeType> &Verifier,
              const BindableMatcher<NodeType> &SearchMatcher,
              const BindableMatcher<NodeType> &VerificationMatcher) {
@@ -110,8 +112,9 @@ class TestImportBase : public CompilerOptionSpecificTest,
 
   template <typename NodeType>
   testing::AssertionResult
-  testImport(const std::string &FromCode, const ArgVector &FromArgs,
-             const std::string &ToCode, const ArgVector &ToArgs,
+  testImport(const std::string &FromCode,
+             const std::vector<std::string> &FromArgs,
+             const std::string &ToCode, const std::vector<std::string> &ToArgs,
              MatchVerifier<NodeType> &Verifier,
              const BindableMatcher<NodeType> &VerificationMatcher) {
     return testImport(
@@ -122,7 +125,7 @@ class TestImportBase : public CompilerOptionSpecificTest,
   }
 
 protected:
-  ArgVector getExtraArgs() const override { return GetParam(); }
+  std::vector<std::string> getExtraArgs() const override { return GetParam(); }
 
 public:
 
@@ -130,12 +133,12 @@ class TestImportBase : public CompilerOptionSpecificTest,
   /// of "FromCode" virtual file is imported to "ToCode" virtual file.
   /// The verification is done by running AMatcher over the imported node.
   template <typename NodeType, typename MatcherType>
-  void testImport(const std::string &FromCode, Language FromLang,
-                  const std::string &ToCode, Language ToLang,
+  void testImport(const std::string &FromCode, TestLanguage FromLang,
+                  const std::string &ToCode, TestLanguage ToLang,
                   MatchVerifier<NodeType> &Verifier,
                   const MatcherType &AMatcher) {
-    ArgVector FromArgs = getArgVectorForLanguage(FromLang),
-              ToArgs = getArgVectorForLanguage(ToLang);
+    std::vector<std::string> FromArgs = getCommandLineArgsForLanguage(FromLang);
+    std::vector<std::string> ToArgs = getCommandLineArgsForLanguage(ToLang);
     EXPECT_TRUE(
         testImport(FromCode, FromArgs, ToCode, ToArgs, Verifier, AMatcher));
   }
@@ -162,14 +165,14 @@ class TestImportBase : public CompilerOptionSpecificTest,
 
   struct CodeEntry {
     std::string CodeSample;
-    Language Lang;
+    TestLanguage Lang;
   };
 
   using CodeFiles = StringMap<CodeEntry>;
 
   /// Builds an ASTUnit for one potential compile options set.
   SingleASTUnit createASTUnit(StringRef FileName, const CodeEntry &CE) const {
-    ArgVector Args = getArgVectorForLanguage(CE.Lang);
+    std::vector<std::string> Args = getCommandLineArgsForLanguage(CE.Lang);
     auto AST = tooling::buildASTFromCodeWithArgs(CE.CodeSample, Args, FileName);
     EXPECT_TRUE(AST.get());
     return AST;
@@ -5523,14 +5526,14 @@ TEST_P(ASTImporterOptionSpecificTestBase,
 }
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, SVEBuiltins,
-                        ::testing::Values(ArgVector{"-target",
-                                                    "aarch64-linux-gnu"}), );
+                        ::testing::Values(std::vector<std::string>{
+                            "-target", "aarch64-linux-gnu"}), );
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, DeclContextTest,
-                        ::testing::Values(ArgVector()), );
+                        ::testing::Values(std::vector<std::string>()), );
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, CanonicalRedeclChain,
-                        ::testing::Values(ArgVector()), );
+                        ::testing::Values(std::vector<std::string>()), );
 
 TEST_P(ASTImporterOptionSpecificTestBase, LambdasAreDifferentiated) {
   Decl *FromTU = getTuDecl(
@@ -5982,9 +5985,9 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportExprOfAlignmentAttr) {
 }
 
 template <typename T>
-auto ExtendWithOptions(const T &Values, const ArgVector &Args) {
+auto ExtendWithOptions(const T &Values, const std::vector<std::string> &Args) {
   auto Copy = Values;
-  for (ArgVector &ArgV : Copy) {
+  for (std::vector<std::string> &ArgV : Copy) {
     for (const std::string &Arg : Args) {
       ArgV.push_back(Arg);
     }
@@ -6056,14 +6059,15 @@ INSTANTIATE_TEST_CASE_P(ParameterizedTests, ASTImporterLookupTableTest,
                         DefaultTestValuesForRunOptions, );
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, ImportPath,
-                        ::testing::Values(ArgVector()), );
+                        ::testing::Values(std::vector<std::string>()), );
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, ImportExpr,
                         DefaultTestValuesForRunOptions, );
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, ImportFixedPointExpr,
                         ExtendWithOptions(DefaultTestArrayForRunOptions,
-                                          ArgVector{"-ffixed-point"}), );
+                                          std::vector<std::string>{
+                                              "-ffixed-point"}), );
 
 INSTANTIATE_TEST_CASE_P(ParameterizedTests, ImportType,
                         DefaultTestValuesForRunOptions, );
diff --git a/clang/unittests/AST/ASTImporterVisibilityTest.cpp b/clang/unittests/AST/ASTImporterVisibilityTest.cpp
index 00a307b8940b1..262402415658f 100644
--- a/clang/unittests/AST/ASTImporterVisibilityTest.cpp
+++ b/clang/unittests/AST/ASTImporterVisibilityTest.cpp
@@ -96,8 +96,8 @@ const auto *AnonCT = "namespace { template <class> class X; }";
 
 // First value in tuple: Compile options.
 // Second value in tuple: Source code to be used in the test.
-using ImportVisibilityChainParams =
-    ::testing::WithParamInterface<std::tuple<ArgVector, const char *>>;
+using ImportVisibilityChainParams = ::testing::WithParamInterface<
+    std::tuple<std::vector<std::string>, const char *>>;
 // Fixture to test the redecl chain of Decls with the same visibility. Gtest
 // makes it possible to have either value-parameterized or type-parameterized
 // fixtures. However, we cannot have both value- and type-parameterized test
@@ -109,7 +109,9 @@ class ImportVisibilityChain
     : public ASTImporterTestBase, public ImportVisibilityChainParams {
 protected:
   using DeclTy = typename PatternFactory::DeclTy;
-  ArgVector getExtraArgs() const override { return std::get<0>(GetParam()); }
+  std::vector<std::string> getExtraArgs() const override {
+    return std::get<0>(GetParam());
+  }
   std::string getCode() const { return std::get<1>(GetParam()); }
   BindableMatcher<Decl> getPattern() const { return PatternFactory()(); }
 
@@ -222,8 +224,8 @@ INSTANTIATE_TEST_CASE_P(ParameterizedTests, ImportClassTemplatesVisibilityChain,
 // functions are expected to be linked in a declaration chain.
 // One value of this tuple is combined with every value of compile options.
 // The test can have a single tuple as parameter only.
-using ImportVisibilityParams = ::testing::WithParamInterface<
-    std::tuple<ArgVector, std::tuple<const char *, const char *, bool>>>;
+using ImportVisibilityParams = ::testing::WithParamInterface<std::tuple<
+    std::vector<std::string>, std::tuple<const char *, const char *, bool>>>;
 
 template <typename PatternFactory>
 class ImportVisibility
@@ -231,7 +233,9 @@ class ImportVisibility
       public ImportVisibilityParams {
 protected:
   using DeclTy = typename PatternFactory::DeclTy;
-  ArgVector getExtraArgs() const override { return std::get<0>(GetParam()); }
+  std::vector<std::string> getExtraArgs() const override {
+    return std::get<0>(GetParam());
+  }
   std::string getCode0() const { return std::get<0>(std::get<1>(GetParam())); }
   std::string getCode1() const { return std::get<1>(std::get<1>(GetParam())); }
   bool shouldBeLinked() const { return std::get<2>(std::get<1>(GetParam())); }
diff --git a/clang/unittests/AST/Language.cpp b/clang/unittests/AST/Language.cpp
index eeb3303a12787..3dd9659fc00ee 100644
--- a/clang/unittests/AST/Language.cpp
+++ b/clang/unittests/AST/Language.cpp
@@ -11,43 +11,42 @@
 //===----------------------------------------------------------------------===//
 
 #include "Language.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace clang {
-namespace ast_matchers {
 
-ArgVector getBasicRunOptionsForLanguage(Language Lang) {
-  ArgVector BasicArgs;
+std::vector<std::string> getCommandLineArgsForTesting(TestLanguage Lang) {
+  std::vector<std::string> Args;
   // Test with basic arguments.
   switch (Lang) {
   case Lang_C:
-    BasicArgs = {"-x", "c", "-std=c99"};
+    Args = {"-x", "c", "-std=c99"};
     break;
   case Lang_C89:
-    BasicArgs = {"-x", "c", "-std=c89"};
+    Args = {"-x", "c", "-std=c89"};
     break;
   case Lang_CXX:
-    BasicArgs = {"-std=c++98", "-frtti"};
+    Args = {"-std=c++98", "-frtti"};
     break;
   case Lang_CXX11:
-    BasicArgs = {"-std=c++11", "-frtti"};
+    Args = {"-std=c++11", "-frtti"};
     break;
   case Lang_CXX14:
-    BasicArgs = {"-std=c++14", "-frtti"};
+    Args = {"-std=c++14", "-frtti"};
     break;
   case Lang_CXX17:
-    BasicArgs = {"-std=c++17", "-frtti"};
+    Args = {"-std=c++17", "-frtti"};
     break;
   case Lang_CXX2a:
-    BasicArgs = {"-std=c++2a", "-frtti"};
+    Args = {"-std=c++2a", "-frtti"};
     break;
   case Lang_OBJCXX:
-    BasicArgs = {"-x", "objective-c++", "-frtti"};
+    Args = {"-x", "objective-c++", "-frtti"};
     break;
   case Lang_OpenCL:
     llvm_unreachable("Not implemented yet!");
   }
-  return BasicArgs;
+  return Args;
 }
 
-} // end namespace ast_matchers
 } // end namespace clang
diff --git a/clang/unittests/AST/Language.h b/clang/unittests/AST/Language.h
index 6ba40be743b75..da200ec8719f1 100644
--- a/clang/unittests/AST/Language.h
+++ b/clang/unittests/AST/Language.h
@@ -13,30 +13,25 @@
 #ifndef LLVM_CLANG_UNITTESTS_AST_LANGUAGE_H
 #define LLVM_CLANG_UNITTESTS_AST_LANGUAGE_H
 
-#include "llvm/Support/ErrorHandling.h"
 #include <vector>
 #include <string>
 
 namespace clang {
-namespace ast_matchers {
 
-typedef std::vector<std::string> ArgVector;
-
-enum Language {
-    Lang_C,
-    Lang_C89,
-    Lang_CXX,
-    Lang_CXX11,
-    Lang_CXX14,
-    Lang_CXX17,
-    Lang_CXX2a,
-    Lang_OpenCL,
-    Lang_OBJCXX
+enum TestLanguage {
+  Lang_C,
+  Lang_C89,
+  Lang_CXX,
+  Lang_CXX11,
+  Lang_CXX14,
+  Lang_CXX17,
+  Lang_CXX2a,
+  Lang_OpenCL,
+  Lang_OBJCXX
 };
 
-ArgVector getBasicRunOptionsForLanguage(Language Lang);
+std::vector<std::string> getCommandLineArgsForTesting(TestLanguage Lang);
 
-} // end namespace ast_matchers
 } // end namespace clang
 
 #endif
diff --git a/clang/unittests/AST/MatchVerifier.h b/clang/unittests/AST/MatchVerifier.h
index e8245cdcac466..9daf4ce39010d 100644
--- a/clang/unittests/AST/MatchVerifier.h
+++ b/clang/unittests/AST/MatchVerifier.h
@@ -41,17 +41,15 @@ class MatchVerifier : public MatchFinder::MatchCallback {
 
   template <typename MatcherType>
   testing::AssertionResult match(const std::string &Code,
-                                 const MatcherType &AMatcher,
-                                 Language L) {
+                                 const MatcherType &AMatcher, TestLanguage L) {
     std::vector<std::string> Args;
     return match(Code, AMatcher, Args, L);
   }
 
   template <typename MatcherType>
-  testing::AssertionResult match(const std::string &Code,
-                                 const MatcherType &AMatcher,
-                                 std::vector<std::string>& Args,
-                                 Language L);
+  testing::AssertionResult
+  match(const std::string &Code, const MatcherType &AMatcher,
+        std::vector<std::string> &Args, TestLanguage L);
 
   template <typename MatcherType>
   testing::AssertionResult match(const Decl *D, const MatcherType &AMatcher);
@@ -77,10 +75,12 @@ class MatchVerifier : public MatchFinder::MatchCallback {
 
 /// \brief Runs a matcher over some code, and returns the result of the
 /// verifier for the matched node.
-template <typename NodeType> template <typename MatcherType>
-testing::AssertionResult MatchVerifier<NodeType>::match(
-    const std::string &Code, const MatcherType &AMatcher,
-    std::vector<std::string>& Args, Language L) {
+template <typename NodeType>
+template <typename MatcherType>
+testing::AssertionResult
+MatchVerifier<NodeType>::match(const std::string &Code,
+                               const MatcherType &AMatcher,
+                               std::vector<std::string> &Args, TestLanguage L) {
   MatchFinder Finder;
   Finder.addMatcher(AMatcher.bind(""), this);
   std::unique_ptr<tooling::FrontendActionFactory> Factory(
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index 493b847fa5fa0..cadcc45c58854 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -23,12 +23,13 @@ struct StructuralEquivalenceTest : ::testing::Test {
   // snippets. To determine the returned node, a separate matcher is specified
   // for both snippets. The first matching node is returned.
   template <typename NodeType, typename MatcherType>
-  std::tuple<NodeType *, NodeType *> makeDecls(
-      const std::string &SrcCode0, const std::string &SrcCode1, Language Lang,
-      const MatcherType &Matcher0, const MatcherType &Matcher1) {
+  std::tuple<NodeType *, NodeType *>
+  makeDecls(const std::string &SrcCode0, const std::string &SrcCode1,
+            TestLanguage Lang, const MatcherType &Matcher0,
+            const MatcherType &Matcher1) {
     this->Code0 = SrcCode0;
     this->Code1 = SrcCode1;
-    ArgVector Args = getBasicRunOptionsForLanguage(Lang);
+    std::vector<std::string> Args = getCommandLineArgsForTesting(Lang);
 
     const char *const InputFileName = "input.cc";
 
@@ -43,11 +44,12 @@ struct StructuralEquivalenceTest : ::testing::Test {
     return std::make_tuple(D0, D1);
   }
 
-  std::tuple<TranslationUnitDecl *, TranslationUnitDecl *> makeTuDecls(
-      const std::string &SrcCode0, const std::string &SrcCode1, Language Lang) {
+  std::tuple<TranslationUnitDecl *, TranslationUnitDecl *>
+  makeTuDecls(const std::string &SrcCode0, const std::string &SrcCode1,
+              TestLanguage Lang) {
     this->Code0 = SrcCode0;
     this->Code1 = SrcCode1;
-    ArgVector Args = getBasicRunOptionsForLanguage(Lang);
+    std::vector<std::string> Args = getCommandLineArgsForTesting(Lang);
 
     const char *const InputFileName = "input.cc";
 
@@ -61,9 +63,9 @@ struct StructuralEquivalenceTest : ::testing::Test {
   // Get a pair of node pointers into the synthesized AST from the given code
   // snippets. The same matcher is used for both snippets.
   template <typename NodeType, typename MatcherType>
-  std::tuple<NodeType *, NodeType *> makeDecls(
-      const std::string &SrcCode0, const std::string &SrcCode1, Language Lang,
-      const MatcherType &AMatcher) {
+  std::tuple<NodeType *, NodeType *>
+  makeDecls(const std::string &SrcCode0, const std::string &SrcCode1,
+            TestLanguage Lang, const MatcherType &AMatcher) {
     return makeDecls<NodeType, MatcherType>(
           SrcCode0, SrcCode1, Lang, AMatcher, AMatcher);
   }
@@ -71,9 +73,9 @@ struct StructuralEquivalenceTest : ::testing::Test {
   // Get a pair of Decl pointers to the synthesized declarations from the given
   // code snippets. We search for the first NamedDecl with given name in both
   // snippets.
-  std::tuple<NamedDecl *, NamedDecl *> makeNamedDecls(
-      const std::string &SrcCode0, const std::string &SrcCode1,
-      Language Lang, const char *const Identifier = "foo") {
+  std::tuple<NamedDecl *, NamedDecl *>
+  makeNamedDecls(const std::string &SrcCode0, const std::string &SrcCode1,
+                 TestLanguage Lang, const char *const Identifier = "foo") {
     auto Matcher = namedDecl(hasName(Identifier));
     return makeDecls<NamedDecl>(SrcCode0, SrcCode1, Lang, Matcher);
   }

From 912502e8efa9c09fb3b4abce0840b242b731ff12 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 08:57:08 -0400
Subject: [PATCH 514/770] [AArch64][x86] add tests for FMA combines; NFC

---
 llvm/test/CodeGen/AArch64/fadd-combines.ll | 101 +++++++++++
 llvm/test/CodeGen/X86/fma_patterns.ll      | 195 +++++++++++++++++++++
 2 files changed, 296 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll
index 640c1a1004627..3702cc540da38 100644
--- a/llvm/test/CodeGen/AArch64/fadd-combines.ll
+++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll
@@ -192,5 +192,106 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do
   ret <2 x double> %sub
 }
 
+; ((a*b) + (c*d)) + n1 --> (a*b) + ((c*d) + n1)
+
+define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, double %n1) nounwind {
+; CHECK-LABEL: fadd_fma_fmul_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d2, d2, d3
+; CHECK-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-NEXT:    fadd d0, d0, d4
+; CHECK-NEXT:    ret
+  %m1 = fmul fast double %a, %b
+  %m2 = fmul fast double %c, %d
+  %a1 = fadd fast double %m1, %m2
+  %a2 = fadd fast double %a1, %n1
+  ret double %a2
+}
+
+; Minimum FMF, commute final add operands, change type.
+
+define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) nounwind {
+; CHECK-LABEL: fadd_fma_fmul_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s2, s3
+; CHECK-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-NEXT:    fadd s0, s4, s0
+; CHECK-NEXT:    ret
+  %m1 = fmul float %a, %b
+  %m2 = fmul float %c, %d
+  %a1 = fadd contract float %m1, %m2
+  %a2 = fadd contract float %n0, %a1
+  ret float %a2
+}
+
+; The final fadd can be folded with either 1 of the leading fmuls.
+
+define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind {
+; CHECK-LABEL: fadd_fma_fmul_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v6.2d, v7.2d
+; CHECK-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmla v3.2d, v5.2d, v4.2d
+; CHECK-NEXT:    fadd v0.2d, v2.2d, v3.2d
+; CHECK-NEXT:    ret
+  %m1 = fmul fast <2 x double> %x1, %x2
+  %m2 = fmul fast <2 x double> %x3, %x4
+  %m3 = fmul fast <2 x double> %x5, %x6
+  %m4 = fmul fast <2 x double> %x7, %x8
+  %a1 = fadd fast <2 x double> %m1, %m2
+  %a2 = fadd fast <2 x double> %m3, %m4
+  %a3 = fadd fast <2 x double> %a1, %a2
+  ret <2 x double> %a3
+}
+
+define float @fadd_fma_fmul_extra_use_1(float %a, float %b, float %c, float %d, float %n0, float* %p) nounwind {
+; CHECK-LABEL: fadd_fma_fmul_extra_use_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s1, s0, s1
+; CHECK-NEXT:    fmadd s0, s2, s3, s1
+; CHECK-NEXT:    fadd s0, s4, s0
+; CHECK-NEXT:    str s1, [x0]
+; CHECK-NEXT:    ret
+  %m1 = fmul fast float %a, %b
+  store float %m1, float* %p
+  %m2 = fmul fast float %c, %d
+  %a1 = fadd fast float %m1, %m2
+  %a2 = fadd fast float %n0, %a1
+  ret float %a2
+}
+
+define float @fadd_fma_fmul_extra_use_2(float %a, float %b, float %c, float %d, float %n0, float* %p) nounwind {
+; CHECK-LABEL: fadd_fma_fmul_extra_use_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s2, s3
+; CHECK-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-NEXT:    fadd s0, s4, s0
+; CHECK-NEXT:    str s2, [x0]
+; CHECK-NEXT:    ret
+  %m1 = fmul fast float %a, %b
+  %m2 = fmul fast float %c, %d
+  store float %m2, float* %p
+  %a1 = fadd fast float %m1, %m2
+  %a2 = fadd fast float %n0, %a1
+  ret float %a2
+}
+
+define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d, float %n0, float* %p) nounwind {
+; CHECK-LABEL: fadd_fma_fmul_extra_use_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s2, s3
+; CHECK-NEXT:    fmadd s1, s0, s1, s2
+; CHECK-NEXT:    fadd s0, s4, s1
+; CHECK-NEXT:    str s1, [x0]
+; CHECK-NEXT:    ret
+  %m1 = fmul fast float %a, %b
+  %m2 = fmul fast float %c, %d
+  %a1 = fadd fast float %m1, %m2
+  store float %a1, float* %p
+  %a2 = fadd fast float %n0, %a1
+  ret float %a2
+}
+
 declare void @use(double)
 
diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll
index 32bcd48e1d108..7b9d511474807 100644
--- a/llvm/test/CodeGen/X86/fma_patterns.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns.ll
@@ -1794,4 +1794,199 @@ define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %
   ret <4 x double> %n
 }
 
+; ((a*b) + (c*d)) + n1 --> (a*b) + ((c*d) + n1)
+
+define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, double %n1) nounwind {
+; FMA-LABEL: fadd_fma_fmul_1:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
+; FMA-NEXT:    vfmadd231sd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT:    vaddsd %xmm4, %xmm2, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: fadd_fma_fmul_1:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
+; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
+; FMA4-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: fadd_fma_fmul_1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vfmadd231sd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vaddsd %xmm4, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %m1 = fmul fast double %a, %b
+  %m2 = fmul fast double %c, %d
+  %a1 = fadd fast double %m1, %m2
+  %a2 = fadd fast double %a1, %n1
+  ret double %a2
+}
+
+; Minimum FMF, commute final add operands, change type.
+
+define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) nounwind {
+; FMA-LABEL: fadd_fma_fmul_2:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; FMA-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT:    vaddss %xmm2, %xmm4, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: fadd_fma_fmul_2:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
+; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: fadd_fma_fmul_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
+; AVX512-NEXT:    retq
+  %m1 = fmul float %a, %b
+  %m2 = fmul float %c, %d
+  %a1 = fadd contract float %m1, %m2
+  %a2 = fadd contract float %n0, %a1
+  ret float %a2
+}
+
+; The final fadd can be folded with either 1 of the leading fmuls.
+
+define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind {
+; FMA-LABEL: fadd_fma_fmul_3:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmulpd %xmm3, %xmm2, %xmm2
+; FMA-NEXT:    vmulpd %xmm7, %xmm6, %xmm3
+; FMA-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT:    vfmadd231pd {{.*#+}} xmm3 = (xmm5 * xmm4) + xmm3
+; FMA-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: fadd_fma_fmul_3:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmulpd %xmm3, %xmm2, %xmm2
+; FMA4-NEXT:    vmulpd %xmm7, %xmm6, %xmm3
+; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
+; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm1 = (xmm4 * xmm5) + xmm3
+; FMA4-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: fadd_fma_fmul_3:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmulpd %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmulpd %xmm7, %xmm6, %xmm3
+; AVX512-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vfmadd231pd {{.*#+}} xmm3 = (xmm5 * xmm4) + xmm3
+; AVX512-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %m1 = fmul fast <2 x double> %x1, %x2
+  %m2 = fmul fast <2 x double> %x3, %x4
+  %m3 = fmul fast <2 x double> %x5, %x6
+  %m4 = fmul fast <2 x double> %x7, %x8
+  %a1 = fadd fast <2 x double> %m1, %m2
+  %a2 = fadd fast <2 x double> %m3, %m4
+  %a3 = fadd fast <2 x double> %a1, %a2
+  ret <2 x double> %a3
+}
+
+define float @fadd_fma_fmul_extra_use_1(float %a, float %b, float %c, float %d, float %n0, float* %p) nounwind {
+; FMA-LABEL: fadd_fma_fmul_extra_use_1:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; FMA-NEXT:    vmovss %xmm0, (%rdi)
+; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm0
+; FMA-NEXT:    vaddss %xmm2, %xmm4, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: fadd_fma_fmul_extra_use_1:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    vmovss %xmm0, (%rdi)
+; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm2 * xmm3) + xmm0
+; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: fadd_fma_fmul_extra_use_1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovss %xmm0, (%rdi)
+; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm0
+; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
+; AVX512-NEXT:    retq
+  %m1 = fmul fast float %a, %b
+  store float %m1, float* %p
+  %m2 = fmul fast float %c, %d
+  %a1 = fadd fast float %m1, %m2
+  %a2 = fadd fast float %n0, %a1
+  ret float %a2
+}
+
+define float @fadd_fma_fmul_extra_use_2(float %a, float %b, float %c, float %d, float %n0, float* %p) nounwind {
+; FMA-LABEL: fadd_fma_fmul_extra_use_2:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; FMA-NEXT:    vmovss %xmm2, (%rdi)
+; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT:    vaddss %xmm0, %xmm4, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: fadd_fma_fmul_extra_use_2:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; FMA4-NEXT:    vmovss %xmm2, (%rdi)
+; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
+; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: fadd_fma_fmul_extra_use_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovss %xmm2, (%rdi)
+; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vaddss %xmm0, %xmm4, %xmm0
+; AVX512-NEXT:    retq
+  %m1 = fmul fast float %a, %b
+  %m2 = fmul fast float %c, %d
+  store float %m2, float* %p
+  %a1 = fadd fast float %m1, %m2
+  %a2 = fadd fast float %n0, %a1
+  ret float %a2
+}
+
+define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d, float %n0, float* %p) nounwind {
+; FMA-LABEL: fadd_fma_fmul_extra_use_3:
+; FMA:       # %bb.0:
+; FMA-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; FMA-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; FMA-NEXT:    vmovss %xmm2, (%rdi)
+; FMA-NEXT:    vaddss %xmm2, %xmm4, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: fadd_fma_fmul_extra_use_3:
+; FMA4:       # %bb.0:
+; FMA4-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
+; FMA4-NEXT:    vmovss %xmm0, (%rdi)
+; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: fadd_fma_fmul_extra_use_3:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT:    vmovss %xmm2, (%rdi)
+; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
+; AVX512-NEXT:    retq
+  %m1 = fmul fast float %a, %b
+  %m2 = fmul fast float %c, %d
+  %a1 = fadd fast float %m1, %m2
+  store float %a1, float* %p
+  %a2 = fadd fast float %n0, %a1
+  ret float %a2
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }

From b12fa146b55206b003b25cc5e550874ab0cc0d89 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 29 May 2020 13:44:06 +0100
Subject: [PATCH 515/770] TextAPIContext.h - remove unused MemoryBuffer.h
 include. NFC.

---
 llvm/lib/TextAPI/MachO/TextAPIContext.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/TextAPI/MachO/TextAPIContext.h b/llvm/lib/TextAPI/MachO/TextAPIContext.h
index 3df40f09f7f7f..217d1f5400ee8 100644
--- a/llvm/lib/TextAPI/MachO/TextAPIContext.h
+++ b/llvm/lib/TextAPI/MachO/TextAPIContext.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_TEXTAPI_MACHO_CONTEXT_H
 #define LLVM_TEXTAPI_MACHO_CONTEXT_H
 
-#include "llvm/Support/MemoryBuffer.h"
 #include <string>
 
 namespace llvm {

From 99a3b20452b16aa92e18e4f5d2c8c6f025aafbae Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 29 May 2020 13:56:45 +0100
Subject: [PATCH 516/770] TextStubCommon.h - move StringSwitch.h include to
 TextStubCommon.cpp. NFC.

Only TextStubCommon.cpp actually uses StringSwitch
---
 llvm/lib/TextAPI/MachO/TextStubCommon.cpp | 1 +
 llvm/lib/TextAPI/MachO/TextStubCommon.h   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/TextAPI/MachO/TextStubCommon.cpp b/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
index 21be654e130c0..2da0b11da831b 100644
--- a/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
+++ b/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
@@ -12,6 +12,7 @@
 
 #include "TextStubCommon.h"
 #include "TextAPIContext.h"
+#include "llvm/ADT/StringSwitch.h"
 
 using namespace llvm::MachO;
 
diff --git a/llvm/lib/TextAPI/MachO/TextStubCommon.h b/llvm/lib/TextAPI/MachO/TextStubCommon.h
index a558cbcec9fb6..f2cda50e297d0 100644
--- a/llvm/lib/TextAPI/MachO/TextStubCommon.h
+++ b/llvm/lib/TextAPI/MachO/TextStubCommon.h
@@ -14,7 +14,6 @@
 #define LLVM_TEXTAPI_TEXT_STUB_COMMON_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/TextAPI/MachO/Architecture.h"
 #include "llvm/TextAPI/MachO/ArchitectureSet.h"

From a9313282cd5413ed498dceb763ebba12ea5bdecd Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Fri, 29 May 2020 19:27:28 +0700
Subject: [PATCH 517/770] [llvm-objcopy][ELF] Fix removing SHT_GROUP sections.

When a SHT_GROUP section is removed, but other sections of the group are
kept, the SHF_GROUP flag of these sections should be dropped, otherwise
the resulting ELF file will be malformed.

Differential Revision: https://reviews.llvm.org/D80511
---
 .../ELF/remove-section-group.test             | 33 +++++++++++++++++++
 llvm/tools/llvm-objcopy/ELF/Object.cpp        |  9 +++++
 llvm/tools/llvm-objcopy/ELF/Object.h          |  3 ++
 3 files changed, 45 insertions(+)
 create mode 100644 llvm/test/tools/llvm-objcopy/ELF/remove-section-group.test

diff --git a/llvm/test/tools/llvm-objcopy/ELF/remove-section-group.test b/llvm/test/tools/llvm-objcopy/ELF/remove-section-group.test
new file mode 100644
index 0000000000000..166fc3965f80e
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/remove-section-group.test
@@ -0,0 +1,33 @@
+## This checks that when the header section of a group is removed, the tool
+## drops the flag SHF_GROUP for preserved members of that group.
+
+# RUN: yaml2obj %s -o - \
+# RUN:   | llvm-objcopy -R .group - - \
+# RUN:   | llvm-readobj --sections - \
+# RUN:   | FileCheck %s
+
+# CHECK:      Name: .foo
+# CHECK-NEXT: Type: SHT_PROGBITS
+# CHECK-NEXT: Flags [
+# CHECK-NEXT:   SHF_ALLOC
+# CHECK-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class:      ELFCLASS64
+  Data:       ELFDATA2LSB
+  Type:       ET_REL
+  Machine:    EM_X86_64
+Sections:
+  - Name:     .group
+    Type:     SHT_GROUP
+    Info:     foo_grp
+    Members:
+      - SectionOrType:  GRP_COMDAT
+      - SectionOrType:  .foo
+  - Name:     .foo
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_ALLOC, SHF_GROUP ]
+Symbols:
+  - Name:     foo_grp
+    Section:  .group
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.cpp b/llvm/tools/llvm-objcopy/ELF/Object.cpp
index 8c3ae25967221..2ceb479236499 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.cpp
+++ b/llvm/tools/llvm-objcopy/ELF/Object.cpp
@@ -65,6 +65,7 @@ void SectionBase::finalize() {}
 void SectionBase::markSymbols() {}
 void SectionBase::replaceSectionReferences(
     const DenseMap<SectionBase *, SectionBase *> &) {}
+void SectionBase::onRemove() {}
 
 template <class ELFT> void ELFWriter<ELFT>::writeShdr(const SectionBase &Sec) {
   uint8_t *B = Buf.getBufferStart() + Sec.HeaderOffset;
@@ -988,6 +989,13 @@ void GroupSection::replaceSectionReferences(
       Sec = To;
 }
 
+void GroupSection::onRemove() {
+  // As the header section of the group is removed, drop the Group flag in its
+  // former members.
+  for (SectionBase *Sec : GroupMembers)
+    Sec->Flags &= ~SHF_GROUP;
+}
+
 void Section::initialize(SectionTableRef SecTable) {
   if (Link == ELF::SHN_UNDEF)
     return;
@@ -1838,6 +1846,7 @@ Error Object::removeSections(bool AllowBrokenLinks,
   for (auto &RemoveSec : make_range(Iter, std::end(Sections))) {
     for (auto &Segment : Segments)
       Segment->removeSection(RemoveSec.get());
+    RemoveSec->onRemove();
     RemoveSections.insert(RemoveSec.get());
   }
 
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.h b/llvm/tools/llvm-objcopy/ELF/Object.h
index 97702a66bc479..3ef1ec75352bb 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.h
+++ b/llvm/tools/llvm-objcopy/ELF/Object.h
@@ -424,6 +424,8 @@ class SectionBase {
   virtual void markSymbols();
   virtual void
   replaceSectionReferences(const DenseMap<SectionBase *, SectionBase *> &);
+  // Notify the section that it is subject to removal.
+  virtual void onRemove();
 };
 
 class Segment {
@@ -803,6 +805,7 @@ class GroupSection : public SectionBase {
   void markSymbols() override;
   void replaceSectionReferences(
       const DenseMap<SectionBase *, SectionBase *> &FromTo) override;
+  void onRemove() override;
 
   static bool classof(const SectionBase *S) {
     return S->OriginalType == ELF::SHT_GROUP;

From 5b875bf59b068f3db91c750b24030a7b2bd164aa Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Fri, 29 May 2020 19:28:37 +0700
Subject: [PATCH 518/770] [llvm-objcopy][ELF] Fix removing a group member.

When a group member is removed, the corresponding record in the
SHT_GROUP section has to be deleted.

This fixes PR46064.

Differential Revision: https://reviews.llvm.org/D80568
---
 .../ELF/remove-section-in-group.test          | 36 +++++++++++++++++++
 llvm/tools/llvm-objcopy/ELF/Object.cpp        | 10 +++++-
 llvm/tools/llvm-objcopy/ELF/Object.h          |  3 ++
 3 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-objcopy/ELF/remove-section-in-group.test

diff --git a/llvm/test/tools/llvm-objcopy/ELF/remove-section-in-group.test b/llvm/test/tools/llvm-objcopy/ELF/remove-section-in-group.test
new file mode 100644
index 0000000000000..6b2e43d680339
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/remove-section-in-group.test
@@ -0,0 +1,36 @@
+## This checks that the group section is shrunk when its member is removed.
+
+# RUN: yaml2obj %s -o - \
+# RUN:   | llvm-objcopy -R .foo - - \
+# RUN:   | obj2yaml - \
+# RUN:   | FileCheck %s
+
+# CHECK:      - Name: .group
+# CHECK:        Members:
+# CHECK-NEXT:     - SectionOrType:  GRP_COMDAT
+# CHECK-NEXT:     - SectionOrType:  .bar
+# CHECK-NOT:      - SectionOrType:
+
+--- !ELF
+FileHeader:
+  Class:      ELFCLASS64
+  Data:       ELFDATA2LSB
+  Type:       ET_REL
+  Machine:    EM_X86_64
+Sections:
+  - Name:     .group
+    Type:     SHT_GROUP
+    Info:     foo_bar_grp
+    Members:
+      - SectionOrType:  GRP_COMDAT
+      - SectionOrType:  .foo
+      - SectionOrType:  .bar
+  - Name:     .foo
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_ALLOC, SHF_GROUP ]
+  - Name:     .bar
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_ALLOC, SHF_GROUP ]
+Symbols:
+  - Name:     foo_bar_grp
+    Section:  .group
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.cpp b/llvm/tools/llvm-objcopy/ELF/Object.cpp
index 2ceb479236499..8bb4bbb40f940 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.cpp
+++ b/llvm/tools/llvm-objcopy/ELF/Object.cpp
@@ -112,7 +112,9 @@ void ELFSectionSizer<ELFT>::visit(RelocationSection &Sec) {
 template <class ELFT>
 void ELFSectionSizer<ELFT>::visit(GnuDebugLinkSection &Sec) {}
 
-template <class ELFT> void ELFSectionSizer<ELFT>::visit(GroupSection &Sec) {}
+template <class ELFT> void ELFSectionSizer<ELFT>::visit(GroupSection &Sec) {
+  Sec.Size = sizeof(Elf_Word) + Sec.GroupMembers.size() * sizeof(Elf_Word);
+}
 
 template <class ELFT>
 void ELFSectionSizer<ELFT>::visit(SectionIndexSection &Sec) {}
@@ -968,6 +970,12 @@ void GroupSection::finalize() {
   this->Link = SymTab->Index;
 }
 
+Error GroupSection::removeSectionReferences(
+    bool AllowBrokenLinks, function_ref<bool(const SectionBase *)> ToRemove) {
+  llvm::erase_if(GroupMembers, ToRemove);
+  return Error::success();
+}
+
 Error GroupSection::removeSymbols(function_ref<bool(const Symbol &)> ToRemove) {
   if (ToRemove(*Sym))
     return createStringError(llvm::errc::invalid_argument,
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.h b/llvm/tools/llvm-objcopy/ELF/Object.h
index 3ef1ec75352bb..c7db57708fc3e 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.h
+++ b/llvm/tools/llvm-objcopy/ELF/Object.h
@@ -801,6 +801,9 @@ class GroupSection : public SectionBase {
   void accept(SectionVisitor &) const override;
   void accept(MutableSectionVisitor &Visitor) override;
   void finalize() override;
+  Error removeSectionReferences(
+      bool AllowBrokenLinks,
+      function_ref<bool(const SectionBase *)> ToRemove) override;
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
   void markSymbols() override;
   void replaceSectionReferences(

From 21dadd774f56778ef68c1ce307205dfbdacc793a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 09:31:11 -0400
Subject: [PATCH 519/770] [DAGCombiner] avoid unnecessary indirection from
 SDNode/SDValue; NFCI

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 100 ++++++++----------
 1 file changed, 47 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0176ae3a0ab5e..d54663f4ce784 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -862,7 +862,7 @@ bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
 /// it is profitable to do so.
 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   SDValue N0, N1, N2;
-  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+  if (isSetCCEquivalent(N, N0, N1, N2) && N.hasOneUse())
     return true;
   return false;
 }
@@ -1235,8 +1235,8 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
 
     // We are always replacing N0/N1's use in N and only need
     // additional replacements if there are additional uses.
-    Replace0 &= !N0->hasOneUse();
-    Replace1 &= (N0 != N1) && !N1->hasOneUse();
+    Replace0 &= !N0.hasOneUse();
+    Replace1 &= (N0 != N1) && !N1.hasOneUse();
 
     // Combine Op here so it is preserved past replacements.
     CombineTo(Op.getNode(), RV);
@@ -3587,12 +3587,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
-        isConstantOrConstantVector(N0.getOperand(1)) &&
-        N0.getNode()->hasOneUse()) {
+        isConstantOrConstantVector(N0.getOperand(1)) && N0.hasOneUse()) {
       Sh = N0; Y = N1;
     } else if (N1.getOpcode() == ISD::SHL &&
-               isConstantOrConstantVector(N1.getOperand(1)) &&
-               N1.getNode()->hasOneUse()) {
+               isConstantOrConstantVector(N1.getOperand(1)) && N1.hasOneUse()) {
       Sh = N1; Y = N0;
     }
 
@@ -5484,7 +5482,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
       std::swap(N0, N1);
   if (N0.getOpcode() == ISD::AND) {
-    if (!N0.getNode()->hasOneUse())
+    if (!N0.hasOneUse())
       return SDValue();
     ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     // Also handle 0xffff since the LHS is guaranteed to have zeros there.
@@ -5497,7 +5495,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   }
 
   if (N1.getOpcode() == ISD::AND) {
-    if (!N1.getNode()->hasOneUse())
+    if (!N1.hasOneUse())
       return SDValue();
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C || N11C->getZExtValue() != 0xFF)
@@ -5510,7 +5508,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
     return SDValue();
-  if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse())
+  if (!N0.hasOneUse() || !N1.hasOneUse())
     return SDValue();
 
   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
@@ -5523,7 +5521,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
   SDValue N00 = N0->getOperand(0);
   if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
-    if (!N00.getNode()->hasOneUse())
+    if (!N00.hasOneUse())
       return SDValue();
     ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
     if (!N001C || N001C->getZExtValue() != 0xFF)
@@ -5534,7 +5532,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 
   SDValue N10 = N1->getOperand(0);
   if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
-    if (!N10.getNode()->hasOneUse())
+    if (!N10.hasOneUse())
       return SDValue();
     ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
     // Also allow 0xFFFF since the bits will be shifted out. This is needed
@@ -5584,7 +5582,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 /// ((x & 0x00ff0000) << 8) |
 /// ((x & 0xff000000) >> 8)
 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
-  if (!N.getNode()->hasOneUse())
+  if (!N.hasOneUse())
     return false;
 
   unsigned Opc = N.getOpcode();
@@ -5698,7 +5696,7 @@ static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
     return SDValue();
   // TODO: this is too restrictive; lifting this restriction requires more tests
-  if (!N0->hasOneUse() || !N1->hasOneUse())
+  if (!N0.hasOneUse() || !N1.hasOneUse())
     return SDValue();
   ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
   ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
@@ -5810,7 +5808,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
       // Don't increase # computations.
-      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+      (N0.hasOneUse() || N1.hasOneUse())) {
     // We can only do this xform if we know that bits from X that are set in C2
     // but not in C1 are already zero.  Likewise for Y.
     if (const ConstantSDNode *N0O1C =
@@ -5838,7 +5836,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
       N1.getOpcode() == ISD::AND &&
       N0.getOperand(0) == N1.getOperand(0) &&
       // Don't increase # computations.
-      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+      (N0.hasOneUse() || N1.hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
@@ -6002,7 +6000,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
     return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
   };
-  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+  if (N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
       ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
     if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
                                                  {N1, N0.getOperand(1)})) {
@@ -7874,8 +7872,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // Variant of version done on multiply, except mul by a power of 2 is turned
   // into a shift.
   if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
-      N0.getNode()->hasOneUse() &&
-      isConstantOrConstantVector(N1, /* No Opaques */ true) &&
+      N0.hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
       TLI.isDesirableToCommuteWithShift(N, Level)) {
     SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
@@ -7886,7 +7883,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   }
 
   // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
-  if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
+  if (N0.getOpcode() == ISD::MUL && N0.hasOneUse() &&
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
     SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
@@ -8805,7 +8802,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
         TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
     // select (and Cond0, Cond1), X, Y
     //   -> select Cond0, (select Cond1, X, Y), Y
-    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+    if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
       SDValue Cond0 = N0->getOperand(0);
       SDValue Cond1 = N0->getOperand(1);
       SDValue InnerSelect =
@@ -8818,7 +8815,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
         recursivelyDeleteUnusedNodes(InnerSelect.getNode());
     }
     // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
-    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+    if (N0.getOpcode() == ISD::OR && N0.hasOneUse()) {
       SDValue Cond0 = N0->getOperand(0);
       SDValue Cond1 = N0->getOperand(1);
       SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(),
@@ -8832,7 +8829,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     }
 
     // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
-    if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
+    if (N1.getOpcode() == ISD::SELECT && N1.hasOneUse()) {
       SDValue N1_0 = N1->getOperand(0);
       SDValue N1_1 = N1->getOperand(1);
       SDValue N1_2 = N1->getOperand(2);
@@ -8851,7 +8848,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       }
     }
     // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
-    if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
+    if (N2.getOpcode() == ISD::SELECT && N2.hasOneUse()) {
       SDValue N2_0 = N2->getOperand(0);
       SDValue N2_1 = N2->getOperand(1);
       SDValue N2_2 = N2->getOperand(2);
@@ -11018,7 +11015,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // creates this pattern) and before operation legalization after which
   // we need to be more careful about the vector instructions that we generate.
   if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
+      LegalTypes && !LegalOperations && N0.hasOneUse() && VT != MVT::i1) {
     EVT VecTy = N0.getOperand(0).getValueType();
     EVT ExTy = N0.getValueType();
     EVT TrTy = N->getValueType(0);
@@ -11397,7 +11394,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       (!LegalTypes ||
        (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
         TLI.isTypeLegal(VT.getVectorElementType()))) &&
-      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0.hasOneUse() &&
       cast<BuildVectorSDNode>(N0)->isConstant())
     return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
                                              VT.getVectorElementType());
@@ -11465,7 +11462,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
-      N0.getNode()->hasOneUse() && VT.isInteger() &&
+      N0.hasOneUse() && VT.isInteger() &&
       !VT.isVector() && !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
     AddToWorklist(NewConv.getNode());
@@ -11514,7 +11511,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   //                     (xor (bitcast cst), (bitcast x)), 0),
   //                    signbit)
   //     (xor (bitcast cst) (build_pair flipbit, flipbit))
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.hasOneUse() &&
       isa<ConstantFPSDNode>(N0.getOperand(0)) &&
       VT.isInteger() && !VT.isVector()) {
     unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
@@ -11818,14 +11815,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   }
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0.hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1), N1, Flags);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
-  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
+  if (isContractableFMUL(N1) && (Aggressive || N1.hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N1.getOperand(0), N1.getOperand(1), N0, Flags);
   }
@@ -11867,7 +11864,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     if (CanFuse &&
         N0.getOpcode() == PreferredFusedOpcode &&
         N0.getOperand(2).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
+        N0.hasOneUse() && N0.getOperand(2).hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -11880,7 +11877,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     if (CanFuse &&
         N1->getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FMUL &&
-        N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
+        N1.hasOneUse() && N1.getOperand(2).hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N1.getOperand(0), N1.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -12031,7 +12028,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
   auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
-    if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
+    if (isContractableFMUL(XY) && (Aggressive || XY.hasOneUse())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
                          XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z),
                          Flags);
@@ -12042,7 +12039,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
   auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
-    if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
+    if (isContractableFMUL(YZ) && (Aggressive || YZ.hasOneUse())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
                          YZ.getOperand(1), X, Flags);
@@ -12071,7 +12068,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
   if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
-      (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
+      (Aggressive || (N0.hasOneUse() && N0.getOperand(0).hasOneUse()))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -12168,8 +12165,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
-        isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
-        N0.getOperand(2)->hasOneUse()) {
+        isContractableFMUL(N0.getOperand(2)) && N0.hasOneUse() &&
+        N0.getOperand(2).hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -12183,7 +12180,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N1.getOperand(2)) &&
-        N1->hasOneUse() && NoSignedZero) {
+        N1.hasOneUse() && NoSignedZero) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -12199,7 +12196,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
     if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0->hasOneUse()) {
+        N0.hasOneUse()) {
       SDValue N02 = N0.getOperand(2);
       if (N02.getOpcode() == ISD::FP_EXTEND) {
         SDValue N020 = N02.getOperand(0);
@@ -12252,7 +12249,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
     if (N1.getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
-        N1->hasOneUse()) {
+        N1.hasOneUse()) {
       SDValue N120 = N1.getOperand(2).getOperand(0);
       if (isContractableFMUL(N120) &&
           TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
@@ -12349,7 +12346,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
   // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
   auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
-    if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
+    if (X.getOpcode() == ISD::FADD && (Aggressive || X.hasOneUse())) {
       if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
         if (C->isExactlyValue(+1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
@@ -12372,7 +12369,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
   // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
   auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
-    if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
+    if (X.getOpcode() == ISD::FSUB && (Aggressive || X.hasOneUse())) {
       if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
         if (C0->isExactlyValue(+1.0))
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -13538,7 +13535,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.hasOneUse()) {
     SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
     AddToWorklist(Tmp.getNode());
@@ -13677,9 +13674,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
   // constant pool values.
-  if (!TLI.isFNegFree(VT) &&
-      N0.getOpcode() == ISD::BITCAST &&
-      N0.getNode()->hasOneUse()) {
+  if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
@@ -13702,8 +13697,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   }
 
   // (fneg (fmul c, x)) -> (fmul -c, x)
-  if (N0.getOpcode() == ISD::FMUL &&
-      (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) {
+  if (N0.getOpcode() == ISD::FMUL && (N0.hasOneUse() || !TLI.isFNegFree(VT))) {
     ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
     if (CFP1) {
       APFloat CVal = CFP1->getValueAPF();
@@ -15598,7 +15592,7 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
   APInt Val;
 
   // If the add only has one use, this would be OK to do.
-  if (AddNode.getNode()->hasOneUse())
+  if (AddNode.hasOneUse())
     return true;
 
   // Walk all the users of the constant with which we're multiplying.
@@ -16846,7 +16840,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
   if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
-      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      && Value.hasOneUse() && ST->isUnindexed() &&
       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
                             ST->getMemoryVT())) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
@@ -18919,7 +18913,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
   unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
   if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
-      BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
+      BinOp.hasOneUse() && Extract->getOperand(0).hasOneUse()) {
     // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
     SDLoc DL(Extract);
     SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
@@ -19345,13 +19339,13 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
   SDValue N0 = SVN->getOperand(0);
   SDValue N1 = SVN->getOperand(1);
 
-  if (!N0->hasOneUse())
+  if (!N0.hasOneUse())
     return SDValue();
 
   // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
   // discussed above.
   if (!N1.isUndef()) {
-    if (!N1->hasOneUse())
+    if (!N1.hasOneUse())
       return SDValue();
 
     bool N0AnyConst = isAnyConstantBuildVector(N0);

From 9819976032c5af8d9109f2077e637c8303e4d6df Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Thu, 28 May 2020 17:42:42 +0300
Subject: [PATCH 520/770] [llvm-readobj] - Cleanup the
 DwarfCFIEH::PrinterContext class. NFCI.

It would be nice to switch to `reportUniqueWarnings` from
`reportError` in this class, but first of all it needs a cleanup.

This patch:
1) Eliminates autos.
2) Removes code duplication.
3) Changes how the code works with `Expected<>`.
4) Introduces 2 new `using`s to make the code a bit shorter.

Differential revision: https://reviews.llvm.org/D80726
---
 llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h | 141 +++++++++-----------
 1 file changed, 64 insertions(+), 77 deletions(-)

diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 01800aba7cbe1..27942224053f1 100644
--- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -30,11 +30,14 @@ namespace DwarfCFIEH {
 
 template <typename ELFT>
 class PrinterContext {
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Phdr = typename ELFT::Phdr;
+
   ScopedPrinter &W;
   const object::ELFObjectFile<ELFT> *ObjF;
 
-  void printEHFrameHdr(const typename ELFT::Phdr *EHFramePHdr) const;
-  void printEHFrame(const typename ELFT::Shdr *EHFrameShdr) const;
+  void printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const;
+  void printEHFrame(const Elf_Shdr *EHFrameShdr) const;
 
 public:
   PrinterContext(ScopedPrinter &W, const object::ELFObjectFile<ELFT> *ObjF)
@@ -44,13 +47,14 @@ class PrinterContext {
 };
 
 template <class ELFT>
-static const typename object::ELFObjectFile<ELFT>::Elf_Shdr *
+static const typename ELFT::Shdr *
 findSectionByAddress(const object::ELFObjectFile<ELFT> *ObjF, uint64_t Addr) {
-  auto Sections = ObjF->getELFFile()->sections();
-  if (Error E = Sections.takeError())
-    reportError(std::move(E), ObjF->getFileName());
+  Expected<typename ELFT::ShdrRange> SectionsOrErr =
+      ObjF->getELFFile()->sections();
+  if (!SectionsOrErr)
+    reportError(SectionsOrErr.takeError(), ObjF->getFileName());
 
-  for (const auto &Shdr : *Sections)
+  for (const typename ELFT::Shdr &Shdr : *SectionsOrErr)
     if (Shdr.sh_addr == Addr)
       return &Shdr;
   return nullptr;
@@ -60,37 +64,38 @@ template <typename ELFT>
 void PrinterContext<ELFT>::printUnwindInformation() const {
   const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
 
-  auto PHs = Obj->program_headers();
-  if (Error E = PHs.takeError())
-    reportError(std::move(E), ObjF->getFileName());
+  Expected<typename ELFT::PhdrRange> PhdrsOrErr = Obj->program_headers();
+  if (!PhdrsOrErr)
+    reportError(PhdrsOrErr.takeError(), ObjF->getFileName());
 
-  for (const auto &Phdr : *PHs) {
-    if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
-      if (Phdr.p_memsz != Phdr.p_filesz)
-        reportError(object::createError(
-                        "p_memsz does not match p_filesz for GNU_EH_FRAME"),
-                    ObjF->getFileName());
-      printEHFrameHdr(&Phdr);
-      break;
-    }
-  }
+  for (const Elf_Phdr &Phdr : *PhdrsOrErr) {
+    if (Phdr.p_type != ELF::PT_GNU_EH_FRAME)
+      continue;
 
-  auto Sections = Obj->sections();
-  if (Error E = Sections.takeError())
-    reportError(std::move(E), ObjF->getFileName());
+    if (Phdr.p_memsz != Phdr.p_filesz)
+      reportError(object::createError(
+                      "p_memsz does not match p_filesz for GNU_EH_FRAME"),
+                  ObjF->getFileName());
+    printEHFrameHdr(&Phdr);
+    break;
+  }
 
-  for (const auto &Shdr : *Sections) {
-    auto SectionName = Obj->getSectionName(&Shdr);
-    if (Error E = SectionName.takeError())
-      reportError(std::move(E), ObjF->getFileName());
+  Expected<typename ELFT::ShdrRange> SectionsOrErr =
+      ObjF->getELFFile()->sections();
+  if (!SectionsOrErr)
+    reportError(SectionsOrErr.takeError(), ObjF->getFileName());
 
-    if (*SectionName == ".eh_frame")
+  for (const Elf_Shdr &Shdr : *SectionsOrErr) {
+    Expected<StringRef> NameOrErr = Obj->getSectionName(&Shdr);
+    if (!NameOrErr)
+      reportError(NameOrErr.takeError(), ObjF->getFileName());
+    if (*NameOrErr == ".eh_frame")
       printEHFrame(&Shdr);
   }
 }
 
 template <typename ELFT>
-void PrinterContext<ELFT>::printEHFrameHdr(const typename ELFT::Phdr *EHFramePHdr) const {
+void PrinterContext<ELFT>::printEHFrameHdr(const Elf_Phdr *EHFramePHdr) const {
   DictScope L(W, "EHFrameHeader");
   uint64_t EHFrameHdrAddress = EHFramePHdr->p_vaddr;
   W.startLine() << format("Address: 0x%" PRIx64 "\n", EHFrameHdrAddress);
@@ -98,14 +103,12 @@ void PrinterContext<ELFT>::printEHFrameHdr(const typename ELFT::Phdr *EHFramePHd
   W.startLine() << format("Size: 0x%" PRIx64 "\n", (uint64_t)EHFramePHdr->p_memsz);
 
   const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  const typename ELFT::Shdr *EHFrameHdrShdr =
-      findSectionByAddress(ObjF, EHFramePHdr->p_vaddr);
-  if (EHFrameHdrShdr) {
-    auto SectionName = Obj->getSectionName(EHFrameHdrShdr);
-    if (Error E = SectionName.takeError())
-      reportError(std::move(E), ObjF->getFileName());
-
-    W.printString("Corresponding Section", *SectionName);
+  if (const Elf_Shdr *EHFrameHdr =
+          findSectionByAddress(ObjF, EHFramePHdr->p_vaddr)) {
+    Expected<StringRef> NameOrErr = Obj->getSectionName(EHFrameHdr);
+    if (!NameOrErr)
+      reportError(NameOrErr.takeError(), ObjF->getFileName());
+    W.printString("Corresponding Section", *NameOrErr);
   }
 
   Expected<ArrayRef<uint8_t>> Content = Obj->getSegmentContents(EHFramePHdr);
@@ -170,8 +173,7 @@ void PrinterContext<ELFT>::printEHFrameHdr(const typename ELFT::Phdr *EHFramePHd
 }
 
 template <typename ELFT>
-void PrinterContext<ELFT>::printEHFrame(
-    const typename ELFT::Shdr *EHFrameShdr) const {
+void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
   uint64_t Address = EHFrameShdr->sh_addr;
   uint64_t ShOffset = EHFrameShdr->sh_offset;
   W.startLine() << format(".eh_frame section at offset 0x%" PRIx64
@@ -179,12 +181,12 @@ void PrinterContext<ELFT>::printEHFrame(
                           ShOffset, Address);
   W.indent();
 
-  const object::ELFFile<ELFT> *Obj = ObjF->getELFFile();
-  auto Result = Obj->getSectionContents(EHFrameShdr);
-  if (Error E = Result.takeError())
-    reportError(std::move(E), ObjF->getFileName());
+  Expected<ArrayRef<uint8_t>> DataOrErr =
+      ObjF->getELFFile()->getSectionContents(EHFrameShdr);
+  if (!DataOrErr)
+    reportError(DataOrErr.takeError(), ObjF->getFileName());
 
-  DWARFDataExtractor DE(*Result,
+  DWARFDataExtractor DE(*DataOrErr,
                         ELFT::TargetEndianness == support::endianness::little,
                         ELFT::Is64Bits ? 8 : 4);
   DWARFDebugFrame EHFrame(Triple::ArchType(ObjF->getArch()), /*IsEH=*/true,
@@ -192,11 +194,10 @@ void PrinterContext<ELFT>::printEHFrame(
   if (Error E = EHFrame.parse(DE))
     reportError(std::move(E), ObjF->getFileName());
 
-  for (const auto &Entry : EHFrame) {
-    if (const auto *CIE = dyn_cast<dwarf::CIE>(&Entry)) {
+  for (const dwarf::FrameEntry &Entry : EHFrame) {
+    if (const dwarf::CIE *CIE = dyn_cast<dwarf::CIE>(&Entry)) {
       W.startLine() << format("[0x%" PRIx64 "] CIE length=%" PRIu64 "\n",
-                              Address + CIE->getOffset(),
-                              CIE->getLength());
+                              Address + CIE->getOffset(), CIE->getLength());
       W.indent();
 
       W.printNumber("version", CIE->getVersion());
@@ -204,47 +205,33 @@ void PrinterContext<ELFT>::printEHFrame(
       W.printNumber("code_alignment_factor", CIE->getCodeAlignmentFactor());
       W.printNumber("data_alignment_factor", CIE->getDataAlignmentFactor());
       W.printNumber("return_address_register", CIE->getReturnAddressRegister());
-
-      W.getOStream() << "\n";
-      W.startLine() << "Program:\n";
-      W.indent();
-      CIE->cfis().dump(W.getOStream(), nullptr, W.getIndentLevel());
-      W.unindent();
-
-      W.unindent();
-      W.getOStream() << "\n";
-
-    } else if (const auto *FDE = dyn_cast<dwarf::FDE>(&Entry)) {
+    } else {
+      const dwarf::FDE *FDE = cast<dwarf::FDE>(&Entry);
       W.startLine() << format("[0x%" PRIx64 "] FDE length=%" PRIu64
                               " cie=[0x%" PRIx64 "]\n",
-                              Address + FDE->getOffset(),
-                              FDE->getLength(),
+                              Address + FDE->getOffset(), FDE->getLength(),
                               Address + FDE->getLinkedCIE()->getOffset());
       W.indent();
 
       W.startLine() << format("initial_location: 0x%" PRIx64 "\n",
                               FDE->getInitialLocation());
-      W.startLine()
-        << format("address_range: 0x%" PRIx64 " (end : 0x%" PRIx64 ")\n",
-                  FDE->getAddressRange(),
-                  FDE->getInitialLocation() + FDE->getAddressRange());
-
-      W.getOStream() << "\n";
-      W.startLine() << "Program:\n";
-      W.indent();
-      FDE->cfis().dump(W.getOStream(), nullptr, W.getIndentLevel());
-      W.unindent();
-
-      W.unindent();
-      W.getOStream() << "\n";
-    } else {
-      llvm_unreachable("unexpected DWARF frame kind");
+      W.startLine() << format(
+          "address_range: 0x%" PRIx64 " (end : 0x%" PRIx64 ")\n",
+          FDE->getAddressRange(),
+          FDE->getInitialLocation() + FDE->getAddressRange());
     }
+
+    W.getOStream() << "\n";
+    W.startLine() << "Program:\n";
+    W.indent();
+    Entry.cfis().dump(W.getOStream(), nullptr, W.getIndentLevel());
+    W.unindent();
+    W.unindent();
+    W.getOStream() << "\n";
   }
 
   W.unindent();
 }
-
 }
 }
 

From d8f2814c913847b1d0e9167dce5973eea3600c7e Mon Sep 17 00:00:00 2001
From: Hendrik Greving <hgreving@google.com>
Date: Wed, 27 May 2020 17:12:58 -0700
Subject: [PATCH 521/770] [ModuloSchedule] Allow illegal phis to be moved
 across stages.

Fixes a trivial but impactful bug where we did not move illegal phis across stages. This
led to incorrect mappings in certain cases.
---
 llvm/lib/CodeGen/ModuloSchedule.cpp | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index aa3599675c91f..a4b994b158069 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -1629,18 +1629,21 @@ void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
     MachineInstr *MI = &*I++;
     if (MI->isPHI()) {
       // This is an illegal PHI. If we move any instructions using an illegal
-      // PHI, we need to create a legal Phi
-      Register PhiR = MI->getOperand(0).getReg();
-      auto RC = MRI.getRegClass(PhiR);
-      Register NR = MRI.createVirtualRegister(RC);
-      MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(),
-                                 TII->get(TargetOpcode::PHI), NR)
-                             .addReg(PhiR)
-                             .addMBB(SourceBB);
-      BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
-      CanonicalMIs[NI] = CanonicalMIs[MI];
-      Remaps[PhiR] = NR;
-      continue;
+      // PHI, we need to create a legal Phi.
+      if (getStage(MI) != Stage) {
+        // The legal Phi is not necessary if the illegal phi's stage
+        // is being moved.
+        Register PhiR = MI->getOperand(0).getReg();
+        auto RC = MRI.getRegClass(PhiR);
+        Register NR = MRI.createVirtualRegister(RC);
+        MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
+                                   DebugLoc(), TII->get(TargetOpcode::PHI), NR)
+                               .addReg(PhiR)
+                               .addMBB(SourceBB);
+        BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
+        CanonicalMIs[NI] = CanonicalMIs[MI];
+        Remaps[PhiR] = NR;
+      }
     }
     if (getStage(MI) != Stage)
       continue;

From bd03ef19beb8a3476d5cd9f744c5fba5ca287c51 Mon Sep 17 00:00:00 2001
From: Gabor Marton <gabor.marton@ericsson.com>
Date: Mon, 30 Mar 2020 17:47:48 +0200
Subject: [PATCH 522/770] [analyzer] ApiModeling: Add buffer size arg
 constraint

Summary:
Introducing a new argument constraint to confine buffer sizes. It is typical in
C APIs that a parameter represents a buffer and another param holds the size of
the buffer (or the size of the data we want to handle from the buffer).

Reviewers: NoQ, Szelethus, Charusso, steakhal

Subscribers: whisperity, xazax.hun, baloghadamsoftware, szepet, rnkovacs, a.sidorin, mikhail.ramalho, donat.nagy, dkrupp, gamesh411, ASDenysPetrov, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D77066
---
 .../Core/PathSensitive/DynamicSize.h          | 15 ++++
 .../Checkers/CheckPlacementNew.cpp            | 22 +-----
 .../Checkers/StdLibraryFunctionsChecker.cpp   | 77 ++++++++++++++++---
 clang/lib/StaticAnalyzer/Core/DynamicSize.cpp | 23 ++++++
 .../std-c-library-functions-arg-constraints.c | 27 +++++++
 5 files changed, 134 insertions(+), 30 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/DynamicSize.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/DynamicSize.h
index b48914c53d82f..398f9b6ac33a4 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/DynamicSize.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/DynamicSize.h
@@ -32,6 +32,21 @@ DefinedOrUnknownSVal getDynamicElementCount(ProgramStateRef State,
                                             SValBuilder &SVB,
                                             QualType ElementTy);
 
+/// Get the dynamic size for a symbolic value that represents a buffer. If
+/// there is an offsetting to the underlying buffer we consider that too.
+/// Returns with an SVal that represents the size, this is Unknown if the
+/// engine cannot deduce the size.
+/// E.g.
+///   char buf[3];
+///   (buf); // size is 3
+///   (buf + 1); // size is 2
+///   (buf + 3); // size is 0
+///   (buf + 4); // size is -1
+///
+///   char *bufptr;
+///   (bufptr) // size is unknown
+SVal getDynamicSizeWithOffset(ProgramStateRef State, const SVal &BufV);
+
 } // namespace ento
 } // namespace clang
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
index fec9fb59b2eb5..dc9cd717be9e9 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
@@ -63,28 +63,8 @@ class PlacementNewChecker : public Checker<check::PreStmt<CXXNewExpr>> {
 
 SVal PlacementNewChecker::getExtentSizeOfPlace(const CXXNewExpr *NE,
                                                CheckerContext &C) const {
-  ProgramStateRef State = C.getState();
   const Expr *Place = NE->getPlacementArg(0);
-
-  const MemRegion *MRegion = C.getSVal(Place).getAsRegion();
-  if (!MRegion)
-    return UnknownVal();
-  RegionOffset Offset = MRegion->getAsOffset();
-  if (Offset.hasSymbolicOffset())
-    return UnknownVal();
-  const MemRegion *BaseRegion = MRegion->getBaseRegion();
-  if (!BaseRegion)
-    return UnknownVal();
-
-  SValBuilder &SvalBuilder = C.getSValBuilder();
-  NonLoc OffsetInBytes = SvalBuilder.makeArrayIndex(
-      Offset.getOffset() / C.getASTContext().getCharWidth());
-  DefinedOrUnknownSVal ExtentInBytes =
-      getDynamicSize(State, BaseRegion, SvalBuilder);
-
-  return SvalBuilder.evalBinOp(State, BinaryOperator::Opcode::BO_Sub,
-                               ExtentInBytes, OffsetInBytes,
-                               SvalBuilder.getArrayIndexType());
+  return getDynamicSizeWithOffset(C.getState(), C.getSVal(Place));
 }
 
 SVal PlacementNewChecker::getExtentSizeOfNewTarget(const CXXNewExpr *NE,
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index aefcad3745968..f661f29948b60 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -56,6 +56,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicSize.h"
 
 using namespace clang;
 using namespace clang::ento;
@@ -108,7 +109,8 @@ class StdLibraryFunctionsChecker
     /// Apply the effects of the constraint on the given program state. If null
     /// is returned then the constraint is not feasible.
     virtual ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
-                                  const Summary &Summary) const = 0;
+                                  const Summary &Summary,
+                                  CheckerContext &C) const = 0;
     virtual ValueConstraintPtr negate() const {
       llvm_unreachable("Not implemented");
     };
@@ -143,7 +145,8 @@ class StdLibraryFunctionsChecker
                                        const Summary &Summary) const;
   public:
     ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
-                          const Summary &Summary) const override {
+                          const Summary &Summary,
+                          CheckerContext &C) const override {
       switch (Kind) {
       case OutOfRange:
         return applyAsOutOfRange(State, Call, Summary);
@@ -178,7 +181,8 @@ class StdLibraryFunctionsChecker
     ArgNo getOtherArgNo() const { return OtherArgN; }
     BinaryOperator::Opcode getOpcode() const { return Opcode; }
     ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
-                          const Summary &Summary) const override;
+                          const Summary &Summary,
+                          CheckerContext &C) const override;
   };
 
   class NotNullConstraint : public ValueConstraint {
@@ -188,7 +192,8 @@ class StdLibraryFunctionsChecker
 
   public:
     ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
-                          const Summary &Summary) const override {
+                          const Summary &Summary,
+                          CheckerContext &C) const override {
       SVal V = getArgSVal(Call, getArgNo());
       if (V.isUndef())
         return State;
@@ -207,6 +212,51 @@ class StdLibraryFunctionsChecker
     }
   };
 
+  // Represents a buffer argument with an additional size argument.
+  // E.g. the first two arguments here:
+  //   ctime_s(char *buffer, rsize_t bufsz, const time_t *time);
+  class BufferSizeConstraint : public ValueConstraint {
+    // The argument which holds the size of the buffer.
+    ArgNo SizeArgN;
+    // The operator we use in apply. This is negated in negate().
+    BinaryOperator::Opcode Op = BO_LE;
+
+  public:
+    BufferSizeConstraint(ArgNo Buffer, ArgNo BufSize)
+        : ValueConstraint(Buffer), SizeArgN(BufSize) {}
+
+    ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
+                          const Summary &Summary,
+                          CheckerContext &C) const override {
+      // The buffer argument.
+      SVal BufV = getArgSVal(Call, getArgNo());
+      // The size argument.
+      SVal SizeV = getArgSVal(Call, SizeArgN);
+      // The dynamic size of the buffer argument, got from the analyzer engine.
+      SVal BufDynSize = getDynamicSizeWithOffset(State, BufV);
+
+      SValBuilder &SvalBuilder = C.getSValBuilder();
+      SVal Feasible = SvalBuilder.evalBinOp(State, Op, SizeV, BufDynSize,
+                                            SvalBuilder.getContext().BoolTy);
+      if (auto F = Feasible.getAs<DefinedOrUnknownSVal>())
+        return State->assume(*F, true);
+
+      // We can get here only if the size argument or the dynamic size is
+      // undefined. But the dynamic size should never be undefined, only
+      // unknown. So, here, the size of the argument is undefined, i.e. we
+      // cannot apply the constraint. Actually, other checkers like
+      // CallAndMessage should catch this situation earlier, because we call a
+      // function with an uninitialized argument.
+      llvm_unreachable("Size argument or the dynamic size is Undefined");
+    }
+
+    ValueConstraintPtr negate() const override {
+      BufferSizeConstraint Tmp(*this);
+      Tmp.Op = BinaryOperator::negateComparisonOp(Op);
+      return std::make_shared<BufferSizeConstraint>(Tmp);
+    }
+  };
+
   /// The complete list of constraints that defines a single branch.
   typedef std::vector<ValueConstraintPtr> ConstraintSet;
 
@@ -416,8 +466,8 @@ ProgramStateRef StdLibraryFunctionsChecker::RangeConstraint::applyAsWithinRange(
 }
 
 ProgramStateRef StdLibraryFunctionsChecker::ComparisonConstraint::apply(
-    ProgramStateRef State, const CallEvent &Call,
-    const Summary &Summary) const {
+    ProgramStateRef State, const CallEvent &Call, const Summary &Summary,
+    CheckerContext &C) const {
 
   ProgramStateManager &Mgr = State->getStateManager();
   SValBuilder &SVB = Mgr.getSValBuilder();
@@ -448,8 +498,8 @@ void StdLibraryFunctionsChecker::checkPreCall(const CallEvent &Call,
 
   ProgramStateRef NewState = State;
   for (const ValueConstraintPtr& VC : Summary.ArgConstraints) {
-    ProgramStateRef SuccessSt = VC->apply(NewState, Call, Summary);
-    ProgramStateRef FailureSt = VC->negate()->apply(NewState, Call, Summary);
+    ProgramStateRef SuccessSt = VC->apply(NewState, Call, Summary, C);
+    ProgramStateRef FailureSt = VC->negate()->apply(NewState, Call, Summary, C);
     // The argument constraint is not satisfied.
     if (FailureSt && !SuccessSt) {
       if (ExplodedNode *N = C.generateErrorNode(NewState))
@@ -482,7 +532,7 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call,
   for (const auto &VRS : Summary.CaseConstraints) {
     ProgramStateRef NewState = State;
     for (const auto &VR: VRS) {
-      NewState = VR->apply(NewState, Call, Summary);
+      NewState = VR->apply(NewState, Call, Summary, C);
       if (!NewState)
         break;
     }
@@ -694,6 +744,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                               IntRangeVector Ranges) {
     return std::make_shared<RangeConstraint>(ArgN, Kind, Ranges);
   };
+  auto BufferSize = [](ArgNo BufArgN, ArgNo SizeArgN) {
+    return std::make_shared<BufferSizeConstraint>(BufArgN, SizeArgN);
+  };
   struct {
     auto operator()(RangeKind Kind, IntRangeVector Ranges) {
       return std::make_shared<RangeConstraint>(Ret, Kind, Ranges);
@@ -929,6 +982,12 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                                     RetType{IntTy}, EvalCallAsPure)
                                 .ArgConstraint(NotNull(ArgNo(0)))
                                 .ArgConstraint(NotNull(ArgNo(1))));
+    addToFunctionSummaryMap(
+        "__buf_size_arg_constraint",
+        Summary(ArgTypes{ConstVoidPtrTy, SizeTy}, RetType{IntTy},
+                EvalCallAsPure)
+            .ArgConstraint(
+                BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1))));
   }
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/DynamicSize.cpp b/clang/lib/StaticAnalyzer/Core/DynamicSize.cpp
index f90c29c52f0fb..8b2172db445ce 100644
--- a/clang/lib/StaticAnalyzer/Core/DynamicSize.cpp
+++ b/clang/lib/StaticAnalyzer/Core/DynamicSize.cpp
@@ -44,5 +44,28 @@ DefinedOrUnknownSVal getDynamicElementCount(ProgramStateRef State,
   return DivisionV.castAs<DefinedOrUnknownSVal>();
 }
 
+SVal getDynamicSizeWithOffset(ProgramStateRef State, const SVal &BufV) {
+  SValBuilder &SvalBuilder = State->getStateManager().getSValBuilder();
+  const MemRegion *MRegion = BufV.getAsRegion();
+  if (!MRegion)
+    return UnknownVal();
+  RegionOffset Offset = MRegion->getAsOffset();
+  if (Offset.hasSymbolicOffset())
+    return UnknownVal();
+  const MemRegion *BaseRegion = MRegion->getBaseRegion();
+  if (!BaseRegion)
+    return UnknownVal();
+
+  NonLoc OffsetInBytes = SvalBuilder.makeArrayIndex(
+      Offset.getOffset() /
+      MRegion->getMemRegionManager().getContext().getCharWidth());
+  DefinedOrUnknownSVal ExtentInBytes =
+      getDynamicSize(State, BaseRegion, SvalBuilder);
+
+  return SvalBuilder.evalBinOp(State, BinaryOperator::Opcode::BO_Sub,
+                               ExtentInBytes, OffsetInBytes,
+                               SvalBuilder.getArrayIndexType());
+}
+
 } // namespace ento
 } // namespace clang
diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
index 00960e42fb551..c59e4429f4194 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-constraints.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
@@ -122,3 +122,30 @@ void test_arg_constraint_on_variadic_fun() {
   // bugpath-warning{{Function argument constraint is not satisfied}} \
   // bugpath-note{{Function argument constraint is not satisfied}}
 }
+
+int __buf_size_arg_constraint(const void *, size_t);
+void test_buf_size_concrete() {
+  char buf[3];                       // bugpath-note{{'buf' initialized here}}
+  __buf_size_arg_constraint(buf, 4); // \
+  // report-warning{{Function argument constraint is not satisfied}} \
+  // bugpath-warning{{Function argument constraint is not satisfied}} \
+  // bugpath-note{{Function argument constraint is not satisfied}}
+}
+void test_buf_size_symbolic(int s) {
+  char buf[3];
+  __buf_size_arg_constraint(buf, s);
+  clang_analyzer_eval(s <= 3); // \
+  // report-warning{{TRUE}} \
+  // bugpath-warning{{TRUE}} \
+  // bugpath-note{{TRUE}} \
+  // bugpath-note{{'s' is <= 3}}
+}
+void test_buf_size_symbolic_and_offset(int s) {
+  char buf[3];
+  __buf_size_arg_constraint(buf + 1, s);
+  clang_analyzer_eval(s <= 2); // \
+  // report-warning{{TRUE}} \
+  // bugpath-warning{{TRUE}} \
+  // bugpath-note{{TRUE}} \
+  // bugpath-note{{'s' is <= 2}}
+}

From 205085d4ccf9c367ba70de9d8f0dd74d6f567c24 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Wed, 27 May 2020 11:41:25 +0100
Subject: [PATCH 523/770] [CodeGen] Fix warnings in LowerToPredicatedOp

When creating a new vector type based on another vector type we
should pass in the element count instead of the number of elements
and scalable flag separately.

I encountered this warning whilst compiling this test:

  CodeGen/AArch64/sve-intrinsics-int-compares.ll

Differential revision: https://reviews.llvm.org/D80621
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 579905d748eac..81b50346437c7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7734,8 +7734,8 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
          Op.getOperand(1).getValueType().isScalableVector() &&
          "Only scalable vectors are supported");
 
-  auto PredTy = VT.getVectorVT(*DAG.getContext(), MVT::i1,
-                               VT.getVectorNumElements(), true);
+  auto PredTy =
+      VT.getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorElementCount());
   SDValue Mask = getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
 
   SmallVector<SDValue, 4> Operands = {Mask};

From 41928c97b6a17264938fc765a6a0656d8b6e86ed Mon Sep 17 00:00:00 2001
From: Gabor Marton <gabor.marton@ericsson.com>
Date: Tue, 31 Mar 2020 17:41:10 +0200
Subject: [PATCH 524/770] [analyzer] ApiModeling: Add buffer size arg
 constraint with multiplier involved

Summary:
Further develop the buffer size argumentum constraint so it can handle sizes
that we can get by multiplying two variables.

Reviewers: Szelethus, NoQ, steakhal

Subscribers: whisperity, xazax.hun, baloghadamsoftware, szepet, rnkovacs, a.sidorin, mikhail.ramalho, donat.nagy, dkrupp, gamesh411, Charusso, ASDenysPetrov, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D77148
---
 .../Checkers/StdLibraryFunctionsChecker.cpp   | 29 +++++++++++++++++--
 .../std-c-library-functions-arg-constraints.c | 24 +++++++++++++++
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index f661f29948b60..bd2f505849af3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -215,9 +215,16 @@ class StdLibraryFunctionsChecker
   // Represents a buffer argument with an additional size argument.
   // E.g. the first two arguments here:
   //   ctime_s(char *buffer, rsize_t bufsz, const time_t *time);
+  // Another example:
+  //   size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
+  //   // Here, ptr is the buffer, and its minimum size is `size * nmemb`.
   class BufferSizeConstraint : public ValueConstraint {
     // The argument which holds the size of the buffer.
     ArgNo SizeArgN;
+    // The argument which is a multiplier to size. This is set in case of
+    // `fread` like functions where the size is computed as a multiplication of
+    // two arguments.
+    llvm::Optional<ArgNo> SizeMultiplierArgN;
     // The operator we use in apply. This is negated in negate().
     BinaryOperator::Opcode Op = BO_LE;
 
@@ -225,17 +232,27 @@ class StdLibraryFunctionsChecker
     BufferSizeConstraint(ArgNo Buffer, ArgNo BufSize)
         : ValueConstraint(Buffer), SizeArgN(BufSize) {}
 
+    BufferSizeConstraint(ArgNo Buffer, ArgNo BufSize, ArgNo BufSizeMultiplier)
+        : ValueConstraint(Buffer), SizeArgN(BufSize),
+          SizeMultiplierArgN(BufSizeMultiplier) {}
+
     ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call,
                           const Summary &Summary,
                           CheckerContext &C) const override {
+      SValBuilder &SvalBuilder = C.getSValBuilder();
       // The buffer argument.
       SVal BufV = getArgSVal(Call, getArgNo());
       // The size argument.
       SVal SizeV = getArgSVal(Call, SizeArgN);
+      // Multiply with another argument if given.
+      if (SizeMultiplierArgN) {
+        SVal SizeMulV = getArgSVal(Call, *SizeMultiplierArgN);
+        SizeV = SvalBuilder.evalBinOp(State, BO_Mul, SizeV, SizeMulV,
+                                      Summary.getArgType(SizeArgN));
+      }
       // The dynamic size of the buffer argument, got from the analyzer engine.
       SVal BufDynSize = getDynamicSizeWithOffset(State, BufV);
 
-      SValBuilder &SvalBuilder = C.getSValBuilder();
       SVal Feasible = SvalBuilder.evalBinOp(State, Op, SizeV, BufDynSize,
                                             SvalBuilder.getContext().BoolTy);
       if (auto F = Feasible.getAs<DefinedOrUnknownSVal>())
@@ -744,8 +761,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                               IntRangeVector Ranges) {
     return std::make_shared<RangeConstraint>(ArgN, Kind, Ranges);
   };
-  auto BufferSize = [](ArgNo BufArgN, ArgNo SizeArgN) {
-    return std::make_shared<BufferSizeConstraint>(BufArgN, SizeArgN);
+  auto BufferSize = [](auto... Args) {
+    return std::make_shared<BufferSizeConstraint>(Args...);
   };
   struct {
     auto operator()(RangeKind Kind, IntRangeVector Ranges) {
@@ -988,6 +1005,12 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                 EvalCallAsPure)
             .ArgConstraint(
                 BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1))));
+    addToFunctionSummaryMap(
+        "__buf_size_arg_constraint_mul",
+        Summary(ArgTypes{ConstVoidPtrTy, SizeTy, SizeTy}, RetType{IntTy},
+                EvalCallAsPure)
+            .ArgConstraint(BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1),
+                                      /*BufSizeMultiplier=*/ArgNo(2))));
   }
 }
 
diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
index c59e4429f4194..60338128ec89d 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-constraints.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
@@ -149,3 +149,27 @@ void test_buf_size_symbolic_and_offset(int s) {
   // bugpath-note{{TRUE}} \
   // bugpath-note{{'s' is <= 2}}
 }
+int __buf_size_arg_constraint_mul(const void *, size_t, size_t);
+void test_buf_size_concrete_with_multiplication() {
+  short buf[3];         // bugpath-note{{'buf' initialized here}}
+  __buf_size_arg_constraint_mul(buf, 4, sizeof(short)); // \
+  // report-warning{{Function argument constraint is not satisfied}} \
+  // bugpath-warning{{Function argument constraint is not satisfied}} \
+  // bugpath-note{{Function argument constraint is not satisfied}}
+}
+void test_buf_size_symbolic_with_multiplication(size_t s) {
+  short buf[3];
+  __buf_size_arg_constraint_mul(buf, s, sizeof(short));
+  clang_analyzer_eval(s * sizeof(short) <= 6); // \
+  // report-warning{{TRUE}} \
+  // bugpath-warning{{TRUE}} \
+  // bugpath-note{{TRUE}}
+}
+void test_buf_size_symbolic_and_offset_with_multiplication(size_t s) {
+  short buf[3];
+  __buf_size_arg_constraint_mul(buf + 1, s, sizeof(short));
+  clang_analyzer_eval(s * sizeof(short) <= 4); // \
+  // report-warning{{TRUE}} \
+  // bugpath-warning{{TRUE}} \
+  // bugpath-note{{TRUE}}
+}

From 0609704760d0a94cccdbd243d0d750d74ee6c459 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 28 May 2020 16:33:44 +0100
Subject: [PATCH 525/770] [SVE] Remove getNumElements() calls in
 visitGetElementPtrInst

Replace calls to getNumElements() with getElementCount() in order
to avoid warnings for scalable vectors. The warnings were discovered
by this existing test:

  test/CodeGen/AArch64/sve-gep.ll

Differential revision: https://reviews.llvm.org/D80782
---
 llvm/lib/IR/Verifier.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index cb96c7ae515a3..677dc02102562 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -3429,16 +3429,16 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   if (auto *GEPVTy = dyn_cast<VectorType>(GEP.getType())) {
     // Additional checks for vector GEPs.
-    unsigned GEPWidth = GEPVTy->getNumElements();
+    ElementCount GEPWidth = GEPVTy->getElementCount();
     if (GEP.getPointerOperandType()->isVectorTy())
       Assert(
           GEPWidth ==
-              cast<VectorType>(GEP.getPointerOperandType())->getNumElements(),
+              cast<VectorType>(GEP.getPointerOperandType())->getElementCount(),
           "Vector GEP result width doesn't match operand's", &GEP);
     for (Value *Idx : Idxs) {
       Type *IndexTy = Idx->getType();
       if (auto *IndexVTy = dyn_cast<VectorType>(IndexTy)) {
-        unsigned IndexWidth = IndexVTy->getNumElements();
+        ElementCount IndexWidth = IndexVTy->getElementCount();
         Assert(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
       }
       Assert(IndexTy->isIntOrIntVectorTy(),

From 16fef6d0b46f0494c502f8fe416fe841f7b000f6 Mon Sep 17 00:00:00 2001
From: Pushpinder Singh <Pushpinder.Singh@amd.com>
Date: Fri, 29 May 2020 01:22:48 -0400
Subject: [PATCH 526/770] Fix build failure when source is read only

cmake configure fails when it tries to setup target for llvm_vcsrevision_h
This happens only when source is checked out using repo in a read
only filesystem, because cmake tries to create `.git/logs/HEAD` file.

This patch:
  1. Recovers from failure gracefully.
  2. Ensures that VCSRevision.h is successfully created and updated
     in above scenarios.

Differential Revision: https://reviews.llvm.org/D79400
---
 llvm/cmake/modules/AddLLVM.cmake         |  8 +++++++-
 llvm/include/llvm/Support/CMakeLists.txt | 13 ++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 9f14561fe0a6f..f16f63c32c95f 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2118,7 +2118,13 @@ function(find_first_existing_vc_file path out_var)
         get_filename_component(git_dir ${git_output} ABSOLUTE BASE_DIR ${path})
         # Some branchless cases (e.g. 'repo') may not yet have .git/logs/HEAD
         if (NOT EXISTS "${git_dir}/logs/HEAD")
-          file(WRITE "${git_dir}/logs/HEAD" "")
+          execute_process(COMMAND ${CMAKE_COMMAND} -E touch HEAD
+            WORKING_DIRECTORY "${git_dir}/logs"
+            RESULT_VARIABLE touch_head_result
+            ERROR_QUIET)
+          if (NOT touch_head_result EQUAL 0)
+            return()
+          endif()
         endif()
         set(${out_var} "${git_dir}/logs/HEAD" PARENT_SCOPE)
       endif()
diff --git a/llvm/include/llvm/Support/CMakeLists.txt b/llvm/include/llvm/Support/CMakeLists.txt
index 680be8fdf3911..da8a4da443edf 100644
--- a/llvm/include/llvm/Support/CMakeLists.txt
+++ b/llvm/include/llvm/Support/CMakeLists.txt
@@ -5,12 +5,19 @@ set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/VCSRevision.h")
 
 set(generate_vcs_version_script "${LLVM_CMAKE_PATH}/GenerateVersionFromVCS.cmake")
 
-if(llvm_vc AND LLVM_APPEND_VC_REV)
+if(LLVM_APPEND_VC_REV)
   set(llvm_source_dir ${LLVM_MAIN_SRC_DIR})
+
+  # A fake version file and is not expected to exist. It is being used to
+  # force regeneration of VCSRevision.h for source directory with no write
+  # permission available.
+  if (NOT llvm_vc)
+    set(fake_version_inc "${CMAKE_CURRENT_BINARY_DIR}/__FakeVCSRevision.h")
+  endif()
 endif()
 
 # Create custom target to generate the VC revision include.
-add_custom_command(OUTPUT "${version_inc}"
+add_custom_command(OUTPUT "${version_inc}" "${fake_version_inc}"
   DEPENDS "${llvm_vc}" "${generate_vcs_version_script}"
   COMMAND ${CMAKE_COMMAND} "-DNAMES=LLVM"
                            "-DLLVM_SOURCE_DIR=${llvm_source_dir}"
@@ -22,5 +29,5 @@ set_source_files_properties("${version_inc}"
   PROPERTIES GENERATED TRUE
              HEADER_FILE_ONLY TRUE)
 
-add_custom_target(llvm_vcsrevision_h DEPENDS "${version_inc}")
+add_custom_target(llvm_vcsrevision_h ALL DEPENDS "${version_inc}" "${fake_version_inc}")
 set_target_properties(llvm_vcsrevision_h PROPERTIES FOLDER "Misc")

From 16506d789084fd037fc61d442da43dd5242872b7 Mon Sep 17 00:00:00 2001
From: Gabor Marton <gabor.marton@ericsson.com>
Date: Tue, 7 Apr 2020 17:57:50 +0200
Subject: [PATCH 527/770] [analyzer] StdLibraryFunctionsChecker: Add sanity
 checks for constraints

Summary:
Once we found a matching FunctionDecl for the given summary then we
validate the given constraints against that FunctionDecl. E.g. we
validate that a NotNull constraint is applied only on arguments that
have pointer types.

This is needed because when we matched the signature of the summary we
were working with incomplete function types, i.e. some intricate type
could have been marked as `Irrelevant` in the signature.

Reviewers: NoQ, Szelethus, balazske

Subscribers: whisperity, xazax.hun, baloghadamsoftware, szepet, rnkovacs, a.sidorin, mikhail.ramalho, donat.nagy, dkrupp, gamesh411, Charusso, steakhal, ASDenysPetrov, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D77658
---
 .../Checkers/StdLibraryFunctionsChecker.cpp   | 204 +++++++++++++-----
 1 file changed, 146 insertions(+), 58 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index bd2f505849af3..578f6ad46b849 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -64,10 +64,8 @@ using namespace clang::ento;
 namespace {
 class StdLibraryFunctionsChecker
     : public Checker<check::PreCall, check::PostCall, eval::Call> {
-  /// Below is a series of typedefs necessary to define function specs.
-  /// We avoid nesting types here because each additional qualifier
-  /// would need to be repeated in every function spec.
-  struct Summary;
+
+  class Summary;
 
   /// Specify how much the analyzer engine should entrust modeling this function
   /// to us. If he doesn't, he performs additional invalidations.
@@ -114,10 +112,27 @@ class StdLibraryFunctionsChecker
     virtual ValueConstraintPtr negate() const {
       llvm_unreachable("Not implemented");
     };
+
+    // Check whether the constraint is malformed or not. It is malformed if the
+    // specified argument has a mismatch with the given FunctionDecl (e.g. the
+    // arg number is out-of-range of the function's argument list).
+    bool checkValidity(const FunctionDecl *FD) const {
+      const bool ValidArg = ArgN == Ret || ArgN < FD->getNumParams();
+      assert(ValidArg && "Arg out of range!");
+      if (!ValidArg)
+        return false;
+      // Subclasses may further refine the validation.
+      return checkSpecificValidity(FD);
+    }
     ArgNo getArgNo() const { return ArgN; }
 
   protected:
     ArgNo ArgN; // Argument to which we apply the constraint.
+
+    /// Do polymorphic sanity check on the constraint.
+    virtual bool checkSpecificValidity(const FunctionDecl *FD) const {
+      return true;
+    }
   };
 
   /// Given a range, should the argument stay inside or outside this range?
@@ -168,6 +183,14 @@ class StdLibraryFunctionsChecker
       }
       return std::make_shared<RangeConstraint>(Tmp);
     }
+
+    bool checkSpecificValidity(const FunctionDecl *FD) const override {
+      const bool ValidArg =
+          getArgType(FD, ArgN)->isIntegralType(FD->getASTContext());
+      assert(ValidArg &&
+             "This constraint should be applied on an integral type");
+      return ValidArg;
+    }
   };
 
   class ComparisonConstraint : public ValueConstraint {
@@ -210,6 +233,13 @@ class StdLibraryFunctionsChecker
       Tmp.CannotBeNull = !this->CannotBeNull;
       return std::make_shared<NotNullConstraint>(Tmp);
     }
+
+    bool checkSpecificValidity(const FunctionDecl *FD) const override {
+      const bool ValidArg = getArgType(FD, ArgN)->isPointerType();
+      assert(ValidArg &&
+             "This constraint should be applied only on a pointer type");
+      return ValidArg;
+    }
   };
 
   // Represents a buffer argument with an additional size argument.
@@ -278,11 +308,52 @@ class StdLibraryFunctionsChecker
   typedef std::vector<ValueConstraintPtr> ConstraintSet;
 
   using ArgTypes = std::vector<QualType>;
+
+  // A placeholder type, we use it whenever we do not care about the concrete
+  // type in a Signature.
+  const QualType Irrelevant{};
+  bool static isIrrelevant(QualType T) { return T.isNull(); }
+
+  // The signature of a function we want to describe with a summary. This is a
+  // concessive signature, meaning there may be irrelevant types in the
+  // signature which we do not check against a function with concrete types.
+  struct Signature {
+    const ArgTypes ArgTys;
+    const QualType RetTy;
+    Signature(ArgTypes ArgTys, QualType RetTy) : ArgTys(ArgTys), RetTy(RetTy) {
+      assertRetTypeSuitableForSignature(RetTy);
+      for (size_t I = 0, E = ArgTys.size(); I != E; ++I) {
+        QualType ArgTy = ArgTys[I];
+        assertArgTypeSuitableForSignature(ArgTy);
+      }
+    }
+    bool matches(const FunctionDecl *FD) const;
+
+  private:
+    static void assertArgTypeSuitableForSignature(QualType T) {
+      assert((T.isNull() || !T->isVoidType()) &&
+             "We should have no void types in the spec");
+      assert((T.isNull() || T.isCanonical()) &&
+             "We should only have canonical types in the spec");
+    }
+    static void assertRetTypeSuitableForSignature(QualType T) {
+      assert((T.isNull() || T.isCanonical()) &&
+             "We should only have canonical types in the spec");
+    }
+  };
+
+  static QualType getArgType(const FunctionDecl *FD, ArgNo ArgN) {
+    assert(FD && "Function must be set");
+    QualType T = (ArgN == Ret)
+                     ? FD->getReturnType().getCanonicalType()
+                     : FD->getParamDecl(ArgN)->getType().getCanonicalType();
+    return T;
+  }
+
   using Cases = std::vector<ConstraintSet>;
 
-  /// Includes information about
-  ///   * function prototype (which is necessary to
-  ///     ensure we're modeling the right function and casting values properly),
+  /// A summary includes information about
+  ///   * function prototype (signature)
   ///   * approach to invalidation,
   ///   * a list of branches - a list of list of ranges -
   ///     A branch represents a path in the exploded graph of a function (which
@@ -299,15 +370,28 @@ class StdLibraryFunctionsChecker
   ///   * a list of argument constraints, that must be true on every branch.
   ///     If these constraints are not satisfied that means a fatal error
   ///     usually resulting in undefined behaviour.
-  struct Summary {
-    const ArgTypes ArgTys;
-    const QualType RetTy;
+  ///
+  /// Application of a summary:
+  ///   The signature and argument constraints together contain information
+  ///   about which functions are handled by the summary. The signature can use
+  ///   "wildcards", i.e. Irrelevant types. Irrelevant type of a parameter in
+  ///   a signature means that type is not compared to the type of the parameter
+  ///   in the found FunctionDecl. Argument constraints may specify additional
+  ///   rules for the given parameter's type, those rules are checked once the
+  ///   signature is matched.
+  class Summary {
+    const Signature Sign;
     const InvalidationKind InvalidationKd;
     Cases CaseConstraints;
     ConstraintSet ArgConstraints;
 
+    // The function to which the summary applies. This is set after lookup and
+    // match to the signature.
+    const FunctionDecl *FD = nullptr;
+
+  public:
     Summary(ArgTypes ArgTys, QualType RetTy, InvalidationKind InvalidationKd)
-        : ArgTys(ArgTys), RetTy(RetTy), InvalidationKd(InvalidationKd) {}
+        : Sign(ArgTys, RetTy), InvalidationKd(InvalidationKd) {}
 
     Summary &Case(ConstraintSet&& CS) {
       CaseConstraints.push_back(std::move(CS));
@@ -318,24 +402,38 @@ class StdLibraryFunctionsChecker
       return *this;
     }
 
-  private:
-    static void assertTypeSuitableForSummary(QualType T) {
-      assert(!T->isVoidType() &&
-             "We should have had no significant void types in the spec");
-      assert(T.isCanonical() &&
-             "We should only have canonical types in the spec");
-    }
+    InvalidationKind getInvalidationKd() const { return InvalidationKd; }
+    const Cases &getCaseConstraints() const { return CaseConstraints; }
+    const ConstraintSet &getArgConstraints() const { return ArgConstraints; }
 
-  public:
     QualType getArgType(ArgNo ArgN) const {
-      QualType T = (ArgN == Ret) ? RetTy : ArgTys[ArgN];
-      assertTypeSuitableForSummary(T);
-      return T;
+      return StdLibraryFunctionsChecker::getArgType(FD, ArgN);
     }
 
-    /// Try our best to figure out if the summary's signature matches
-    /// *the* library function to which this specification applies.
-    bool matchesSignature(const FunctionDecl *FD) const;
+    // Returns true if the summary should be applied to the given function.
+    // And if yes then store the function declaration.
+    bool matchesAndSet(const FunctionDecl *FD) {
+      bool Result = Sign.matches(FD) && validateByConstraints(FD);
+      if (Result) {
+        assert(!this->FD && "FD must not be set more than once");
+        this->FD = FD;
+      }
+      return Result;
+    }
+
+  private:
+    // Once we know the exact type of the function then do sanity check on all
+    // the given constraints.
+    bool validateByConstraints(const FunctionDecl *FD) const {
+      for (const ConstraintSet &Case : CaseConstraints)
+        for (const ValueConstraintPtr &Constraint : Case)
+          if (!Constraint->checkValidity(FD))
+            return false;
+      for (const ValueConstraintPtr &Constraint : ArgConstraints)
+        if (!Constraint->checkValidity(FD))
+          return false;
+      return true;
+    }
   };
 
   // The map of all functions supported by the checker. It is initialized
@@ -345,11 +443,6 @@ class StdLibraryFunctionsChecker
 
   mutable std::unique_ptr<BugType> BT_InvalidArg;
 
-  // Auxiliary functions to support ArgNo within all structures
-  // in a unified manner.
-  static QualType getArgType(const Summary &Summary, ArgNo ArgN) {
-    return Summary.getArgType(ArgN);
-  }
   static SVal getArgSVal(const CallEvent &Call, ArgNo ArgN) {
     return ArgN == Ret ? Call.getReturnValue() : Call.getArgSVal(ArgN);
   }
@@ -406,7 +499,7 @@ ProgramStateRef StdLibraryFunctionsChecker::RangeConstraint::applyAsOutOfRange(
   SValBuilder &SVB = Mgr.getSValBuilder();
   BasicValueFactory &BVF = SVB.getBasicValueFactory();
   ConstraintManager &CM = Mgr.getConstraintManager();
-  QualType T = getArgType(Summary, getArgNo());
+  QualType T = Summary.getArgType(getArgNo());
   SVal V = getArgSVal(Call, getArgNo());
 
   if (auto N = V.getAs<NonLoc>()) {
@@ -433,7 +526,7 @@ ProgramStateRef StdLibraryFunctionsChecker::RangeConstraint::applyAsWithinRange(
   SValBuilder &SVB = Mgr.getSValBuilder();
   BasicValueFactory &BVF = SVB.getBasicValueFactory();
   ConstraintManager &CM = Mgr.getConstraintManager();
-  QualType T = getArgType(Summary, getArgNo());
+  QualType T = Summary.getArgType(getArgNo());
   SVal V = getArgSVal(Call, getArgNo());
 
   // "WithinRange R" is treated as "outside [T_MIN, T_MAX] \ R".
@@ -489,13 +582,13 @@ ProgramStateRef StdLibraryFunctionsChecker::ComparisonConstraint::apply(
   ProgramStateManager &Mgr = State->getStateManager();
   SValBuilder &SVB = Mgr.getSValBuilder();
   QualType CondT = SVB.getConditionType();
-  QualType T = getArgType(Summary, getArgNo());
+  QualType T = Summary.getArgType(getArgNo());
   SVal V = getArgSVal(Call, getArgNo());
 
   BinaryOperator::Opcode Op = getOpcode();
   ArgNo OtherArg = getOtherArgNo();
   SVal OtherV = getArgSVal(Call, OtherArg);
-  QualType OtherT = getArgType(Summary, OtherArg);
+  QualType OtherT = Summary.getArgType(OtherArg);
   // Note: we avoid integral promotion for comparison.
   OtherV = SVB.evalCast(OtherV, T, OtherT);
   if (auto CompV = SVB.evalBinOp(State, Op, V, OtherV, CondT)
@@ -514,9 +607,10 @@ void StdLibraryFunctionsChecker::checkPreCall(const CallEvent &Call,
   ProgramStateRef State = C.getState();
 
   ProgramStateRef NewState = State;
-  for (const ValueConstraintPtr& VC : Summary.ArgConstraints) {
-    ProgramStateRef SuccessSt = VC->apply(NewState, Call, Summary, C);
-    ProgramStateRef FailureSt = VC->negate()->apply(NewState, Call, Summary, C);
+  for (const ValueConstraintPtr &Constraint : Summary.getArgConstraints()) {
+    ProgramStateRef SuccessSt = Constraint->apply(NewState, Call, Summary, C);
+    ProgramStateRef FailureSt =
+        Constraint->negate()->apply(NewState, Call, Summary, C);
     // The argument constraint is not satisfied.
     if (FailureSt && !SuccessSt) {
       if (ExplodedNode *N = C.generateErrorNode(NewState))
@@ -546,10 +640,10 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call,
   ProgramStateRef State = C.getState();
 
   // Apply case/branch specifications.
-  for (const auto &VRS : Summary.CaseConstraints) {
+  for (const ConstraintSet &Case : Summary.getCaseConstraints()) {
     ProgramStateRef NewState = State;
-    for (const auto &VR: VRS) {
-      NewState = VR->apply(NewState, Call, Summary, C);
+    for (const ValueConstraintPtr &Constraint : Case) {
+      NewState = Constraint->apply(NewState, Call, Summary, C);
       if (!NewState)
         break;
     }
@@ -566,7 +660,7 @@ bool StdLibraryFunctionsChecker::evalCall(const CallEvent &Call,
     return false;
 
   const Summary &Summary = *FoundSummary;
-  switch (Summary.InvalidationKd) {
+  switch (Summary.getInvalidationKd()) {
   case EvalCallAsPure: {
     ProgramStateRef State = C.getState();
     const LocationContext *LC = C.getLocationContext();
@@ -585,27 +679,23 @@ bool StdLibraryFunctionsChecker::evalCall(const CallEvent &Call,
   llvm_unreachable("Unknown invalidation kind!");
 }
 
-bool StdLibraryFunctionsChecker::Summary::matchesSignature(
+bool StdLibraryFunctionsChecker::Signature::matches(
     const FunctionDecl *FD) const {
   // Check number of arguments:
   if (FD->param_size() != ArgTys.size())
     return false;
 
-  // Check return type if relevant:
-  if (!RetTy.isNull() && RetTy != FD->getReturnType().getCanonicalType())
-    return false;
+  // Check return type.
+  if (!isIrrelevant(RetTy))
+    if (RetTy != FD->getReturnType().getCanonicalType())
+      return false;
 
-  // Check argument types when relevant:
+  // Check argument types.
   for (size_t I = 0, E = ArgTys.size(); I != E; ++I) {
-    QualType FormalT = ArgTys[I];
-    // Null type marks irrelevant arguments.
-    if (FormalT.isNull())
+    QualType ArgTy = ArgTys[I];
+    if (isIrrelevant(ArgTy))
       continue;
-
-    assertTypeSuitableForSummary(FormalT);
-
-    QualType ActualT = FD->getParamDecl(I)->getType().getCanonicalType();
-    if (ActualT != FormalT)
+    if (ArgTy != FD->getParamDecl(I)->getType().getCanonicalType())
       return false;
   }
 
@@ -651,8 +741,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   // of function summary for common cases (eg. ssize_t could be int or long
   // or long long, so three summary variants would be enough).
   // Of course, function variants are also useful for C++ overloads.
-  const QualType
-      Irrelevant{}; // A placeholder, whenever we do not care about the type.
   const QualType IntTy = ACtx.IntTy;
   const QualType LongTy = ACtx.LongTy;
   const QualType LongLongTy = ACtx.LongLongTy;
@@ -702,14 +790,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // Add a summary to a FunctionDecl found by lookup. The lookup is performed
     // by the given Name, and in the global scope. The summary will be attached
     // to the found FunctionDecl only if the signatures match.
-    void operator()(StringRef Name, const Summary &S) {
+    void operator()(StringRef Name, Summary S) {
       IdentifierInfo &II = ACtx.Idents.get(Name);
       auto LookupRes = ACtx.getTranslationUnitDecl()->lookup(&II);
       if (LookupRes.size() == 0)
         return;
       for (Decl *D : LookupRes) {
         if (auto *FD = dyn_cast<FunctionDecl>(D)) {
-          if (S.matchesSignature(FD)) {
+          if (S.matchesAndSet(FD)) {
             auto Res = Map.insert({FD->getCanonicalDecl(), S});
             assert(Res.second && "Function already has a summary set!");
             (void)Res;

From 0e265e315784b4e47f984f8ed9fb7586130bacdc Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Fri, 29 May 2020 16:47:23 +0200
Subject: [PATCH 528/770] Move unittest helpers to a shared location

Summary:
unittests/AST/Language.h defines some helpers that we would like to
reuse in other tests, for example, in tests for syntax trees.

Reviewers: sammccall

Reviewed By: sammccall

Subscribers: mgorny, martong, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80792
---
 .../clang/Testing/CommandLineArgs.h}                   | 10 +++++-----
 clang/include/clang/module.modulemap                   |  6 ++++++
 clang/lib/CMakeLists.txt                               |  1 +
 clang/lib/Testing/CMakeLists.txt                       |  7 +++++++
 .../Language.cpp => lib/Testing/CommandLineArgs.cpp}   |  8 ++------
 clang/unittests/AST/ASTImporterFixtures.h              |  2 +-
 clang/unittests/AST/CMakeLists.txt                     |  2 +-
 clang/unittests/AST/MatchVerifier.h                    |  2 +-
 clang/unittests/AST/StructuralEquivalenceTest.cpp      |  2 +-
 9 files changed, 25 insertions(+), 15 deletions(-)
 rename clang/{unittests/AST/Language.h => include/clang/Testing/CommandLineArgs.h} (76%)
 create mode 100644 clang/lib/Testing/CMakeLists.txt
 rename clang/{unittests/AST/Language.cpp => lib/Testing/CommandLineArgs.cpp} (83%)

diff --git a/clang/unittests/AST/Language.h b/clang/include/clang/Testing/CommandLineArgs.h
similarity index 76%
rename from clang/unittests/AST/Language.h
rename to clang/include/clang/Testing/CommandLineArgs.h
index da200ec8719f1..0d2267f63ac54 100644
--- a/clang/unittests/AST/Language.h
+++ b/clang/include/clang/Testing/CommandLineArgs.h
@@ -1,4 +1,4 @@
-//===------ unittest/AST/Language.h - AST unit test support ---------------===//
+//===--- CommandLineArgs.h ------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines language options for AST unittests.
+//  This file defines language options for Clang unittests.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_UNITTESTS_AST_LANGUAGE_H
-#define LLVM_CLANG_UNITTESTS_AST_LANGUAGE_H
+#ifndef LLVM_CLANG_TESTING_COMMANDLINEARGS_H
+#define LLVM_CLANG_TESTING_COMMANDLINEARGS_H
 
-#include <vector>
 #include <string>
+#include <vector>
 
 namespace clang {
 
diff --git a/clang/include/clang/module.modulemap b/clang/include/clang/module.modulemap
index 15f891c153401..7549ff2e3bcd6 100644
--- a/clang/include/clang/module.modulemap
+++ b/clang/include/clang/module.modulemap
@@ -152,6 +152,12 @@ module Clang_StaticAnalyzer_Frontend {
   module * { export * }
 }
 
+module Clang_Testing {
+  requires cplusplus
+  umbrella "Testing"
+  module * { export * }
+}
+
 module Clang_Tooling {
   requires cplusplus umbrella "Tooling" module * { export * }
   // FIXME: Exclude these headers to avoid pulling all of the AST matchers
diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt
index 0c03f5972b093..c2b6a5a4d5d4f 100644
--- a/clang/lib/CMakeLists.txt
+++ b/clang/lib/CMakeLists.txt
@@ -24,3 +24,4 @@ if(CLANG_ENABLE_STATIC_ANALYZER)
   add_subdirectory(StaticAnalyzer)
 endif()
 add_subdirectory(Format)
+add_subdirectory(Testing)
diff --git a/clang/lib/Testing/CMakeLists.txt b/clang/lib/Testing/CMakeLists.txt
new file mode 100644
index 0000000000000..4a8d436aab664
--- /dev/null
+++ b/clang/lib/Testing/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_library(clangTesting
+  CommandLineArgs.cpp
+  )
diff --git a/clang/unittests/AST/Language.cpp b/clang/lib/Testing/CommandLineArgs.cpp
similarity index 83%
rename from clang/unittests/AST/Language.cpp
rename to clang/lib/Testing/CommandLineArgs.cpp
index 3dd9659fc00ee..58a61dade2eae 100644
--- a/clang/unittests/AST/Language.cpp
+++ b/clang/lib/Testing/CommandLineArgs.cpp
@@ -1,16 +1,12 @@
-//===------ unittest/AST/Language.cpp - AST unit test support -------------===//
+//===--- CommandLineArgs.cpp ----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-//  This file defines language options for AST unittests.
-//
-//===----------------------------------------------------------------------===//
 
-#include "Language.h"
+#include "clang/Testing/CommandLineArgs.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace clang {
diff --git a/clang/unittests/AST/ASTImporterFixtures.h b/clang/unittests/AST/ASTImporterFixtures.h
index 619c3f590be4f..4fbceef393844 100644
--- a/clang/unittests/AST/ASTImporterFixtures.h
+++ b/clang/unittests/AST/ASTImporterFixtures.h
@@ -19,11 +19,11 @@
 #include "clang/AST/ASTImporter.h"
 #include "clang/AST/ASTImporterSharedState.h"
 #include "clang/Frontend/ASTUnit.h"
+#include "clang/Testing/CommandLineArgs.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #include "DeclMatcher.h"
-#include "Language.h"
 
 #include <sstream>
 
diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt
index 5d9ff5b97dbd4..2e750ac9ea925 100644
--- a/clang/unittests/AST/CMakeLists.txt
+++ b/clang/unittests/AST/CMakeLists.txt
@@ -26,7 +26,6 @@ add_clang_unittest(ASTTests
   DeclTest.cpp
   EvaluateAsRValueTest.cpp
   ExternalASTSourceTest.cpp
-  Language.cpp
   NamedDeclPrinterTest.cpp
   RecursiveASTVisitorTest.cpp
   SizelessTypesTest.cpp
@@ -42,6 +41,7 @@ clang_target_link_libraries(ASTTests
   clangBasic
   clangFrontend
   clangSerialization
+  clangTesting
   clangTooling
   )
 
diff --git a/clang/unittests/AST/MatchVerifier.h b/clang/unittests/AST/MatchVerifier.h
index 9daf4ce39010d..865b03a7e80e4 100644
--- a/clang/unittests/AST/MatchVerifier.h
+++ b/clang/unittests/AST/MatchVerifier.h
@@ -21,8 +21,8 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Testing/CommandLineArgs.h"
 #include "clang/Tooling/Tooling.h"
-#include "Language.h"
 #include "gtest/gtest.h"
 
 namespace clang {
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index cadcc45c58854..3cb4afedd4db4 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -2,10 +2,10 @@
 #include "clang/AST/ASTStructuralEquivalence.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Frontend/ASTUnit.h"
+#include "clang/Testing/CommandLineArgs.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Support/Host.h"
 
-#include "Language.h"
 #include "DeclMatcher.h"
 
 #include "gtest/gtest.h"

From cf6cc662eeee2b1416430f517850be9032788e39 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Fri, 29 May 2020 15:41:37 +0300
Subject: [PATCH 529/770] [OpenMP][SYCL] Improve diagnosing of unsupported
 types usage

Summary:
Diagnostic is emitted if some declaration of unsupported type
declaration is used inside device code.
Memcpy operations for structs containing member with unsupported type
are allowed. Fixed crash on attempt to emit diagnostic outside of the
functions.

The approach is generalized between SYCL and OpenMP.
CUDA/OMP deferred diagnostic interface is going to be used for SYCL device.

Reviewers: rsmith, rjmccall, ABataev, erichkeane, bader, jdoerfert, aaron.ballman

Reviewed By: jdoerfert

Subscribers: guansong, sstefan1, yaxunl, mgorny, bader, ebevhan, Anastasia, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D74387
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  4 +-
 clang/include/clang/Sema/Sema.h               | 42 +++++++-
 clang/lib/Sema/CMakeLists.txt                 |  1 +
 clang/lib/Sema/Sema.cpp                       | 46 +++++++++
 clang/lib/Sema/SemaDecl.cpp                   |  7 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  3 +
 clang/lib/Sema/SemaExpr.cpp                   | 24 ++---
 clang/lib/Sema/SemaOpenMP.cpp                 | 52 ++++------
 clang/lib/Sema/SemaSYCL.cpp                   | 49 ++++++++++
 clang/lib/Sema/SemaType.cpp                   |  1 +
 clang/test/Headers/nvptx_device_math_sin.c    |  6 +-
 clang/test/Headers/nvptx_device_math_sin.cpp  |  6 +-
 .../OpenMP/nvptx_unsupported_type_codegen.cpp |  8 --
 .../nvptx_unsupported_type_messages.cpp       | 72 +++++++++++++-
 clang/test/SemaSYCL/float128.cpp              | 96 +++++++++++++++++++
 15 files changed, 347 insertions(+), 70 deletions(-)
 create mode 100644 clang/lib/Sema/SemaSYCL.cpp
 create mode 100644 clang/test/SemaSYCL/float128.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 845e329033c39..63af9f42dfd33 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10204,8 +10204,8 @@ def err_omp_invariant_or_linear_dependency : Error<
   "expected loop invariant expression or '<invariant1> * %0 + <invariant2>' kind of expression">;
 def err_omp_wrong_dependency_iterator_type : Error<
   "expected an integer or a pointer type of the outer loop counter '%0' for non-rectangular nests">;
-def err_omp_unsupported_type : Error <
-  "host requires %0 bit size %1 type support, but device '%2' does not support it">;
+def err_device_unsupported_type : Error <
+  "%0 requires %1 bit size %2 type support, but device '%3' does not support it">;
 def err_omp_lambda_capture_in_declare_target_not_to : Error<
   "variable captured in declare target region must appear in a to clause">;
 def err_omp_device_type_mismatch : Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index dc7ee2ddd0b89..594c6e03aa38f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9868,10 +9868,6 @@ class Sema final {
   /// Pop OpenMP function region for non-capturing function.
   void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI);
 
-  /// Check if the expression is allowed to be used in expressions for the
-  /// OpenMP devices.
-  void checkOpenMPDeviceExpr(const Expr *E);
-
   /// Checks if a type or a declaration is disabled due to the owning extension
   /// being disabled, and emits diagnostic messages if it is disabled.
   /// \param D type or declaration to be checked.
@@ -11654,6 +11650,10 @@ class Sema final {
 
   DeviceDiagBuilder targetDiag(SourceLocation Loc, unsigned DiagID);
 
+  /// Check if the expression is allowed to be used in expressions for the
+  /// offloading devices.
+  void checkDeviceDecl(const ValueDecl *D, SourceLocation Loc);
+
   enum CUDAFunctionTarget {
     CFT_Device,
     CFT_Global,
@@ -12396,6 +12396,40 @@ class Sema final {
     ConstructorDestructor,
     BuiltinFunction
   };
+  /// Creates a DeviceDiagBuilder that emits the diagnostic if the current
+  /// context is "used as device code".
+  ///
+  /// - If CurLexicalContext is a kernel function or it is known that the
+  ///   function will be emitted for the device, emits the diagnostics
+  ///   immediately.
+  /// - If CurLexicalContext is a function and we are compiling
+  ///   for the device, but we don't know that this function will be codegen'ed
+  ///   for devive yet, creates a diagnostic which is emitted if and when we
+  ///   realize that the function will be codegen'ed.
+  ///
+  /// Example usage:
+  ///
+  /// Diagnose __float128 type usage only from SYCL device code if the current
+  /// target doesn't support it
+  /// if (!S.Context.getTargetInfo().hasFloat128Type() &&
+  ///     S.getLangOpts().SYCLIsDevice)
+  ///   SYCLDiagIfDeviceCode(Loc, diag::err_type_unsupported) << "__float128";
+  DeviceDiagBuilder SYCLDiagIfDeviceCode(SourceLocation Loc, unsigned DiagID);
+
+  /// Check whether we're allowed to call Callee from the current context.
+  ///
+  /// - If the call is never allowed in a semantically-correct program
+  ///   emits an error and returns false.
+  ///
+  /// - If the call is allowed in semantically-correct programs, but only if
+  ///   it's never codegen'ed, creates a deferred diagnostic to be emitted if
+  ///   and when the caller is codegen'ed, and returns true.
+  ///
+  /// - Otherwise, returns true without emitting any diagnostics.
+  ///
+  /// Adds Callee to DeviceCallGraph if we don't know if its caller will be
+  /// codegen'ed yet.
+  bool checkSYCLDeviceFunction(SourceLocation Loc, FunctionDecl *Callee);
 };
 
 /// RAII object that enters a new expression evaluation context.
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index 71def7129beb9..b59fc30882f96 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -61,6 +61,7 @@ add_clang_library(clangSema
   SemaStmt.cpp
   SemaStmtAsm.cpp
   SemaStmtAttr.cpp
+  SemaSYCL.cpp
   SemaTemplate.cpp
   SemaTemplateDeduction.cpp
   SemaTemplateInstantiate.cpp
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index b3aeb10184672..8c11a1a59e9c9 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1698,10 +1698,56 @@ Sema::DeviceDiagBuilder Sema::targetDiag(SourceLocation Loc, unsigned DiagID) {
   if (getLangOpts().CUDA)
     return getLangOpts().CUDAIsDevice ? CUDADiagIfDeviceCode(Loc, DiagID)
                                       : CUDADiagIfHostCode(Loc, DiagID);
+
+  if (getLangOpts().SYCLIsDevice)
+    return SYCLDiagIfDeviceCode(Loc, DiagID);
+
   return DeviceDiagBuilder(DeviceDiagBuilder::K_Immediate, Loc, DiagID,
                            getCurFunctionDecl(), *this);
 }
 
+void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
+  if (isUnevaluatedContext())
+    return;
+
+  Decl *C = cast<Decl>(getCurLexicalContext());
+
+  // Memcpy operations for structs containing a member with unsupported type
+  // are ok, though.
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(C)) {
+    if ((MD->isCopyAssignmentOperator() || MD->isMoveAssignmentOperator()) &&
+        MD->isTrivial())
+      return;
+
+    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(MD))
+      if (Ctor->isCopyOrMoveConstructor() && Ctor->isTrivial())
+        return;
+  }
+
+  auto CheckType = [&](QualType Ty) {
+    if ((Ty->isFloat16Type() && !Context.getTargetInfo().hasFloat16Type()) ||
+        ((Ty->isFloat128Type() ||
+          (Ty->isRealFloatingType() && Context.getTypeSize(Ty) == 128)) &&
+         !Context.getTargetInfo().hasFloat128Type()) ||
+        (Ty->isIntegerType() && Context.getTypeSize(Ty) == 128 &&
+         !Context.getTargetInfo().hasInt128Type())) {
+      targetDiag(Loc, diag::err_device_unsupported_type)
+          << D << static_cast<unsigned>(Context.getTypeSize(Ty)) << Ty
+          << Context.getTargetInfo().getTriple().str();
+      targetDiag(D->getLocation(), diag::note_defined_here) << D;
+    }
+  };
+
+  QualType Ty = D->getType();
+  CheckType(Ty);
+
+  if (const auto *FPTy = dyn_cast<FunctionProtoType>(Ty)) {
+    for (const auto &ParamTy : FPTy->param_types())
+      CheckType(ParamTy);
+    CheckType(FPTy->getReturnType());
+  }
+}
+
 /// Looks through the macro-expansion chain for the given
 /// location, looking for a macro expansion with the given name.
 /// If one is found, returns true and sets the location to that
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6fe48c860864b..76754adbf20bd 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -14439,7 +14439,7 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
     DiscardCleanupsInEvaluationContext();
   }
 
-  if (LangOpts.OpenMP || LangOpts.CUDA) {
+  if (LangOpts.OpenMP || LangOpts.CUDA || LangOpts.SYCLIsDevice) {
     auto ES = getEmissionStatus(FD);
     if (ES == Sema::FunctionEmissionStatus::Emitted ||
         ES == Sema::FunctionEmissionStatus::Unknown)
@@ -18119,6 +18119,11 @@ Decl *Sema::getObjCDeclContext() const {
 
 Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
                                                      bool Final) {
+  // SYCL functions can be template, so we check if they have appropriate
+  // attribute prior to checking if it is a template.
+  if (LangOpts.SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>())
+    return FunctionEmissionStatus::Emitted;
+
   // Templates are emitted when they're instantiated.
   if (FD->isDependentContext())
     return FunctionEmissionStatus::TemplateDiscarded;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 3f1121c0e9b28..cedd9437e0012 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -14915,6 +14915,9 @@ Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType,
   MarkFunctionReferenced(ConstructLoc, Constructor);
   if (getLangOpts().CUDA && !CheckCUDACall(ConstructLoc, Constructor))
     return ExprError();
+  if (getLangOpts().SYCLIsDevice &&
+      !checkSYCLDeviceFunction(ConstructLoc, Constructor))
+    return ExprError();
 
   return CheckForImmediateInvocation(
       CXXConstructExpr::Create(
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 4063289711cc5..63f71d81e0474 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -293,6 +293,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
 
     if (getLangOpts().CUDA && !CheckCUDACall(Loc, FD))
       return true;
+
+    if (getLangOpts().SYCLIsDevice && !checkSYCLDeviceFunction(Loc, FD))
+      return true;
   }
 
   if (auto *MD = dyn_cast<CXXMethodDecl>(D)) {
@@ -352,6 +355,10 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
 
   diagnoseUseOfInternalDeclInInlineFunction(*this, D, Loc);
 
+  if (LangOpts.SYCLIsDevice || (LangOpts.OpenMP && LangOpts.OpenMPIsDevice))
+    if (const auto *VD = dyn_cast<ValueDecl>(D))
+      checkDeviceDecl(VD, Loc);
+
   if (isa<ParmVarDecl>(D) && isa<RequiresExprBodyDecl>(D->getDeclContext()) &&
       !isUnevaluatedContext()) {
     // C++ [expr.prim.req.nested] p3
@@ -13511,14 +13518,6 @@ ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
     }
   }
 
-  // Diagnose operations on the unsupported types for OpenMP device compilation.
-  if (getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice) {
-    if (Opc != BO_Assign && Opc != BO_Comma) {
-      checkOpenMPDeviceExpr(LHSExpr);
-      checkOpenMPDeviceExpr(RHSExpr);
-    }
-  }
-
   switch (Opc) {
   case BO_Assign:
     ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, QualType());
@@ -14131,12 +14130,6 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                        << Input.get()->getSourceRange());
     }
   }
-  // Diagnose operations on the unsupported types for OpenMP device compilation.
-  if (getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice) {
-    if (UnaryOperator::isIncrementDecrementOp(Opc) ||
-        UnaryOperator::isArithmeticOp(Opc))
-      checkOpenMPDeviceExpr(InputExpr);
-  }
 
   switch (Opc) {
   case UO_PreInc:
@@ -16395,6 +16388,9 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
   if (getLangOpts().CUDA)
     CheckCUDACall(Loc, Func);
 
+  if (getLangOpts().SYCLIsDevice)
+    checkSYCLDeviceFunction(Loc, Func);
+
   // If we need a definition, try to create one.
   if (NeedDefinition && !Func->getBody()) {
     runWithSufficientStackSpace(Loc, [&] {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a60a047db0e7a..17b585862639d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -1832,23 +1832,28 @@ Sema::DeviceDiagBuilder Sema::diagIfOpenMPDeviceCode(SourceLocation Loc,
                                                      unsigned DiagID) {
   assert(LangOpts.OpenMP && LangOpts.OpenMPIsDevice &&
          "Expected OpenMP device compilation.");
-  FunctionEmissionStatus FES = getEmissionStatus(getCurFunctionDecl());
+
+  FunctionDecl *FD = getCurFunctionDecl();
   DeviceDiagBuilder::Kind Kind = DeviceDiagBuilder::K_Nop;
-  switch (FES) {
-  case FunctionEmissionStatus::Emitted:
-    Kind = DeviceDiagBuilder::K_Immediate;
-    break;
-  case FunctionEmissionStatus::Unknown:
-    Kind = isOpenMPDeviceDelayedContext(*this) ? DeviceDiagBuilder::K_Deferred
-                                               : DeviceDiagBuilder::K_Immediate;
-    break;
-  case FunctionEmissionStatus::TemplateDiscarded:
-  case FunctionEmissionStatus::OMPDiscarded:
-    Kind = DeviceDiagBuilder::K_Nop;
-    break;
-  case FunctionEmissionStatus::CUDADiscarded:
-    llvm_unreachable("CUDADiscarded unexpected in OpenMP device compilation");
-    break;
+  if (FD) {
+    FunctionEmissionStatus FES = getEmissionStatus(FD);
+    switch (FES) {
+    case FunctionEmissionStatus::Emitted:
+      Kind = DeviceDiagBuilder::K_Immediate;
+      break;
+    case FunctionEmissionStatus::Unknown:
+      Kind = isOpenMPDeviceDelayedContext(*this)
+                 ? DeviceDiagBuilder::K_Deferred
+                 : DeviceDiagBuilder::K_Immediate;
+      break;
+    case FunctionEmissionStatus::TemplateDiscarded:
+    case FunctionEmissionStatus::OMPDiscarded:
+      Kind = DeviceDiagBuilder::K_Nop;
+      break;
+    case FunctionEmissionStatus::CUDADiscarded:
+      llvm_unreachable("CUDADiscarded unexpected in OpenMP device compilation");
+      break;
+    }
   }
 
   return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
@@ -1877,21 +1882,6 @@ Sema::DeviceDiagBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
   return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
 }
 
-void Sema::checkOpenMPDeviceExpr(const Expr *E) {
-  assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice &&
-         "OpenMP device compilation mode is expected.");
-  QualType Ty = E->getType();
-  if ((Ty->isFloat16Type() && !Context.getTargetInfo().hasFloat16Type()) ||
-      ((Ty->isFloat128Type() ||
-        (Ty->isRealFloatingType() && Context.getTypeSize(Ty) == 128)) &&
-       !Context.getTargetInfo().hasFloat128Type()) ||
-      (Ty->isIntegerType() && Context.getTypeSize(Ty) == 128 &&
-       !Context.getTargetInfo().hasInt128Type()))
-    targetDiag(E->getExprLoc(), diag::err_omp_unsupported_type)
-        << static_cast<unsigned>(Context.getTypeSize(Ty)) << Ty
-        << Context.getTargetInfo().getTriple().str() << E->getSourceRange();
-}
-
 static OpenMPDefaultmapClauseKind
 getVariableCategoryFromDecl(const LangOptions &LO, const ValueDecl *VD) {
   if (LO.OpenMP <= 45) {
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
new file mode 100644
index 0000000000000..db7603b42f7b6
--- /dev/null
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -0,0 +1,49 @@
+//===- SemaSYCL.cpp - Semantic Analysis for SYCL constructs ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This implements Semantic Analysis for SYCL constructs.
+//===----------------------------------------------------------------------===//
+
+#include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaDiagnostic.h"
+
+using namespace clang;
+
+// -----------------------------------------------------------------------------
+// SYCL device specific diagnostics implementation
+// -----------------------------------------------------------------------------
+
+Sema::DeviceDiagBuilder Sema::SYCLDiagIfDeviceCode(SourceLocation Loc,
+                                                   unsigned DiagID) {
+  assert(getLangOpts().SYCLIsDevice &&
+         "Should only be called during SYCL compilation");
+  FunctionDecl *FD = dyn_cast<FunctionDecl>(getCurLexicalContext());
+  DeviceDiagBuilder::Kind DiagKind = [this, FD] {
+    if (!FD)
+      return DeviceDiagBuilder::K_Nop;
+    if (getEmissionStatus(FD) == Sema::FunctionEmissionStatus::Emitted)
+      return DeviceDiagBuilder::K_ImmediateWithCallStack;
+    return DeviceDiagBuilder::K_Deferred;
+  }();
+  return DeviceDiagBuilder(DiagKind, Loc, DiagID, FD, *this);
+}
+
+bool Sema::checkSYCLDeviceFunction(SourceLocation Loc, FunctionDecl *Callee) {
+  assert(getLangOpts().SYCLIsDevice &&
+         "Should only be called during SYCL compilation");
+  assert(Callee && "Callee may not be null.");
+
+  // Errors in unevaluated context don't need to be generated,
+  // so we can safely skip them.
+  if (isUnevaluatedContext() || isConstantEvaluated())
+    return true;
+
+  DeviceDiagBuilder::Kind DiagKind = DeviceDiagBuilder::K_Nop;
+
+  return DiagKind != DeviceDiagBuilder::K_Immediate &&
+         DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack;
+}
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 1822951266f5a..fc4a23157bca1 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1530,6 +1530,7 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
     break;
   case DeclSpec::TST_float128:
     if (!S.Context.getTargetInfo().hasFloat128Type() &&
+        !S.getLangOpts().SYCLIsDevice &&
         !(S.getLangOpts().OpenMP && S.getLangOpts().OpenMPIsDevice))
       S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported)
         << "__float128";
diff --git a/clang/test/Headers/nvptx_device_math_sin.c b/clang/test/Headers/nvptx_device_math_sin.c
index 83de8b02444a0..92692912789aa 100644
--- a/clang/test/Headers/nvptx_device_math_sin.c
+++ b/clang/test/Headers/nvptx_device_math_sin.c
@@ -7,7 +7,7 @@
 
 #include <math.h>
 
-double math(float f, double d, long double ld) {
+double math(float f, double d) {
   double r = 0;
 // SLOW:  call float @__nv_sinf(float
 // FAST:  call fast float @__nv_fast_sinf(float
@@ -20,8 +20,8 @@ double math(float f, double d, long double ld) {
 
 long double foo(float f, double d, long double ld) {
   double r = ld;
-  r += math(f, d, ld);
+  r += math(f, d);
 #pragma omp target map(r)
-  { r += math(f, d, ld); }
+  { r += math(f, d); }
   return r;
 }
diff --git a/clang/test/Headers/nvptx_device_math_sin.cpp b/clang/test/Headers/nvptx_device_math_sin.cpp
index ba5f6fc483d9d..7c6f102cd2501 100644
--- a/clang/test/Headers/nvptx_device_math_sin.cpp
+++ b/clang/test/Headers/nvptx_device_math_sin.cpp
@@ -7,7 +7,7 @@
 
 #include <cmath>
 
-double math(float f, double d, long double ld) {
+double math(float f, double d) {
   double r = 0;
 // SLOW:  call float @__nv_sinf(float
 // FAST:  call fast float @__nv_fast_sinf(float
@@ -20,8 +20,8 @@ double math(float f, double d, long double ld) {
 
 long double foo(float f, double d, long double ld) {
   double r = ld;
-  r += math(f, d, ld);
+  r += math(f, d);
 #pragma omp target map(r)
-  { r += math(f, d, ld); }
+  { r += math(f, d); }
   return r;
 }
diff --git a/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp b/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
index 0e5abba943b1d..34d0087406daf 100644
--- a/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
@@ -71,11 +71,3 @@ void baz1() {
 }
 #pragma omp end declare target
 
-BIGTYPE foo(BIGTYPE f) {
-#pragma omp target map(f)
-  f = 1;
-  return f;
-}
-
-// CHECK: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l75([[BIGTYPE:.+]]*
-// CHECK: store [[BIGTYPE]] {{0xL00000000000000003FFF000000000000|0xM3FF00000000000000000000000000000}}, [[BIGTYPE]]* %
diff --git a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
index bffb014c5d321..22ce8175fd05a 100644
--- a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
+++ b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
@@ -7,18 +7,23 @@
 struct T {
   char a;
 #ifndef _ARCH_PPC
+  // expected-note@+1 {{'f' defined here}}
   __float128 f;
 #else
+  // expected-note@+1 {{'f' defined here}}
   long double f;
 #endif
   char c;
   T() : a(12), f(15) {}
 #ifndef _ARCH_PPC
-// expected-error@+4 {{host requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+// expected-error@+5 {{'f' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 #else
-// expected-error@+2 {{host requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+// expected-error@+3 {{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 #endif
-  T &operator+(T &b) { f += b.a; return *this;}
+  T &operator+(T &b) {
+    f += b.a;
+    return *this;
+  }
 };
 
 struct T1 {
@@ -27,19 +32,36 @@ struct T1 {
   __int128 f1;
   char c;
   T1() : a(12), f(15) {}
-  T1 &operator/(T1 &b) { f /= b.a; return *this;}
+  T1 &operator/(T1 &b) {
+    f /= b.a;
+    return *this;
+  }
 };
 
+#ifndef _ARCH_PPC
+// expected-note@+1 {{'boo' defined here}}
+void boo(__float128 A) { return; }
+#else
+// expected-note@+1 {{'boo' defined here}}
+void boo(long double A) { return; }
+#endif
 #pragma omp declare target
 T a = T();
 T f = a;
 void foo(T a = T()) {
   a = a + f; // expected-note {{called by 'foo'}}
+#ifndef _ARCH_PPC
+// expected-error@+4 {{'boo' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+#else
+// expected-error@+2 {{'boo' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+#endif
+  boo(0);
   return;
 }
 T bar() {
   return T();
 }
+
 void baz() {
   T t = bar();
 }
@@ -56,3 +78,45 @@ void baz1() {
   T1 t = bar1();
 }
 #pragma omp end declare target
+
+#ifndef _ARCH_PPC
+// expected-note@+1 3{{'f' defined here}}
+__float128 foo1(__float128 f) {
+#pragma omp target map(f)
+  // expected-error@+1 3{{'f' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  f = 1;
+  return f;
+}
+#else
+// expected-note@+1 3{{'f' defined here}}
+long double foo1(long double f) {
+#pragma omp target map(f)
+  // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  f = 1;
+  return f;
+}
+#endif
+
+T foo3() {
+  T S;
+#pragma omp target map(S)
+  S.a = 1;
+  return S;
+}
+
+// Allow all sorts of stuff on host
+#ifndef _ARCH_PPC
+__float128 q, b;
+__float128 c = q + b;
+#else
+long double q, b;
+long double c = q + b;
+#endif
+
+void hostFoo() {
+  boo(c - b);
+}
+
+long double qa, qb;
+decltype(qa + qb) qc;
+double qd[sizeof(-(-(qc * 2)))];
diff --git a/clang/test/SemaSYCL/float128.cpp b/clang/test/SemaSYCL/float128.cpp
new file mode 100644
index 0000000000000..d2d520b5b12dc
--- /dev/null
+++ b/clang/test/SemaSYCL/float128.cpp
@@ -0,0 +1,96 @@
+// RUN: %clang_cc1 -triple spir64 -fsycl -fsycl-is-device -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsycl -fsycl-is-device -fsyntax-only %s
+
+typedef __float128 BIGTY;
+
+template <class T>
+class Z {
+public:
+  // expected-note@+1 {{'field' defined here}}
+  T field;
+  // expected-note@+1 2{{'field1' defined here}}
+  __float128 field1;
+  using BIGTYPE = __float128;
+  // expected-note@+1 {{'bigfield' defined here}}
+  BIGTYPE bigfield;
+};
+
+void host_ok(void) {
+  __float128 A;
+  int B = sizeof(__float128);
+  Z<__float128> C;
+  C.field1 = A;
+}
+
+void usage() {
+  // expected-note@+1 3{{'A' defined here}}
+  __float128 A;
+  Z<__float128> C;
+  // expected-error@+2 {{'A' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+  // expected-error@+1 {{'field1' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+  C.field1 = A;
+  // expected-error@+1 {{'bigfield' requires 128 bit size 'Z::BIGTYPE' (aka '__float128') type support, but device 'spir64' does not support it}}
+  C.bigfield += 1.0;
+
+  // expected-error@+1 {{'A' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+  auto foo1 = [=]() {
+    __float128 AA;
+    // expected-note@+2 {{'BB' defined here}}
+    // expected-error@+1 {{'A' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+    auto BB = A;
+    // expected-error@+1 {{'BB' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+    BB += 1;
+  };
+
+  // expected-note@+1 {{called by 'usage'}}
+  foo1();
+}
+
+template <typename t>
+void foo2(){};
+
+// expected-note@+3 {{'P' defined here}}
+// expected-error@+2 {{'P' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+// expected-note@+1 2{{'foo' defined here}}
+__float128 foo(__float128 P) { return P; }
+
+template <typename Name, typename Func>
+__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
+  // expected-note@+1 5{{called by 'kernel}}
+  kernelFunc();
+}
+
+int main() {
+  // expected-note@+1 {{'CapturedToDevice' defined here}}
+  __float128 CapturedToDevice = 1;
+  host_ok();
+  kernel<class variables>([=]() {
+    decltype(CapturedToDevice) D;
+    // expected-error@+1 {{'CapturedToDevice' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+    auto C = CapturedToDevice;
+    Z<__float128> S;
+    // expected-error@+1 {{'field1' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+    S.field1 += 1;
+    // expected-error@+1 {{'field' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+    S.field = 1;
+  });
+
+  kernel<class functions>([=]() {
+    // expected-note@+1 2{{called by 'operator()'}}
+    usage();
+    // expected-note@+1 {{'BBBB' defined here}}
+    BIGTY BBBB;
+    // expected-note@+3 {{called by 'operator()'}}
+    // expected-error@+2 2{{'foo' requires 128 bit size '__float128' type support, but device 'spir64' does not support it}}
+    // expected-error@+1 {{'BBBB' requires 128 bit size 'BIGTY' (aka '__float128') type support, but device 'spir64' does not support it}}
+    auto A = foo(BBBB);
+  });
+
+  kernel<class ok>([=]() {
+    Z<__float128> S;
+    foo2<__float128>();
+    auto A = sizeof(CapturedToDevice);
+  });
+
+  return 0;
+}

From 0384446c7c2458b12ce3ef1c2bdf438af1f78ad7 Mon Sep 17 00:00:00 2001
From: Pushpinder Singh <Pushpinder.Singh@amd.com>
Date: Fri, 29 May 2020 05:39:43 -0400
Subject: [PATCH 530/770] Remove SVN logic from find_first_existing_vc_file

As LLVM has moved from SVN to git, there is no need to
keep SVN related code. Also, this code piece was never used.

Differential Revision: https://reviews.llvm.org/D79400
---
 llvm/cmake/modules/AddLLVM.cmake | 51 ++++++++++++--------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index f16f63c32c95f..f4dbd364262b9 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2094,40 +2094,27 @@ function(find_first_existing_vc_file path out_var)
   if(NOT EXISTS "${path}")
     return()
   endif()
-  if(EXISTS "${path}/.svn")
-    set(svn_files
-      "${path}/.svn/wc.db"   # SVN 1.7
-      "${path}/.svn/entries" # SVN 1.6
-    )
-    foreach(file IN LISTS svn_files)
-      if(EXISTS "${file}")
-        set(${out_var} "${file}" PARENT_SCOPE)
-        return()
-      endif()
-    endforeach()
-  else()
-    find_package(Git)
-    if(GIT_FOUND)
-      execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --git-dir
-        WORKING_DIRECTORY ${path}
-        RESULT_VARIABLE git_result
-        OUTPUT_VARIABLE git_output
-        ERROR_QUIET)
-      if(git_result EQUAL 0)
-        string(STRIP "${git_output}" git_output)
-        get_filename_component(git_dir ${git_output} ABSOLUTE BASE_DIR ${path})
-        # Some branchless cases (e.g. 'repo') may not yet have .git/logs/HEAD
-        if (NOT EXISTS "${git_dir}/logs/HEAD")
-          execute_process(COMMAND ${CMAKE_COMMAND} -E touch HEAD
-            WORKING_DIRECTORY "${git_dir}/logs"
-            RESULT_VARIABLE touch_head_result
-            ERROR_QUIET)
-          if (NOT touch_head_result EQUAL 0)
-            return()
-          endif()
+  find_package(Git)
+  if(GIT_FOUND)
+    execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --git-dir
+      WORKING_DIRECTORY ${path}
+      RESULT_VARIABLE git_result
+      OUTPUT_VARIABLE git_output
+      ERROR_QUIET)
+    if(git_result EQUAL 0)
+      string(STRIP "${git_output}" git_output)
+      get_filename_component(git_dir ${git_output} ABSOLUTE BASE_DIR ${path})
+      # Some branchless cases (e.g. 'repo') may not yet have .git/logs/HEAD
+      if (NOT EXISTS "${git_dir}/logs/HEAD")
+        execute_process(COMMAND ${CMAKE_COMMAND} -E touch HEAD
+          WORKING_DIRECTORY "${git_dir}/logs"
+          RESULT_VARIABLE touch_head_result
+          ERROR_QUIET)
+        if (NOT touch_head_result EQUAL 0)
+          return()
         endif()
-        set(${out_var} "${git_dir}/logs/HEAD" PARENT_SCOPE)
       endif()
+      set(${out_var} "${git_dir}/logs/HEAD" PARENT_SCOPE)
     endif()
   endif()
 endfunction()

From 34cfed24ebd3a2a9f286877d142a68dbf2c42c96 Mon Sep 17 00:00:00 2001
From: diggerlin <digger.llvm@gmail.com>
Date: Fri, 29 May 2020 11:08:51 -0400
Subject: [PATCH 531/770] [AIX][XCOFF] add symbol priority for the llvm-objdump
 -D -symbol-description

SUMMARY:
when there are two symbol has the same address. llvm-objdump -D -symbol-description will select symbol based on the following rule:

1. using Label first if there is a Label symbol.
2. If there is not Label, using a symbol which has Storage Mapping class.
3. if more than one symbol has storage mapping class, put the TC0 has the low priority, for other storage mapping class , compare based on the value.

Reviewers: James Henderson ,hubert.reinterpretcast,

Differential Revision: https://reviews.llvm.org/D78387
---
 .../llvm/MC/MCDisassembler/MCDisassembler.h   |  9 ++--
 llvm/lib/MC/MCDisassembler/MCDisassembler.cpp | 53 +++++++++++++++++++
 .../XCOFF/disassemble-symbol-description.test |  2 +-
 llvm/unittests/MC/CMakeLists.txt              |  1 +
 llvm/unittests/MC/MCDisassemblerTest.cpp      | 49 +++++++++++++++++
 5 files changed, 110 insertions(+), 4 deletions(-)
 create mode 100644 llvm/unittests/MC/MCDisassemblerTest.cpp

diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 2d7efc8acc22f..cddf325994f2d 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -26,6 +26,8 @@ struct XCOFFSymbolInfo {
   XCOFFSymbolInfo(Optional<XCOFF::StorageMappingClass> Smc,
                   Optional<uint32_t> Idx, bool Label)
       : StorageMappingClass(Smc), Index(Idx), IsLabel(Label) {}
+
+  bool operator<(const XCOFFSymbolInfo &SymInfo) const;
 };
 
 struct SymbolInfoTy {
@@ -53,9 +55,10 @@ struct SymbolInfoTy {
     assert(P1.IsXCOFF == P2.IsXCOFF &&
            "P1.IsXCOFF should be equal to P2.IsXCOFF.");
     if (P1.IsXCOFF)
-      return std::tie(P1.Addr, P1.Name) < std::tie(P2.Addr, P2.Name);
-    else
-      return std::tie(P1.Addr, P1.Name, P1.Type) <
+      return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
+             std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
+
+    return std::tie(P1.Addr, P1.Name, P1.Type) <
              std::tie(P2.Addr, P2.Name, P2.Type);
   }
 };
diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
index 373916fbed785..40ffd1fc5b73e 100644
--- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -43,3 +43,56 @@ void MCDisassembler::tryAddingPcLoadReferenceComment(int64_t Value,
 void MCDisassembler::setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer) {
   Symbolizer = std::move(Symzer);
 }
+
+#define SMC_PCASE(A, P)                                                         \
+  case XCOFF::XMC_##A:                                                         \
+    return P;
+
+uint8_t getSMCPriority(XCOFF::StorageMappingClass SMC) {
+  switch (SMC) {
+    SMC_PCASE(PR, 1)
+    SMC_PCASE(RO, 1)
+    SMC_PCASE(DB, 1)
+    SMC_PCASE(GL, 1)
+    SMC_PCASE(XO, 1)
+    SMC_PCASE(SV, 1)
+    SMC_PCASE(SV64, 1)
+    SMC_PCASE(SV3264, 1)
+    SMC_PCASE(TI, 1)
+    SMC_PCASE(TB, 1)
+    SMC_PCASE(RW, 1)
+    SMC_PCASE(TC0, 0)
+    SMC_PCASE(TC, 1)
+    SMC_PCASE(TD, 1)
+    SMC_PCASE(DS, 1)
+    SMC_PCASE(UA, 1)
+    SMC_PCASE(BS, 1)
+    SMC_PCASE(UC, 1)
+    SMC_PCASE(TL, 1)
+    SMC_PCASE(UL, 1)
+    SMC_PCASE(TE, 1)
+#undef SMC_PCASE
+  }
+  return 0;
+}
+
+/// The function is for symbol sorting when symbols have the same address.
+/// The symbols in the same section are sorted in ascending order.
+/// llvm-objdump -D will choose the highest priority symbol to display when
+/// there are symbols with the same address.
+bool XCOFFSymbolInfo::operator<(const XCOFFSymbolInfo &SymInfo) const {
+  // Label symbols have higher priority than non-label symbols.
+  if (IsLabel != SymInfo.IsLabel)
+    return SymInfo.IsLabel;
+
+  // Symbols with a StorageMappingClass have higher priority than those without.
+  if (StorageMappingClass.hasValue() != SymInfo.StorageMappingClass.hasValue())
+    return SymInfo.StorageMappingClass.hasValue();
+
+  if (StorageMappingClass.hasValue()) {
+    return getSMCPriority(StorageMappingClass.getValue()) <
+           getSMCPriority(SymInfo.StorageMappingClass.getValue());
+  }
+
+  return false;
+}
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
index c92772b8cbad6..71c5958fa916a 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
@@ -25,7 +25,7 @@
 COMMON: Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 COMMON: Disassembly of section .text:
 PLAIN:      00000000 <.text>:
-DESC:       00000000 (idx: 4) .text:
+DESC:       00000000 (idx: 16) .func: 
 COMMON-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 RELOC:                              00000002:  R_TOC        (idx: 26) a[TC]
 COMMON-NEXT:        4: 80 63 00 00                  	lwz 3, 0(3)
diff --git a/llvm/unittests/MC/CMakeLists.txt b/llvm/unittests/MC/CMakeLists.txt
index 48c06183ab99d..3827a2b4fc435 100644
--- a/llvm/unittests/MC/CMakeLists.txt
+++ b/llvm/unittests/MC/CMakeLists.txt
@@ -17,5 +17,6 @@ add_llvm_unittest(MCTests
   MCInstPrinter.cpp
   StringTableBuilderTest.cpp
   TargetRegistry.cpp
+  MCDisassemblerTest.cpp
   )
 
diff --git a/llvm/unittests/MC/MCDisassemblerTest.cpp b/llvm/unittests/MC/MCDisassemblerTest.cpp
new file mode 100644
index 0000000000000..07d13cfa24c4e
--- /dev/null
+++ b/llvm/unittests/MC/MCDisassemblerTest.cpp
@@ -0,0 +1,49 @@
+//===- MCDisassemblerTest.cpp - Tests for MCDisassembler.cpp --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(MCDisassembler, XCOFFSymbolPriorityTest) {
+
+  SymbolInfoTy SIT1(0x100000, "sym1", None, 1, false);
+  SymbolInfoTy SIT2(0x110000, "sym2", None, 2, false);
+  SymbolInfoTy SIT3(0x120000, ".func", XCOFF::XMC_PR, 3, true);
+  SymbolInfoTy SIT4(0x120000, ".text", XCOFF::XMC_PR, 4, false);
+  SymbolInfoTy SIT5(0x130000, "TOC", XCOFF::XMC_TC0, 5, false);
+  SymbolInfoTy SIT6(0x130000, "func", XCOFF::XMC_TC, 6, false);
+
+  // Test that higher addresses would appear later than lower ones when symbols
+  // are sorted in ascending order.
+  EXPECT_TRUE(SIT1 < SIT2);
+  EXPECT_FALSE(SIT2 < SIT1);
+
+  // Test that symbols with a StorageMappingClass have higher priority than those
+  // without.
+  EXPECT_TRUE(SIT2 < SIT5);
+  EXPECT_FALSE(SIT5 < SIT2);
+
+  // Test that symbols with a TC0 StorageMappingClass have lower priority than those
+  // with some other StorageMappingClass.
+  EXPECT_TRUE(SIT5 < SIT6);
+  EXPECT_FALSE(SIT6 < SIT5);
+
+  // Test label symbols have higher priorty than non-label symbols.
+  EXPECT_TRUE(SIT4 < SIT3);
+  EXPECT_FALSE(SIT3 < SIT4);
+
+  // Test symbols comparing with themselves.
+  EXPECT_FALSE(SIT1 < SIT1);
+  EXPECT_FALSE(SIT2 < SIT2);
+  EXPECT_FALSE(SIT3 < SIT3);
+  EXPECT_FALSE(SIT4 < SIT4);
+  EXPECT_FALSE(SIT5 < SIT5);
+  EXPECT_FALSE(SIT6 < SIT6);
+}

From 747c574b94595c6209185c87e79f51bff46fd4d8 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 29 May 2020 14:45:08 +0100
Subject: [PATCH 532/770] [ARM] Extra MVE VMLAV reduction patterns

These patterns for i8 and i16 VMLA's were missing. They end up from
legalized vector.reduce.add.v8i16 and vector.reduce.add.v16i8, and
although the instruction works differently (the mul and add are
performed in a higher precision), I believe it is OK because only an
i8/i16 are demanded from them, and so the results will be the same. At
least, they pass any testing I can think to run on them.

There are some tests that end up looking worse, but are quite artificial
due to passing half vector types through a call boundary. I would not
expect the vmull to realistically come up like that, and a vmlava is
likely better a lot of the time.

Differential Revision: https://reviews.llvm.org/D80524
---
 llvm/lib/Target/ARM/ARMInstrMVE.td            | 18 ++++++++---
 llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll | 32 +++++++++----------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index a5ea45bdaf152..4f72730d73be2 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -1019,22 +1019,32 @@ def ARMVMLALVAu      : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
 let Predicates = [HasMVEInt] in {
   def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
             (i32 (MVE_VMLADAVu32 $src1, $src2))>;
-  def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+  def : Pat<(i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
+            (i32 (MVE_VMLADAVu16 $src1, $src2))>;
   def : Pat<(i32 (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
             (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
   def : Pat<(i32 (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
             (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+  def : Pat<(i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
+            (i32 (MVE_VMLADAVu8 $src1, $src2))>;
+  def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+            (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+  def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+            (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
 
   def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
                                           (i32 tGPREven:$src3))),
             (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
+                                          (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>;
   def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
   def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
+                                          (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>;
   def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
             (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
   def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index 0716f585baf60..67a0075a7245a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -135,8 +135,7 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: add_v8i16_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i16 q0, q0, q1
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vmlav.u16 r0, q0, q1
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -438,8 +437,9 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: add_v8i8_v8i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullb.u8 q0, q0, q1
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vmlav.u16 r0, q0, q1
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -453,8 +453,9 @@ entry:
 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: add_v8i8_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullb.s8 q0, q0, q1
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmlav.u16 r0, q0, q1
 ; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -468,8 +469,7 @@ entry:
 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: add_v16i8_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i8 q0, q0, q1
-; CHECK-NEXT:    vaddv.u8 r0, q0
+; CHECK-NEXT:    vmlav.u8 r0, q0, q1
 ; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1086,8 +1086,7 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
 ; CHECK-LABEL: add_v8i16_v8i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i16 q0, q0, q1
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vmlava.u16 r0, q0, q1
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1408,8 +1407,9 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
 ; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullb.u8 q0, q0, q1
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vmlava.u16 r0, q0, q1
 ; CHECK-NEXT:    uxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1424,8 +1424,9 @@ entry:
 define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
 ; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmullb.s8 q0, q0, q1
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmlava.u16 r0, q0, q1
 ; CHECK-NEXT:    sxth r0, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1440,8 +1441,7 @@ entry:
 define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
 ; CHECK-LABEL: add_v16i8_v16i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.i8 q0, q0, q1
-; CHECK-NEXT:    vaddva.u8 r0, q0
+; CHECK-NEXT:    vmlava.u8 r0, q0, q1
 ; CHECK-NEXT:    uxtb r0, r0
 ; CHECK-NEXT:    bx lr
 entry:

From 68c50708d1f2b9aee3f10ec710df0b1387f701e5 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Fri, 29 May 2020 08:25:35 -0700
Subject: [PATCH 533/770] unwind: use a more portable endianness check in EHABI

The ARM specific code was trying to determine endianness using the
`__LITTLE_ENDIAN__` macro which is not guaranteed to be defined.
When not defined, it makes libunwind to build the big-endian code even
when the compiler builds for a little-endian target.

This change allows building libunwind with the `musl-gcc` toolchain
which does not define `__LITTLE_ENDIAN__`.  Use `__BYTE_ORDER__`
instead.

Patch by Idan Freiberg!
---
 libunwind/src/Unwind-EHABI.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libunwind/src/Unwind-EHABI.cpp b/libunwind/src/Unwind-EHABI.cpp
index a8a64cbfd7e5e..32b5cbc3be92e 100644
--- a/libunwind/src/Unwind-EHABI.cpp
+++ b/libunwind/src/Unwind-EHABI.cpp
@@ -31,10 +31,12 @@ namespace {
 // signinficant byte.
 uint8_t getByte(const uint32_t* data, size_t offset) {
   const uint8_t* byteData = reinterpret_cast<const uint8_t*>(data);
-#ifdef __LITTLE_ENDIAN__
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
   return byteData[(offset & ~(size_t)0x03) + (3 - (offset & (size_t)0x03))];
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   return byteData[offset];
+#else
+#error "Unable to determine endianess"
 #endif
 }
 
@@ -943,10 +945,12 @@ _Unwind_VRS_Pop(_Unwind_Context *context, _Unwind_VRS_RegClass regclass,
         // SP is only 32-bit aligned so don't copy 64-bit at a time.
         uint64_t w0 = *sp++;
         uint64_t w1 = *sp++;
-#ifdef __LITTLE_ENDIAN__
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
         uint64_t value = (w1 << 32) | w0;
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
         uint64_t value = (w0 << 32) | w1;
+#else
+#error "Unable to determine endianess"
 #endif
         if (_Unwind_VRS_Set(context, regclass, i, representation, &value) !=
             _UVRSR_OK)

From 977f00123a6d94c634d22356cae1da2a22f0e3df Mon Sep 17 00:00:00 2001
From: Fred Riss <friss@apple.com>
Date: Fri, 29 May 2020 08:29:11 -0700
Subject: [PATCH 534/770] [lldb/test] Fix TestAppleSimulatorOSType when
 multiple runtimes are installed

One can have multiple simulator runtimes installed, supporting
various generations of OSs. The logic in TestAppleSimulatorOSType
might select a rnutime older than the one targeted by the current
tools, preventing the executable from running. This commit changes
the test to look for the most recent runtime available instead.
---
 .../test/API/tools/lldb-server/TestAppleSimulatorOSType.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
index 86b54dd3e8e5e..a259ef66832b8 100644
--- a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
+++ b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
@@ -18,6 +18,7 @@ def check_simulator_ostype(self, sdk, platform, arch='x86_64'):
         sim_devices = json.loads(sim_devices_str)['devices']
         # Find an available simulator for the requested platform
         deviceUDID = None
+        deviceRuntime = None
         for simulator in sim_devices:
             if isinstance(simulator,dict):
                 runtime = simulator['name']
@@ -32,9 +33,11 @@ def check_simulator_ostype(self, sdk, platform, arch='x86_64'):
                     continue
                 if 'isAvailable' in device and device['isAvailable'] != True:
                     continue
+                if deviceRuntime and runtime < deviceRuntime:
+                    continue
                 deviceUDID = device['udid']
-                break
-            if deviceUDID != None:
+                deviceRuntime = runtime
+                # Stop searching in this runtime
                 break
 
         # Launch the process using simctl

From 9e0b52e2e68412a9a2add18697f4246e5e5ee5e3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 29 May 2020 16:35:21 +0100
Subject: [PATCH 535/770] [AMDGPU] Remove duplicate test cases

The two "2sin" test cases were identical to the "sin_2x" test cases just
above.
---
 llvm/test/CodeGen/AMDGPU/llvm.sin.ll | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index 45b4e874da0e4..c5736cd1d2324 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -80,34 +80,6 @@ define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x)
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_safe_2sin_f32:
-; GCN: v_add_f32
-; GCN: v_mul_f32
-; SICIVI: v_fract_f32
-; GFX9-NOT: v_fract_f32
-; GCN: v_sin_f32
-; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
-   %y = fmul float 2.0, %x
-   %sin = call float @llvm.sin.f32(float %y)
-   store float %sin, float addrspace(1)* %out
-   ret void
-}
-
-; FUNC-LABEL: {{^}}test_unsafe_2sin_f32:
-; GCN: 0x3ea2f983
-; GCN: v_mul_f32
-; SICIVI: v_fract_f32
-; GFX9-NOT: v_fract_f32
-; GCN: v_sin_f32
-; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
-   %y = fmul float 2.0, %x
-   %sin = call float @llvm.sin.f32(float %y)
-   store float %sin, float addrspace(1)* %out
-   ret void
-}
-
 ; FUNC-LABEL: {{^}}sin_v4f32:
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

From 634258b80606c4bb8192077239a089ae5842781a Mon Sep 17 00:00:00 2001
From: Gabor Marton <gabor.marton@ericsson.com>
Date: Fri, 15 May 2020 11:25:40 +0200
Subject: [PATCH 536/770] [analyzer] StdLibraryFunctionsChecker: Add support to
 lookup types

Summary:
In this patch I am trying to get rid of the `Irrelevant` types from the
signatures of the functions from the standard C library. For that I've
introduced `lookupType()` to be able to lookup arbitrary types in the global
scope. This makes it possible to define the signatures precisely.

Note 1) `fread`'s signature is now fixed to have the proper `FILE *restrict`
type when C99 is the language.
Note 2) There are still existing `Irrelevant` types, but they are all from
POSIX. I am planning to address those together with the missing POSIX functions
(in D79433).

Reviewers: xazax.hun, NoQ, Szelethus, balazske

Subscribers: whisperity, baloghadamsoftware, szepet, rnkovacs, a.sidorin, mikhail.ramalho, donat.nagy, dkrupp, gamesh411, Charusso, steakhal, ASDenysPetrov, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80016
---
 .../Checkers/StdLibraryFunctionsChecker.cpp   | 68 +++++++++++++++----
 .../std-c-library-functions-arg-constraints.c |  2 +-
 .../Analysis/std-c-library-functions-lookup.c | 19 ++++++
 .../std-c-library-functions-lookup.cpp        | 23 +++++++
 clang/test/Analysis/std-c-library-functions.c |  6 +-
 5 files changed, 102 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/Analysis/std-c-library-functions-lookup.c
 create mode 100644 clang/test/Analysis/std-c-library-functions-lookup.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 578f6ad46b849..6feae56502f1f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -725,6 +725,26 @@ StdLibraryFunctionsChecker::findFunctionSummary(const CallEvent &Call,
   return findFunctionSummary(FD, C);
 }
 
+llvm::Optional<QualType> lookupType(StringRef Name, const ASTContext &ACtx) {
+  IdentifierInfo &II = ACtx.Idents.get(Name);
+  auto LookupRes = ACtx.getTranslationUnitDecl()->lookup(&II);
+  if (LookupRes.size() == 0)
+    return None;
+
+  // Prioritze typedef declarations.
+  // This is needed in case of C struct typedefs. E.g.:
+  //   typedef struct FILE FILE;
+  // In this case, we have a RecordDecl 'struct FILE' with the name 'FILE' and
+  // we have a TypedefDecl with the name 'FILE'.
+  for (Decl *D : LookupRes) {
+    if (auto *TD = dyn_cast<TypedefNameDecl>(D))
+      return ACtx.getTypeDeclType(TD).getCanonicalType();
+  }
+  assert(LookupRes.size() == 1 && "Type identifier should be unique");
+  auto *D = cast<TypeDecl>(LookupRes.front());
+  return ACtx.getTypeDeclType(D).getCanonicalType();
+}
+
 void StdLibraryFunctionsChecker::initFunctionSummaries(
     CheckerContext &C) const {
   if (!FunctionSummaryMap.empty())
@@ -747,13 +767,16 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   const QualType SizeTy = ACtx.getSizeType();
   const QualType VoidPtrTy = ACtx.VoidPtrTy; // void *
   const QualType VoidPtrRestrictTy =
-      ACtx.getRestrictType(VoidPtrTy); // void *restrict
+      ACtx.getLangOpts().C99 ? ACtx.getRestrictType(VoidPtrTy) // void *restrict
+                             : VoidPtrTy;
   const QualType ConstVoidPtrTy =
       ACtx.getPointerType(ACtx.VoidTy.withConst()); // const void *
   const QualType ConstCharPtrTy =
       ACtx.getPointerType(ACtx.CharTy.withConst()); // const char *
   const QualType ConstVoidPtrRestrictTy =
-      ACtx.getRestrictType(ConstVoidPtrTy); // const void *restrict
+      ACtx.getLangOpts().C99
+          ? ACtx.getRestrictType(ConstVoidPtrTy) // const void *restrict
+          : ConstVoidPtrTy;
 
   const RangeInt IntMax = BVF.getMaxValue(IntTy).getLimitedValue();
   const RangeInt LongMax = BVF.getMaxValue(LongTy).getLimitedValue();
@@ -871,10 +894,20 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     return std::make_shared<NotNullConstraint>(ArgN);
   };
 
+  Optional<QualType> FileTy = lookupType("FILE", ACtx);
+  Optional<QualType> FilePtrTy, FilePtrRestrictTy;
+  if (FileTy) {
+    // FILE *
+    FilePtrTy = ACtx.getPointerType(*FileTy);
+    // FILE *restrict
+    FilePtrRestrictTy =
+        ACtx.getLangOpts().C99 ? ACtx.getRestrictType(*FilePtrTy) : *FilePtrTy;
+  }
+
   using RetType = QualType;
   // Templates for summaries that are reused by many functions.
   auto Getc = [&]() {
-    return Summary(ArgTypes{Irrelevant}, RetType{IntTy}, NoEvalCall)
+    return Summary(ArgTypes{*FilePtrTy}, RetType{IntTy}, NoEvalCall)
         .Case({ReturnValueCondition(WithinRange,
                                     {{EOFv, EOFv}, {0, UCharRangeMax}})});
   };
@@ -885,17 +918,18 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                ReturnValueCondition(WithinRange, Range(-1, Max))});
   };
   auto Fread = [&]() {
-    return Summary(ArgTypes{VoidPtrRestrictTy, Irrelevant, SizeTy, Irrelevant},
-                   RetType{SizeTy}, NoEvalCall)
+    return Summary(
+               ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, *FilePtrRestrictTy},
+               RetType{SizeTy}, NoEvalCall)
         .Case({
             ReturnValueCondition(LessThanOrEq, ArgNo(2)),
         })
         .ArgConstraint(NotNull(ArgNo(0)));
   };
   auto Fwrite = [&]() {
-    return Summary(
-               ArgTypes{ConstVoidPtrRestrictTy, Irrelevant, SizeTy, Irrelevant},
-               RetType{SizeTy}, NoEvalCall)
+    return Summary(ArgTypes{ConstVoidPtrRestrictTy, SizeTy, SizeTy,
+                            *FilePtrRestrictTy},
+                   RetType{SizeTy}, NoEvalCall)
         .Case({
             ReturnValueCondition(LessThanOrEq, ArgNo(2)),
         })
@@ -1042,23 +1076,33 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                  ReturnValueCondition(WithinRange, SingleValue(0))}));
 
   // The getc() family of functions that returns either a char or an EOF.
-  addToFunctionSummaryMap("getc", Getc());
-  addToFunctionSummaryMap("fgetc", Getc());
+  if (FilePtrTy) {
+    addToFunctionSummaryMap("getc", Getc());
+    addToFunctionSummaryMap("fgetc", Getc());
+  }
   addToFunctionSummaryMap(
       "getchar", Summary(ArgTypes{}, RetType{IntTy}, NoEvalCall)
                      .Case({ReturnValueCondition(
                          WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}})}));
 
   // read()-like functions that never return more than buffer size.
+  if (FilePtrRestrictTy) {
+    addToFunctionSummaryMap("fread", Fread());
+    addToFunctionSummaryMap("fwrite", Fwrite());
+  }
+
   // We are not sure how ssize_t is defined on every platform, so we
   // provide three variants that should cover common cases.
+  // FIXME these are actually defined by POSIX and not by the C standard, we
+  // should handle them together with the rest of the POSIX functions.
   addToFunctionSummaryMap("read", {Read(IntTy, IntMax), Read(LongTy, LongMax),
                                    Read(LongLongTy, LongLongMax)});
   addToFunctionSummaryMap("write", {Read(IntTy, IntMax), Read(LongTy, LongMax),
                                     Read(LongLongTy, LongLongMax)});
-  addToFunctionSummaryMap("fread", Fread());
-  addToFunctionSummaryMap("fwrite", Fwrite());
+
   // getline()-like functions either fail or read at least the delimiter.
+  // FIXME these are actually defined by POSIX and not by the C standard, we
+  // should handle them together with the rest of the POSIX functions.
   addToFunctionSummaryMap("getline",
                           {Getline(IntTy, IntMax), Getline(LongTy, LongMax),
                            Getline(LongLongTy, LongLongMax)});
diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
index 60338128ec89d..b99248d337b34 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-constraints.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
@@ -64,7 +64,7 @@ void test_alnum_symbolic2(int x) {
 
 typedef struct FILE FILE;
 typedef typeof(sizeof(int)) size_t;
-size_t fread(void *restrict, size_t, size_t, FILE *);
+size_t fread(void *restrict, size_t, size_t, FILE *restrict);
 void test_notnull_concrete(FILE *fp) {
   fread(0, sizeof(int), 10, fp); // \
   // report-warning{{Function argument constraint is not satisfied}} \
diff --git a/clang/test/Analysis/std-c-library-functions-lookup.c b/clang/test/Analysis/std-c-library-functions-lookup.c
new file mode 100644
index 0000000000000..495562a2a5a47
--- /dev/null
+++ b/clang/test/Analysis/std-c-library-functions-lookup.c
@@ -0,0 +1,19 @@
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=apiModeling.StdCLibraryFunctions \
+// RUN:   -analyzer-config apiModeling.StdCLibraryFunctions:DisplayLoadedSummaries=true \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -analyzer-config eagerly-assume=false \
+// RUN:   -triple i686-unknown-linux 2>&1 | FileCheck %s
+
+// CHECK: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
+
+typedef typeof(sizeof(int)) size_t;
+typedef struct FILE FILE;
+size_t fread(void *restrict, size_t, size_t, FILE *restrict);
+
+// Must have at least one call expression to initialize the summary map.
+int bar(void);
+void foo() {
+  bar();
+}
diff --git a/clang/test/Analysis/std-c-library-functions-lookup.cpp b/clang/test/Analysis/std-c-library-functions-lookup.cpp
new file mode 100644
index 0000000000000..888ab27d501fc
--- /dev/null
+++ b/clang/test/Analysis/std-c-library-functions-lookup.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=apiModeling.StdCLibraryFunctions \
+// RUN:   -analyzer-config apiModeling.StdCLibraryFunctions:DisplayLoadedSummaries=true \
+// RUN:   -analyzer-checker=debug.ExprInspection \
+// RUN:   -analyzer-config eagerly-assume=false \
+// RUN:   -triple i686-unknown-linux 2>&1 | FileCheck %s
+
+//      CHECK: Loaded summary for: size_t fread(void *, size_t, size_t, FILE *)
+//  CHECK-NOT: Loaded summary for: size_t fread(void *, size_t, size_t, MyFile *)
+
+typedef unsigned int size_t;
+typedef struct FILE FILE;
+size_t fread(void *, size_t, size_t, FILE *);
+
+struct MyFile;
+size_t fread(void *, size_t, size_t, MyFile *);
+
+// Must have at least one call expression to initialize the summary map.
+int bar(void);
+void foo() {
+  bar();
+}
diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c
index c4762d74ea633..dbe7d102bb623 100644
--- a/clang/test/Analysis/std-c-library-functions.c
+++ b/clang/test/Analysis/std-c-library-functions.c
@@ -53,10 +53,10 @@
 // CHECK-NEXT: Loaded summary for: int getc(FILE *)
 // CHECK-NEXT: Loaded summary for: int fgetc(FILE *)
 // CHECK-NEXT: Loaded summary for: int getchar()
+// CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
+// CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: ssize_t read(int, void *, size_t)
 // CHECK-NEXT: Loaded summary for: ssize_t write(int, const void *, size_t)
-// CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *)
-// CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: ssize_t getline(char **, size_t *, FILE *)
 
 void clang_analyzer_eval(int);
@@ -104,7 +104,7 @@ void test_read_write(int fd, char *buf) {
   }
 }
 
-size_t fread(void *restrict, size_t, size_t, FILE *);
+size_t fread(void *restrict, size_t, size_t, FILE *restrict);
 size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict);
 void test_fread_fwrite(FILE *fp, int *buf) {
 

From d8a78889f633c5a60f0a1c7cb60bf9b18817d9d7 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Wed, 27 May 2020 15:00:00 +0100
Subject: [PATCH 537/770] [CodeGen] Fix warning in visitShuffleVector

Make sure we only ask for the number of elements after we've
bailed out for scalable vectors.

Differential revision: https://reviews.llvm.org/D80632
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d40f7f92c4cb0..1dd5cfe1b49fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3458,7 +3458,6 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   EVT SrcVT = Src1.getValueType();
-  unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
   if (all_of(Mask, [](int Elem) { return Elem == 0; }) &&
       VT.isScalableVector()) {
@@ -3475,6 +3474,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   // for targets that support a SPLAT_VECTOR for non-scalable vector types.
   assert(!VT.isScalableVector() && "Unsupported scalable vector shuffle");
 
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
   unsigned MaskNumElts = Mask.size();
 
   if (SrcNumElts == MaskNumElts) {

From a38788201e44c5eb1abf3c71d0cad22d291dc9a3 Mon Sep 17 00:00:00 2001
From: "Kevin P. Neal" <kevin.neal@sas.com>
Date: Fri, 29 May 2020 12:11:57 -0400
Subject: [PATCH 538/770] Fix errors in use of strictfp attribute.

Errors spotted with use of: https://reviews.llvm.org/D68233
---
 .../test/CodeGen/AArch64/strict-fp-int-promote.ll | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/strict-fp-int-promote.ll b/llvm/test/CodeGen/AArch64/strict-fp-int-promote.ll
index 296f058d19288..6a47fee824255 100644
--- a/llvm/test/CodeGen/AArch64/strict-fp-int-promote.ll
+++ b/llvm/test/CodeGen/AArch64/strict-fp-int-promote.ll
@@ -29,9 +29,9 @@ define i32 @test() #0 {
 ; SUBOPTIMAL-NEXT:    and w0, w8, #0x1
 ; SUBOPTIMAL-NEXT:    ret
 entry:
-  %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  %conv1 = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict")
+  %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %conv1 = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict") #1
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
 }
@@ -57,11 +57,12 @@ define i32 @test2() #0 {
 ; SUBOPTIMAL-NEXT:    and w0, w8, #0x1
 ; SUBOPTIMAL-NEXT:    ret
 entry:
-  %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  %conv1 = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict")
+  %conv = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %conv1 = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(float %conv, float %conv1, metadata !"oeq", metadata !"fpexcept.strict") #1
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
 }
 
-attributes #0 = { noinline optnone }
+attributes #0 = { strictfp noinline optnone }
+attributes #1 = { strictfp }

From 66d1899e2ffd2581f774ecf51ced4a325e7d004b Mon Sep 17 00:00:00 2001
From: "Kevin P. Neal" <kevin.neal@sas.com>
Date: Fri, 29 May 2020 12:15:54 -0400
Subject: [PATCH 539/770] Fix errors in use of strictfp attribute.

Errors spotted with use of: https://reviews.llvm.org/D68233
---
 llvm/test/CodeGen/PowerPC/fp-strict-f128.ll   |  38 ++---
 llvm/test/CodeGen/PowerPC/fp-strict-minmax.ll |  18 ++-
 llvm/test/CodeGen/PowerPC/fp-strict.ll        | 152 +++++++++---------
 .../ppcf128-constrained-fp-intrinsics.ll      | 143 ++++++++--------
 4 files changed, 180 insertions(+), 171 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-f128.ll b/llvm/test/CodeGen/PowerPC/fp-strict-f128.ll
index 2f92382f47095..21ddb799141d0 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-f128.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-f128.ll
@@ -9,7 +9,7 @@ declare fp128 @llvm.experimental.constrained.fdiv.f128(fp128, fp128, metadata, m
 declare fp128 @llvm.experimental.constrained.fma.f128(fp128, fp128, fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 
-define fp128 @fadd_f128(fp128 %f1, fp128 %f2) {
+define fp128 @fadd_f128(fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fadd_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsaddqp v2, v2, v3
@@ -17,11 +17,11 @@ define fp128 @fadd_f128(fp128 %f1, fp128 %f2) {
   %res = call fp128 @llvm.experimental.constrained.fadd.f128(
                         fp128 %f1, fp128 %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
 
-define fp128 @fsub_f128(fp128 %f1, fp128 %f2) {
+define fp128 @fsub_f128(fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fsub_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xssubqp v2, v2, v3
@@ -29,11 +29,11 @@ define fp128 @fsub_f128(fp128 %f1, fp128 %f2) {
   %res = call fp128 @llvm.experimental.constrained.fsub.f128(
                         fp128 %f1, fp128 %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
 
-define fp128 @fmul_f128(fp128 %f1, fp128 %f2) {
+define fp128 @fmul_f128(fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fmul_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmulqp v2, v2, v3
@@ -41,11 +41,11 @@ define fp128 @fmul_f128(fp128 %f1, fp128 %f2) {
   %res = call fp128 @llvm.experimental.constrained.fmul.f128(
                         fp128 %f1, fp128 %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
 
-define fp128 @fdiv_f128(fp128 %f1, fp128 %f2) {
+define fp128 @fdiv_f128(fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fdiv_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsdivqp v2, v2, v3
@@ -53,11 +53,11 @@ define fp128 @fdiv_f128(fp128 %f1, fp128 %f2) {
   %res = call fp128 @llvm.experimental.constrained.fdiv.f128(
                         fp128 %f1, fp128 %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
 
-define fp128 @fmadd_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
+define fp128 @fmadd_f128(fp128 %f0, fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fmadd_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmaddqp v4, v2, v3
@@ -66,11 +66,11 @@ define fp128 @fmadd_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
   %res = call fp128 @llvm.experimental.constrained.fma.f128(
                         fp128 %f0, fp128 %f1, fp128 %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
 
-define fp128 @fmsub_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
+define fp128 @fmsub_f128(fp128 %f0, fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fmsub_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmsubqp v4, v2, v3
@@ -80,11 +80,11 @@ define fp128 @fmsub_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
   %res = call fp128 @llvm.experimental.constrained.fma.f128(
                         fp128 %f0, fp128 %f1, fp128 %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
 
-define fp128 @fnmadd_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
+define fp128 @fnmadd_f128(fp128 %f0, fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fnmadd_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsnmaddqp v4, v2, v3
@@ -93,12 +93,12 @@ define fp128 @fnmadd_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
   %fma = call fp128 @llvm.experimental.constrained.fma.f128(
                         fp128 %f0, fp128 %f1, fp128 %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg fp128 %fma
   ret fp128 %res
 }
 
-define fp128 @fnmsub_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
+define fp128 @fnmsub_f128(fp128 %f0, fp128 %f1, fp128 %f2) #0 {
 ; CHECK-LABEL: fnmsub_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsnmsubqp v4, v2, v3
@@ -108,13 +108,13 @@ define fp128 @fnmsub_f128(fp128 %f0, fp128 %f1, fp128 %f2) {
   %fma = call fp128 @llvm.experimental.constrained.fma.f128(
                         fp128 %f0, fp128 %f1, fp128 %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg fp128 %fma
   ret fp128 %res
 }
 
 
-define fp128 @fsqrt_f128(fp128 %f1) {
+define fp128 @fsqrt_f128(fp128 %f1) #0 {
 ; CHECK-LABEL: fsqrt_f128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xssqrtqp v2, v2
@@ -122,6 +122,8 @@ define fp128 @fsqrt_f128(fp128 %f1) {
   %res = call fp128 @llvm.experimental.constrained.sqrt.f128(
                         fp128 %f1,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret fp128 %res
 }
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-minmax.ll b/llvm/test/CodeGen/PowerPC/fp-strict-minmax.ll
index 14e8be5d7d39d..7663708e6b4e9 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-minmax.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-minmax.ll
@@ -9,47 +9,49 @@ declare <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double>, <
 declare <4 x float> @llvm.experimental.constrained.minnum.v4f32(<4 x float>, <4 x float>, metadata)
 declare <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double>, <2 x double>, metadata)
 
-define <4 x float> @fmaxnum_v4f32(<4 x float> %vf0, <4 x float> %vf1) {
+define <4 x float> @fmaxnum_v4f32(<4 x float> %vf0, <4 x float> %vf1) #0 {
 ; CHECK-LABEL: fmaxnum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmaxsp v2, v2, v3
 ; CHECK-NEXT:    blr
   %res = call <4 x float> @llvm.experimental.constrained.maxnum.v4f32(
                         <4 x float> %vf0, <4 x float> %vf1,
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fmaxnum_v2f64(<2 x double> %vf0, <2 x double> %vf1) {
+define <2 x double> @fmaxnum_v2f64(<2 x double> %vf0, <2 x double> %vf1) #0 {
 ; CHECK-LABEL: fmaxnum_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmaxdp v2, v2, v3
 ; CHECK-NEXT:    blr
   %res = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64(
                         <2 x double> %vf0, <2 x double> %vf1,
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
 
-define <4 x float> @fminnum_v4f32(<4 x float> %vf0, <4 x float> %vf1) {
+define <4 x float> @fminnum_v4f32(<4 x float> %vf0, <4 x float> %vf1) #0 {
 ; CHECK-LABEL: fminnum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvminsp v2, v2, v3
 ; CHECK-NEXT:    blr
   %res = call <4 x float> @llvm.experimental.constrained.minnum.v4f32(
                         <4 x float> %vf0, <4 x float> %vf1,
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fminnum_v2f64(<2 x double> %vf0, <2 x double> %vf1) {
+define <2 x double> @fminnum_v2f64(<2 x double> %vf0, <2 x double> %vf1) #0 {
 ; CHECK-LABEL: fminnum_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmindp v2, v2, v3
 ; CHECK-NEXT:    blr
   %res = call <2 x double> @llvm.experimental.constrained.minnum.v2f64(
                         <2 x double> %vf0, <2 x double> %vf1,
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict.ll b/llvm/test/CodeGen/PowerPC/fp-strict.ll
index 743f68029be97..04e6f967a2b8b 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict.ll
@@ -33,7 +33,7 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadat
 declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
 
-define float @fadd_f32(float %f1, float %f2) {
+define float @fadd_f32(float %f1, float %f2) #0 {
 ; CHECK-LABEL: fadd_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsaddsp f1, f1, f2
@@ -46,11 +46,11 @@ define float @fadd_f32(float %f1, float %f2) {
   %res = call float @llvm.experimental.constrained.fadd.f32(
                         float %f1, float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res
 }
 
-define double @fadd_f64(double %f1, double %f2) {
+define double @fadd_f64(double %f1, double %f2) #0 {
 ; CHECK-LABEL: fadd_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsadddp f1, f1, f2
@@ -63,11 +63,11 @@ define double @fadd_f64(double %f1, double %f2) {
   %res = call double @llvm.experimental.constrained.fadd.f64(
                         double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res
 }
 
-define <4 x float> @fadd_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fadd_v4f32(<4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fadd_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvaddsp v2, v2, v3
@@ -101,11 +101,11 @@ define <4 x float> @fadd_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
   %res = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(
                         <4 x float> %vf1, <4 x float> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fadd_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fadd_v2f64(<2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fadd_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvadddp v2, v2, v3
@@ -119,11 +119,11 @@ define <2 x double> @fadd_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
   %res = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(
                         <2 x double> %vf1, <2 x double> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
-define float @fsub_f32(float %f1, float %f2) {
+define float @fsub_f32(float %f1, float %f2) #0 {
 ; CHECK-LABEL: fsub_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xssubsp f1, f1, f2
@@ -137,11 +137,11 @@ define float @fsub_f32(float %f1, float %f2) {
   %res = call float @llvm.experimental.constrained.fsub.f32(
                         float %f1, float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res;
 }
 
-define double @fsub_f64(double %f1, double %f2) {
+define double @fsub_f64(double %f1, double %f2) #0 {
 ; CHECK-LABEL: fsub_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xssubdp f1, f1, f2
@@ -155,11 +155,11 @@ define double @fsub_f64(double %f1, double %f2) {
   %res = call double @llvm.experimental.constrained.fsub.f64(
                         double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res;
 }
 
-define <4 x float> @fsub_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fsub_v4f32(<4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fsub_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvsubsp v2, v2, v3
@@ -193,11 +193,11 @@ define <4 x float> @fsub_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
   %res = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(
                         <4 x float> %vf1, <4 x float> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res;
 }
 
-define <2 x double> @fsub_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fsub_v2f64(<2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fsub_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvsubdp v2, v2, v3
@@ -211,11 +211,11 @@ define <2 x double> @fsub_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
   %res = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(
                         <2 x double> %vf1, <2 x double> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res;
 }
 
-define float @fmul_f32(float %f1, float %f2) {
+define float @fmul_f32(float %f1, float %f2) #0 {
 ; CHECK-LABEL: fmul_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmulsp f1, f1, f2
@@ -229,11 +229,11 @@ define float @fmul_f32(float %f1, float %f2) {
   %res = call float @llvm.experimental.constrained.fmul.f32(
                         float %f1, float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res;
 }
 
-define double @fmul_f64(double %f1, double %f2) {
+define double @fmul_f64(double %f1, double %f2) #0 {
 ; CHECK-LABEL: fmul_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmuldp f1, f1, f2
@@ -247,11 +247,11 @@ define double @fmul_f64(double %f1, double %f2) {
   %res = call double @llvm.experimental.constrained.fmul.f64(
                         double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res;
 }
 
-define <4 x float> @fmul_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fmul_v4f32(<4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fmul_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmulsp v2, v2, v3
@@ -285,11 +285,11 @@ define <4 x float> @fmul_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
   %res = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(
                         <4 x float> %vf1, <4 x float> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res;
 }
 
-define <2 x double> @fmul_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fmul_v2f64(<2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fmul_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmuldp v2, v2, v3
@@ -303,11 +303,11 @@ define <2 x double> @fmul_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
   %res = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(
                         <2 x double> %vf1, <2 x double> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res;
 }
 
-define float @fdiv_f32(float %f1, float %f2) {
+define float @fdiv_f32(float %f1, float %f2) #0 {
 ; CHECK-LABEL: fdiv_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsdivsp f1, f1, f2
@@ -321,11 +321,11 @@ define float @fdiv_f32(float %f1, float %f2) {
   %res = call float @llvm.experimental.constrained.fdiv.f32(
                         float %f1, float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res;
 }
 
-define double @fdiv_f64(double %f1, double %f2) {
+define double @fdiv_f64(double %f1, double %f2) #0 {
 ; CHECK-LABEL: fdiv_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsdivdp f1, f1, f2
@@ -339,11 +339,11 @@ define double @fdiv_f64(double %f1, double %f2) {
   %res = call double @llvm.experimental.constrained.fdiv.f64(
                         double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res;
 }
 
-define <4 x float> @fdiv_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fdiv_v4f32(<4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fdiv_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvdivsp v2, v2, v3
@@ -377,11 +377,11 @@ define <4 x float> @fdiv_v4f32(<4 x float> %vf1, <4 x float> %vf2) {
   %res = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(
                         <4 x float> %vf1, <4 x float> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fdiv_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fdiv_v2f64(<2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fdiv_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvdivdp v2, v2, v3
@@ -395,11 +395,11 @@ define <2 x double> @fdiv_v2f64(<2 x double> %vf1, <2 x double> %vf2) {
   %res = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(
                         <2 x double> %vf1, <2 x double> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
-define double @no_fma_fold(double %f1, double %f2, double %f3) {
+define double @no_fma_fold(double %f1, double %f2, double %f3) #0 {
 ; CHECK-LABEL: no_fma_fold:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmuldp f0, f1, f2
@@ -414,15 +414,15 @@ define double @no_fma_fold(double %f1, double %f2, double %f3) {
   %mul = call double @llvm.experimental.constrained.fmul.f64(
                         double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %add = call double @llvm.experimental.constrained.fadd.f64(
                         double %mul, double %f3,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %add
 }
 
-define float @fmadd_f32(float %f0, float %f1, float %f2) {
+define float @fmadd_f32(float %f0, float %f1, float %f2) #0 {
 ; CHECK-LABEL: fmadd_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmaddasp f3, f1, f2
@@ -436,11 +436,11 @@ define float @fmadd_f32(float %f0, float %f1, float %f2) {
   %res = call float @llvm.experimental.constrained.fma.f32(
                         float %f0, float %f1, float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res
 }
 
-define double @fmadd_f64(double %f0, double %f1, double %f2) {
+define double @fmadd_f64(double %f0, double %f1, double %f2) #0 {
 ; CHECK-LABEL: fmadd_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmaddadp f3, f1, f2
@@ -454,11 +454,11 @@ define double @fmadd_f64(double %f0, double %f1, double %f2) {
   %res = call double @llvm.experimental.constrained.fma.f64(
                         double %f0, double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res
 }
 
-define <4 x float> @fmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fmadd_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmaddasp v4, v2, v3
@@ -499,11 +499,11 @@ define <4 x float> @fmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
   %res = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
                         <4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fmadd_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fmadd_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fmadd_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmaddadp v4, v2, v3
@@ -518,11 +518,11 @@ define <2 x double> @fmadd_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x doub
   %res = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
                         <2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
-define float @fmsub_f32(float %f0, float %f1, float %f2) {
+define float @fmsub_f32(float %f0, float %f1, float %f2) #0 {
 ; CHECK-LABEL: fmsub_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmsubasp f3, f1, f2
@@ -537,11 +537,11 @@ define float @fmsub_f32(float %f0, float %f1, float %f2) {
   %res = call float @llvm.experimental.constrained.fma.f32(
                         float %f0, float %f1, float %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res
 }
 
-define double @fmsub_f64(double %f0, double %f1, double %f2) {
+define double @fmsub_f64(double %f0, double %f1, double %f2) #0 {
 ; CHECK-LABEL: fmsub_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsmsubadp f3, f1, f2
@@ -556,11 +556,11 @@ define double @fmsub_f64(double %f0, double %f1, double %f2) {
   %res = call double @llvm.experimental.constrained.fma.f64(
                         double %f0, double %f1, double %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res
 }
 
-define <4 x float> @fmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fmsub_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmsubasp v4, v2, v3
@@ -605,11 +605,11 @@ define <4 x float> @fmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
   %res = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
                         <4 x float> %vf0, <4 x float> %vf1, <4 x float> %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fmsub_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fmsub_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fmsub_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmsubadp v4, v2, v3
@@ -625,11 +625,11 @@ define <2 x double> @fmsub_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x doub
   %res = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
                         <2 x double> %vf0, <2 x double> %vf1, <2 x double> %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
-define float @fnmadd_f32(float %f0, float %f1, float %f2) {
+define float @fnmadd_f32(float %f0, float %f1, float %f2) #0 {
 ; CHECK-LABEL: fnmadd_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsnmaddasp f3, f1, f2
@@ -643,12 +643,12 @@ define float @fnmadd_f32(float %f0, float %f1, float %f2) {
   %fma = call float @llvm.experimental.constrained.fma.f32(
                         float %f0, float %f1, float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg float %fma
   ret float %res
 }
 
-define double @fnmadd_f64(double %f0, double %f1, double %f2) {
+define double @fnmadd_f64(double %f0, double %f1, double %f2) #0 {
 ; CHECK-LABEL: fnmadd_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsnmaddadp f3, f1, f2
@@ -662,12 +662,12 @@ define double @fnmadd_f64(double %f0, double %f1, double %f2) {
   %fma = call double @llvm.experimental.constrained.fma.f64(
                         double %f0, double %f1, double %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg double %fma
   ret double %res
 }
 
-define <4 x float> @fnmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fnmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fnmadd_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvmaddasp v4, v2, v3
@@ -711,12 +711,12 @@ define <4 x float> @fnmadd_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
   %fma = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
                         <4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg <4 x float> %fma
   ret <4 x float> %res
 }
 
-define <2 x double> @fnmadd_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fnmadd_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fnmadd_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvnmaddadp v4, v2, v3
@@ -731,12 +731,12 @@ define <2 x double> @fnmadd_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x dou
   %fma = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
                         <2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg <2 x double> %fma
   ret <2 x double> %res
 }
 
-define float @fnmsub_f32(float %f0, float %f1, float %f2) {
+define float @fnmsub_f32(float %f0, float %f1, float %f2) #0 {
 ; CHECK-LABEL: fnmsub_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsnmsubasp f3, f1, f2
@@ -751,12 +751,12 @@ define float @fnmsub_f32(float %f0, float %f1, float %f2) {
   %fma = call float @llvm.experimental.constrained.fma.f32(
                         float %f0, float %f1, float %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg float %fma
   ret float %res
 }
 
-define double @fnmsub_f64(double %f0, double %f1, double %f2) {
+define double @fnmsub_f64(double %f0, double %f1, double %f2) #0 {
 ; CHECK-LABEL: fnmsub_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xsnmsubadp f3, f1, f2
@@ -771,12 +771,12 @@ define double @fnmsub_f64(double %f0, double %f1, double %f2) {
   %fma = call double @llvm.experimental.constrained.fma.f64(
                         double %f0, double %f1, double %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg double %fma
   ret double %res
 }
 
-define <4 x float> @fnmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) {
+define <4 x float> @fnmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float> %vf2) #0 {
 ; CHECK-LABEL: fnmsub_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvnmsubasp v4, v2, v3
@@ -822,12 +822,12 @@ define <4 x float> @fnmsub_v4f32(<4 x float> %vf0, <4 x float> %vf1, <4 x float>
   %fma = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
                         <4 x float> %vf0, <4 x float> %vf1, <4 x float> %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg <4 x float> %fma
   ret <4 x float> %res
 }
 
-define <2 x double> @fnmsub_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) {
+define <2 x double> @fnmsub_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x double> %vf2) #0 {
 ; CHECK-LABEL: fnmsub_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvnmsubadp v4, v2, v3
@@ -843,12 +843,12 @@ define <2 x double> @fnmsub_v2f64(<2 x double> %vf0, <2 x double> %vf1, <2 x dou
   %fma = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
                         <2 x double> %vf0, <2 x double> %vf1, <2 x double> %neg,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   %res = fneg <2 x double> %fma
   ret <2 x double> %res
 }
 
-define float @fsqrt_f32(float %f1) {
+define float @fsqrt_f32(float %f1) #0 {
 ; CHECK-LABEL: fsqrt_f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xssqrtsp f1, f1
@@ -861,11 +861,11 @@ define float @fsqrt_f32(float %f1) {
   %res = call float @llvm.experimental.constrained.sqrt.f32(
                         float %f1,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret float %res
 }
 
-define double @fsqrt_f64(double %f1) {
+define double @fsqrt_f64(double %f1) #0 {
 ; CHECK-LABEL: fsqrt_f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xssqrtdp f1, f1
@@ -878,11 +878,11 @@ define double @fsqrt_f64(double %f1) {
   %res = call double @llvm.experimental.constrained.sqrt.f64(
                         double %f1,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret double %res
 }
 
-define <4 x float> @fsqrt_v4f32(<4 x float> %vf1) {
+define <4 x float> @fsqrt_v4f32(<4 x float> %vf1) #0 {
 ; CHECK-LABEL: fsqrt_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvsqrtsp v2, v2
@@ -910,11 +910,11 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %vf1) {
   %res = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(
                         <4 x float> %vf1,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
-define <2 x double> @fsqrt_v2f64(<2 x double> %vf1) {
+define <2 x double> @fsqrt_v2f64(<2 x double> %vf1) #0 {
 ; CHECK-LABEL: fsqrt_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvsqrtdp v2, v2
@@ -928,6 +928,8 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %vf1) {
   %res = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(
                         <2 x double> %vf1,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
index 8284607482a36..c9d9cf870e49f 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -O3 -mtriple=powerpc64le-linux-gnu -mcpu=pwr9 < %s | FileCheck --check-prefix=PC64LE9 %s
 ; RUN: llc -O3 -mtriple=powerpc64-linux-gnu < %s | FileCheck --check-prefix=PC64 %s
 
-define ppc_fp128 @test_fadd_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_fadd_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_fadd_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -44,11 +44,11 @@ entry:
                     ppc_fp128 %first,
                     ppc_fp128 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %add
 }
 
-define ppc_fp128 @test_fsub_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_fsub_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_fsub_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -89,11 +89,11 @@ entry:
                     ppc_fp128 %first,
                     ppc_fp128 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %sub
 }
 
-define ppc_fp128 @test_fmul_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_fmul_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_fmul_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -134,11 +134,11 @@ entry:
                     ppc_fp128 %first,
                     ppc_fp128 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %mul
 }
 
-define ppc_fp128 @test_fdiv_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_fdiv_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_fdiv_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -179,11 +179,11 @@ entry:
                     ppc_fp128 %first,
                     ppc_fp128 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %div
 }
 
-define ppc_fp128 @test_frem_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_frem_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_frem_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -224,11 +224,11 @@ entry:
                     ppc_fp128 %first,
                     ppc_fp128 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %rem
 }
 
-define ppc_fp128 @test_fma_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second, ppc_fp128 %third) nounwind {
+define ppc_fp128 @test_fma_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second, ppc_fp128 %third) #0 {
 ; PC64LE-LABEL: test_fma_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -270,11 +270,11 @@ entry:
                     ppc_fp128 %second,
                     ppc_fp128 %third,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %add
 }
 
-define ppc_fp128 @test_sqrt_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_sqrt_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_sqrt_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -314,11 +314,11 @@ entry:
   %sqrt = call ppc_fp128 @llvm.experimental.constrained.sqrt.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %sqrt
 }
 
-define ppc_fp128 @test_pow_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_pow_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_pow_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -359,11 +359,11 @@ entry:
                     ppc_fp128 %first,
                     ppc_fp128 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %pow
 }
 
-define ppc_fp128 @test_powi_ppc_fp128(ppc_fp128 %first, i32 %second) nounwind {
+define ppc_fp128 @test_powi_ppc_fp128(ppc_fp128 %first, i32 %second) #0 {
 ; PC64LE-LABEL: test_powi_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -407,11 +407,11 @@ entry:
                     ppc_fp128 %first,
                     i32 %second,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %powi
 }
 
-define ppc_fp128 @test_sin_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_sin_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_sin_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -451,11 +451,11 @@ entry:
   %sin = call ppc_fp128 @llvm.experimental.constrained.sin.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %sin
 }
 
-define ppc_fp128 @test_cos_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_cos_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_cos_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -495,11 +495,11 @@ entry:
   %cos = call ppc_fp128 @llvm.experimental.constrained.cos.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %cos
 }
 
-define ppc_fp128 @test_exp_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_exp_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_exp_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -539,11 +539,11 @@ entry:
   %exp = call ppc_fp128 @llvm.experimental.constrained.exp.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %exp
 }
 
-define ppc_fp128 @test_exp2_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_exp2_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_exp2_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -583,11 +583,11 @@ entry:
   %exp2 = call ppc_fp128 @llvm.experimental.constrained.exp2.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %exp2
 }
 
-define ppc_fp128 @test_log_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_log_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_log_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -627,11 +627,11 @@ entry:
   %log = call ppc_fp128 @llvm.experimental.constrained.log.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %log
 }
 
-define ppc_fp128 @test_log2_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_log2_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_log2_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -671,11 +671,11 @@ entry:
   %log2 = call ppc_fp128 @llvm.experimental.constrained.log2.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %log2
 }
 
-define ppc_fp128 @test_log10_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_log10_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_log10_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -715,11 +715,11 @@ entry:
   %log10 = call ppc_fp128 @llvm.experimental.constrained.log10.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %log10
 }
 
-define ppc_fp128 @test_rint_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_rint_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_rint_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -759,11 +759,11 @@ entry:
   %rint = call ppc_fp128 @llvm.experimental.constrained.rint.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %rint
 }
 
-define ppc_fp128 @test_nearbyint_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_nearbyint_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_nearbyint_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -803,11 +803,11 @@ entry:
   %nearbyint = call ppc_fp128 @llvm.experimental.constrained.nearbyint.ppcf128(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %nearbyint
 }
 
-define ppc_fp128 @test_maxnum_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_maxnum_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_maxnum_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -847,11 +847,11 @@ entry:
   %maxnum = call ppc_fp128 @llvm.experimental.constrained.maxnum.ppcf128(
                     ppc_fp128 %first,
                     ppc_fp128 %second,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %maxnum
 }
 
-define ppc_fp128 @test_minnum_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+define ppc_fp128 @test_minnum_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) #0 {
 ; PC64LE-LABEL: test_minnum_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -891,11 +891,11 @@ entry:
   %minnum = call ppc_fp128 @llvm.experimental.constrained.minnum.ppcf128(
                     ppc_fp128 %first,
                     ppc_fp128 %second,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %minnum
 }
 
-define ppc_fp128 @test_ceil_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_ceil_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_ceil_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -934,11 +934,11 @@ define ppc_fp128 @test_ceil_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %ceil = call ppc_fp128 @llvm.experimental.constrained.ceil.ppcf128(
                     ppc_fp128 %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %ceil
 }
 
-define ppc_fp128 @test_floor_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_floor_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_floor_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -977,11 +977,11 @@ define ppc_fp128 @test_floor_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %floor = call ppc_fp128 @llvm.experimental.constrained.floor.ppcf128(
                     ppc_fp128 %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %floor
 }
 
-define ppc_fp128 @test_round_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_round_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_round_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -1020,11 +1020,11 @@ define ppc_fp128 @test_round_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %round = call ppc_fp128 @llvm.experimental.constrained.round.ppcf128(
                     ppc_fp128 %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %round
 }
 
-define ppc_fp128 @test_trunc_ppc_fp128(ppc_fp128 %first) nounwind {
+define ppc_fp128 @test_trunc_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_trunc_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -1063,11 +1063,11 @@ define ppc_fp128 @test_trunc_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %trunc = call ppc_fp128 @llvm.experimental.constrained.trunc.ppcf128(
                     ppc_fp128 %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %trunc
 }
 
-define float @test_fptrunc_ppc_fp128_f32(ppc_fp128 %first) nounwind {
+define float @test_fptrunc_ppc_fp128_f32(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_fptrunc_ppc_fp128_f32:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    frsp 1, 1
@@ -1086,11 +1086,11 @@ entry:
   %fptrunc = call float @llvm.experimental.constrained.fptrunc.ppcf128.f32(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret float %fptrunc
 }
 
-define double @test_fptrunc_ppc_fp128_f64(ppc_fp128 %first) nounwind {
+define double @test_fptrunc_ppc_fp128_f64(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_fptrunc_ppc_fp128_f64:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    blr
@@ -1106,11 +1106,11 @@ entry:
   %fptrunc = call double @llvm.experimental.constrained.fptrunc.ppcf128.f64(
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret double %fptrunc
 }
 
-define ppc_fp128 @test_fpext_ppc_fp128_f32(float %first) nounwind {
+define ppc_fp128 @test_fpext_ppc_fp128_f32(float %first) #0 {
 ; PC64LE-LABEL: test_fpext_ppc_fp128_f32:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    xxlxor 2, 2, 2
@@ -1129,11 +1129,11 @@ define ppc_fp128 @test_fpext_ppc_fp128_f32(float %first) nounwind {
 entry:
   %fpext = call ppc_fp128 @llvm.experimental.constrained.fpext.f32.ppcf128(
                     float %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %fpext
 }
 
-define ppc_fp128 @test_fpext_ppc_fp128_f64(double %first) nounwind {
+define ppc_fp128 @test_fpext_ppc_fp128_f64(double %first) #0 {
 ; PC64LE-LABEL: test_fpext_ppc_fp128_f64:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    xxlxor 2, 2, 2
@@ -1152,11 +1152,11 @@ define ppc_fp128 @test_fpext_ppc_fp128_f64(double %first) nounwind {
 entry:
   %fpext = call ppc_fp128 @llvm.experimental.constrained.fpext.f64.ppcf128(
                     double %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret ppc_fp128 %fpext
 }
 
-define i64 @test_fptosi_ppc_i64_ppc_fp128(ppc_fp128 %first) nounwind {
+define i64 @test_fptosi_ppc_i64_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_fptosi_ppc_i64_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -1195,11 +1195,11 @@ define i64 @test_fptosi_ppc_i64_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %fpext = call i64 @llvm.experimental.constrained.fptosi.i64.ppcf128(
                     ppc_fp128 %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret i64 %fpext
 }
 
-define i32 @test_fptosi_ppc_i32_ppc_fp128(ppc_fp128 %first) nounwind {
+define i32 @test_fptosi_ppc_i32_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_fptosi_ppc_i32_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -1238,11 +1238,11 @@ define i32 @test_fptosi_ppc_i32_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %fpext = call i32 @llvm.experimental.constrained.fptosi.i32.ppcf128(
                     ppc_fp128  %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret i32 %fpext
 }
 
-define i64 @test_fptoui_ppc_i64_ppc_fp128(ppc_fp128 %first) nounwind {
+define i64 @test_fptoui_ppc_i64_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_fptoui_ppc_i64_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -1281,11 +1281,11 @@ define i64 @test_fptoui_ppc_i64_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %fpext = call i64 @llvm.experimental.constrained.fptoui.i64.ppcf128(
                     ppc_fp128   %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret i64 %fpext
 }
 
-define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) nounwind {
+define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) #0 {
 ; PC64LE-LABEL: test_fptoui_ppc_i32_ppc_fp128:
 ; PC64LE:       # %bb.0: # %entry
 ; PC64LE-NEXT:    mflr 0
@@ -1324,13 +1324,13 @@ define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) nounwind {
 entry:
   %fpext = call i32 @llvm.experimental.constrained.fptoui.i32.ppcf128(
                     ppc_fp128   %first,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   ret i32 %fpext
 }
 
 ; Test that resultant libcalls retain order even when their non-strict FLOP form could be
 ; trivially optimized into differing sequences.
-define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %result) nounwind {
+define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %result) #0 {
 ; PC64LE-LABEL: test_constrained_libcall_multichain:
 ; PC64LE:       # %bb.0:
 ; PC64LE-NEXT:    mflr 0
@@ -1490,7 +1490,7 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
   %load = load float, float* %firstptr
   %first = call ppc_fp128 @llvm.experimental.constrained.fpext.f32.ppcf128(
                     float %load,
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   store ppc_fp128 %first, ppc_fp128* %result
 
   ; For unconstrained FLOPs, these next two FP instructions would necessarily
@@ -1499,14 +1499,14 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
                     ppc_fp128 %first,
                     ppc_fp128 %first,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   %stridx1 = getelementptr ppc_fp128, ppc_fp128* %result, i32 1
   store ppc_fp128 %fadd, ppc_fp128* %stridx1
   %fmul = call ppc_fp128 @llvm.experimental.constrained.fmul.ppcf128(
                     ppc_fp128 %fadd,
                     ppc_fp128 %fadd,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   %stridx2 = getelementptr ppc_fp128, ppc_fp128* %stridx1, i32 1
   store ppc_fp128 %fadd, ppc_fp128* %stridx2
 
@@ -1517,17 +1517,20 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
                     ppc_fp128 %first,
                     i32 2,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   %tinypow = call float @llvm.experimental.constrained.fptrunc.ppcf128.f32(
                     ppc_fp128 %powi,
                     metadata !"round.dynamic",
-                    metadata !"fpexcept.strict")
+                    metadata !"fpexcept.strict") #1
   store float %tinypow, float* %firstptr
   %stridxn1 = getelementptr ppc_fp128, ppc_fp128* %result, i32 -1
   store ppc_fp128 %powi, ppc_fp128* %stridxn1
   ret void
 }
 
+attributes #0 = { nounwind strictfp }
+attributes #1 = { strictfp }
+
 declare ppc_fp128 @llvm.experimental.constrained.fadd.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.ceil.ppcf128(ppc_fp128, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.cos.ppcf128(ppc_fp128, metadata, metadata)

From c21a4f84b0e83aaf8f5390f6299a8857197f96c4 Mon Sep 17 00:00:00 2001
From: "Kevin P. Neal" <kevin.neal@sas.com>
Date: Fri, 29 May 2020 12:17:23 -0400
Subject: [PATCH 540/770] Fix errors in use of strictfp attribute.

Errors spotted with use of: https://reviews.llvm.org/D68233
---
 llvm/test/CodeGen/SystemZ/fp-strict-alias.ll  |  5 +++-
 llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll | 30 +++++++++----------
 llvm/test/CodeGen/SystemZ/fp-strict-cmp-05.ll |  8 ++---
 .../test/CodeGen/SystemZ/fp-strict-cmps-04.ll |  8 ++---
 .../test/CodeGen/SystemZ/fp-strict-cmps-05.ll |  8 ++---
 .../CodeGen/SystemZ/vec-strict-conv-02.ll     |  6 ++--
 6 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll b/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll
index 5091c3dc97bad..ae1f748d62514 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll
@@ -349,7 +349,10 @@ define void @f17(float %in, float* %out) #0 {
 ; CHECK: sqebr
 ; CHECK: ste
 ; CHECK: jg bar
-  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %sqrt = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %in,
+                        metadata !"round.dynamic",
+                        metadata !"fpexcept.ignore") #0
   store float %sqrt, float* %out, align 4
   tail call void @bar() #0
   ret void
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll
index b62532ed6f7c8..de052935a41f9 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-04.ll
@@ -111,7 +111,7 @@ exit:
 }
 
 ; Subtraction also provides a zero-based CC value.
-define float @f5(float %a, float %b, float *%dest) {
+define float @f5(float %a, float %b, float *%dest) #0 {
 ; CHECK-LABEL: f5:
 ; CHECK: seb %f0, 0(%r2)
 ; CHECK-NEXT: bnher %r14
@@ -144,7 +144,7 @@ define float @f6(float %dummy, float %a, float *%dest) #0 {
 ; CHECK-NEXT: bhr %r14
 ; CHECK: br %r14
 entry:
-  %res = call float @llvm.fabs.f32(float %a)
+  %res = call float @llvm.fabs.f32(float %a) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %res, float 0.0,
                                                metadata !"ogt",
@@ -167,7 +167,7 @@ define float @f7(float %dummy, float %a, float *%dest) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %abs = call float @llvm.fabs.f32(float %a)
+  %abs = call float @llvm.fabs.f32(float %a) #0
   %res = fneg float %abs
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %res, float 0.0,
@@ -308,7 +308,7 @@ define float @f12(float %dummy, float %val) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %ret = call float asm "blah $1", "=f,{f0}"(float %val)
+  %ret = call float asm "blah $1", "=f,{f0}"(float %val) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %val, float 0.0,
                                                metadata !"olt",
@@ -316,7 +316,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
@@ -333,7 +333,7 @@ define double @f13(double %dummy, double %val) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %ret = call double asm "blah $1", "=f,{f0}"(double %val)
+  %ret = call double asm "blah $1", "=f,{f0}"(double %val) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f64(
                                                double %val, double 0.0,
                                                metadata !"olt",
@@ -341,7 +341,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
@@ -374,7 +374,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
@@ -392,7 +392,7 @@ define float @f15(float %val, float %dummy) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %ret = call float asm "blah $1", "=f,{f2}"(float %val)
+  %ret = call float asm "blah $1", "=f,{f2}"(float %val) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %val, float 0.0,
                                                metadata !"olt",
@@ -400,7 +400,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
@@ -418,7 +418,7 @@ define double @f16(double %val, double %dummy) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %ret = call double asm "blah $1", "=f,{f2}"(double %val)
+  %ret = call double asm "blah $1", "=f,{f2}"(double %val) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f64(
                                                double %val, double 0.0,
                                                metadata !"olt",
@@ -426,7 +426,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
@@ -471,7 +471,7 @@ entry:
                         float %a, float %b,
                         metadata !"round.dynamic",
                         metadata !"fpexcept.strict") #0
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %res, float 0.0,
                                                metadata !"oeq",
@@ -498,7 +498,7 @@ define float @f19(float %dummy, float %val) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %ret = call float asm sideeffect "blah $1", "=f,{f0}"(float %val)
+  %ret = call float asm sideeffect "blah $1", "=f,{f0}"(float %val) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %val, float 0.0,
                                                metadata !"olt",
@@ -506,7 +506,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-05.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-05.ll
index 590705a5504e7..6fcf46685ee9e 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-cmp-05.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmp-05.ll
@@ -41,7 +41,7 @@ define float @f3(float %a, float %b, float %f) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK: ltebr
 ; CHECK-NEXT: ber %r14
-  %abs = call float @llvm.fabs.f32(float %f)
+  %abs = call float @llvm.fabs.f32(float %f) #0
   %neg = fneg float %abs
   %cond = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %neg, float 0.0,
@@ -57,7 +57,7 @@ define double @f4(double %a, double %b, double %f) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK: ltdbr
 ; CHECK-NEXT: ber %r14
-  %abs = call double @llvm.fabs.f64(double %f)
+  %abs = call double @llvm.fabs.f64(double %f) #0
   %neg = fneg double %abs
   %cond = call i1 @llvm.experimental.constrained.fcmp.f64(
                                                double %neg, double 0.0,
@@ -73,7 +73,7 @@ define float @f5(float %a, float %b, float %f) #0 {
 ; CHECK-LABEL: f5:
 ; CHECK: ltebr
 ; CHECK-NEXT: ber %r14
-  %abs = call float @llvm.fabs.f32(float %f)
+  %abs = call float @llvm.fabs.f32(float %f) #0
   %cond = call i1 @llvm.experimental.constrained.fcmp.f32(
                                                float %abs, float 0.0,
                                                metadata !"oeq",
@@ -87,7 +87,7 @@ define double @f6(double %a, double %b, double %f) #0 {
 ; CHECK-LABEL: f6:
 ; CHECK: ltdbr
 ; CHECK-NEXT: ber %r14
-  %abs = call double @llvm.fabs.f64(double %f)
+  %abs = call double @llvm.fabs.f64(double %f) #0
   %cond = call i1 @llvm.experimental.constrained.fcmp.f64(
                                                double %abs, double 0.0,
                                                metadata !"oeq",
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll
index 47c0d5caa37a6..c251fd772d203 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-04.ll
@@ -46,7 +46,7 @@ define float @f6(float %dummy, float %a, float *%dest) #0 {
 ; CHECK-NEXT: bhr %r14
 ; CHECK: br %r14
 entry:
-  %res = call float @llvm.fabs.f32(float %a)
+  %res = call float @llvm.fabs.f32(float %a) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmps.f32(
                                                float %res, float 0.0,
                                                metadata !"ogt",
@@ -70,7 +70,7 @@ define float @f7(float %dummy, float %a, float *%dest) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %abs = call float @llvm.fabs.f32(float %a)
+  %abs = call float @llvm.fabs.f32(float %a) #0
   %res = fneg float %abs
   %cmp = call i1 @llvm.experimental.constrained.fcmps.f32(
                                                float %res, float 0.0,
@@ -122,7 +122,7 @@ define float @f12(float %dummy, float %val) #0 {
 ; CHECK-NEXT: blr %r14
 ; CHECK: br %r14
 entry:
-  %ret = call float asm "blah $1", "=f,{f0}"(float %val)
+  %ret = call float asm "blah $1", "=f,{f0}"(float %val) #0
   %cmp = call i1 @llvm.experimental.constrained.fcmps.f32(
                                                float %val, float 0.0,
                                                metadata !"olt",
@@ -130,7 +130,7 @@ entry:
   br i1 %cmp, label %exit, label %store
 
 store:
-  call void asm sideeffect "blah", ""()
+  call void asm sideeffect "blah", ""() #0
   br label %exit
 
 exit:
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-05.ll b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-05.ll
index ac677b29619bb..8dfe66b5d46cc 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-cmps-05.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-cmps-05.ll
@@ -41,7 +41,7 @@ define float @f3(float %a, float %b, float %f) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK: kebr
 ; CHECK-NEXT: ber %r14
-  %abs = call float @llvm.fabs.f32(float %f)
+  %abs = call float @llvm.fabs.f32(float %f) #0
   %neg = fneg float %abs
   %cond = call i1 @llvm.experimental.constrained.fcmps.f32(
                                                float %neg, float 0.0,
@@ -57,7 +57,7 @@ define double @f4(double %a, double %b, double %f) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK: kdbr
 ; CHECK-NEXT: ber %r14
-  %abs = call double @llvm.fabs.f64(double %f)
+  %abs = call double @llvm.fabs.f64(double %f) #0
   %neg = fneg double %abs
   %cond = call i1 @llvm.experimental.constrained.fcmps.f64(
                                                double %neg, double 0.0,
@@ -73,7 +73,7 @@ define float @f5(float %a, float %b, float %f) #0 {
 ; CHECK-LABEL: f5:
 ; CHECK: kebr
 ; CHECK-NEXT: ber %r14
-  %abs = call float @llvm.fabs.f32(float %f)
+  %abs = call float @llvm.fabs.f32(float %f) #0
   %cond = call i1 @llvm.experimental.constrained.fcmps.f32(
                                                float %abs, float 0.0,
                                                metadata !"oeq",
@@ -87,7 +87,7 @@ define double @f6(double %a, double %b, double %f) #0 {
 ; CHECK-LABEL: f6:
 ; CHECK: kdbr
 ; CHECK-NEXT: ber %r14
-  %abs = call double @llvm.fabs.f64(double %f)
+  %abs = call double @llvm.fabs.f64(double %f) #0
   %cond = call i1 @llvm.experimental.constrained.fcmps.f64(
                                                double %abs, double 0.0,
                                                metadata !"oeq",
diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-conv-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-conv-02.ll
index d4590a57d3edf..3f561654c3646 100644
--- a/llvm/test/CodeGen/SystemZ/vec-strict-conv-02.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-strict-conv-02.ll
@@ -9,7 +9,7 @@ declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float
 declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
 
 ; Test cases where both elements of a v2f64 are converted to f32s.
-define void @f1(<2 x double> %val, <2 x float> *%ptr) {
+define void @f1(<2 x double> %val, <2 x float> *%ptr) #0 {
 ; CHECK-LABEL: f1:
 ; CHECK: vledb {{%v[0-9]+}}, %v24, 0, 0
 ; CHECK: br %r14
@@ -35,7 +35,7 @@ define float @f2(<2 x double> %vec) #0 {
 }
 
 ; Test cases where even elements of a v4f32 are converted to f64s.
-define <2 x double> @f3(<4 x float> %vec) {
+define <2 x double> @f3(<4 x float> %vec) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK: vldeb %v24, {{%v[0-9]+}}
 ; CHECK: br %r14
@@ -47,7 +47,7 @@ define <2 x double> @f3(<4 x float> %vec) {
 }
 
 ; Test conversion of an f32 in a vector register to an f64.
-define double @f4(<4 x float> %vec) {
+define double @f4(<4 x float> %vec) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK: wldeb %f0, %v24
 ; CHECK: br %r14

From f6a6de288bfb23e45ab2558a9c163132cfe7579a Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 28 May 2020 15:30:23 -0700
Subject: [PATCH 541/770] GlobalISel: fix CombinerHelper::matchEqualDefs()

This matcher was always returning true for the different
results of a same instruction.

Differential Revision:
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  9 ++++
 .../postlegalizercombiner-select.mir          | 44 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index a3291a6a9712a..fbcd4c6f9d9f2 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1549,6 +1549,15 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
   if (!I2)
     return false;
 
+  // Handle a case like this:
+  //
+  // %0:_(s64), %1:_(s64) = G_UNMERGE_VALUES %2:_(<2 x s64>)
+  //
+  // Even though %0 and %1 are produced by the same instruction they are not
+  // the same values.
+  if (I1 == I2)
+    return MOP1.getReg() == MOP2.getReg();
+
   // If we have an instruction which loads or stores, we can't guarantee that
   // it is identical.
   //
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
new file mode 100644
index 0000000000000..89f58e1e76871
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            select_from_different_results_of_unmerge_values
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: select_from_different_results_of_unmerge_values
+    ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; GCN: [[DEF1:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+    ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
+    ; GCN: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[DEF1]](s1), [[UV]], [[UV1]]
+    ; GCN: $vgpr0 = COPY [[SELECT]](s32)
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %2:_(<2 x s32>) = G_IMPLICIT_DEF
+    %4:_(s1) = G_IMPLICIT_DEF
+    %0:_(s32), %1:_(s32) = G_UNMERGE_VALUES %2:_(<2 x s32>)
+    %3:_(s32) = G_SELECT %4:_(s1), %0:_, %1:_
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name:            select_from_same_results_of_unmerge_values
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: select_from_same_results_of_unmerge_values
+    ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
+    ; GCN: $vgpr0 = COPY [[UV]](s32)
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %2:_(<2 x s32>) = G_IMPLICIT_DEF
+    %4:_(s1) = G_IMPLICIT_DEF
+    %0:_(s32), %1:_(s32) = G_UNMERGE_VALUES %2:_(<2 x s32>)
+    %3:_(s32) = G_SELECT %4:_(s1), %0:_, %0:_
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...

From cd74ccc965e773a3f0c31cd6bb46de318cefdca9 Mon Sep 17 00:00:00 2001
From: "Kevin P. Neal" <kevin.neal@sas.com>
Date: Fri, 29 May 2020 12:19:33 -0400
Subject: [PATCH 542/770] [X86] Fix errors in use of strictfp attribute.

Errors spotted with use of: https://reviews.llvm.org/D68233
---
 llvm/test/CodeGen/X86/fp-intrinsics.ll        |  3 +-
 llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll |  2 +-
 .../CodeGen/X86/fp-strict-scalar-round.ll     | 20 +++----
 llvm/test/CodeGen/X86/vec-strict-128.ll       |  2 +-
 llvm/test/CodeGen/X86/vec-strict-256.ll       | 20 +++----
 llvm/test/CodeGen/X86/vec-strict-512.ll       | 20 +++----
 .../CodeGen/X86/vec-strict-fptoint-128.ll     | 52 +++++++++----------
 .../CodeGen/X86/vec-strict-fptoint-256.ll     | 40 +++++++-------
 .../CodeGen/X86/vec-strict-fptoint-512.ll     | 40 +++++++-------
 llvm/test/CodeGen/X86/vec-strict-round-128.ll | 20 +++----
 .../vector-constrained-fp-intrinsics-flags.ll |  2 +
 .../CodeGen/X86/vector-half-conversions.ll    |  2 +-
 12 files changed, 113 insertions(+), 110 deletions(-)

diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 27f198168e385..657731c231c6b 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -1762,13 +1762,14 @@ entry:
   ret i64 %result
 }
 
-define i64 @f26(float %x) {
+define i64 @f26(float %x) #0 {
 ; X87-LABEL: f26:
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    subl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll llrintf
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
index 7bee1340a774d..7f9e57d94f730 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
@@ -4206,7 +4206,7 @@ define void @foo(float %0, float %1) #0 {
   br i1 %3, label %4, label %5
 
 4:                                                ; preds = %2
-  tail call void @bar()
+  tail call void @bar() #0
   br label %5
 
 5:                                                ; preds = %4, %2
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
index 26137bd76a9f5..da05e8be432eb 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
@@ -54,7 +54,7 @@ define float @fceil32(float %f) #0 {
 ; AVX-X64-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
   %res = call float @llvm.experimental.constrained.ceil.f32(
-                        float %f, metadata !"fpexcept.strict")
+                        float %f, metadata !"fpexcept.strict") #0
   ret float %res
 }
 
@@ -107,7 +107,7 @@ define double @fceilf64(double %f) #0 {
 ; AVX-X64-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
   %res = call double @llvm.experimental.constrained.ceil.f64(
-                        double %f, metadata !"fpexcept.strict")
+                        double %f, metadata !"fpexcept.strict") #0
   ret double %res
 }
 
@@ -148,7 +148,7 @@ define float @ffloor32(float %f) #0 {
 ; AVX-X64-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
   %res = call float @llvm.experimental.constrained.floor.f32(
-                        float %f, metadata !"fpexcept.strict")
+                        float %f, metadata !"fpexcept.strict") #0
   ret float %res
 }
 
@@ -201,7 +201,7 @@ define double @ffloorf64(double %f) #0 {
 ; AVX-X64-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
   %res = call double @llvm.experimental.constrained.floor.f64(
-                        double %f, metadata !"fpexcept.strict")
+                        double %f, metadata !"fpexcept.strict") #0
   ret double %res
 }
 
@@ -242,7 +242,7 @@ define float @ftrunc32(float %f) #0 {
 ; AVX-X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
   %res = call float @llvm.experimental.constrained.trunc.f32(
-                        float %f, metadata !"fpexcept.strict")
+                        float %f, metadata !"fpexcept.strict") #0
   ret float %res
 }
 
@@ -295,7 +295,7 @@ define double @ftruncf64(double %f) #0 {
 ; AVX-X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
   %res = call double @llvm.experimental.constrained.trunc.f64(
-                        double %f, metadata !"fpexcept.strict")
+                        double %f, metadata !"fpexcept.strict") #0
   ret double %res
 }
 
@@ -337,7 +337,7 @@ define float @frint32(float %f) #0 {
 ; AVX-X64-NEXT:    retq
   %res = call float @llvm.experimental.constrained.rint.f32(
                         float %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret float %res
 }
 
@@ -391,7 +391,7 @@ define double @frintf64(double %f) #0 {
 ; AVX-X64-NEXT:    retq
   %res = call double @llvm.experimental.constrained.rint.f64(
                         double %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret double %res
 }
 
@@ -433,7 +433,7 @@ define float @fnearbyint32(float %f) #0 {
 ; AVX-X64-NEXT:    retq
   %res = call float @llvm.experimental.constrained.nearbyint.f32(
                         float %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret float %res
 }
 
@@ -487,7 +487,7 @@ define double @fnearbyintf64(double %f) #0 {
 ; AVX-X64-NEXT:    retq
   %res = call double @llvm.experimental.constrained.nearbyint.f64(
                         double %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret double %res
 }
 
diff --git a/llvm/test/CodeGen/X86/vec-strict-128.ll b/llvm/test/CodeGen/X86/vec-strict-128.ll
index 98162a1da9a9d..4fecd2631dc01 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128.ll
@@ -199,7 +199,7 @@ define <4 x float> @f11(<2 x double> %a0, <4 x float> %a1) #0 {
   %ext = extractelement <2 x double> %a0, i32 0
   %cvt = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %ext,
                                                                    metadata !"round.dynamic",
-                                                                   metadata !"fpexcept.strict")
+                                                                   metadata !"fpexcept.strict") #0
   %res = insertelement <4 x float> %a1, float %cvt, i32 0
   ret <4 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/vec-strict-256.ll b/llvm/test/CodeGen/X86/vec-strict-256.ll
index 97980a4f05da0..5945e6c1bc66e 100644
--- a/llvm/test/CodeGen/X86/vec-strict-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-256.ll
@@ -194,7 +194,7 @@ define <8 x float> @fceilv8f32(<8 x float> %f) #0 {
 ; CHECK-NEXT:    vroundps $10, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.experimental.constrained.ceil.v8f32(
-                          <8 x float> %f, metadata !"fpexcept.strict")
+                          <8 x float> %f, metadata !"fpexcept.strict") #0
   ret <8 x float> %res
 }
 
@@ -204,7 +204,7 @@ define <4 x double> @fceilv4f64(<4 x double> %f) #0 {
 ; CHECK-NEXT:    vroundpd $10, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.experimental.constrained.ceil.v4f64(
-                        <4 x double> %f, metadata !"fpexcept.strict")
+                        <4 x double> %f, metadata !"fpexcept.strict") #0
   ret <4 x double> %res
 }
 
@@ -214,7 +214,7 @@ define <8 x float> @ffloorv8f32(<8 x float> %f) #0 {
 ; CHECK-NEXT:    vroundps $9, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.experimental.constrained.floor.v8f32(
-                          <8 x float> %f, metadata !"fpexcept.strict")
+                          <8 x float> %f, metadata !"fpexcept.strict") #0
   ret <8 x float> %res
 }
 
@@ -224,7 +224,7 @@ define <4 x double> @ffloorv4f64(<4 x double> %f) #0 {
 ; CHECK-NEXT:    vroundpd $9, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.experimental.constrained.floor.v4f64(
-                        <4 x double> %f, metadata !"fpexcept.strict")
+                        <4 x double> %f, metadata !"fpexcept.strict") #0
   ret <4 x double> %res
 }
 
@@ -235,7 +235,7 @@ define <8 x float> @ftruncv8f32(<8 x float> %f) #0 {
 ; CHECK-NEXT:    vroundps $11, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.experimental.constrained.trunc.v8f32(
-                          <8 x float> %f, metadata !"fpexcept.strict")
+                          <8 x float> %f, metadata !"fpexcept.strict") #0
   ret <8 x float> %res
 }
 
@@ -245,7 +245,7 @@ define <4 x double> @ftruncv4f64(<4 x double> %f) #0 {
 ; CHECK-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.experimental.constrained.trunc.v4f64(
-                        <4 x double> %f, metadata !"fpexcept.strict")
+                        <4 x double> %f, metadata !"fpexcept.strict") #0
   ret <4 x double> %res
 }
 
@@ -257,7 +257,7 @@ define <8 x float> @frintv8f32(<8 x float> %f) #0 {
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.experimental.constrained.rint.v8f32(
                           <8 x float> %f,
-                          metadata !"round.dynamic", metadata !"fpexcept.strict")
+                          metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <8 x float> %res
 }
 
@@ -268,7 +268,7 @@ define <4 x double> @frintv4f64(<4 x double> %f) #0 {
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.experimental.constrained.rint.v4f64(
                         <4 x double> %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <4 x double> %res
 }
 
@@ -280,7 +280,7 @@ define <8 x float> @fnearbyintv8f32(<8 x float> %f) #0 {
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(
                           <8 x float> %f,
-                          metadata !"round.dynamic", metadata !"fpexcept.strict")
+                          metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <8 x float> %res
 }
 
@@ -291,7 +291,7 @@ define <4 x double> @fnearbyintv4f64(<4 x double> %f) #0 {
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(
                         <4 x double> %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <4 x double> %res
 }
 
diff --git a/llvm/test/CodeGen/X86/vec-strict-512.ll b/llvm/test/CodeGen/X86/vec-strict-512.ll
index b2a2c7efbdf63..2cafd74af4953 100644
--- a/llvm/test/CodeGen/X86/vec-strict-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-512.ll
@@ -191,7 +191,7 @@ define <16 x float> @strict_vector_fceil_v16f32(<16 x float> %f) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vrndscaleps $10, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x float> @llvm.experimental.constrained.ceil.v16f32(<16 x float> %f, metadata !"fpexcept.strict")
+  %res = call <16 x float> @llvm.experimental.constrained.ceil.v16f32(<16 x float> %f, metadata !"fpexcept.strict") #0
   ret <16 x float> %res
 }
 
@@ -200,7 +200,7 @@ define <8 x double> @strict_vector_fceil_v8f64(<8 x double> %f) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vrndscalepd $10, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <8 x double> @llvm.experimental.constrained.ceil.v8f64(<8 x double> %f, metadata !"fpexcept.strict")
+  %res = call <8 x double> @llvm.experimental.constrained.ceil.v8f64(<8 x double> %f, metadata !"fpexcept.strict") #0
   ret <8 x double> %res
 }
 
@@ -209,7 +209,7 @@ define <16 x float> @strict_vector_ffloor_v16f32(<16 x float> %f) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vrndscaleps $9, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x float> @llvm.experimental.constrained.floor.v16f32(<16 x float> %f, metadata !"fpexcept.strict")
+  %res = call <16 x float> @llvm.experimental.constrained.floor.v16f32(<16 x float> %f, metadata !"fpexcept.strict") #0
   ret <16 x float> %res
 }
 
@@ -218,7 +218,7 @@ define <8 x double> @strict_vector_ffloor_v8f64(<8 x double> %f) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vrndscalepd $9, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <8 x double> @llvm.experimental.constrained.floor.v8f64(<8 x double> %f, metadata !"fpexcept.strict")
+  %res = call <8 x double> @llvm.experimental.constrained.floor.v8f64(<8 x double> %f, metadata !"fpexcept.strict") #0
   ret <8 x double> %res
 }
 
@@ -227,7 +227,7 @@ define <16 x float> @strict_vector_ftrunc_v16f32(<16 x float> %f) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x float> @llvm.experimental.constrained.trunc.v16f32(<16 x float> %f, metadata !"fpexcept.strict")
+  %res = call <16 x float> @llvm.experimental.constrained.trunc.v16f32(<16 x float> %f, metadata !"fpexcept.strict") #0
   ret <16 x float> %res
 }
 
@@ -236,7 +236,7 @@ define <8 x double> @strict_vector_ftrunc_v8f64(<8 x double> %f) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <8 x double> @llvm.experimental.constrained.trunc.v8f64(<8 x double> %f, metadata !"fpexcept.strict")
+  %res = call <8 x double> @llvm.experimental.constrained.trunc.v8f64(<8 x double> %f, metadata !"fpexcept.strict") #0
   ret <8 x double> %res
 }
 
@@ -246,7 +246,7 @@ define <16 x float> @strict_vector_frint_v16f32(<16 x float> %f) #0 {
 ; CHECK-NEXT:    vrndscaleps $4, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x float> @llvm.experimental.constrained.rint.v16f32(<16 x float> %f,
-                             metadata !"round.dynamic", metadata !"fpexcept.strict")
+                             metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <16 x float> %res
 }
 
@@ -256,7 +256,7 @@ define <8 x double> @strict_vector_frint_v8f64(<8 x double> %f) #0 {
 ; CHECK-NEXT:    vrndscalepd $4, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x double> @llvm.experimental.constrained.rint.v8f64(<8 x double> %f,
-                            metadata !"round.dynamic", metadata !"fpexcept.strict")
+                            metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <8 x double> %res
 }
 
@@ -266,7 +266,7 @@ define <16 x float> @strict_vector_fnearbyint_v16f32(<16 x float> %f) #0 {
 ; CHECK-NEXT:    vrndscaleps $12, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float> %f,
-                             metadata !"round.dynamic", metadata !"fpexcept.strict")
+                             metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <16 x float> %res
 }
 
@@ -276,7 +276,7 @@ define <8 x double> @strict_vector_fnearbyint_v8f64(<8 x double> %f) #0 {
 ; CHECK-NEXT:    vrndscalepd $12, %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> %f,
-                             metadata !"round.dynamic", metadata !"fpexcept.strict")
+                             metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <8 x double> %res
 }
 
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index b3a91e657c427..af9663d7798f9 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -208,7 +208,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i64> %ret
 }
 
@@ -526,7 +526,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i64> %ret
 }
 
@@ -711,7 +711,7 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i64> %ret
 }
 
@@ -1042,7 +1042,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i64> %ret
 }
 
@@ -1082,7 +1082,7 @@ define <2 x i32> @strict_vector_fptosi_v2f64_to_v2i32(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i32> %ret
 }
 
@@ -1191,7 +1191,7 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i32> %ret
 }
 
@@ -1238,7 +1238,7 @@ define <2 x i32> @strict_vector_fptosi_v2f32_to_v2i32(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i32> %ret
 }
 
@@ -1349,7 +1349,7 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i32> %ret
 }
 
@@ -1396,7 +1396,7 @@ define <2 x i16> @strict_vector_fptosi_v2f64_to_v2i16(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i16> %ret
 }
 
@@ -1443,7 +1443,7 @@ define <2 x i16> @strict_vector_fptoui_v2f64_to_v2i16(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i16> %ret
 }
 
@@ -1497,7 +1497,7 @@ define <2 x i16> @strict_vector_fptosi_v2f32_to_v2i16(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i16> %ret
 }
 
@@ -1551,7 +1551,7 @@ define <2 x i16> @strict_vector_fptoui_v2f32_to_v2i16(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i16> %ret
 }
 
@@ -1602,7 +1602,7 @@ define <2 x i8> @strict_vector_fptosi_v2f64_to_v2i8(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovdb %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i8> %ret
 }
 
@@ -1653,7 +1653,7 @@ define <2 x i8> @strict_vector_fptoui_v2f64_to_v2i8(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovdb %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i8> %ret
 }
 
@@ -1711,7 +1711,7 @@ define <2 x i8> @strict_vector_fptosi_v2f32_to_v2i8(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovdb %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i8> %ret
 }
 
@@ -1769,7 +1769,7 @@ define <2 x i8> @strict_vector_fptoui_v2f32_to_v2i8(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovdb %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i8> %ret
 }
 
@@ -1894,7 +1894,7 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i1> %ret
 }
 
@@ -2122,7 +2122,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f64(<2 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i1> %ret
 }
 
@@ -2270,7 +2270,7 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i1> %ret
 }
 
@@ -2518,7 +2518,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovm2q %k0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f32(<2 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <2 x i1> %ret
 }
 
@@ -2558,7 +2558,7 @@ define <4 x i32> @strict_vector_fptosi_v4f32_to_v4i32(<4 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i32> %ret
 }
 
@@ -2630,7 +2630,7 @@ define <4 x i32> @strict_vector_fptoui_v4f32_to_v4i32(<4 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i32> %ret
 }
 
@@ -2682,7 +2682,7 @@ define <4 x i8> @strict_vector_fptosi_v4f32_to_v4i8(<4 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovdb %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i8> %ret
 }
 
@@ -2734,7 +2734,7 @@ define <4 x i8> @strict_vector_fptoui_v4f32_to_v4i8(<4 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovdb %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i8> %ret
 }
 
@@ -2787,7 +2787,7 @@ define <4 x i1> @strict_vector_fptosi_v4f32_to_v4i1(<4 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i1> %ret
 }
 
@@ -2844,7 +2844,7 @@ define <4 x i1> @strict_vector_fptoui_v4f32_to_v4i1(<4 x float> %a) #0 {
 ; AVX512VLDQ-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512VLDQ-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i1> %ret
 }
 
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index 00aee49f64cb0..52313174fed79 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -212,7 +212,7 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vcvttpd2qq %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i64> %ret
 }
 
@@ -595,7 +595,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vcvttpd2uqq %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i64> %ret
 }
 
@@ -774,7 +774,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vcvttps2qq %xmm0, %ymm0
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i64> %ret
 }
 
@@ -1157,7 +1157,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vcvttps2uqq %xmm0, %ymm0
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i64> %ret
 }
 
@@ -1168,7 +1168,7 @@ define <4 x i32> @strict_vector_fptosi_v4f64_to_v4i32(<4 x double> %a) #0 {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i32> %ret
 }
 
@@ -1218,7 +1218,7 @@ define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i32> %ret
 }
 
@@ -1230,7 +1230,7 @@ define <4 x i16> @strict_vector_fptosi_v4f64_to_v4i16(<4 x double> %a) #0 {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i16> %ret
 }
 
@@ -1242,7 +1242,7 @@ define <4 x i16> @strict_vector_fptoui_v4f64_to_v4i16(<4 x double> %a) #0 {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i16> %ret
 }
 
@@ -1285,7 +1285,7 @@ define <4 x i8> @strict_vector_fptosi_v4f64_to_v4i8(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i8> %ret
 }
 
@@ -1328,7 +1328,7 @@ define <4 x i8> @strict_vector_fptoui_v4f64_to_v4i8(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i8> %ret
 }
 
@@ -1374,7 +1374,7 @@ define <4 x i1> @strict_vector_fptosi_v4f64_to_v4i1(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i1> %ret
 }
 
@@ -1424,7 +1424,7 @@ define <4 x i1> @strict_vector_fptoui_v4f64_to_v4i1(<4 x double> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f64(<4 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <4 x i1> %ret
 }
 
@@ -1434,7 +1434,7 @@ define <8 x i32> @strict_vector_fptosi_v8f32_to_v8i32(<8 x float> %a) #0 {
 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i32> %ret
 }
 
@@ -1476,7 +1476,7 @@ define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vcvttps2udq %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i32> %ret
 }
 
@@ -1519,7 +1519,7 @@ define <8 x i16> @strict_vector_fptosi_v8f32_to_v8i16(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i16> %ret
 }
 
@@ -1562,7 +1562,7 @@ define <8 x i16> @strict_vector_fptoui_v8f32_to_v8i16(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i16> %ret
 }
 
@@ -1604,7 +1604,7 @@ define <8 x i8> @strict_vector_fptosi_v8f32_to_v8i8(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i8> %ret
 }
 
@@ -1646,7 +1646,7 @@ define <8 x i8> @strict_vector_fptoui_v8f32_to_v8i8(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i8> %ret
 }
 
@@ -1698,7 +1698,7 @@ define <8 x i1> @strict_vector_fptosi_v8f32_to_v8i1(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i1> %ret
 }
 
@@ -1754,7 +1754,7 @@ define <8 x i1> @strict_vector_fptoui_v8f32_to_v8i1(<8 x float> %a) #0 {
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i1> %ret
 }
 
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index 06464ea1cb810..52ba8deec1c4b 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -129,7 +129,7 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i64> %ret
 }
 
@@ -340,7 +340,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i64> %ret
 }
 
@@ -443,7 +443,7 @@ define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i64> %ret
 }
 
@@ -654,7 +654,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f32(<8 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i64> %ret
 }
 
@@ -664,7 +664,7 @@ define <8 x i32> @strict_vector_fptosi_v8f64_to_v8i32(<8 x double> %a) #0 {
 ; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i32> %ret
 }
 
@@ -674,7 +674,7 @@ define <8 x i32> @strict_vector_fptoui_v8f64_to_v8i32(<8 x double> %a) #0 {
 ; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i32> %ret
 }
 
@@ -694,7 +694,7 @@ define <8 x i16> @strict_vector_fptosi_v8f64_to_v8i16(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i16> %ret
 }
 
@@ -714,7 +714,7 @@ define <8 x i16> @strict_vector_fptoui_v8f64_to_v8i16(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i16> %ret
 }
 
@@ -733,7 +733,7 @@ define <8 x i8> @strict_vector_fptosi_v8f64_to_v8i8(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i8> %ret
 }
 
@@ -752,7 +752,7 @@ define <8 x i8> @strict_vector_fptoui_v8f64_to_v8i8(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i8> %ret
 }
 
@@ -777,7 +777,7 @@ define <8 x i1> @strict_vector_fptosi_v8f64_to_v8i1(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i1> %ret
 }
 
@@ -804,7 +804,7 @@ define <8 x i1> @strict_vector_fptoui_v8f64_to_v8i1(<8 x double> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f64(<8 x double> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <8 x i1> %ret
 }
 
@@ -814,7 +814,7 @@ define <16 x i32> @strict_vector_fptosi_v16f32_to_v16i32(<16 x float> %a) #0 {
 ; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i32> %ret
 }
 
@@ -824,7 +824,7 @@ define <16 x i32> @strict_vector_fptoui_v16f32_to_v16i32(<16 x float> %a) #0 {
 ; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i32> %ret
 }
 
@@ -835,7 +835,7 @@ define <16 x i16> @strict_vector_fptosi_v16f32_to_v16i16(<16 x float> %a) #0 {
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i16> %ret
 }
 
@@ -846,7 +846,7 @@ define <16 x i16> @strict_vector_fptoui_v16f32_to_v16i16(<16 x float> %a) #0 {
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i16> %ret
 }
 
@@ -858,7 +858,7 @@ define <16 x i8> @strict_vector_fptosi_v16f32_to_v16i8(<16 x float> %a) #0 {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i8> %ret
 }
 
@@ -870,7 +870,7 @@ define <16 x i8> @strict_vector_fptoui_v16f32_to_v16i8(<16 x float> %a) #0 {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i8> %ret
 }
 
@@ -893,7 +893,7 @@ define <16 x i1> @strict_vector_fptosi_v16f32_to_v16i1(<16 x float> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i1> %ret
 }
 
@@ -918,7 +918,7 @@ define <16 x i1> @strict_vector_fptoui_v16f32_to_v16i1(<16 x float> %a) #0 {
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    ret{{[l|q]}}
   %ret = call <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f32(<16 x float> %a,
-                                              metadata !"fpexcept.strict")
+                                              metadata !"fpexcept.strict") #0
   ret <16 x i1> %ret
 }
 
diff --git a/llvm/test/CodeGen/X86/vec-strict-round-128.ll b/llvm/test/CodeGen/X86/vec-strict-round-128.ll
index 3f8b4f84a136d..1f7507cc02bc5 100644
--- a/llvm/test/CodeGen/X86/vec-strict-round-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-round-128.ll
@@ -28,7 +28,7 @@ define <4 x float> @fceilv4f32(<4 x float> %f) #0 {
 ; AVX-NEXT:    vroundps $10, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.experimental.constrained.ceil.v4f32(
-                          <4 x float> %f, metadata !"fpexcept.strict")
+                          <4 x float> %f, metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
@@ -43,7 +43,7 @@ define <2 x double> @fceilv2f64(<2 x double> %f) #0 {
 ; AVX-NEXT:    vroundpd $10, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(
-                        <2 x double> %f, metadata !"fpexcept.strict")
+                        <2 x double> %f, metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
@@ -58,7 +58,7 @@ define <4 x float> @ffloorv4f32(<4 x float> %f) #0 {
 ; AVX-NEXT:    vroundps $9, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.experimental.constrained.floor.v4f32(
-                          <4 x float> %f, metadata !"fpexcept.strict")
+                          <4 x float> %f, metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
@@ -73,7 +73,7 @@ define <2 x double> @ffloorv2f64(<2 x double> %f) #0 {
 ; AVX-NEXT:    vroundpd $9, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.experimental.constrained.floor.v2f64(
-                        <2 x double> %f, metadata !"fpexcept.strict")
+                        <2 x double> %f, metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
@@ -88,7 +88,7 @@ define <4 x float> @ftruncv4f32(<4 x float> %f) #0 {
 ; AVX-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.experimental.constrained.trunc.v4f32(
-                          <4 x float> %f, metadata !"fpexcept.strict")
+                          <4 x float> %f, metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
@@ -103,7 +103,7 @@ define <2 x double> @ftruncv2f64(<2 x double> %f) #0 {
 ; AVX-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(
-                        <2 x double> %f, metadata !"fpexcept.strict")
+                        <2 x double> %f, metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
@@ -119,7 +119,7 @@ define <4 x float> @frintv4f32(<4 x float> %f) #0 {
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.experimental.constrained.rint.v4f32(
                           <4 x float> %f,
-                          metadata !"round.dynamic", metadata !"fpexcept.strict")
+                          metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
@@ -135,7 +135,7 @@ define <2 x double> @frintv2f64(<2 x double> %f) #0 {
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.experimental.constrained.rint.v2f64(
                         <2 x double> %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
@@ -151,7 +151,7 @@ define <4 x float> @fnearbyintv4f32(<4 x float> %f) #0 {
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(
                           <4 x float> %f,
-                          metadata !"round.dynamic", metadata !"fpexcept.strict")
+                          metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <4 x float> %res
 }
 
@@ -167,7 +167,7 @@ define <2 x double> @fnearbyintv2f64(<2 x double> %f) #0 {
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(
                         <2 x double> %f,
-                        metadata !"round.dynamic", metadata !"fpexcept.strict")
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <2 x double> %res
 }
 
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
index b1ef02c855a17..4dbb6d888febb 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
@@ -54,6 +54,8 @@ entry:
   ret <4 x double> %add
 }
 
+attributes #0 = { strictfp }
+
 declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index f1436324c2caf..85d5f580974cf 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -222,7 +222,7 @@ define <4 x float> @load_cvt_4i16_to_4f32_constrained(<4 x i16>* %a0) nounwind s
   ret <4 x float> %3
 }
 
-define <4 x float> @load_cvt_8i16_to_4f32_constrained(<8 x i16>* %a0) nounwind {
+define <4 x float> @load_cvt_8i16_to_4f32_constrained(<8 x i16>* %a0) nounwind strictfp {
 ; ALL-LABEL: load_cvt_8i16_to_4f32_constrained:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vcvtph2ps (%rdi), %xmm0

From 40c08367e411a178404d4f01a82f651188f2ed01 Mon Sep 17 00:00:00 2001
From: Guozhi Wei <carrot@google.com>
Date: Fri, 29 May 2020 09:31:26 -0700
Subject: [PATCH 543/770] [DAGCombiner] Add command line options to guard store
 width reduction optimizations

As discussed in the thread http://lists.llvm.org/pipermail/llvm-dev/2020-May/141838.html,
some bit field access width can be reduced by ReduceLoadOpStoreWidth, some
can't. If two accesses are very close, and the first access width is reduced,
the second is not. Then the wide load of second access will be stalled for long
time.

This patch add command line options to guard ReduceLoadOpStoreWidth and
ShrinkLoadReplaceStoreWithStore, so users can use them to disable these
store width reduction optimizations.

Differential Revision: https://reviews.llvm.org/D80745
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +++++++++-
 llvm/test/CodeGen/X86/clear-bitfield.ll       | 30 +++++++++++++++++++
 llvm/test/CodeGen/X86/disable-shrink-store.ll | 18 +++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/clear-bitfield.ll
 create mode 100644 llvm/test/CodeGen/X86/disable-shrink-store.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d54663f4ce784..43bcf2e118882 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -125,6 +125,16 @@ static cl::opt<unsigned> StoreMergeDependenceLimit(
     cl::desc("Limit the number of times for the same StoreNode and RootNode "
              "to bail out in store merging dependence check"));
 
+static cl::opt<bool> EnableReduceLoadOpStoreWidth(
+    "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
+    cl::desc("DAG cominber enable reducing the width of load/op/store "
+             "sequence"));
+
+static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
+    "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
+    cl::desc("DAG cominber enable load/<replace bytes>/store with "
+             "a narrower store"));
+
 namespace {
 
   class DAGCombiner {
@@ -15423,7 +15433,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   // Y is known to provide just those bytes.  If so, we try to replace the
   // load + replace + store sequence with a single (narrower) store, which makes
   // the load dead.
-  if (Opc == ISD::OR) {
+  if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
     std::pair<unsigned, unsigned> MaskedLoad;
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
     if (MaskedLoad.first)
@@ -15439,6 +15449,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
         return NewST;
   }
 
+  if (!EnableReduceLoadOpStoreWidth)
+    return SDValue();
+
   if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
       Value.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
diff --git a/llvm/test/CodeGen/X86/clear-bitfield.ll b/llvm/test/CodeGen/X86/clear-bitfield.ll
new file mode 100644
index 0000000000000..01c35becefab9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/clear-bitfield.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -combiner-reduce-load-op-store-width=false | FileCheck %s
+
+%struct.bit_fields = type { i32 }
+
+define void @clear_b1(%struct.bit_fields* %ptr) {
+; CHECK-LABEL: clear_b1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andl $-2, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast %struct.bit_fields* %ptr to i32*
+  %bf.load = load i32, i32* %0
+  %bf.clear = and i32 %bf.load, -2
+  store i32 %bf.clear, i32* %0
+  ret void
+}
+
+define void @clear16(%struct.bit_fields* %ptr) {
+; CHECK-LABEL: clear16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andw $-2, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast %struct.bit_fields* %ptr to i16*
+  %bf.load = load i16, i16* %0
+  %bf.clear = and i16 %bf.load, -2
+  store i16 %bf.clear, i16* %0
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/disable-shrink-store.ll b/llvm/test/CodeGen/X86/disable-shrink-store.ll
new file mode 100644
index 0000000000000..871fc47dbfdb2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/disable-shrink-store.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -combiner-shrink-load-replace-store-with-store=false | FileCheck %s
+
+define void @shrink(i16* %ptr) {
+; CHECK-LABEL: shrink:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    orl $25600, %eax # imm = 0x6400
+; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %val = load i16, i16* %ptr
+  %masked_val = and i16 %val, 255
+  %replaced_val = or i16 %masked_val, 25600
+  store i16 %replaced_val, i16* %ptr
+  ret void
+}
+

From 31a8e27e1805e039d699afa890702e26a93e0b40 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 29 May 2020 09:43:02 -0700
Subject: [PATCH 544/770] Let @skipUnlessThreadSanitizer imply @skipIfAsan

Don't run tests that use thread sanitizer inside an address-sanitized
LLDB. The tests don't support that configuration. Incidentally they
were skipped on green dragon for a different reason, so this hasn't
come up there before.
---
 lldb/packages/Python/lldbsuite/test/decorators.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index b94b672e44999..d02c9468cdbcd 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -719,6 +719,9 @@ def skipUnlessThreadSanitizer(func):
     """Decorate the item to skip test unless Clang -fsanitize=thread is supported."""
 
     def is_compiler_clang_with_thread_sanitizer(self):
+        if is_running_under_asan():
+            return "Thread sanitizer tests are disabled when runing under ASAN"
+
         compiler_path = self.getCompiler()
         compiler = os.path.basename(compiler_path)
         if not compiler.startswith("clang"):

From c1a33299768f8404dd7b83a6cfef7c1155786c6a Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 29 May 2020 09:45:43 -0700
Subject: [PATCH 545/770] Let @skipUnlessUndefinedBehaviorSanitizer imply
 @skipIfAsan

Don't run tests that use undefined behavior sanitizer inside an
address-sanitized LLDB. The tests don't support that
configuration. Incidentally they were skipped on green dragon for a
different reason, so this hasn't come up there before.
---
 lldb/packages/Python/lldbsuite/test/decorators.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index d02c9468cdbcd..0ef2f931542c6 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -745,6 +745,9 @@ def skipUnlessUndefinedBehaviorSanitizer(func):
     """Decorate the item to skip test unless -fsanitize=undefined is supported."""
 
     def is_compiler_clang_with_ubsan(self):
+        if is_running_under_asan():
+            return "Undefined behavior sanitizer tests are disabled when runing under ASAN"
+
         # Write out a temp file which exhibits UB.
         inputf = tempfile.NamedTemporaryFile(suffix='.c', mode='w')
         inputf.write('int main() { int x = 0; return x / x; }\n')

From a01c0049b1a3583604259ae7bbe9761fc123adae Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 29 May 2020 16:26:44 +0100
Subject: [PATCH 546/770] [ConstantFolding] Constant folding for integer vector
 reduce intrinsics

This add constant folding for all the integer vector reduce intrinsics,
providing that the argument is a constant vector. zeroinitializer always
produces 0 for all intrinsics, and other values can be handled with
APInt operators.

Differential Revision: https://reviews.llvm.org/D80516
---
 llvm/lib/Analysis/ConstantFolding.cpp         |  84 +++
 .../Analysis/ConstantFolding/vecreduce.ll     | 481 ++++++++++++++++++
 2 files changed, 565 insertions(+)
 create mode 100644 llvm/test/Analysis/ConstantFolding/vecreduce.ll

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 53f1c144c546a..88d9cc5403eb9 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1428,6 +1428,15 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::smul_fix_sat:
   case Intrinsic::bitreverse:
   case Intrinsic::is_constant:
+  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::experimental_vector_reduce_umax:
     return true;
 
   // Floating point operations cannot be folded in strictfp functions in
@@ -1647,6 +1656,53 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
   return GetConstantFoldFPValue(V, Ty);
 }
 
+Constant *ConstantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
+  FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType());
+  if (!VT)
+    return nullptr;
+  ConstantInt *CI = dyn_cast<ConstantInt>(Op->getAggregateElement(0U));
+  if (!CI)
+    return nullptr;
+  APInt Acc = CI->getValue();
+
+  for (unsigned I = 1; I < VT->getNumElements(); I++) {
+    if (!(CI = dyn_cast<ConstantInt>(Op->getAggregateElement(I))))
+      return nullptr;
+    const APInt &X = CI->getValue();
+    switch (IID) {
+    case Intrinsic::experimental_vector_reduce_add:
+      Acc = Acc + X;
+      break;
+    case Intrinsic::experimental_vector_reduce_mul:
+      Acc = Acc * X;
+      break;
+    case Intrinsic::experimental_vector_reduce_and:
+      Acc = Acc & X;
+      break;
+    case Intrinsic::experimental_vector_reduce_or:
+      Acc = Acc | X;
+      break;
+    case Intrinsic::experimental_vector_reduce_xor:
+      Acc = Acc ^ X;
+      break;
+    case Intrinsic::experimental_vector_reduce_smin:
+      Acc = APIntOps::smin(Acc, X);
+      break;
+    case Intrinsic::experimental_vector_reduce_smax:
+      Acc = APIntOps::smax(Acc, X);
+      break;
+    case Intrinsic::experimental_vector_reduce_umin:
+      Acc = APIntOps::umin(Acc, X);
+      break;
+    case Intrinsic::experimental_vector_reduce_umax:
+      Acc = APIntOps::umax(Acc, X);
+      break;
+    }
+  }
+
+  return ConstantInt::get(Op->getContext(), Acc);
+}
+
 /// Attempt to fold an SSE floating point to integer conversion of a constant
 /// floating point. If roundTowardZero is false, the default IEEE rounding is
 /// used (toward nearest, ties to even). This matches the behavior of the
@@ -2086,12 +2142,40 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     }
   }
 
+  if (isa<ConstantAggregateZero>(Operands[0])) {
+    switch (IntrinsicID) {
+    default: break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_umax:
+      return ConstantInt::get(Ty, 0);
+    }
+  }
+
   // Support ConstantVector in case we have an Undef in the top.
   if (isa<ConstantVector>(Operands[0]) ||
       isa<ConstantDataVector>(Operands[0])) {
     auto *Op = cast<Constant>(Operands[0]);
     switch (IntrinsicID) {
     default: break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_umax:
+      if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op))
+        return C;
+      break;
     case Intrinsic::x86_sse_cvtss2si:
     case Intrinsic::x86_sse_cvtss2si64:
     case Intrinsic::x86_sse2_cvtsd2si:
diff --git a/llvm/test/Analysis/ConstantFolding/vecreduce.ll b/llvm/test/Analysis/ConstantFolding/vecreduce.ll
new file mode 100644
index 0000000000000..4d6ba1b45db10
--- /dev/null
+++ b/llvm/test/Analysis/ConstantFolding/vecreduce.ll
@@ -0,0 +1,481 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -constprop -S | FileCheck %s
+
+declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a)
+declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a)
+
+
+define i32 @add_0() {
+; CHECK-LABEL: @add_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @add_1() {
+; CHECK-LABEL: @add_1(
+; CHECK-NEXT:    ret i32 8
+;
+  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @add_inc() {
+; CHECK-LABEL: @add_inc(
+; CHECK-NEXT:    ret i32 18
+;
+  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @add_1v() {
+; CHECK-LABEL: @add_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @add_undef() {
+; CHECK-LABEL: @add_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @add_undef1() {
+; CHECK-LABEL: @add_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+
+define i32 @mul_0() {
+; CHECK-LABEL: @mul_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @mul_1() {
+; CHECK-LABEL: @mul_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @mul_inc() {
+; CHECK-LABEL: @mul_inc(
+; CHECK-NEXT:    ret i32 40320
+;
+  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @mul_1v() {
+; CHECK-LABEL: @mul_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @mul_undef() {
+; CHECK-LABEL: @mul_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @mul_undef1() {
+; CHECK-LABEL: @mul_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @and_0() {
+; CHECK-LABEL: @and_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @and_1() {
+; CHECK-LABEL: @and_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @and_inc() {
+; CHECK-LABEL: @and_inc(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @and_1v() {
+; CHECK-LABEL: @and_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @and_undef() {
+; CHECK-LABEL: @and_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @and_undef1() {
+; CHECK-LABEL: @and_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @or_0() {
+; CHECK-LABEL: @or_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @or_1() {
+; CHECK-LABEL: @or_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @or_inc() {
+; CHECK-LABEL: @or_inc(
+; CHECK-NEXT:    ret i32 -1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @or_1v() {
+; CHECK-LABEL: @or_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @or_undef() {
+; CHECK-LABEL: @or_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @or_undef1() {
+; CHECK-LABEL: @or_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @xor_0() {
+; CHECK-LABEL: @xor_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @xor_1() {
+; CHECK-LABEL: @xor_1(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @xor_inc() {
+; CHECK-LABEL: @xor_inc(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @xor_1v() {
+; CHECK-LABEL: @xor_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @xor_undef() {
+; CHECK-LABEL: @xor_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @xor_undef1() {
+; CHECK-LABEL: @xor_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @smin_0() {
+; CHECK-LABEL: @smin_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @smin_1() {
+; CHECK-LABEL: @smin_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @smin_inc() {
+; CHECK-LABEL: @smin_inc(
+; CHECK-NEXT:    ret i32 -6
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @smin_1v() {
+; CHECK-LABEL: @smin_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @smin_undef() {
+; CHECK-LABEL: @smin_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smin_undef1() {
+; CHECK-LABEL: @smin_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @smax_0() {
+; CHECK-LABEL: @smax_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @smax_1() {
+; CHECK-LABEL: @smax_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @smax_inc() {
+; CHECK-LABEL: @smax_inc(
+; CHECK-NEXT:    ret i32 8
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @smax_1v() {
+; CHECK-LABEL: @smax_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @smax_undef() {
+; CHECK-LABEL: @smax_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @smax_undef1() {
+; CHECK-LABEL: @smax_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @umin_0() {
+; CHECK-LABEL: @umin_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @umin_1() {
+; CHECK-LABEL: @umin_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @umin_inc() {
+; CHECK-LABEL: @umin_inc(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @umin_1v() {
+; CHECK-LABEL: @umin_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @umin_undef() {
+; CHECK-LABEL: @umin_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umin_undef1() {
+; CHECK-LABEL: @umin_undef1(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+
+define i32 @umax_0() {
+; CHECK-LABEL: @umax_0(
+; CHECK-NEXT:    ret i32 0
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer)
+  ret i32 %x
+}
+
+define i32 @umax_1() {
+; CHECK-LABEL: @umax_1(
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}
+
+define i32 @umax_inc() {
+; CHECK-LABEL: @umax_inc(
+; CHECK-NEXT:    ret i32 -3
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 -3, i32 5, i32 7, i32 2, i32 4, i32 -6, i32 8>)
+  ret i32 %x
+}
+
+define i32 @umax_1v() {
+; CHECK-LABEL: @umax_1v(
+; CHECK-NEXT:    ret i32 10
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> <i32 10>)
+  ret i32 %x
+}
+
+define i32 @umax_undef() {
+; CHECK-LABEL: @umax_undef(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
+  ret i32 %x
+}
+
+define i32 @umax_undef1d() {
+; CHECK-LABEL: @umax_undef1d(
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  ret i32 %x
+}

From 8c2d2d971b2a221786e29380666d9900adeaee6a Mon Sep 17 00:00:00 2001
From: Paul Robinson <paul.robinson@sony.com>
Date: Thu, 28 May 2020 13:19:52 -0700
Subject: [PATCH 547/770] Preserve DbgLoc when DeadArgumentElimination rewrites
 a 'ret'.

Fixes PR46002.
---
 llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp         | 3 ++-
 llvm/test/Transforms/DeadArgElim/dbginfo-preserve-dbgloc.ll | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 5dc1a6ff63279..00fcba40c36f4 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -1056,7 +1056,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         }
         // Replace the return instruction with one returning the new return
         // value (possibly 0 if we became void).
-        ReturnInst::Create(F->getContext(), RetVal, RI);
+        auto *NewRet = ReturnInst::Create(F->getContext(), RetVal, RI);
+        NewRet->setDebugLoc(RI->getDebugLoc());
         BB.getInstList().erase(RI);
       }
 
diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo-preserve-dbgloc.ll b/llvm/test/Transforms/DeadArgElim/dbginfo-preserve-dbgloc.ll
index de5082d99bf5a..ab7645ecf6106 100644
--- a/llvm/test/Transforms/DeadArgElim/dbginfo-preserve-dbgloc.ll
+++ b/llvm/test/Transforms/DeadArgElim/dbginfo-preserve-dbgloc.ll
@@ -63,6 +63,7 @@ if.end3:                                          ; preds = %if.end
 
 cleanup:                                          ; preds = %if.end3, %if.then2, %if.then
   %retval.0 = phi i1 [ false, %if.then2 ], [ true, %if.end3 ], [ false, %if.then ]
+; CHECK: ret void, !dbg
   ret i1 %retval.0, !dbg !56
 }
 

From 0b21c6706a48c013d2b3fc4a258a0f7437b0781a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 12:18:34 -0400
Subject: [PATCH 548/770] [LoopVectorize] auto-generate complete test checks;
 NFC

---
 .../X86/invariant-store-vectorization.ll      | 181 ++++++++++++------
 1 file changed, 124 insertions(+), 57 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 69f578cf789eb..6ba61396afd40 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -6,39 +6,106 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; first test checks that loop with a reduction and a uniform store gets
 ; vectorized.
-; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
-; CHECK-LABEL: vector.memcheck:
-; CHECK:    found.conflict
 
-; CHECK-LABEL: vector.body:
-; CHECK:         %vec.phi = phi <16 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:         %wide.load = load <16 x i32>
-; CHECK:         [[ADD]] = add <16 x i32> %vec.phi, %wide.load
-; CHECK:         store i32 %ntrunc, i32* %a
-; CHECK-NOT:     store i32 %ntrunc, i32* %a
-; CHECK:         %index.next = add i64 %index, 64
-
-; CHECK-LABEL: middle.block:
-; CHECK:         %rdx.shuf = shufflevector <16 x i32>
 define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 64
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX3:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775744
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP10]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11]] = add <16 x i32> [[VEC_PHI5]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP12]] = add <16 x i32> [[VEC_PHI6]], [[WIDE_LOAD9]]
+; CHECK-NEXT:    [[TMP13]] = add <16 x i32> [[VEC_PHI7]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add <16 x i32> [[TMP13]], [[BIN_RDX11]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[BIN_RDX12]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <16 x i32> [[BIN_RDX12]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <16 x i32> [[BIN_RDX13]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX15:%.*]] = add <16 x i32> [[BIN_RDX13]], [[RDX_SHUF14]]
+; CHECK-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <16 x i32> [[BIN_RDX15]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = add <16 x i32> [[BIN_RDX15]], [[RDX_SHUF16]]
+; CHECK-NEXT:    [[RDX_SHUF18:%.*]] = shufflevector <16 x i32> [[BIN_RDX17]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX19:%.*]] = add <16 x i32> [[BIN_RDX17]], [[RDX_SHUF18]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i32> [[BIN_RDX19]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
+; CHECK-NEXT:    [[T3]] = add i32 [[T0]], [[T2]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; CHECK:       for.end:
+; CHECK-NEXT:    [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[T4]]
+;
 entry:
   %ntrunc = trunc i64 %n to i32
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
-  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %tmp2 = load i32, i32* %tmp1, align 8
-  %tmp3 = add i32 %tmp0, %tmp2
+  %t0 = phi i32 [ %t3, %for.body ], [ 0, %entry ]
+  %t1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %t2 = load i32, i32* %t1, align 8
+  %t3 = add i32 %t0, %t2
   store i32 %ntrunc, i32* %a
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
 
 for.end:                                          ; preds = %for.body
-  %tmp4 = phi i32 [ %tmp3, %for.body ]
-  ret i32 %tmp4
+  %t4 = phi i32 [ %t3, %for.body ]
+  ret i32 %t4
 }
 
 ; Conditional store
@@ -64,22 +131,22 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b,
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT7]], <16 x i32*> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT6]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT6]], <16 x i32*> [[BROADCAST_SPLAT8]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
@@ -91,10 +158,10 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b,
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[T1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
 ; CHECK:       cond_store:
 ; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
@@ -112,10 +179,10 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %tmp2 = load i32, i32* %tmp1, align 8
-  %cmp = icmp eq i32 %tmp2, %k
-  store i32 %ntrunc, i32* %tmp1
+  %t1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %t2 = load i32, i32* %t1, align 8
+  %cmp = icmp eq i32 %t2, %k
+  store i32 %ntrunc, i32* %t1
   br i1 %cmp, label %cond_store, label %latch
 
 cond_store:
@@ -162,25 +229,25 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32*
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT16]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT18]], <16 x i32*> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
-; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT17]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT19]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
@@ -192,15 +259,15 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32*
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
-; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[T1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[T1]], align 4
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
 ; CHECK:       cond_store:
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 8
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[T4:%.*]] = load i32, i32* [[T3]], align 8
+; CHECK-NEXT:    store i32 [[T4]], i32* [[A]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
@@ -215,16 +282,16 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
-  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
-  %tmp2 = load i32, i32* %tmp1, align 8
-  %cmp = icmp eq i32 %tmp2, %k
-  store i32 %ntrunc, i32* %tmp1
+  %t1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %t2 = load i32, i32* %t1, align 8
+  %cmp = icmp eq i32 %t2, %k
+  store i32 %ntrunc, i32* %t1
   br i1 %cmp, label %cond_store, label %latch
 
 cond_store:
-  %tmp3 = getelementptr inbounds i32, i32* %c, i64 %i
-  %tmp4 = load i32, i32* %tmp3, align 8
-  store i32 %tmp4, i32* %a
+  %t3 = getelementptr inbounds i32, i32* %c, i64 %i
+  %t4 = load i32, i32* %t3, align 8
+  store i32 %t4, i32* %a
   br label %latch
 
 latch:

From 9d1f95bf9f55715ecc127d6f425696ff94c614fe Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 12:19:46 -0400
Subject: [PATCH 549/770] [LoopVectorize] regenerate test checks; NFC

Align attributes are now visible.
---
 .../LoopVectorize/X86/load-deref-pred.ll      | 448 +++++++++---------
 1 file changed, 224 insertions(+), 224 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 3c7dae3d71fb8..c33e728f78296 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -17,7 +17,7 @@ declare void @init(i32*)
 define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-LABEL: @test_explicit_pred(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -116,7 +116,7 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -156,7 +156,7 @@ loop_exit:
 define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @test_explicit_pred_generic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -200,34 +200,34 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -283,11 +283,11 @@ define i32 @test_explicit_pred_generic(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -331,7 +331,7 @@ loop_exit:
 define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @test_invariant_address(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -375,66 +375,66 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP65:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP67:%.*]] = load i32, i32* [[BASE]]
+; CHECK-NEXT:    [[TMP64:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP65:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP67:%.*]] = load i32, i32* [[BASE]], align 4
 ; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <4 x i32> undef, i32 [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2
 ; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3
-; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP74:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[BASE]]
+; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP74:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[BASE]], align 4
 ; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> undef, i32 [[TMP72]], i32 0
 ; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2
 ; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3
-; CHECK-NEXT:    [[TMP80:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP83:%.*]] = load i32, i32* [[BASE]]
+; CHECK-NEXT:    [[TMP80:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP83:%.*]] = load i32, i32* [[BASE]], align 4
 ; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> undef, i32 [[TMP80]], i32 0
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
 ; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[BASE]]
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[BASE]]
+; CHECK-NEXT:    [[TMP88:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[BASE]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[BASE]], align 4
 ; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> undef, i32 [[TMP88]], i32 0
 ; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
 ; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
@@ -474,10 +474,10 @@ define i32 @test_invariant_address(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[BASE]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[BASE]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -517,7 +517,7 @@ loop_exit:
 define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @test_step_narrower_than_access(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -561,34 +561,34 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -599,7 +599,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32*
-; CHECK-NEXT:    [[TMP68:%.*]] = load i32, i32* [[TMP67]]
+; CHECK-NEXT:    [[TMP68:%.*]] = load i32, i32* [[TMP67]], align 4
 ; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
@@ -610,7 +610,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32*
-; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[TMP74]]
+; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* [[TMP74]], align 4
 ; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.continue5:
@@ -621,7 +621,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32*
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[TMP81]]
+; CHECK-NEXT:    [[TMP82:%.*]] = load i32, i32* [[TMP81]], align 4
 ; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
@@ -632,7 +632,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32*
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[TMP88]]
+; CHECK-NEXT:    [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4
 ; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
 ; CHECK:       pred.load.continue9:
@@ -643,7 +643,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32*
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> undef, i32 [[TMP96]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
@@ -654,7 +654,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32*
-; CHECK-NEXT:    [[TMP103:%.*]] = load i32, i32* [[TMP102]]
+; CHECK-NEXT:    [[TMP103:%.*]] = load i32, i32* [[TMP102]], align 4
 ; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
@@ -665,7 +665,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32*
-; CHECK-NEXT:    [[TMP110:%.*]] = load i32, i32* [[TMP109]]
+; CHECK-NEXT:    [[TMP110:%.*]] = load i32, i32* [[TMP109]], align 4
 ; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
@@ -676,7 +676,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32*
-; CHECK-NEXT:    [[TMP117:%.*]] = load i32, i32* [[TMP116]]
+; CHECK-NEXT:    [[TMP117:%.*]] = load i32, i32* [[TMP116]], align 4
 ; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
@@ -687,7 +687,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32*
-; CHECK-NEXT:    [[TMP124:%.*]] = load i32, i32* [[TMP123]]
+; CHECK-NEXT:    [[TMP124:%.*]] = load i32, i32* [[TMP123]], align 4
 ; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <4 x i32> undef, i32 [[TMP124]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
@@ -698,7 +698,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32*
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]]
+; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4
 ; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
@@ -709,7 +709,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32*
-; CHECK-NEXT:    [[TMP138:%.*]] = load i32, i32* [[TMP137]]
+; CHECK-NEXT:    [[TMP138:%.*]] = load i32, i32* [[TMP137]], align 4
 ; CHECK-NEXT:    [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
@@ -720,7 +720,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32*
-; CHECK-NEXT:    [[TMP145:%.*]] = load i32, i32* [[TMP144]]
+; CHECK-NEXT:    [[TMP145:%.*]] = load i32, i32* [[TMP144]], align 4
 ; CHECK-NEXT:    [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
@@ -731,7 +731,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32*
-; CHECK-NEXT:    [[TMP152:%.*]] = load i32, i32* [[TMP151]]
+; CHECK-NEXT:    [[TMP152:%.*]] = load i32, i32* [[TMP151]], align 4
 ; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <4 x i32> undef, i32 [[TMP152]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
@@ -742,7 +742,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32*
-; CHECK-NEXT:    [[TMP159:%.*]] = load i32, i32* [[TMP158]]
+; CHECK-NEXT:    [[TMP159:%.*]] = load i32, i32* [[TMP158]], align 4
 ; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
@@ -753,7 +753,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32*
-; CHECK-NEXT:    [[TMP166:%.*]] = load i32, i32* [[TMP165]]
+; CHECK-NEXT:    [[TMP166:%.*]] = load i32, i32* [[TMP165]], align 4
 ; CHECK-NEXT:    [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
@@ -764,7 +764,7 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32*
-; CHECK-NEXT:    [[TMP173:%.*]] = load i32, i32* [[TMP172]]
+; CHECK-NEXT:    [[TMP173:%.*]] = load i32, i32* [[TMP172]], align 4
 ; CHECK-NEXT:    [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.continue33:
@@ -804,13 +804,13 @@ define i32 @test_step_narrower_than_access(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[BASE_I16P:%.*]] = bitcast i32* [[BASE]] to i16*
 ; CHECK-NEXT:    [[ADDR_I16P:%.*]] = getelementptr inbounds i16, i16* [[BASE_I16P]], i64 [[IV]]
 ; CHECK-NEXT:    [[ADDR:%.*]] = bitcast i16* [[ADDR_I16P]] to i32*
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -852,7 +852,7 @@ loop_exit:
 define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-LABEL: @test_max_trip_count(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    [[MIN_CMP:%.*]] = icmp ult i64 4096, [[N:%.*]]
@@ -902,34 +902,34 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i1, i1* [[TMP20]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> undef, i1 [[TMP33]], i32 0
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
-; CHECK-NEXT:    [[TMP44:%.*]] = load i1, i1* [[TMP24]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> undef, i1 [[TMP41]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
-; CHECK-NEXT:    [[TMP52:%.*]] = load i1, i1* [[TMP28]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> undef, i1 [[TMP49]], i32 0
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
-; CHECK-NEXT:    [[TMP60:%.*]] = load i1, i1* [[TMP32]]
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> undef, i1 [[TMP57]], i32 0
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
@@ -985,11 +985,11 @@ define i32 @test_max_trip_count(i64 %len, i1* %test_base, i64 %n) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1034,7 +1034,7 @@ loop_exit:
 define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @test_non_zero_start(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1079,34 +1079,34 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1162,11 +1162,11 @@ define i32 @test_non_zero_start(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1206,7 +1206,7 @@ loop_exit:
 define i32 @neg_out_of_bounds_start(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @neg_out_of_bounds_start(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1215,11 +1215,11 @@ define i32 @neg_out_of_bounds_start(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1261,7 +1261,7 @@ loop_exit:
 define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @test_non_unit_stride(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1306,34 +1306,34 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1342,7 +1342,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[TMP65]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4
 ; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> undef, i32 [[TMP66]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
@@ -1351,7 +1351,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
 ; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP71:%.*]] = load i32, i32* [[TMP70]]
+; CHECK-NEXT:    [[TMP71:%.*]] = load i32, i32* [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.continue5:
@@ -1360,7 +1360,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
 ; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP76:%.*]] = load i32, i32* [[TMP75]]
+; CHECK-NEXT:    [[TMP76:%.*]] = load i32, i32* [[TMP75]], align 4
 ; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
@@ -1369,7 +1369,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
 ; CHECK:       pred.load.if8:
 ; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[TMP80]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4
 ; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
 ; CHECK:       pred.load.continue9:
@@ -1378,7 +1378,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
 ; CHECK:       pred.load.if10:
 ; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP86:%.*]] = load i32, i32* [[TMP85]]
+; CHECK-NEXT:    [[TMP86:%.*]] = load i32, i32* [[TMP85]], align 4
 ; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> undef, i32 [[TMP86]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
@@ -1387,7 +1387,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
 ; CHECK:       pred.load.if12:
 ; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[TMP90]]
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4
 ; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
@@ -1396,7 +1396,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
 ; CHECK:       pred.load.if14:
 ; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
@@ -1405,7 +1405,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
 ; CHECK:       pred.load.if16:
 ; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP101:%.*]] = load i32, i32* [[TMP100]]
+; CHECK-NEXT:    [[TMP101:%.*]] = load i32, i32* [[TMP100]], align 4
 ; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
@@ -1414,7 +1414,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
 ; CHECK:       pred.load.if18:
 ; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, i32* [[TMP105]]
+; CHECK-NEXT:    [[TMP106:%.*]] = load i32, i32* [[TMP105]], align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <4 x i32> undef, i32 [[TMP106]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
@@ -1423,7 +1423,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
 ; CHECK:       pred.load.if20:
 ; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP111:%.*]] = load i32, i32* [[TMP110]]
+; CHECK-NEXT:    [[TMP111:%.*]] = load i32, i32* [[TMP110]], align 4
 ; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
@@ -1432,7 +1432,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
 ; CHECK:       pred.load.if22:
 ; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP116:%.*]] = load i32, i32* [[TMP115]]
+; CHECK-NEXT:    [[TMP116:%.*]] = load i32, i32* [[TMP115]], align 4
 ; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
@@ -1441,7 +1441,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
 ; CHECK:       pred.load.if24:
 ; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP121:%.*]] = load i32, i32* [[TMP120]]
+; CHECK-NEXT:    [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4
 ; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
@@ -1450,7 +1450,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
 ; CHECK:       pred.load.if26:
 ; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP126:%.*]] = load i32, i32* [[TMP125]]
+; CHECK-NEXT:    [[TMP126:%.*]] = load i32, i32* [[TMP125]], align 4
 ; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <4 x i32> undef, i32 [[TMP126]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
@@ -1459,7 +1459,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
 ; CHECK:       pred.load.if28:
 ; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]]
+; CHECK-NEXT:    [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4
 ; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
@@ -1468,7 +1468,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
 ; CHECK:       pred.load.if30:
 ; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP136:%.*]] = load i32, i32* [[TMP135]]
+; CHECK-NEXT:    [[TMP136:%.*]] = load i32, i32* [[TMP135]], align 4
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
@@ -1477,7 +1477,7 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.if32:
 ; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load i32, i32* [[TMP140]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load i32, i32* [[TMP140]], align 4
 ; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.continue33:
@@ -1517,11 +1517,11 @@ define i32 @test_non_unit_stride(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 2
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1561,7 +1561,7 @@ loop_exit:
 define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @neg_off_by_many(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [1024 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [1024 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1605,34 +1605,34 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1688,11 +1688,11 @@ define i32 @neg_off_by_many(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1732,7 +1732,7 @@ loop_exit:
 define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @neg_off_by_one_iteration(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4095 x i32]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4095 x i32], align 4
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [4095 x i32]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1776,34 +1776,34 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -1859,11 +1859,11 @@ define i32 @neg_off_by_one_iteration(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
@@ -1903,7 +1903,7 @@ loop_exit:
 define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
 ; CHECK-LABEL: @neg_off_by_one_byte(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [16383 x i8]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [16383 x i8], align 1
 ; CHECK-NEXT:    [[BASE:%.*]] = bitcast [16383 x i8]* [[ALLOCA]] to i32*
 ; CHECK-NEXT:    call void @init(i32* [[BASE]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1947,34 +1947,34 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]]
-; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> undef, i1 [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i1> undef, i1 [[TMP40]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3
-; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]]
-; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]]
-; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i1> undef, i1 [[TMP48]], i32 0
 ; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1
 ; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3
-; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]]
-; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]]
+; CHECK-NEXT:    [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i1> undef, i1 [[TMP56]], i32 0
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
@@ -2030,11 +2030,11 @@ define i32 @neg_off_by_one_byte(i64 %len, i1* %test_base) {
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, i1* [[TEST_ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[ADDR]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]

From 5e9427322721a24f23d73dd1627fc8848c9dcba1 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 12:29:09 -0400
Subject: [PATCH 550/770] [LoopVectorize] auto-generate complete checks; NFC

---
 .../LoopVectorize/X86/tail_loop_folding.ll    | 288 +++++++++++++++---
 1 file changed, 242 insertions(+), 46 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 7fc8d518e560d..6f8d28fee0504 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -1,28 +1,59 @@
-; RUN: opt < %s -loop-vectorize -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck -check-prefix=PREDFLAG %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=CHECK,PREDFLAG
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
-; CHECK-LABEL: tail_folding_enabled(
-; CHECK:  vector.body:
-; CHECK:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
-; CHECK:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
-; CHECK:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load
-; CHECK:  call void @llvm.masked.store.v8i32.p0v8i32(
-; CHECK:  %index.next = add i64 %index, 8
-; CHECK:  %12 = icmp eq i64 %index.next, 432
-; CHECK:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0
-; PREDFLAG-LABEL: tail_folding_enabled(
-; PREDFLAG:  vector.body:
-; PREDFLAG:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
-; PREDFLAG:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
-; PREDFLAG:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG:  call void @llvm.masked.store.v8i32.p0v8i32(
-; PREDFLAG:  %index.next = add i64 %index, 8
-; PREDFLAG:  %12 = icmp eq i64 %index.next, 432
-; PREDFLAG:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0
+; CHECK-LABEL: @tail_folding_enabled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2
+;
 entry:
   br label %for.body
 
@@ -44,20 +75,141 @@ for.body:
 }
 
 define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
-; CHECK-LABEL: tail_folding_disabled(
-; CHECK:      vector.body:
-; CHECK-NOT:  @llvm.masked.load.v8i32.p0v8i32(
-; CHECK-NOT:  @llvm.masked.store.v8i32.p0v8i32(
-; CHECK:      br i1 %44, label {{.*}}, label %vector.body
-; PREDFLAG-LABEL: tail_folding_disabled(
-; PREDFLAG:  vector.body:
-; PREDFLAG:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
-; PREDFLAG:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
-; PREDFLAG:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG:  call void @llvm.masked.store.v8i32.p0v8i32(
-; PREDFLAG:  %index.next = add i64 %index, 8
-; PREDFLAG:  %12 = icmp eq i64 %index.next, 432
-; PREDFLAG:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !4
+; DEFAULT-LABEL: @tail_folding_disabled(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
+; DEFAULT-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
+; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
+; DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
+; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
+; DEFAULT-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
+; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
+; DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, <8 x i32>* [[TMP21]], align 4
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8
+; DEFAULT-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP23]], align 4
+; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 16
+; DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP25]], align 4
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 24
+; DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
+; DEFAULT-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4
+; DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
+; DEFAULT-NEXT:    [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD1]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <8 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
+; DEFAULT-NEXT:    [[TMP31:%.*]] = add nsw <8 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
+; DEFAULT-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0
+; DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>*
+; DEFAULT-NEXT:    store <8 x i32> [[TMP28]], <8 x i32>* [[TMP37]], align 4
+; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8
+; DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>*
+; DEFAULT-NEXT:    store <8 x i32> [[TMP29]], <8 x i32>* [[TMP39]], align 4
+; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 16
+; DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>*
+; DEFAULT-NEXT:    store <8 x i32> [[TMP30]], <8 x i32>* [[TMP41]], align 4
+; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 24
+; DEFAULT-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
+; DEFAULT-NEXT:    store <8 x i32> [[TMP31]], <8 x i32>* [[TMP43]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 416
+; DEFAULT-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 430, 416
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 416, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.cond.cleanup:
+; DEFAULT-NEXT:    ret void
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; DEFAULT-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; DEFAULT-NEXT:    [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; DEFAULT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]]
+; DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; DEFAULT-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
+; DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5
+;
+; PREDFLAG-LABEL: @tail_folding_disabled(
+; PREDFLAG-NEXT:  entry:
+; PREDFLAG-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDFLAG:       vector.ph:
+; PREDFLAG-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDFLAG:       vector.body:
+; PREDFLAG-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDFLAG-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; PREDFLAG-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; PREDFLAG-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; PREDFLAG-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; PREDFLAG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; PREDFLAG-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; PREDFLAG-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; PREDFLAG-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
+; PREDFLAG-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; PREDFLAG-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; PREDFLAG-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
+; PREDFLAG-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; PREDFLAG-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; PREDFLAG-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; PREDFLAG-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; PREDFLAG-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP2]])
+; PREDFLAG-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; PREDFLAG-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
+; PREDFLAG-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; PREDFLAG:       middle.block:
+; PREDFLAG-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; PREDFLAG:       scalar.ph:
+; PREDFLAG-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PREDFLAG-NEXT:    br label [[FOR_BODY:%.*]]
+; PREDFLAG:       for.cond.cleanup:
+; PREDFLAG-NEXT:    ret void
+; PREDFLAG:       for.body:
+; PREDFLAG-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PREDFLAG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; PREDFLAG-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; PREDFLAG-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; PREDFLAG-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; PREDFLAG-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; PREDFLAG-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; PREDFLAG-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; PREDFLAG-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; PREDFLAG-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
+; PREDFLAG-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5
+;
 entry:
   br label %for.body
 
@@ -87,30 +239,74 @@ for.body:
 ;   return sum;
 ; }
 ;
+
 define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 {
 ; CHECK-LABEL: @reduction_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], 7
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ [[ACCUM:%.*]], %vector.body ]
-; CHECK:         [[ICMPULE:%.*]] = icmp ule <8 x i64>
-; CHECK:         [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
-; CHECK:         [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]]
-; CHECK-NEXT:    [[ACCUM]] = add <8 x i32> [[ADD]], [[ACCUM_PHI]]
-; CHECK:         [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <8 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP8]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP13]] = add <8 x i32> [[TMP12]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP13]], <8 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[LIVEOUT]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[LIVEOUT]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP15]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
 ; CHECK-NEXT:    [[RDX_SHUF6:%.*]] = shufflevector <8 x i32> [[BIN_RDX5]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[BIN_RDX5]], [[RDX_SHUF6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[BIN_RDX7]], i32 0
-; CHECK-NEXT:    br i1 true, label %for.cond.cleanup, label %scalar.ph
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ [[SUM_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP18]]
+; CHECK-NEXT:    [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]]
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !7
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ {{.*}}, %for.body ], [ [[TMP17]], %middle.block ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_1_LCSSA]]
 ;
 entry:

From f78eecbb93ca0e6f740a001e2325704dca920f00 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 13:02:45 -0400
Subject: [PATCH 551/770] [LoopVectorize] regenerate test checks; NFC

Align attributes are now visible.
---
 .../Transforms/LoopVectorize/X86/imprecise-through-phis.ll  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index 720a0cc4700d6..16020c6daa7c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -17,7 +17,7 @@ define double @sumIfScalar(double* nocapture readonly %arr) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
 ; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
-; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
 ; CHECK-NEXT:    [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
 ; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
 ; CHECK:       do.add:
@@ -72,7 +72,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; SSE-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
 ; SSE-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
 ; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
-; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
 ; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
 ; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
 ; SSE:       do.add:
@@ -125,7 +125,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
 ; AVX-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
 ; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
-; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
 ; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
 ; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
 ; AVX:       do.add:

From b82a95f8bc91976a9ba663f8fa2edf15708b5c0f Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 09:47:43 -0700
Subject: [PATCH 552/770] [SVE] Eliminate calls to default-false
 VectorType::get() from polly

Reviewers: bollu, efriedma, david-arm, fpetrogalli, gchatelet

Reviewed By: fpetrogalli

Subscribers: tschuett, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80341
---
 polly/lib/CodeGen/BlockGenerators.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index 5a64cc86ea1d2..bf6c96b8091d3 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -1025,7 +1025,7 @@ Value *VectorBlockGenerator::getVectorValue(ScopStmt &Stmt, Value *Old,
 
   int Width = getVectorWidth();
 
-  Value *Vector = UndefValue::get(VectorType::get(Old->getType(), Width));
+  Value *Vector = UndefValue::get(FixedVectorType::get(Old->getType(), Width));
 
   for (int Lane = 0; Lane < Width; Lane++)
     Vector = Builder.CreateInsertElement(
@@ -1042,9 +1042,9 @@ Type *VectorBlockGenerator::getVectorPtrTy(const Value *Val, int Width) {
   assert(PointerTy && "PointerType expected");
 
   Type *ScalarType = PointerTy->getElementType();
-  VectorType *VectorType = VectorType::get(ScalarType, Width);
+  auto *FVTy = FixedVectorType::get(ScalarType, Width);
 
-  return PointerType::getUnqual(VectorType);
+  return PointerType::getUnqual(FVTy);
 }
 
 Value *VectorBlockGenerator::generateStrideOneLoad(
@@ -1093,7 +1093,7 @@ Value *VectorBlockGenerator::generateStrideZeroLoad(
     ScalarLoad->setAlignment(Align(8));
 
   Constant *SplatVector = Constant::getNullValue(
-      VectorType::get(Builder.getInt32Ty(), getVectorWidth()));
+      FixedVectorType::get(Builder.getInt32Ty(), getVectorWidth()));
 
   Value *VectorLoad = Builder.CreateShuffleVector(
       ScalarLoad, ScalarLoad, SplatVector, Load->getName() + "_p_splat");
@@ -1105,10 +1105,10 @@ Value *VectorBlockGenerator::generateUnknownStrideLoad(
     __isl_keep isl_id_to_ast_expr *NewAccesses) {
   int VectorWidth = getVectorWidth();
   auto *Pointer = Load->getPointerOperand();
-  VectorType *VectorType = VectorType::get(
+  auto *FVTy = FixedVectorType::get(
       dyn_cast<PointerType>(Pointer->getType())->getElementType(), VectorWidth);
 
-  Value *Vector = UndefValue::get(VectorType);
+  Value *Vector = UndefValue::get(FVTy);
 
   for (int i = 0; i < VectorWidth; i++) {
     Value *NewPointer = generateLocationAccessed(Stmt, Load, ScalarMaps[i],
@@ -1167,7 +1167,7 @@ void VectorBlockGenerator::copyUnaryInst(ScopStmt &Stmt, UnaryInstruction *Inst,
   assert(isa<CastInst>(Inst) && "Can not generate vector code for instruction");
 
   const CastInst *Cast = dyn_cast<CastInst>(Inst);
-  VectorType *DestType = VectorType::get(Inst->getType(), VectorWidth);
+  auto *DestType = FixedVectorType::get(Inst->getType(), VectorWidth);
   VectorMap[Inst] = Builder.CreateCast(Cast->getOpcode(), NewOperand, DestType);
 }
 
@@ -1277,8 +1277,8 @@ void VectorBlockGenerator::copyInstScalarized(
     return;
 
   // Make the result available as vector value.
-  VectorType *VectorType = VectorType::get(Inst->getType(), VectorWidth);
-  Value *Vector = UndefValue::get(VectorType);
+  auto *FVTy = FixedVectorType::get(Inst->getType(), VectorWidth);
+  Value *Vector = UndefValue::get(FVTy);
 
   for (int i = 0; i < VectorWidth; i++)
     Vector = Builder.CreateInsertElement(Vector, ScalarMaps[i][Inst],
@@ -1344,7 +1344,7 @@ void VectorBlockGenerator::generateScalarVectorLoads(
                                              Address->getName() + "_p_vec_p");
     auto *Val = Builder.CreateLoad(VectorPtr, Address->getName() + ".reload");
     Constant *SplatVector = Constant::getNullValue(
-        VectorType::get(Builder.getInt32Ty(), getVectorWidth()));
+        FixedVectorType::get(Builder.getInt32Ty(), getVectorWidth()));
 
     Value *VectorVal = Builder.CreateShuffleVector(
         Val, Val, SplatVector, Address->getName() + "_p_splat");

From 26604d06b6fe10bb047a3620cf306be8648dcc20 Mon Sep 17 00:00:00 2001
From: Xiangling Liao <Xiangling.Liao@ibm.com>
Date: Fri, 29 May 2020 11:41:10 -0400
Subject: [PATCH 553/770] [AIX] Emit AvailableExternally Linkage on AIX

Since on AIX, our strategy is to not use -u to suppress any undefined
symbols, we need to emit .extern for the symbols with AvailableExternally
linkage.

Differential Revision: https://reviews.llvm.org/D80642
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  7 +++-
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  8 ++--
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |  2 +-
 llvm/lib/Target/TargetLoweringObjectFile.cpp  |  2 +-
 .../aix-available-externally-linkage.ll       | 39 +++++++++++++++++++
 5 files changed, 50 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-available-externally-linkage.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 3be48935f2ab8..0aaf5a487c158 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -438,8 +438,13 @@ void AsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
       return;
     }
     LLVM_FALLTHROUGH;
-  case GlobalValue::AppendingLinkage:
   case GlobalValue::AvailableExternallyLinkage:
+    if (MAI->hasDotExternDirective()) {
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Extern);
+      return;
+    }
+    LLVM_FALLTHROUGH;
+  case GlobalValue::AppendingLinkage:
     llvm_unreachable("Should never emit this");
   }
   llvm_unreachable("Unknown linkage type!");
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 586de4fd97f0d..68df50f1a87ae 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1983,7 +1983,7 @@ TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
   // function entry point. We choose to always return a function descriptor
   // here.
   if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) {
-    if (GO->isDeclaration())
+    if (GO->isDeclarationForLinker())
       return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM))
           ->getQualNameSymbol();
 
@@ -2011,7 +2011,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference(
     const GlobalObject *GO, const TargetMachine &TM) const {
-  assert(GO->isDeclaration() &&
+  assert(GO->isDeclarationForLinker() &&
          "Tried to get ER section for a defined global.");
 
   SmallString<128> Name;
@@ -2133,6 +2133,7 @@ XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
     return XCOFF::C_HIDEXT;
   case GlobalValue::ExternalLinkage:
   case GlobalValue::CommonLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
     return XCOFF::C_EXT;
   case GlobalValue::ExternalWeakLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
@@ -2143,9 +2144,6 @@ XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
   case GlobalValue::AppendingLinkage:
     report_fatal_error(
         "There is no mapping that implements AppendingLinkage for XCOFF.");
-  case GlobalValue::AvailableExternallyLinkage:
-    report_fatal_error("unhandled AvailableExternallyLinkage when mapping "
-                       "linkage to StorageClass");
   }
   llvm_unreachable("Unknown linkage type!");
 }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 84f870f4a584a..051c800600c5a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1728,7 +1728,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
 
   auto setCsectAlignment = [this](const GlobalObject *GO) {
     // Declarations have 0 alignment which is set by default.
-    if (GO->isDeclaration())
+    if (GO->isDeclarationForLinker())
       return;
 
     SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM);
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index bd256600c5fec..eea0aeea2c458 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -143,7 +143,7 @@ void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
 /// may be overridden by the target implementation.
 SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
                                                        const TargetMachine &TM){
-  assert(!GO->isDeclaration() && !GO->hasAvailableExternallyLinkage() &&
+  assert(!GO->isDeclarationForLinker() &&
          "Can only be used for global definitions");
 
   // Functions are classified as text sections.
diff --git a/llvm/test/CodeGen/PowerPC/aix-available-externally-linkage.ll b/llvm/test/CodeGen/PowerPC/aix-available-externally-linkage.ll
new file mode 100644
index 0000000000000..a0201f6351502
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-available-externally-linkage.ll
@@ -0,0 +1,39 @@
+; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr4 \
+; RUN:     -mattr=-altivec < %s | \
+; RUN:   FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr4 \
+; RUN:     -mattr=-altivec < %s | \
+; RUN:   FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr4 \
+; RUN:     -mattr=-altivec -filetype=obj -o %t.o < %s
+; RUN: llvm-readobj --symbols %t.o | \
+; RUN:   FileCheck --check-prefix=XCOFF32 %s
+
+; RUN: not --crash llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:     -mcpu=pwr4 -mattr=-altivec -filetype=obj -o %t.o 2>&1 < %s | \
+; RUN:   FileCheck --check-prefix=XCOFF64 %s
+; XCOFF64: LLVM ERROR: 64-bit XCOFF object files are not supported yet.
+
+@_ZN3Foo1aE = available_externally constant i32 -1
+
+; CHECK: .extern  _ZN3Foo1aE[UA]
+
+; XCOFF32:          Index: [[#Index:]]{{.*}}{{[[:space:]] *}}Name: _ZN3Foo1aE
+; XCOFF32-NEXT:     Value (RelocatableAddress): 0x0
+; XCOFF32-NEXT:     Section: N_UNDEF
+; XCOFF32-NEXT:     Type: 0x0
+; XCOFF32-NEXT:     StorageClass: C_EXT (0x2)
+; XCOFF32-NEXT:     NumberOfAuxEntries: 1
+; XCOFF32-NEXT:     CSECT Auxiliary Entry {
+; XCOFF32-NEXT:       Index: [[#Index+1]]
+; XCOFF32-NEXT:       SectionLen: 0
+; XCOFF32-NEXT:       ParameterHashIndex: 0x0
+; XCOFF32-NEXT:       TypeChkSectNum: 0x0
+; XCOFF32-NEXT:       SymbolAlignmentLog2: 0
+; XCOFF32-NEXT:       SymbolType: XTY_ER (0x0)
+; XCOFF32-NEXT:       StorageMappingClass: XMC_UA (0x4)
+; XCOFF32-NEXT:       StabInfoIndex: 0x0
+; XCOFF32-NEXT:       StabSectNum: 0x0
+; XCOFF32-NEXT:     }

From db653ff6b777f9133793c21c48a46912bc1a77df Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 13:13:27 -0400
Subject: [PATCH 554/770] [LoopVectorize] auto-generate complete test checks;
 NFC

---
 .../LoopVectorize/X86/reduction-fastmath.ll   | 219 +++++++++++++++---
 1 file changed, 191 insertions(+), 28 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index 1146e31ec2588..9063c5116cca6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -1,10 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define float @reduction_sum_float_ieee(i32 %n, float* %array) {
-; CHECK-LABEL: define float @reduction_sum_float_ieee(
+; CHECK-LABEL: @reduction_sum_float_ieee(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
+; CHECK-NEXT:    br i1 [[ENTRY_COND]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_EXIT:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ 0.000000e+00, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[SUM_INC]] = fadd float [[SUM]], [[VALUE]]
+; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
+; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       loop.exit.loopexit:
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop.exit:
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_LCSSA]]
+;
 entry:
   %entry.cond = icmp ne i32 0, 4096
   br i1 %entry.cond, label %loop, label %loop.exit
@@ -21,18 +43,66 @@ loop:
 
 loop.exit:
   %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
-; CHECK-NOT: %wide.load = load <4 x float>, <4 x float>*
-; CHECK: ret float %sum.lcssa
   ret float %sum.lcssa
 }
 
 define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
-; CHECK-LABEL: define float @reduction_sum_float_fastmath(
-; CHECK: fadd fast <4 x float>
-; CHECK: fadd fast <4 x float>
-; CHECK: fadd fast <4 x float>
-; CHECK: fadd fast <4 x float>
-; CHECK: fadd fast <4 x float>
+; CHECK-LABEL: @reduction_sum_float_fastmath(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
+; CHECK-NEXT:    br i1 [[ENTRY_COND]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_EXIT:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <4 x float> [[BIN_RDX3]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[BIN_RDX3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[BIN_RDX5]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]]
+; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
+; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop !2
+; CHECK:       loop.exit.loopexit:
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop.exit:
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_LCSSA]]
+;
 entry:
   %entry.cond = icmp ne i32 0, 4096
   br i1 %entry.cond, label %loop, label %loop.exit
@@ -49,19 +119,66 @@ loop:
 
 loop.exit:
   %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
-; CHECK: ret float %sum.lcssa
   ret float %sum.lcssa
 }
 
 define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
-; CHECK-LABEL: define float @reduction_sum_float_only_reassoc(
-; CHECK-NOT: fadd fast
-; CHECK: fadd reassoc <4 x float>
-; CHECK: fadd reassoc <4 x float>
-; CHECK: fadd reassoc <4 x float>
-; CHECK: fadd reassoc <4 x float>
-; CHECK: fadd reassoc <4 x float>
-
+; CHECK-LABEL: @reduction_sum_float_only_reassoc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
+; CHECK-NEXT:    br i1 [[ENTRY_COND]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_EXIT:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd reassoc <4 x float> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <4 x float> [[BIN_RDX3]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd reassoc <4 x float> [[BIN_RDX3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[BIN_RDX5]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]]
+; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
+; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop !5
+; CHECK:       loop.exit.loopexit:
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop.exit:
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_LCSSA]]
+;
 entry:
   %entry.cond = icmp ne i32 0, 4096
   br i1 %entry.cond, label %loop, label %loop.exit
@@ -78,19 +195,66 @@ loop:
 
 loop.exit:
   %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
-; CHECK: ret float %sum.lcssa
   ret float %sum.lcssa
 }
 
 define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) {
-; CHECK-LABEL: define float @reduction_sum_float_only_reassoc_and_contract(
-; CHECK-NOT: fadd fast
-; CHECK: fadd reassoc contract <4 x float>
-; CHECK: fadd reassoc contract <4 x float>
-; CHECK: fadd reassoc contract <4 x float>
-; CHECK: fadd reassoc contract <4 x float>
-; CHECK: fadd reassoc contract <4 x float>
-
+; CHECK-LABEL: @reduction_sum_float_only_reassoc_and_contract(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ENTRY_COND:%.*]] = icmp ne i32 0, 4096
+; CHECK-NEXT:    br i1 [[ENTRY_COND]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_EXIT:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, float* [[ARRAY:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, float* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd reassoc contract <4 x float> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <4 x float> [[BIN_RDX3]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd reassoc contract <4 x float> [[BIN_RDX3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[BIN_RDX5]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4096, 4096
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, float* [[ARRAY]], i32 [[IDX]]
+; CHECK-NEXT:    [[VALUE:%.*]] = load float, float* [[ADDRESS]], align 4
+; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]]
+; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
+; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop !7
+; CHECK:       loop.exit.loopexit:
+; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop.exit:
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret float [[SUM_LCSSA]]
+;
 entry:
   %entry.cond = icmp ne i32 0, 4096
   br i1 %entry.cond, label %loop, label %loop.exit
@@ -107,6 +271,5 @@ loop:
 
 loop.exit:
   %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
-; CHECK: ret float %sum.lcssa
   ret float %sum.lcssa
 }

From dfc8244c24631169630399a640ab526acd678346 Mon Sep 17 00:00:00 2001
From: Ehud Katz <ehudkatz@gmail.com>
Date: Fri, 29 May 2020 20:12:44 +0300
Subject: [PATCH 555/770] [PrintSCC] Fix printing a basic-block without a name

Print a basic-block as an operand to handle the case where it has no
name.

Differential Revision: https://reviews.llvm.org/D80552
---
 llvm/test/Other/print-cfg-sccs.ll | 27 +++++++++++++++++++++++++++
 llvm/tools/opt/PrintSCC.cpp       |  7 ++++---
 2 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Other/print-cfg-sccs.ll

diff --git a/llvm/test/Other/print-cfg-sccs.ll b/llvm/test/Other/print-cfg-sccs.ll
new file mode 100644
index 0000000000000..43e885476bca8
--- /dev/null
+++ b/llvm/test/Other/print-cfg-sccs.ll
@@ -0,0 +1,27 @@
+; RUN: opt -print-cfg-sccs -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: SCCs for Function test in PostOrder:
+; CHECK-NEXT: SCC #1 : %exit,
+; CHECK-NEXT: SCC #2 : %0,
+; CHECK-NEXT: SCC #3 : %3,
+; CHECK-NEXT: SCC #4 : %2, %1,
+; CHECK-NEXT: SCC #5 : %entry,
+define void @test(i1 %cond) {
+entry:
+  br i1 %cond, label %0, label %1
+
+0:
+  br label %exit
+
+1:
+  br label %2
+
+2:
+  br i1 %cond, label %1, label %3
+
+3:
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/tools/opt/PrintSCC.cpp b/llvm/tools/opt/PrintSCC.cpp
index 5ab4a00552f39..1ca52745ff400 100644
--- a/llvm/tools/opt/PrintSCC.cpp
+++ b/llvm/tools/opt/PrintSCC.cpp
@@ -76,9 +76,10 @@ bool CFGSCC::runOnFunction(Function &F) {
   for (scc_iterator<Function*> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI) {
     const std::vector<BasicBlock *> &nextSCC = *SCCI;
     errs() << "\nSCC #" << ++sccNum << " : ";
-    for (std::vector<BasicBlock*>::const_iterator I = nextSCC.begin(),
-           E = nextSCC.end(); I != E; ++I)
-      errs() << (*I)->getName() << ", ";
+    for (BasicBlock *BB : nextSCC) {
+      BB->printAsOperand(errs(), false);
+      errs() << ", ";
+    }
     if (nextSCC.size() == 1 && SCCI.hasCycle())
       errs() << " (Has self-loop).";
   }

From dbb5979d158cd7c49fdb31a03a4a73dfb402cf66 Mon Sep 17 00:00:00 2001
From: Anchu Rajendran <asudhaku@amd.com>
Date: Thu, 28 May 2020 11:16:04 +0530
Subject: [PATCH 556/770] [MLIR][OpenMP] Defined master operation in OpenMP
 Dialect

Summary:
Implemented the basic changes for defining master operation in OpenMP.
It uses the generic parser and printer.

Reviewed By: kiranchandramohan, ftynse

Differential Revision: https://reviews.llvm.org/D80689
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 18 +++++++++++++++---
 mlir/test/Dialect/OpenMP/ops.mlir             | 11 ++++++++++-
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 27b2110bf71ed..78b56cac13534 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -94,7 +94,7 @@ def ParallelOp : OpenMP_Op<"parallel", [AttrSizedOperandSegments]> {
 }
 
 def TerminatorOp : OpenMP_Op<"terminator", [Terminator]> {
-  let summary = "terminator for OpenMP regions.";
+  let summary = "terminator for OpenMP regions";
   let description = [{
     A terminator operation for regions that appear in the body of OpenMP
     operation.  These regions are not expected to return any value so the
@@ -102,8 +102,7 @@ def TerminatorOp : OpenMP_Op<"terminator", [Terminator]> {
     enclosing op.
   }];
 
-  let parser = [{ return success(); }];
-  let printer = [{ p << getOperationName(); }];
+  let assemblyFormat = "attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
@@ -137,6 +136,19 @@ def FlushOp : OpenMP_Op<"flush"> {
   let assemblyFormat = "attr-dict ($varList^ `:` type($varList))?";
 }
 
+//===----------------------------------------------------------------------===//
+// 2.16 master Construct
+//===----------------------------------------------------------------------===//
+def MasterOp : OpenMP_Op<"master"> {
+  let summary = "master construct";
+  let description = [{
+    The master construct specifies a structured block that is executed by
+    the master thread of the team.
+  }];
+
+  let regions = (region AnyRegion:$region);
+}
+
 //===----------------------------------------------------------------------===//
 // 2.17.2 barrier Construct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index bffc82417761e..e780cebd93fa3 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -6,6 +6,15 @@ func @omp_barrier() -> () {
   return
 }
 
+func @omp_master() -> () {
+  // CHECK: omp.master
+  "omp.master" ()({
+    // CHECK: omp.terminator
+    omp.terminator
+  }):()->()
+  return
+}
+
 func @omp_taskwait() -> () {
   // CHECK: omp.taskwait
   omp.taskwait
@@ -42,7 +51,7 @@ func @omp_terminator() -> () {
 }
 
 func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32) -> () {
-  // CHECK: omp_parallel
+  // CHECK: omp.parallel
   "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var, %data_var, %data_var) ({
 
   // test without if condition

From 87e4ad4d5ce1a231fae257faaada8badcfc22d43 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Fri, 29 May 2020 00:15:09 -0700
Subject: [PATCH 557/770] [X86] Remove isel pattern for
 MMX_X86movdq2q+simple_load. Replace with DAG combine to to loadmmx.

Only 64-bit bits will be loaded, not the whole 128 bits. We can
just combine it to plain mmx load. This has the side effect of
enabling isel load folding for it.

This part of my desire to get rid of isel patterns that shrink loads.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++++++++++
 llvm/lib/Target/X86/X86InstrMMX.td      |  3 ---
 llvm/test/CodeGen/X86/mmx-fold-load.ll  | 31 +++++++++++++++++++++++--
 3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8ec958338c024..a1121600346f2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47736,6 +47736,27 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   return DAG.getBitcast(VT, Cvt);
 }
 
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+  SDValue Src = N->getOperand(0);
+
+  // Turn MOVDQ2Q+simple_load into an mmx load.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+    if (LN->isSimple()) {
+      SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+                                  LN->getBasePtr(),
+                                  LN->getPointerInfo(),
+                                  LN->getOriginalAlign(),
+                                  LN->getMemOperand()->getFlags());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+      return NewLd;
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -47898,6 +47919,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
   case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+  case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 2880be6cb8f38..83eddaa05f4ae 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -568,9 +568,6 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
 def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
           (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
 
-def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
-          (x86mmx (MMX_MOVQ64rm addr:$src))>;
-
 def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
                              (i64 (bitconvert (x86mmx VR64:$src)))))),
           (MMX_MOVQ2DQrr VR64:$src)>;
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 5ad2d50c1d133..5de16c0ca082e 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
 
 define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
 ; X86-LABEL: t0:
@@ -616,3 +616,30 @@ entry:
 
 declare void @llvm.lifetime.start(i64, i8* nocapture)
 declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+; Make sure we shrink this vector load and fold it.
+define x86_mmx @vec_load(<4 x float>* %x) {
+; X86-LABEL: vec_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
+; X86-NEXT:    paddsb %mm0, %mm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: vec_load:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
+; X64-NEXT:    paddsb %mm0, %mm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    retq
+  %z = load <4 x float>, <4 x float>* %x
+  %y = extractelement <4 x float> %z, i32 0
+  %a = insertelement <2 x float> undef, float %y, i32 0
+  %b = insertelement <2 x float> %a, float %y, i32 1
+  %c = bitcast <2 x float> %b to x86_mmx
+  %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
+  ret x86_mmx %d
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+

From b637519eef01d48301620b6537c52ec72978f553 Mon Sep 17 00:00:00 2001
From: Anna Bulanova <anna.bulanova@huawei.com>
Date: Fri, 29 May 2020 13:09:52 -0400
Subject: [PATCH 558/770] [SVE] Replace deprecated call in
 changeVectorElementTypeToInteger

Summary:
Replace getVectorNumElements with getVectorElementCount;
gets rid of the warnings in several tests

Reviewers: sdesmalen, kmclaughlin, dancgr, efriedma, each, andwar, rengolin

Reviewed By: efriedma

Subscribers: tschuett, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80746
---
 llvm/include/llvm/CodeGen/ValueTypes.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index c6f8a813ca333..e4d8a04a3340b 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -97,8 +97,7 @@ namespace llvm {
       MVT EltTy = getSimpleVT().getVectorElementType();
       unsigned BitWidth = EltTy.getSizeInBits();
       MVT IntTy = MVT::getIntegerVT(BitWidth);
-      MVT VecTy = MVT::getVectorVT(IntTy, getVectorNumElements(),
-                                   isScalableVector());
+      MVT VecTy = MVT::getVectorVT(IntTy, getVectorElementCount());
       assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
              "Simple vector VT not representable by simple integer vector VT!");
       return VecTy;

From 5c7aca6a4c8c93f1af3cdac676151f8d7e28a410 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Fri, 29 May 2020 10:38:21 -0700
Subject: [PATCH 559/770] [X86] Ignore large code model in
 X86FastISel::X86MaterializeFP in 32-bit mode

Large code model doesn't mean anything to 32-bit mode. But nothing
prevents it from being set. Ignore to avoid generating 64-bit mode
only instructions.

Differential Revision: https://reviews.llvm.org/D80768
---
 llvm/lib/Target/X86/X86FastISel.cpp          |  3 +-
 llvm/test/CodeGen/X86/fast-isel-constpool.ll | 65 +++++++++++---------
 2 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 5a51e249f5cec..c5ae67fe081dc 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3787,7 +3787,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
   unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
 
-  if (CM == CodeModel::Large) {
+  // Large code model only applies to 64-bit mode.
+  if (Subtarget->is64Bit() && CM == CodeModel::Large) {
     unsigned AddrReg = createResultReg(&X86::GR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
             AddrReg)
diff --git a/llvm/test/CodeGen/X86/fast-isel-constpool.ll b/llvm/test/CodeGen/X86/fast-isel-constpool.ll
index 706674c584baf..f1aacc7ceab4c 100644
--- a/llvm/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-constpool.ll
@@ -6,9 +6,10 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX
 ; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large -mattr=avx512f < %s | FileCheck %s --check-prefix=LARGE_AVX
 
-; This large code mode shouldn't mean anything on x86 but it currently
-; generates 64-bit only instructions and will assert in the encoder.
-; RUN: llc -mtriple=i686-apple-darwin -fast-isel -code-model=large -mattr=sse2 < %s | FileCheck %s --check-prefix=X86-LARGE
+; This large code mode shouldn't mean anything on x86 but it used to
+; generate 64-bit only instructions and asserted in the encoder.
+; -show-mc-encoding here to assert if this breaks again.
+; RUN: llc -mtriple=i686-apple-darwin -fast-isel -code-model=large -mattr=sse2 -show-mc-encoding < %s | FileCheck %s --check-prefix=X86-LARGE
 
 ; Make sure fast isel uses rip-relative addressing for the small code model.
 define float @constpool_float(float %x) {
@@ -38,14 +39,16 @@ define float @constpool_float(float %x) {
 ;
 ; X86-LARGE-LABEL: constpool_float:
 ; X86-LARGE:       ## %bb.0:
-; X86-LARGE-NEXT:    pushl %eax
+; X86-LARGE-NEXT:    pushl %eax ## encoding: [0x50]
 ; X86-LARGE-NEXT:    .cfi_def_cfa_offset 8
-; X86-LARGE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-LARGE-NEXT:    addss LCPI0_0, %xmm0
-; X86-LARGE-NEXT:    movss %xmm0, (%esp)
-; X86-LARGE-NEXT:    flds (%esp)
-; X86-LARGE-NEXT:    popl %eax
-; X86-LARGE-NEXT:    retl
+; X86-LARGE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
+; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; X86-LARGE-NEXT:    addss LCPI0_0, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
+; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: LCPI0_0, kind: FK_Data_4
+; X86-LARGE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
+; X86-LARGE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
+; X86-LARGE-NEXT:    popl %eax ## encoding: [0x58]
+; X86-LARGE-NEXT:    retl ## encoding: [0xc3]
 
   %1 = fadd float %x, 16.50e+01
   ret float %1
@@ -78,13 +81,15 @@ define double @constpool_double(double %x) nounwind {
 ;
 ; X86-LARGE-LABEL: constpool_double:
 ; X86-LARGE:       ## %bb.0:
-; X86-LARGE-NEXT:    subl $12, %esp
-; X86-LARGE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-LARGE-NEXT:    addsd LCPI1_0, %xmm0
-; X86-LARGE-NEXT:    movsd %xmm0, (%esp)
-; X86-LARGE-NEXT:    fldl (%esp)
-; X86-LARGE-NEXT:    addl $12, %esp
-; X86-LARGE-NEXT:    retl
+; X86-LARGE-NEXT:    subl $12, %esp ## encoding: [0x83,0xec,0x0c]
+; X86-LARGE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf2,0x0f,0x10,0x44,0x24,0x10]
+; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero
+; X86-LARGE-NEXT:    addsd LCPI1_0, %xmm0 ## encoding: [0xf2,0x0f,0x58,0x05,A,A,A,A]
+; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4
+; X86-LARGE-NEXT:    movsd %xmm0, (%esp) ## encoding: [0xf2,0x0f,0x11,0x04,0x24]
+; X86-LARGE-NEXT:    fldl (%esp) ## encoding: [0xdd,0x04,0x24]
+; X86-LARGE-NEXT:    addl $12, %esp ## encoding: [0x83,0xc4,0x0c]
+; X86-LARGE-NEXT:    retl ## encoding: [0xc3]
 
   %1 = fadd double %x, 8.500000e-01
   ret double %1
@@ -123,12 +128,13 @@ define void @constpool_float_no_fp_args(float* %x) nounwind {
 ;
 ; X86-LARGE-LABEL: constpool_float_no_fp_args:
 ; X86-LARGE:       ## %bb.0:
-; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-LARGE-NEXT:    movabsq $LCPI2_0, %rcx
-; X86-LARGE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-LARGE-NEXT:    addss (%eax), %xmm0
-; X86-LARGE-NEXT:    movss %xmm0, (%eax)
-; X86-LARGE-NEXT:    retl
+; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-LARGE-NEXT:    movss LCPI2_0, %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: LCPI2_0, kind: FK_Data_4
+; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; X86-LARGE-NEXT:    addss (%eax), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x00]
+; X86-LARGE-NEXT:    movss %xmm0, (%eax) ## encoding: [0xf3,0x0f,0x11,0x00]
+; X86-LARGE-NEXT:    retl ## encoding: [0xc3]
   %a = load float, float* %x
   %b = fadd float %a, 16.50e+01
   store float %b, float* %x
@@ -168,12 +174,13 @@ define void @constpool_double_no_fp_args(double* %x) nounwind {
 ;
 ; X86-LARGE-LABEL: constpool_double_no_fp_args:
 ; X86-LARGE:       ## %bb.0:
-; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-LARGE-NEXT:    movabsq $LCPI3_0, %rcx
-; X86-LARGE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-LARGE-NEXT:    addsd (%eax), %xmm0
-; X86-LARGE-NEXT:    movsd %xmm0, (%eax)
-; X86-LARGE-NEXT:    retl
+; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-LARGE-NEXT:    movsd LCPI3_0, %xmm0 ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4
+; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero
+; X86-LARGE-NEXT:    addsd (%eax), %xmm0 ## encoding: [0xf2,0x0f,0x58,0x00]
+; X86-LARGE-NEXT:    movsd %xmm0, (%eax) ## encoding: [0xf2,0x0f,0x11,0x00]
+; X86-LARGE-NEXT:    retl ## encoding: [0xc3]
   %a = load double, double* %x
   %b = fadd double %a, 8.500000e-01
   store double %b, double* %x

From 61412b762df79328fa29dafdd1f8cc35792693ec Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 29 May 2020 13:44:52 -0400
Subject: [PATCH 560/770] [SLP] auto-generate complete test checks; NFC

---
 .../SLPVectorizer/X86/scheduling.ll           | 52 ++++++++++++++++---
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
index 8395401c5df0b..9f3db14d9071a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -3,17 +3,53 @@
 
 define i32 @foo(i32* nocapture readonly %diff) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK: load <4 x i32>
-; CHECK: load <4 x i32>
-; CHECK: [[S1:%.+]] = add nsw <4 x i32>
-; CHECK: store <4 x i32> [[S1]],
-; CHECK:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[S1]], [[RDX_SHUF]]
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M2:%.*]] = alloca [8 x [8 x i32]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [8 x [8 x i32]]* [[M2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK:         [[ADD52:%.*]] = add nsw i32 [[TMP15]],
-; CHECK:          ret i32 [[ADD52]]
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 0
+; CHECK-NEXT:    call void @ff([8 x i32]* [[ARRAYDECAY]])
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
 ;
 entry:
   %m2 = alloca [8 x [8 x i32]], align 16

From 81443ac1bc710c89565ea1bce0eb566bf2cacd0d Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 28 May 2020 18:39:27 -0700
Subject: [PATCH 561/770] [WebAssembly] Add placeholders for
 R_WASM_TABLE_INDEX_REL_SLEB relocations

Previously in the object format we punted on this and simply wrote
zeros (and didn't include the function in the elem segment).  With
this change we write a meaningful value which is the segment
relative table index of the associated function.

This matches the that wasm-ld produces in `-r` mode.  This inconsistency
between the output the MC object writer and the wasm-ld object
writer could cause warnings to be emitted when reading back in the
output of `wasm-ld -r`.  See:
https://github.com/emscripten-core/emscripten/issues/11217

This only applies to this one relocation type which is only generated
when compiling in PIC mode.

Differential Revision: https://reviews.llvm.org/D80774
---
 lld/wasm/InputFiles.cpp              | 9 +++++++--
 lld/wasm/InputFiles.h                | 1 +
 llvm/lib/MC/WasmObjectWriter.cpp     | 8 ++++++--
 llvm/test/MC/WebAssembly/reloc-pic.s | 8 +++++++-
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index b7d90feb58b3d..7390575242fca 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -122,11 +122,14 @@ uint32_t ObjFile::calcNewAddend(const WasmRelocation &reloc) const {
 uint32_t ObjFile::calcExpectedValue(const WasmRelocation &reloc) const {
   switch (reloc.Type) {
   case R_WASM_TABLE_INDEX_I32:
-  case R_WASM_TABLE_INDEX_SLEB:
-  case R_WASM_TABLE_INDEX_REL_SLEB: {
+  case R_WASM_TABLE_INDEX_SLEB: {
     const WasmSymbol &sym = wasmObj->syms()[reloc.Index];
     return tableEntries[sym.Info.ElementIndex];
   }
+  case R_WASM_TABLE_INDEX_REL_SLEB: {
+    const WasmSymbol &sym = wasmObj->syms()[reloc.Index];
+    return tableEntriesRel[sym.Info.ElementIndex];
+  }
   case R_WASM_MEMORY_ADDR_SLEB:
   case R_WASM_MEMORY_ADDR_I32:
   case R_WASM_MEMORY_ADDR_LEB:
@@ -266,6 +269,7 @@ void ObjFile::parse(bool ignoreComdats) {
   // verifying the existing table index relocations
   uint32_t totalFunctions =
       wasmObj->getNumImportedFunctions() + wasmObj->functions().size();
+  tableEntriesRel.resize(totalFunctions);
   tableEntries.resize(totalFunctions);
   for (const WasmElemSegment &seg : wasmObj->elements()) {
     if (seg.Offset.Opcode != WASM_OPCODE_I32_CONST)
@@ -274,6 +278,7 @@ void ObjFile::parse(bool ignoreComdats) {
     for (uint32_t index = 0; index < seg.Functions.size(); index++) {
 
       uint32_t functionIndex = seg.Functions[index];
+      tableEntriesRel[functionIndex] = index;
       tableEntries[functionIndex] = offset + index;
     }
   }
diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h
index bf4a4ec99abf5..661aa8963e540 100644
--- a/lld/wasm/InputFiles.h
+++ b/lld/wasm/InputFiles.h
@@ -118,6 +118,7 @@ class ObjFile : public InputFile {
   std::vector<bool> typeIsUsed;
   // Maps function indices to table indices
   std::vector<uint32_t> tableEntries;
+  std::vector<uint32_t> tableEntriesRel;
   std::vector<bool> keptComdats;
   std::vector<InputSegment *> segments;
   std::vector<InputFunction *> functions;
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 0c09f99a78973..c6a27898d1f32 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -565,7 +565,10 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
     // Provisional value is table address of the resolved symbol itself
     const MCSymbolWasm *Sym = resolveSymbol(*RelEntry.Symbol);
     assert(Sym->isFunction());
-    return TableIndices[Sym];
+    if (RelEntry.Type == wasm::R_WASM_TABLE_INDEX_REL_SLEB)
+      return TableIndices[Sym] - InitialTableOffset;
+    else
+      return TableIndices[Sym];
   }
   case wasm::R_WASM_TYPE_INDEX_LEB:
     // Provisional value is same as the index
@@ -1559,7 +1562,8 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       // purely to make the object file's provisional values readable, and is
       // ignored by the linker, which re-calculates the relocations itself.
       if (Rel.Type != wasm::R_WASM_TABLE_INDEX_I32 &&
-          Rel.Type != wasm::R_WASM_TABLE_INDEX_SLEB)
+          Rel.Type != wasm::R_WASM_TABLE_INDEX_SLEB &&
+          Rel.Type != wasm::R_WASM_TABLE_INDEX_REL_SLEB)
         return;
       assert(Rel.Symbol->isFunction());
       const MCSymbolWasm &WS = *resolveSymbol(*Rel.Symbol);
diff --git a/llvm/test/MC/WebAssembly/reloc-pic.s b/llvm/test/MC/WebAssembly/reloc-pic.s
index 626f8d993e3c6..4732b7ee73141 100644
--- a/llvm/test/MC/WebAssembly/reloc-pic.s
+++ b/llvm/test/MC/WebAssembly/reloc-pic.s
@@ -68,7 +68,7 @@ hidden_func:
 # CHECK-NEXT:         Table:
 # CHECK-NEXT:           ElemType:        FUNCREF
 # CHECK-NEXT:           Limits:
-# CHECK-NEXT:             Initial:         0x00000000
+# CHECK-NEXT:             Initial:         0x00000001
 # CHECK-NEXT:       - Module:          env
 # CHECK-NEXT:         Field:           default_func
 # CHECK-NEXT:         Kind:            FUNCTION
@@ -85,6 +85,12 @@ hidden_func:
 # CHECK-NEXT:         GlobalMutable:   true
 # CHECK-NEXT:   - Type:            FUNCTION
 # CHECK-NEXT:     FunctionTypes:   [ 0, 0, 0, 0, 0 ]
+# CHECK-NEXT:   - Type:            ELEM
+# CHECK-NEXT:     Segments:
+# CHECK-NEXT:        Offset:
+# CHECK-NEXT:          Opcode:          I32_CONST
+# CHECK-NEXT:          Value:           1
+# CHECK-NEXT:        Functions:       [ 5 ]
 # CHECK-NEXT:   - Type:            DATACOUNT
 # CHECK-NEXT:     Count:           1
 # CHECK-NEXT:   - Type:            CODE

From 3a574a6cb35953e538e577a88f62af8dd01432c7 Mon Sep 17 00:00:00 2001
From: Eduardo Caldas <ecaldas@google.com>
Date: Fri, 29 May 2020 20:03:58 +0200
Subject: [PATCH 562/770] Add support for Overloaded Binary Operators in
 SyntaxTree

Reviewers: gribozavr2

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80812
---
 clang/lib/Tooling/Syntax/BuildTree.cpp      |  18 +++
 clang/unittests/Tooling/Syntax/TreeTest.cpp | 128 ++++++++++++++++++++
 2 files changed, 146 insertions(+)

diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index 60c6b3f88509d..2b312cdde1d6b 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -640,6 +640,24 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
     return true;
   }
 
+  bool WalkUpFromCXXOperatorCallExpr(CXXOperatorCallExpr *S) {
+    if (S->isInfixBinaryOp()) {
+      Builder.markExprChild(
+          S->getArg(0),
+          syntax::NodeRole::BinaryOperatorExpression_leftHandSide);
+      Builder.markChildToken(
+          S->getOperatorLoc(),
+          syntax::NodeRole::BinaryOperatorExpression_operatorToken);
+      Builder.markExprChild(
+          S->getArg(1),
+          syntax::NodeRole::BinaryOperatorExpression_rightHandSide);
+      Builder.foldNode(Builder.getExprRange(S),
+                       new (allocator()) syntax::BinaryOperatorExpression, S);
+      return true;
+    }
+    return RecursiveASTVisitor::WalkUpFromCXXOperatorCallExpr(S);
+  }
+
   bool WalkUpFromNamespaceDecl(NamespaceDecl *S) {
     auto Tokens = Builder.getDeclarationRange(S);
     if (Tokens.front().kind() == tok::coloncolon) {
diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
index 7051074d3b33a..04786257c434f 100644
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -993,6 +993,134 @@ void test(int a, int b) {
 )txt");
 }
 
+TEST_F(SyntaxTreeTest, UserDefinedBinaryOperator) {
+  expectTreeDumpEqual(
+      R"cpp(
+struct X {
+  X& operator=(const X&);
+  friend X operator+(X, const X&);
+  friend bool operator<(const X&, const X&);
+};
+void test(X x, X y) {
+  x = y;
+  x + y;
+  x < y;
+}
+      )cpp",
+      R"txt(
+*: TranslationUnit
+|-SimpleDeclaration
+| |-struct
+| |-X
+| |-{
+| |-SimpleDeclaration
+| | |-X
+| | |-SimpleDeclarator
+| | | |-&
+| | | |-operator
+| | | |-=
+| | | `-ParametersAndQualifiers
+| | |   |-(
+| | |   |-SimpleDeclaration
+| | |   | |-const
+| | |   | |-X
+| | |   | `-SimpleDeclarator
+| | |   |   `-&
+| | |   `-)
+| | `-;
+| |-UnknownDeclaration
+| | `-SimpleDeclaration
+| |   |-friend
+| |   |-X
+| |   |-SimpleDeclarator
+| |   | |-operator
+| |   | |-+
+| |   | `-ParametersAndQualifiers
+| |   |   |-(
+| |   |   |-SimpleDeclaration
+| |   |   | `-X
+| |   |   |-,
+| |   |   |-SimpleDeclaration
+| |   |   | |-const
+| |   |   | |-X
+| |   |   | `-SimpleDeclarator
+| |   |   |   `-&
+| |   |   `-)
+| |   `-;
+| |-UnknownDeclaration
+| | `-SimpleDeclaration
+| |   |-friend
+| |   |-bool
+| |   |-SimpleDeclarator
+| |   | |-operator
+| |   | |-<
+| |   | `-ParametersAndQualifiers
+| |   |   |-(
+| |   |   |-SimpleDeclaration
+| |   |   | |-const
+| |   |   | |-X
+| |   |   | `-SimpleDeclarator
+| |   |   |   `-&
+| |   |   |-,
+| |   |   |-SimpleDeclaration
+| |   |   | |-const
+| |   |   | |-X
+| |   |   | `-SimpleDeclarator
+| |   |   |   `-&
+| |   |   `-)
+| |   `-;
+| |-}
+| `-;
+`-SimpleDeclaration
+  |-void
+  |-SimpleDeclarator
+  | |-test
+  | `-ParametersAndQualifiers
+  |   |-(
+  |   |-SimpleDeclaration
+  |   | |-X
+  |   | `-SimpleDeclarator
+  |   |   `-x
+  |   |-,
+  |   |-SimpleDeclaration
+  |   | |-X
+  |   | `-SimpleDeclarator
+  |   |   `-y
+  |   `-)
+  `-CompoundStatement
+    |-{
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-x
+    | | |-UnknownExpression
+    | | | `-=
+    | | `-UnknownExpression
+    | |   `-y
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-UnknownExpression
+    | | |   `-x
+    | | |-UnknownExpression
+    | | | `-+
+    | | `-UnknownExpression
+    | |   `-y
+    | `-;
+    |-ExpressionStatement
+    | |-BinaryOperatorExpression
+    | | |-UnknownExpression
+    | | | `-x
+    | | |-UnknownExpression
+    | | | `-<
+    | | `-UnknownExpression
+    | |   `-y
+    | `-;
+    `-}
+)txt");
+}
+
 TEST_F(SyntaxTreeTest, MultipleDeclaratorsGrouping) {
   expectTreeDumpEqual(
       R"cpp(

From c710bb44a6b4b367b506ca2ef0f1d2af5a92feef Mon Sep 17 00:00:00 2001
From: Ehud Katz <ehudkatz@gmail.com>
Date: Fri, 29 May 2020 21:07:48 +0300
Subject: [PATCH 563/770] [Local] Prevent `invertCondition` from creating a
 redundant instruction

Prevent `invertCondition` from creating the inversion instruction, in
case the given value is an argument which has already been inverted.
Note that this approach has already been taken in case the given value
is an instruction (and not an argument).

Differential Revision: https://reviews.llvm.org/D80399
---
 llvm/lib/Transforms/Utils/Local.cpp           | 49 +++++------
 .../Transforms/StructurizeCFG/bug36015.ll     | 14 +--
 .../StructurizeCFG/loop-multiple-exits.ll     |  4 +-
 .../one-loop-multiple-backedges.ll            | 14 +--
 .../post-order-traversal-bug.ll               |  4 +-
 .../workarounds/needs-fr-ule.ll               | 86 +++++++++----------
 6 files changed, 83 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 545413c1fe035..f0df08251a019 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3053,31 +3053,26 @@ Value *llvm::invertCondition(Value *Condition) {
   if (match(Condition, m_Not(m_Value(NotCondition))))
     return NotCondition;
 
-  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
-    // Third: Check all the users for an invert
-    BasicBlock *Parent = Inst->getParent();
-    for (User *U : Condition->users())
-      if (Instruction *I = dyn_cast<Instruction>(U))
-        if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
-          return I;
-
-    // Last option: Create a new instruction
-    auto Inverted = BinaryOperator::CreateNot(Inst, "");
-    if (isa<PHINode>(Inst)) {
-      // FIXME: This fails if the inversion is to be used in a
-      // subsequent PHINode in the same basic block.
-      Inverted->insertBefore(&*Parent->getFirstInsertionPt());
-    } else {
-      Inverted->insertAfter(Inst);
-    }
-    return Inverted;
-  }
-
-  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
-    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
-    return BinaryOperator::CreateNot(Condition, Arg->getName() + ".inv",
-                                     &*EntryBlock.getFirstInsertionPt());
-  }
-
-  llvm_unreachable("Unhandled condition to invert");
+  BasicBlock *Parent = nullptr;
+  Instruction *Inst = dyn_cast<Instruction>(Condition);
+  if (Inst)
+    Parent = Inst->getParent();
+  else if (Argument *Arg = dyn_cast<Argument>(Condition))
+    Parent = &Arg->getParent()->getEntryBlock();
+  assert(Parent && "Unsupported condition to invert");
+
+  // Third: Check all the users for an invert
+  for (User *U : Condition->users())
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+        return I;
+
+  // Last option: Create a new instruction
+  auto *Inverted =
+      BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv");
+  if (Inst && !isa<PHINode>(Inst))
+    Inverted->insertAfter(Inst);
+  else
+    Inverted->insertBefore(&*Parent->getFirstInsertionPt());
+  return Inverted;
 }
diff --git a/llvm/test/Transforms/StructurizeCFG/bug36015.ll b/llvm/test/Transforms/StructurizeCFG/bug36015.ll
index 24b9c9cdde2d6..507b9ae58504c 100644
--- a/llvm/test/Transforms/StructurizeCFG/bug36015.ll
+++ b/llvm/test/Transforms/StructurizeCFG/bug36015.ll
@@ -18,7 +18,7 @@ loop.inner:
   br i1 %cond.inner, label %if, label %else
 
 ; CHECK: if:
-; CHECK:   %0 = xor i1 %cond.if, true
+; CHECK:   %cond.if.inv = xor i1 %cond.if, true
 ; CHECK:   br label %Flow
 if:
   %ctr.if = add i32 %ctr.loop.inner, 1
@@ -27,12 +27,12 @@ if:
   br i1 %cond.if, label %loop.inner, label %exit
 
 ; CHECK: Flow:
-; CHECK:   %2 = phi i1 [ %0, %if ], [ true, %loop.inner ]
-; CHECK:   %3 = phi i1 [ false, %if ], [ true, %loop.inner ]
-; CHECK:   br i1 %2, label %Flow1, label %loop.inner
+; CHECK:   %1 = phi i1 [ %cond.if.inv, %if ], [ true, %loop.inner ]
+; CHECK:   %2 = phi i1 [ false, %if ], [ true, %loop.inner ]
+; CHECK:   br i1 %1, label %Flow1, label %loop.inner
 
 ; CHECK: Flow1:
-; CHECK:   br i1 %3, label %else, label %Flow2
+; CHECK:   br i1 %2, label %else, label %Flow2
 
 ; CHECK: else:
 ; CHECK:   br label %Flow2
@@ -43,8 +43,8 @@ else:
   br i1 %cond.else, label %loop.outer, label %exit
 
 ; CHECK: Flow2:
-; CHECK:   %6 = phi i1 [ %4, %else ], [ true, %Flow1 ]
-; CHECK:   br i1 %6, label %exit, label %loop.outer
+; CHECK:   %4 = phi i1 [ %cond.else.inv, %else ], [ true, %Flow1 ]
+; CHECK:   br i1 %4, label %exit, label %loop.outer
 
 exit:
   ret void
diff --git a/llvm/test/Transforms/StructurizeCFG/loop-multiple-exits.ll b/llvm/test/Transforms/StructurizeCFG/loop-multiple-exits.ll
index 40f6be9670a3b..320a8e2513751 100644
--- a/llvm/test/Transforms/StructurizeCFG/loop-multiple-exits.ll
+++ b/llvm/test/Transforms/StructurizeCFG/loop-multiple-exits.ll
@@ -26,11 +26,11 @@ for.body:                                         ; preds = %for.cond
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %i.0
   store i32 %i.0, i32 addrspace(1)* %arrayidx, align 4
   %cmp1 = icmp ugt i32 %i.0, %cond_b
-; CHECK: br i1 %{{[0-9a-zA-Z_]+}}, label %for.inc, label %[[FLOW1:[0-9a-zA-Z_]+]]
+; CHECK: br i1 %{{[0-9a-zA-Z_.]+}}, label %for.inc, label %[[FLOW1:[0-9a-zA-Z_]+]]
   br i1 %cmp1, label %for.end, label %for.inc
 
 ; CHECK: [[FLOW:[0-9a-zA-Z]+]]:
-; CHECK: br i1 %{{[0-9a-zA-Z_]+}}, label %for.end, label %for.cond
+; CHECK: br i1 %{{[0-9a-zA-Z_.]+}}, label %for.end, label %for.cond
 
 ; CHECK: for.inc:
 ; CHECK: br label %[[FLOW1]]
diff --git a/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll b/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
index 0af25d61b92c3..d21742fb4e8aa 100644
--- a/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
+++ b/llvm/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
@@ -8,23 +8,23 @@ bb:
   br label %bb3
 
 ; CHECK: bb3:
-; CHECK:   %0 = xor i1 %tmp4, true
-; CHECK:   br i1 %0, label %bb5, label %Flow
+; CHECK:   %tmp4.inv = xor i1 %tmp4, true
+; CHECK:   br i1 %tmp4.inv, label %bb5, label %Flow
 bb3:                                              ; preds = %bb7, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
   %tmp4 = fcmp ult float %arg1, 3.500000e+00
   br i1 %tmp4, label %bb7, label %bb5
 
 ; CHECK: bb5:
-; CHECK:   %1 = xor i1 %tmp6, true
+; CHECK:   %tmp6.inv = xor i1 %tmp6, true
 ; CHECK:   br label %Flow
 bb5:                                              ; preds = %bb3
   %tmp6 = fcmp olt float 0.000000e+00, %arg2
   br i1 %tmp6, label %bb10, label %bb7
 
 ; CHECK: Flow:
-; CHECK:   %2 = phi i1 [ %1, %bb5 ], [ %tmp4, %bb3 ]
-; CHECK:   br i1 %2, label %bb7, label %Flow1
+; CHECK:   %0 = phi i1 [ %tmp6.inv, %bb5 ], [ %tmp4, %bb3 ]
+; CHECK:   br i1 %0, label %bb7, label %Flow1
 
 ; CHECK: bb7:
 ; CHECK:   br label %Flow1
@@ -34,8 +34,8 @@ bb7:                                              ; preds = %bb5, %bb3
   br i1 %tmp9, label %bb3, label %bb10
 
 ; CHECK: Flow1:
-; CHECK:   %6 = phi i1 [ %3, %bb7 ], [ true, %Flow ]
-; CHECK:   br i1 %6, label %bb10, label %bb3
+; CHECK:   %3 = phi i1 [ %tmp9.inv, %bb7 ], [ true, %Flow ]
+; CHECK:   br i1 %3, label %bb10, label %bb3
 
 ; CHECK: bb10:
 bb10:                                             ; preds = %bb7, %bb5
diff --git a/llvm/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll b/llvm/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll
index ba9aa29130611..291e9a58e09bd 100644
--- a/llvm/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll
+++ b/llvm/test/Transforms/StructurizeCFG/post-order-traversal-bug.ll
@@ -15,7 +15,7 @@ entry:
   br label %for.body
 
 ; CHECK: for.body:
-; CHECK: br i1 %{{[0-9]+}}, label %lor.lhs.false, label %Flow
+; CHECK: br i1 %cmp1.inv, label %lor.lhs.false, label %Flow
 for.body:                                         ; preds = %for.body.backedge, %entry
   %indvars.iv = phi i64 [ %indvars.iv.be, %for.body.backedge ], [ 1, %entry ]
   %best_val.027 = phi float [ %best_val.027.be, %for.body.backedge ], [ 5.000000e+01, %entry ]
@@ -59,7 +59,7 @@ for.end:                                          ; preds = %for.body.1, %if.the
 ; CHECK: br i1 %{{[0-9]}}, label %for.body.1, label %Flow2
 
 ; CHECK: for.body.1:
-; CHECK: br i1 %{{[0-9]+}}, label %for.body.6, label %Flow3
+; CHECK: br i1 %cmp1.5.inv, label %for.body.6, label %Flow3
 for.body.1:                                       ; preds = %if.then, %lor.lhs.false
   %best_val.233 = phi float [ %tmp5, %if.then ], [ %best_val.027, %lor.lhs.false ]
   %best_count.231 = phi i32 [ %sub4, %if.then ], [ %best_count.025, %lor.lhs.false ]
diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll
index 1ae1478cff9d8..61dccd2e572d3 100644
--- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll
+++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll
@@ -13,32 +13,32 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3
 ; CHECK-NEXT:    [[PRED11_INV:%.*]] = xor i1 [[PRED11:%.*]], true
 ; CHECK-NEXT:    [[PRED12_INV:%.*]] = xor i1 [[PRED12:%.*]], true
 ; CHECK-NEXT:    [[PRED13_INV:%.*]] = xor i1 [[PRED13:%.*]], true
-; CHECK-NEXT:    br i1 [[PRED0_INV]], label [[IF_THEN:%.*]], label [[FLOW18:%.*]]
-; CHECK:       Flow18:
+; CHECK-NEXT:    br i1 [[PRED0_INV]], label [[IF_THEN:%.*]], label [[FLOW19:%.*]]
+; CHECK:       Flow19:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[FLOW3:%.*]] ], [ true, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_END:%.*]], label [[FLOW19:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_END:%.*]], label [[FLOW20:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 [[PRED1_INV]], label [[IF_ELSE:%.*]], label [[FLOW17:%.*]]
-; CHECK:       Flow17:
+; CHECK-NEXT:    br i1 [[PRED1_INV]], label [[IF_ELSE:%.*]], label [[FLOW18:%.*]]
+; CHECK:       Flow18:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, [[IF_ELSE]] ], [ true, [[IF_END]] ]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN7:%.*]], label [[IF_END16:%.*]]
 ; CHECK:       if.then7:
 ; CHECK-NEXT:    br label [[IF_END16]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    br label [[FLOW17]]
-; CHECK:       Flow19:
+; CHECK-NEXT:    br label [[FLOW18]]
+; CHECK:       Flow20:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       if.end16:
-; CHECK-NEXT:    br i1 [[PRED2_INV]], label [[IF_THEN39:%.*]], label [[FLOW15:%.*]]
-; CHECK:       Flow15:
+; CHECK-NEXT:    br i1 [[PRED2_INV]], label [[IF_THEN39:%.*]], label [[FLOW16:%.*]]
+; CHECK:       Flow16:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ false, [[FLOW5:%.*]] ], [ true, [[IF_END16]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[FLOW16:%.*]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[FLOW17:%.*]]
 ; CHECK:       while.cond.preheader:
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-; CHECK:       Flow16:
-; CHECK-NEXT:    br label [[FLOW19]]
+; CHECK:       Flow17:
+; CHECK-NEXT:    br label [[FLOW20]]
 ; CHECK:       while.cond:
-; CHECK-NEXT:    br i1 [[PRED3_INV]], label [[LOR_RHS:%.*]], label [[FLOW11:%.*]]
+; CHECK-NEXT:    br i1 [[PRED3_INV]], label [[LOR_RHS:%.*]], label [[FLOW12:%.*]]
 ; CHECK:       Flow7:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[PRED7:%.*]], [[COND_END61:%.*]] ], [ false, [[IRR_GUARD:%.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[COND_END61]] ], [ true, [[IRR_GUARD]] ]
@@ -54,22 +54,22 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3
 ; CHECK:       Flow9:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i1 [ true, [[FLOW10]] ], [ false, [[FLOW8]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[FLOW10]] ], [ [[TMP5]], [[FLOW8]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[TMP18:%.*]], [[FLOW10]] ], [ true, [[FLOW8]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP7]], true
-; CHECK-NEXT:    [[TMP11:%.*]] = xor i1 [[TMP8]], true
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[TMP15:%.*]], [[FLOW10]] ], [ true, [[FLOW8]] ]
+; CHECK-NEXT:    [[DOTINV11:%.*]] = xor i1 [[TMP7]], true
+; CHECK-NEXT:    [[DOTINV:%.*]] = xor i1 [[TMP8]], true
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_EXIT_GUARD1:%.*]], label [[IRR_GUARD]]
 ; CHECK:       while.cond47:
 ; CHECK-NEXT:    br label [[FLOW10]]
 ; CHECK:       cond.end61:
 ; CHECK-NEXT:    br label [[FLOW7]]
-; CHECK:       Flow13:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i1 [ false, [[FLOW14:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ [[TMP17:%.*]], [[FLOW14]] ], [ [[TMP11]], [[LOOP_EXIT_GUARD1]] ]
-; CHECK-NEXT:    br label [[FLOW12:%.*]]
+; CHECK:       Flow14:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW15:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ [[TMP14:%.*]], [[FLOW15]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    br label [[FLOW13:%.*]]
 ; CHECK:       if.then69:
-; CHECK-NEXT:    br label [[FLOW14]]
+; CHECK-NEXT:    br label [[FLOW15]]
 ; CHECK:       lor.rhs:
-; CHECK-NEXT:    br label [[FLOW11]]
+; CHECK-NEXT:    br label [[FLOW12]]
 ; CHECK:       while.end76:
 ; CHECK-NEXT:    br label [[FLOW6:%.*]]
 ; CHECK:       if.then39:
@@ -87,39 +87,39 @@ define void @irreducible_mountain_bug(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3
 ; CHECK:       Flow:
 ; CHECK-NEXT:    br label [[FLOW3]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    br label [[FLOW18]]
+; CHECK-NEXT:    br label [[FLOW19]]
 ; CHECK:       Flow4:
 ; CHECK-NEXT:    br label [[FLOW5]]
 ; CHECK:       Flow5:
-; CHECK-NEXT:    br label [[FLOW15]]
-; CHECK:       Flow6:
 ; CHECK-NEXT:    br label [[FLOW16]]
+; CHECK:       Flow6:
+; CHECK-NEXT:    br label [[FLOW17]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
-; CHECK:       Flow11:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i1 [ false, [[LOR_RHS]] ], [ true, [[WHILE_COND]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ [[PRED3]], [[WHILE_COND]] ]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[IRR_GUARD]], label [[FLOW12]]
+; CHECK:       Flow12:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i1 [ false, [[LOR_RHS]] ], [ true, [[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ [[PRED3]], [[WHILE_COND]] ]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[IRR_GUARD]], label [[FLOW13]]
 ; CHECK:       irr.guard:
-; CHECK-NEXT:    [[GUARD_COND_TRUE49:%.*]] = phi i1 [ [[PRED6:%.*]], [[FLOW9]] ], [ [[TMP14]], [[FLOW11]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i1 [[GUARD_COND_TRUE49]], true
-; CHECK-NEXT:    br i1 [[TMP16]], label [[COND_END61]], label [[FLOW7]]
-; CHECK:       Flow14:
-; CHECK-NEXT:    [[TMP17]] = phi i1 [ [[PRED8:%.*]], [[IF_THEN69:%.*]] ], [ [[TMP11]], [[LOOP_EXIT_GUARD2:%.*]] ]
-; CHECK-NEXT:    br label [[FLOW13:%.*]]
+; CHECK-NEXT:    [[GUARD_COND_TRUE49:%.*]] = phi i1 [ [[PRED6:%.*]], [[FLOW9]] ], [ [[TMP12]], [[FLOW12]] ]
+; CHECK-NEXT:    [[GUARD_COND_TRUE49_INV:%.*]] = xor i1 [[GUARD_COND_TRUE49]], true
+; CHECK-NEXT:    br i1 [[GUARD_COND_TRUE49_INV]], label [[COND_END61]], label [[FLOW7]]
+; CHECK:       Flow15:
+; CHECK-NEXT:    [[TMP14]] = phi i1 [ [[PRED8:%.*]], [[IF_THEN69:%.*]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD2:%.*]] ]
+; CHECK-NEXT:    br label [[FLOW14:%.*]]
 ; CHECK:       loop.exit.guard:
-; CHECK-NEXT:    br i1 [[TMP19:%.*]], label [[WHILE_END76:%.*]], label [[FLOW6]]
+; CHECK-NEXT:    br i1 [[TMP16:%.*]], label [[WHILE_END76:%.*]], label [[FLOW6]]
 ; CHECK:       Flow10:
-; CHECK-NEXT:    [[TMP18]] = phi i1 [ false, [[WHILE_COND47]] ], [ true, [[WHILE_BODY63]] ]
+; CHECK-NEXT:    [[TMP15]] = phi i1 [ false, [[WHILE_COND47]] ], [ true, [[WHILE_BODY63]] ]
 ; CHECK-NEXT:    br label [[FLOW9]]
-; CHECK:       Flow12:
-; CHECK-NEXT:    [[TMP19]] = phi i1 [ [[TMP12]], [[FLOW13]] ], [ true, [[FLOW11]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = phi i1 [ [[TMP13]], [[FLOW13]] ], [ true, [[FLOW11]] ]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[LOOP_EXIT_GUARD:%.*]], label [[WHILE_COND]]
+; CHECK:       Flow13:
+; CHECK-NEXT:    [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW14]] ], [ true, [[FLOW12]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[TMP11]], [[FLOW14]] ], [ true, [[FLOW12]] ]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[LOOP_EXIT_GUARD:%.*]], label [[WHILE_COND]]
 ; CHECK:       loop.exit.guard1:
-; CHECK-NEXT:    br i1 [[TMP11]], label [[LOOP_EXIT_GUARD2]], label [[FLOW13]]
+; CHECK-NEXT:    br i1 [[DOTINV]], label [[LOOP_EXIT_GUARD2]], label [[FLOW14]]
 ; CHECK:       loop.exit.guard2:
-; CHECK-NEXT:    br i1 [[TMP10]], label [[IF_THEN69]], label [[FLOW14]]
+; CHECK-NEXT:    br i1 [[DOTINV11]], label [[IF_THEN69]], label [[FLOW15]]
 ;
 entry:
   br i1 %Pred0, label %if.end, label %if.then

From d2befc66336d4d4c014be11d40e8ed6d3140fd36 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 10:06:26 -0700
Subject: [PATCH 564/770] [SVE] Eliminate calls to default-false
 VectorType::get() from Vectorize

Reviewers: efriedma, c-rhodes, david-arm, fhahn

Reviewed By: david-arm

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80339
---
 .../Vectorize/LoadStoreVectorizer.cpp         | 12 ++---
 .../Vectorize/LoopVectorizationLegality.cpp   |  4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 51 ++++++++++---------
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 ++++-----
 4 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c02b8f8895006..4885dd4351d5c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1030,10 +1030,10 @@ bool Vectorizer::vectorizeStoreChain(
   VectorType *VecTy;
   VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
   if (VecStoreTy)
-    VecTy = VectorType::get(StoreTy->getScalarType(),
-                            Chain.size() * VecStoreTy->getNumElements());
+    VecTy = FixedVectorType::get(StoreTy->getScalarType(),
+                                 Chain.size() * VecStoreTy->getNumElements());
   else
-    VecTy = VectorType::get(StoreTy, Chain.size());
+    VecTy = FixedVectorType::get(StoreTy, Chain.size());
 
   // If it's more than the max vector size or the target has a better
   // vector factor, break it into two pieces.
@@ -1182,10 +1182,10 @@ bool Vectorizer::vectorizeLoadChain(
   VectorType *VecTy;
   VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
   if (VecLoadTy)
-    VecTy = VectorType::get(LoadTy->getScalarType(),
-                            Chain.size() * VecLoadTy->getNumElements());
+    VecTy = FixedVectorType::get(LoadTy->getScalarType(),
+                                 Chain.size() * VecLoadTy->getNumElements());
   else
-    VecTy = VectorType::get(LoadTy, Chain.size());
+    VecTy = FixedVectorType::get(LoadTy, Chain.size());
 
   // If it's more than the max vector size or the target has a better
   // vector factor, break it into two pieces.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index bf19405cd4ee3..eb8709a9d63f5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -767,7 +767,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // supported on the target.
         if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
           // Arbitrarily try a vector of 2 elements.
-          Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+          auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of stored type");
           if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
             reportVectorizationFailure(
@@ -782,7 +782,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
           // For nontemporal loads, check that a nontemporal vector version is
           // supported on the target (arbitrarily try a vector of 2 elements).
-          Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of load type");
           if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
             reportVectorizationFailure(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8d52ddc5b3b5d..5e5f029578f04 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -311,7 +311,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
   // Determine if an array of VF elements of type Ty is "bitcast compatible"
   // with a <VF x Ty> vector.
   if (VF > 1) {
-    auto *VectorTy = VectorType::get(Ty, VF);
+    auto *VectorTy = FixedVectorType::get(Ty, VF);
     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
   }
 
@@ -2074,7 +2074,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     } else {
       // Initialize packing with insertelements to start from undef.
-      Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+      Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
       VectorLoopValueMap.setVectorValue(V, Part, Undef);
       for (unsigned Lane = 0; Lane < VF; ++Lane)
         packScalarIntoVectorValue(V, {Part, Lane});
@@ -2196,7 +2196,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
-  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
+  auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
 
   // Prepare for the new pointers.
   SmallVector<Value *, 2> AddrParts;
@@ -2300,7 +2300,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
 
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
-          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+          VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
         }
 
@@ -2314,7 +2314,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 
   // The sub vector type for current instruction.
-  VectorType *SubVT = VectorType::get(ScalarTy, VF);
+  auto *SubVT = FixedVectorType::get(ScalarTy, VF);
 
   // Vectorize the interleaved store group.
   for (unsigned Part = 0; Part < UF; Part++) {
@@ -2385,7 +2385,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
          "CM decision is not to widen the memory instruction");
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
-  Type *DataTy = VectorType::get(ScalarDataTy, VF);
+  auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
 
   // Determine if the pointer operand of the access is either consecutive or
@@ -2688,7 +2688,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
          "Only one type should be a floating point type");
   Type *IntTy =
       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
-  VectorType *VecIntTy = VectorType::get(IntTy, VF);
+  auto *VecIntTy = FixedVectorType::get(IntTy, VF);
   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
 }
@@ -3359,7 +3359,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       Type *OriginalTy = I->getType();
       Type *ScalarTruncatedTy =
           IntegerType::get(OriginalTy->getContext(), KV.second);
-      Type *TruncatedTy = VectorType::get(
+      auto *TruncatedTy = FixedVectorType::get(
           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
       if (TruncatedTy == OriginalTy)
         continue;
@@ -3413,11 +3413,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
         auto Elements0 =
             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
-            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
+            SI->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements0));
         auto Elements1 =
             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
         auto *O1 = B.CreateZExtOrTrunc(
-            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
+            SI->getOperand(1),
+            FixedVectorType::get(ScalarTruncatedTy, Elements1));
 
         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
@@ -3427,14 +3429,16 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
         auto Elements =
             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
-            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+            IE->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements));
         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
         auto Elements =
             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
-            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+            EE->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements));
         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
       } else {
         // If we don't know what to do, be conservative and don't do anything.
@@ -3598,8 +3602,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   if (VF > 1) {
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
     VectorInit = Builder.CreateInsertElement(
-        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
-        Builder.getInt32(VF - 1), "vector.recur.init");
+        UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
+        VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
   }
 
   // We constructed a temporary phi node in the first phase of vectorization.
@@ -3821,7 +3825,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
-    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+    Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
     Builder.SetInsertPoint(
         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
     VectorParts RdxParts(UF);
@@ -4148,7 +4152,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // Create a vector phi with no operands - the vector phi operands will be
     // set at the end of vector code generation.
     Type *VecTy =
-        (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+        (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
     OrigPHIsToFix.push_back(P);
@@ -4167,7 +4171,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
       Type *VecTy =
-          (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+          (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
       Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
@@ -4327,7 +4331,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
 
     /// Vectorize casts.
     Type *DestTy =
-        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+        (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *A = State.get(User.getOperand(0), Part);
@@ -4387,7 +4391,8 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
       // Use vector version of the intrinsic.
       Type *TysForDecl[] = {CI->getType()};
       if (VF > 1)
-        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+        TysForDecl[0] =
+            FixedVectorType::get(CI->getType()->getScalarType(), VF);
       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
@@ -5947,7 +5952,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
-  VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+  auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
 
   // Holds the indices of existing members in an interleaved load group.
   // An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -6349,7 +6354,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
     if (!ScalarCond)
-      CondTy = VectorType::get(CondTy, VF);
+      CondTy = FixedVectorType::get(CondTy, VF);
 
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
                                   CostKind, I);
@@ -7510,8 +7515,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     if (AlsoPack && State.VF > 1) {
       // If we're constructing lane 0, initialize to start from undef.
       if (State.Instance->Lane == 0) {
-        Value *Undef =
-            UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+        Value *Undef = UndefValue::get(
+            FixedVectorType::get(Ingredient->getType(), State.VF));
         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
       }
       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1657b9e901150..4c18fab4ec098 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3165,7 +3165,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
 
   if (!isValidElementType(EltTy))
     return 0;
-  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
     return 0;
   return N;
@@ -3265,7 +3265,7 @@ getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
     SmallVector<Type *, 4> VecTys;
     for (Use &Arg : CI->args())
       VecTys.push_back(
-          VectorType::get(Arg->getType(), VecTy->getNumElements()));
+          FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
 
     // If the corresponding vector call is cheaper, return its cost.
     LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
@@ -3425,7 +3425,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Calculate the cost of this instruction.
       int ScalarCost = VL.size() * ScalarEltCost;
 
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
       int VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
@@ -3445,7 +3445,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
                                             CostKind, VL0);
@@ -3633,8 +3633,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       } else {
         Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
         Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
-        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
-        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
+        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
         VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
                                         CostKind);
         VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
@@ -3807,7 +3807,7 @@ int BoUpSLP::getSpillCost() const {
     if (NumCalls) {
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
-        V.push_back(VectorType::get(II->getType(), BundleWidth));
+        V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
     }
 
@@ -4100,7 +4100,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
     else
       VL = UniqueValues;
   }
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
 
   Value *V = Gather(VL, VecTy);
   if (!ReuseShuffleIndicies.empty()) {
@@ -4135,7 +4135,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
@@ -4532,7 +4532,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Module *M = F->getParent();
-      Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
+      Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
 
       if (!UseIntrinsic) {
@@ -4660,7 +4660,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
     auto BundleWidth = VectorizableTree[0]->Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-    auto *VecTy = VectorType::get(MinTy, BundleWidth);
+    auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
     VectorizableTree[0]->VectorizedValue = Trunc;
   }
@@ -5988,7 +5988,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
-    auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+    auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
     if (TTI->getNumberOfParts(VecTy) == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {

From e7102eed20d969c7e75b7d3dc5192290f733797a Mon Sep 17 00:00:00 2001
From: Sourabh Singh Tomar <SourabhSingh.Tomar@amd.com>
Date: Fri, 29 May 2020 23:57:35 +0530
Subject: [PATCH 565/770] [DWARF5] Added support for .debug_macro.dwo section
 in llvm-dwarfdump

This patch extends the parsing and dumping support of llvm-dwarfdump
for debug_macro.dwo section.

Following forms are supported:

 - DW_MACRO_define
 - DW_MACRO_undef
 - DW_MACRO_start_file
 - DW_MACRO_end_file
 - DW_MACRO_define_strx
 - DW_MACRO_undef_strx
 - DW_MACRO_define_strp
 - DW_MACRO_undef_strp

Reviewed by: ikudrin, dblaikie

Differential Revision: https://reviews.llvm.org/D78500
---
 .../llvm/DebugInfo/DWARF/DWARFContext.h       | 11 ++-
 .../llvm/DebugInfo/DWARF/DWARFObject.h        |  1 +
 llvm/lib/DebugInfo/DWARF/DWARFContext.cpp     | 30 +++++++-
 .../test/DebugInfo/X86/debug-macro-strp-dwo.s | 36 ++++++++++
 .../test/DebugInfo/X86/debug-macro-strx-dwo.s | 68 +++++++++++++++++++
 5 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/debug-macro-strp-dwo.s
 create mode 100644 llvm/test/DebugInfo/X86/debug-macro-strx-dwo.s

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 3e387343d54f5..b36505760e466 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -72,6 +72,7 @@ class DWARFContext : public DIContext {
   DWARFUnitVector DWOUnits;
   std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
   std::unique_ptr<DWARFDebugMacro> MacinfoDWO;
+  std::unique_ptr<DWARFDebugMacro> MacroDWO;
 
   /// The maximum DWARF version of all units.
   unsigned MaxVersion = 0;
@@ -110,8 +111,8 @@ class DWARFContext : public DIContext {
   enum MacroSecType {
     MacinfoSection,
     MacinfoDwoSection,
-    MacroSection
-    // FIXME: Add support for.debug_macro.dwo section.
+    MacroSection,
+    MacroDwoSection
   };
 
 public:
@@ -291,6 +292,9 @@ class DWARFContext : public DIContext {
   /// Get a pointer to the parsed DebugMacro information object.
   const DWARFDebugMacro *getDebugMacro();
 
+  /// Get a pointer to the parsed DebugMacroDWO information object.
+  const DWARFDebugMacro *getDebugMacroDWO();
+
   /// Get a reference to the parsed accelerator table object.
   const DWARFDebugNames &getDebugNames();
 
@@ -319,6 +323,9 @@ class DWARFContext : public DIContext {
   DataExtractor getStringExtractor() const {
     return DataExtractor(DObj->getStrSection(), false, 0);
   }
+  DataExtractor getStringDWOExtractor() const {
+    return DataExtractor(DObj->getStrDWOSection(), false, 0);
+  }
   DataExtractor getLineStringExtractor() const {
     return DataExtractor(DObj->getLineStrSection(), false, 0);
   }
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 89fccf04a01df..60fcd3daf5b1b 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -48,6 +48,7 @@ class DWARFObject {
   virtual const DWARFSection &getRangesSection() const { return Dummy; }
   virtual const DWARFSection &getRnglistsSection() const { return Dummy; }
   virtual const DWARFSection &getMacroSection() const { return Dummy; }
+  virtual StringRef getMacroDWOSection() const { return ""; }
   virtual StringRef getMacinfoSection() const { return ""; }
   virtual StringRef getMacinfoDWOSection() const { return ""; }
   virtual const DWARFSection &getPubnamesSection() const { return Dummy; }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 2989c68436ed3..b6e0d9342cf96 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -264,9 +264,13 @@ std::unique_ptr<DWARFDebugMacro>
 DWARFContext::parseMacroOrMacinfo(MacroSecType SectionType) {
   auto Macro = std::make_unique<DWARFDebugMacro>();
   auto ParseAndDump = [&](DWARFDataExtractor &Data, bool IsMacro) {
-    // FIXME: Add support for debug_macro.dwo section.
-    if (Error Err = IsMacro ? Macro->parseMacro(compile_units(),
-                                                getStringExtractor(), Data)
+    if (Error Err = IsMacro ? Macro->parseMacro(SectionType == MacroSection
+                                                    ? compile_units()
+                                                    : dwo_compile_units(),
+                                                SectionType == MacroSection
+                                                    ? getStringExtractor()
+                                                    : getStringDWOExtractor(),
+                                                Data)
                             : Macro->parseMacinfo(Data)) {
       RecoverableErrorHandler(std::move(Err));
       Macro = nullptr;
@@ -289,6 +293,11 @@ DWARFContext::parseMacroOrMacinfo(MacroSecType SectionType) {
     ParseAndDump(Data, /*IsMacro=*/true);
     break;
   }
+  case MacroDwoSection: {
+    DWARFDataExtractor Data(DObj->getMacroDWOSection(), isLittleEndian(), 0);
+    ParseAndDump(Data, /*IsMacro=*/true);
+    break;
+  }
   }
   return Macro;
 }
@@ -461,6 +470,12 @@ void DWARFContext::dump(
       Macro->dump(OS);
   }
 
+  if (shouldDump(Explicit, ".debug_macro.dwo", DIDT_ID_DebugMacro,
+                 DObj->getMacroDWOSection())) {
+    if (auto MacroDWO = getDebugMacroDWO())
+      MacroDWO->dump(OS);
+  }
+
   if (shouldDump(Explicit, ".debug_macinfo", DIDT_ID_DebugMacro,
                  DObj->getMacinfoSection())) {
     if (auto Macinfo = getDebugMacinfo())
@@ -845,6 +860,12 @@ const DWARFDebugMacro *DWARFContext::getDebugMacro() {
   return Macro.get();
 }
 
+const DWARFDebugMacro *DWARFContext::getDebugMacroDWO() {
+  if (!MacroDWO)
+    MacroDWO = parseMacroOrMacinfo(MacroDwoSection);
+  return MacroDWO.get();
+}
+
 const DWARFDebugMacro *DWARFContext::getDebugMacinfo() {
   if (!Macinfo)
     Macinfo = parseMacroOrMacinfo(MacinfoSection);
@@ -1534,6 +1555,7 @@ class DWARFObjInMemory final : public DWARFObject {
   StringRef StrSection;
   StringRef MacinfoSection;
   StringRef MacinfoDWOSection;
+  StringRef MacroDWOSection;
   StringRef AbbrevDWOSection;
   StringRef StrDWOSection;
   StringRef CUIndexSection;
@@ -1554,6 +1576,7 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("debug_str", &StrSection)
         .Case("debug_macinfo", &MacinfoSection)
         .Case("debug_macinfo.dwo", &MacinfoDWOSection)
+        .Case("debug_macro.dwo", &MacroDWOSection)
         .Case("debug_abbrev.dwo", &AbbrevDWOSection)
         .Case("debug_str.dwo", &StrDWOSection)
         .Case("debug_cu_index", &CUIndexSection)
@@ -1872,6 +1895,7 @@ class DWARFObjInMemory final : public DWARFObject {
     return RnglistsSection;
   }
   const DWARFSection &getMacroSection() const override { return MacroSection; }
+  StringRef getMacroDWOSection() const override { return MacroDWOSection; }
   StringRef getMacinfoSection() const override { return MacinfoSection; }
   StringRef getMacinfoDWOSection() const override { return MacinfoDWOSection; }
   const DWARFSection &getPubnamesSection() const override { return PubnamesSection; }
diff --git a/llvm/test/DebugInfo/X86/debug-macro-strp-dwo.s b/llvm/test/DebugInfo/X86/debug-macro-strp-dwo.s
new file mode 100644
index 0000000000000..b74f49da558f3
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/debug-macro-strp-dwo.s
@@ -0,0 +1,36 @@
+## This test checks that llvm-dwarfdump can dump debug_macro.dwo
+## section containing DW_MACRO_*_strp forms present in a dwo object.
+
+# RUN: llvm-mc -triple x86_64-unknown-linux -filetype=obj %s -o -| \
+# RUN:   llvm-dwarfdump -debug-macro - | FileCheck -strict-whitespace -match-full-lines %s
+
+#      CHECK:.debug_macro.dwo contents:
+# CHECK-NEXT:0x00000000:
+# CHECK-NEXT:macro header: version = 0x0005, flags = 0x02, debug_line_offset = 0x0000
+# CHECK-NEXT:DW_MACRO_start_file - lineno: 0 filenum: 0
+# CHECK-NEXT:  DW_MACRO_define_strp - lineno: 1 macro: DWARF_VERSION 5
+# CHECK-NEXT:  DW_MACRO_undef_strp - lineno: 4 macro: DWARF_VERSION
+# CHECK-NEXT:DW_MACRO_end_file
+
+	.section	.debug_macro.dwo,"e",@progbits
+.Lcu_macro_begin0:
+	.short	5                      # Macro information version
+	.byte	2                       # Flags: 32 bit, debug_line_offset present
+	.long	0                       # debug_line_offset
+	.byte	3                       # DW_MACRO_start_file
+	.byte	0                       # Line Number
+	.byte	0                       # File Number
+	.byte	5                       # DW_MACRO_define_strp
+	.byte	1                       # Line Number
+	.long	.Linfo_string0-.debug_str.dwo   # Macro String
+	.byte	6                       # DW_MACRO_undef_strp
+	.byte	4                       # Line Number
+	.long	.Linfo_string1-.debug_str.dwo   # Macro String
+	.byte	4                       # DW_MACRO_end_file
+	.byte	0                       # End Of Macro List Mark
+
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"DWARF_VERSION 5"
+.Linfo_string1:
+	.asciz	"DWARF_VERSION"
diff --git a/llvm/test/DebugInfo/X86/debug-macro-strx-dwo.s b/llvm/test/DebugInfo/X86/debug-macro-strx-dwo.s
new file mode 100644
index 0000000000000..242505f25af2b
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/debug-macro-strx-dwo.s
@@ -0,0 +1,68 @@
+## This test checks that llvm-dwarfdump can dump debug_macro.dwo
+## section containing DW_MACRO_*_strx forms present in a dwo object.
+
+# RUN: llvm-mc -triple x86_64-unknown-linux -filetype=obj %s -o -| \
+# RUN:   llvm-dwarfdump -debug-macro - | FileCheck -strict-whitespace -match-full-lines %s
+
+#      CHECK:.debug_macro.dwo contents:
+# CHECK-NEXT:0x00000000:
+# CHECK-NEXT:macro header: version = 0x0005, flags = 0x02, debug_line_offset = 0x0000
+# CHECK-NEXT:DW_MACRO_start_file - lineno: 0 filenum: 0
+# CHECK-NEXT:  DW_MACRO_define_strx - lineno: 1 macro: DWARF_VERSION 5
+# CHECK-NEXT:  DW_MACRO_undef_strx - lineno: 4 macro: DWARF_VERSION
+# CHECK-NEXT:DW_MACRO_end_file
+
+	.section	.debug_macro.dwo,"e",@progbits
+.Lcu_macro_begin0:
+	.short	5                      # Macro information version
+	.byte	2                       # Flags: 32 bit, debug_line_offset present
+	.long	0                       # debug_line_offset
+	.byte	3                       # DW_MACRO_start_file
+	.byte	0                       # Line Number
+	.byte	0                       # File Number
+	.byte	11                      # DW_MACRO_define_strx
+	.byte	1                       # Line Number
+	.byte	0                       # Macro String Index
+	.byte	12                      # DW_MACRO_undef_strx
+	.byte	4                       # Line Number
+	.byte	1                       # Macro String Index
+	.byte	4                       # DW_MACRO_end_file
+	.byte	0                       # End Of Macro List Mark
+
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long   .Lcu_str_off_end0-.Lcu_str_off_start0 # Unit length
+	.short	5                                     # Version
+	.short	0                                     # Padding
+.Lcu_str_off_start0:
+	.long	.Linfo_string0-.debug_str.dwo
+	.long	.Linfo_string1-.debug_str.dwo
+.Lcu_str_off_end0:
+
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"DWARF_VERSION 5"
+.Linfo_string1:
+	.asciz	"DWARF_VERSION"
+
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                      # DWARF version number
+	.byte	5                       # DWARF Unit Type
+	.byte	8                       # Address Size (in bytes)
+	.long	0                       # Offset Into Abbrev. Section
+	.quad	1536875774479801980
+	.byte	1                       # Abbrev [1] 0x14:0x1a DW_TAG_compile_unit
+	.long   .Lcu_macro_begin0-.debug_macro.dwo # DW_AT_macros
+	.byte	0                       # End Of Children Mark
+.Ldebug_info_dwo_end0:
+
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                       # Abbreviation Code
+	.byte	17                      # DW_TAG_compile_unit
+	.byte	0                       # DW_CHILDREN_no
+	.byte	121                     # DW_AT_macros
+	.byte	23                      # DW_FORM_sec_offset
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	0                       # EOM(3)

From b47403c0a4c532554cf3d67ed1669fe00530aab3 Mon Sep 17 00:00:00 2001
From: Sourabh Singh Tomar <SourabhSingh.Tomar@amd.com>
Date: Sat, 30 May 2020 00:22:40 +0530
Subject: [PATCH 566/770] [DWARF5] Replace emission of strp with stx forms in
 debug_macro section

DW_MACRO_define_strx forms are supported now in llvm-dwarfdump and these
forms can be used in both debug_macro[.dwo] sections. An added advantage
for using strx forms over strp forms is that it uses indices
approach instead of a relocation to debug_str section.

This patch unify the emission for debug_macro section.

Reviewed by: dblaikie, ikudrin

Differential Revision: https://reviews.llvm.org/D78865
---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 21 +++++++++------------
 llvm/test/DebugInfo/X86/debug-macro-v5.ll  |  6 +++---
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 953154f0b10b6..84bc1a13c984e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2924,26 +2924,23 @@ void DwarfDebug::emitMacro(DIMacro &M) {
 
   if (UseMacro) {
     unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
-                        ? dwarf::DW_MACRO_define_strp
-                        : dwarf::DW_MACRO_undef_strp;
+                        ? dwarf::DW_MACRO_define_strx
+                        : dwarf::DW_MACRO_undef_strx;
     Asm->OutStreamer->AddComment(dwarf::MacroString(Type));
     Asm->emitULEB128(Type);
     Asm->OutStreamer->AddComment("Line Number");
     Asm->emitULEB128(M.getLine());
     Asm->OutStreamer->AddComment("Macro String");
     if (!Value.empty())
-      Asm->OutStreamer->emitSymbolValue(
-          this->InfoHolder.getStringPool()
-              .getEntry(*Asm, (Name + " " + Value).str())
-              .getSymbol(),
-          4);
+      Asm->emitULEB128(this->InfoHolder.getStringPool()
+                           .getIndexedEntry(*Asm, (Name + " " + Value).str())
+                           .getIndex());
     else
-      // DW_MACRO_undef_strp doesn't have a value, so just emit the macro
+      // DW_MACRO_undef_strx doesn't have a value, so just emit the macro
       // string.
-      Asm->OutStreamer->emitSymbolValue(this->InfoHolder.getStringPool()
-                                            .getEntry(*Asm, (Name).str())
-                                            .getSymbol(),
-                                        4);
+      Asm->emitULEB128(this->InfoHolder.getStringPool()
+                           .getIndexedEntry(*Asm, (Name).str())
+                           .getIndex());
   } else {
     Asm->OutStreamer->AddComment(dwarf::MacinfoString(M.getMacinfoType()));
     Asm->emitULEB128(M.getMacinfoType());
diff --git a/llvm/test/DebugInfo/X86/debug-macro-v5.ll b/llvm/test/DebugInfo/X86/debug-macro-v5.ll
index 3307c10addc54..85df7769d71da 100644
--- a/llvm/test/DebugInfo/X86/debug-macro-v5.ll
+++ b/llvm/test/DebugInfo/X86/debug-macro-v5.ll
@@ -11,12 +11,12 @@
 ; CHECK-NEXT: macro header: version = 0x0005, flags = 0x02, debug_line_offset = 0x0000
 ; CHECK-NEXT: DW_MACRO_start_file - lineno: 0 filenum: 0
 ; CHECK-NEXT:   DW_MACRO_start_file - lineno: 1 filenum: 1
-; CHECK-NEXT:     DW_MACRO_define_strp - lineno: 1 macro: FOO 5
+; CHECK-NEXT:     DW_MACRO_define_strx - lineno: 1 macro: FOO 5
 ; CHECK-NEXT:   DW_MACRO_end_file
 ; CHECK-NEXT:   DW_MACRO_start_file - lineno: 2 filenum: 2
-; CHECK-NEXT:     DW_MACRO_undef_strp - lineno: 14 macro: YEA
+; CHECK-NEXT:     DW_MACRO_undef_strx - lineno: 14 macro: YEA
 ; CHECK-NEXT:   DW_MACRO_end_file
-; CHECK-NEXT:   DW_MACRO_undef_strp - lineno: 14 macro: YEA
+; CHECK-NEXT:   DW_MACRO_undef_strx - lineno: 14 macro: YEA
 ; CHECK-NEXT: DW_MACRO_end_file
 
 ; ModuleID = 'test.c'

From 9534192c3bfd861f8082843c57dfee0a7881d266 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Fri, 29 May 2020 13:09:55 -0400
Subject: [PATCH 567/770] [mlir][Linalg] Make contraction vectorization use
 vector transfers

This revision replaces the load + vector.type_cast by appropriate vector transfer
operations. These play more nicely with other vector abstractions and canonicalization
patterns and lower to load/store with or without masks when appropriate.

Differential Revision: https://reviews.llvm.org/D80809
---
 .../Linalg/Transforms/Vectorization.cpp       | 26 +++++++++++++++----
 .../Dialect/Linalg/transform-patterns.mlir    | 11 +++-----
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 8fa0aa35a8746..763961311d0b7 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -120,14 +120,30 @@ void mlir::linalg::vectorizeLinalgOp(OpBuilder &builder, Operation *op) {
   // Vectorize other ops as vector contraction (currently only matmul).
   LLVM_DEBUG(dbgs() << dbgPref
                     << "Rewrite linalg op as vector.contract: " << *op);
+  auto extractVectorTypeFromScalarView = [](Value v) {
+    MemRefType mt = v.getType().cast<MemRefType>();
+    return VectorType::get(mt.getShape(), mt.getElementType());
+  };
   auto linalgOp = cast<linalg::LinalgOp>(op);
-  Value a = std_load(vector_type_cast(linalgOp.getInput(0)));
-  Value b = std_load(vector_type_cast(linalgOp.getInput(1)));
-  Value memref = vector_type_cast(linalgOp.getOutputBuffer(0));
-  Value c = std_load(memref);
+  Value viewA = linalgOp.getInput(0);
+  Value viewB = linalgOp.getInput(1);
+  Value viewC = linalgOp.getOutputBuffer(0);
+  Value zero = std_constant_index(0);
+  SmallVector<Value, 4> indicesA(linalgOp.getInputShapedType(0).getRank(),
+                                 zero);
+  SmallVector<Value, 4> indicesB(linalgOp.getInputShapedType(1).getRank(),
+                                 zero);
+  SmallVector<Value, 4> indicesC(linalgOp.getOutputShapedType(0).getRank(),
+                                 zero);
+  Value a = vector_transfer_read(extractVectorTypeFromScalarView(viewA), viewA,
+                                 indicesA);
+  Value b = vector_transfer_read(extractVectorTypeFromScalarView(viewB), viewB,
+                                 indicesB);
+  Value c = vector_transfer_read(extractVectorTypeFromScalarView(viewC), viewC,
+                                 indicesC);
   Value res = vector_contract(a, b, c, linalgOp.indexing_maps(),
                               linalgOp.iterator_types());
-  std_store(res, memref);
+  vector_transfer_write(res, viewC, indicesC);
 }
 
 /// Check whether there is any interleaved use of any `values` between `firstOp`
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
index 4c46c74fe4909..41fa3fd95d93c 100644
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -106,14 +106,11 @@ func @vectorization_test(%A: memref<8x16xf32>, %B: memref<16x32xf32>,
   return
 }
 // CHECK-LABEL: func @vectorization_test
-//       CHECK: vector.type_cast %{{.*}} : memref<8x16xf32> to memref<vector<8x16xf32>>
-//       CHECK: load %{{.*}}[] : memref<vector<8x16xf32>>
-//       CHECK: vector.type_cast %{{.*}} : memref<16x32xf32> to memref<vector<16x32xf32>>
-//       CHECK: load %{{.*}}[] : memref<vector<16x32xf32>>
-//       CHECK: vector.type_cast %{{.*}} : memref<8x32xf32> to memref<vector<8x32xf32>>
-//       CHECK: load %{{.*}}[] : memref<vector<8x32xf32>>
+//       CHECK: vector.transfer_read %{{.*}} : memref<8x16xf32>, vector<8x16xf32>
+//       CHECK: vector.transfer_read %{{.*}} : memref<16x32xf32>, vector<16x32xf32>
+//       CHECK: vector.transfer_read %{{.*}} : memref<8x32xf32>, vector<8x32xf32>
 //       CHECK: vector.contract {indexing_maps = [#[[mk]], #[[kn]], #[[mn]]], iterator_types = ["parallel", "parallel", "reduction"]} %{{.*}}, %{{.*}}, %{{.*}} : vector<8x16xf32>, vector<16x32xf32> into vector<8x32xf32>
-//       CHECK: store %{{.*}}, %{{.*}}[] : memref<vector<8x32xf32>>
+//       CHECK: vector.transfer_write %{{.*}}, %{{.*}} : vector<8x32xf32>, memref<8x32xf32>
 
 func @vectorization_test_2(%A: memref<8x16xf32>, %B: memref<16x32xf32>,
                          %C: memref<8x32xf32>) {

From a5202949134c3b8da108e04043a5c4350309ad1b Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Fri, 29 May 2020 12:08:47 -0700
Subject: [PATCH 568/770] [AMDGPU] Regenrated urem/udiv global isel tests. NFC.

---
 llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 4 ++--
 llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 656aa864d4c7a..fa3ffea3bd599 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -775,7 +775,7 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; CGP-NEXT:  BB2_2: ; %Flow1
+; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB2_4
@@ -2896,7 +2896,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; CGP-NEXT:  BB8_2: ; %Flow1
+; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB8_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 928c592399574..da67412c057c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -770,7 +770,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
-; CGP-NEXT:  BB2_2: ; %Flow1
+; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; CGP-NEXT:    s_cbranch_execz BB2_4
@@ -2866,7 +2866,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v7, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
-; CGP-NEXT:  BB8_2: ; %Flow1
+; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; CGP-NEXT:    s_cbranch_execz BB8_4

From f881c7967dbeaa5a5f2d80b4216efa072afbf6cb Mon Sep 17 00:00:00 2001
From: Ehud Katz <ehudkatz@gmail.com>
Date: Fri, 29 May 2020 22:15:26 +0300
Subject: [PATCH 569/770] [tests] Fix AMDGPU test

Fix naming issue in test due to change D80399.
---
 .../AMDGPU/multi-divergent-exit-region.ll     | 162 +++++++++---------
 1 file changed, 81 insertions(+), 81 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index d473146d1cdda..144b3f2599bf0 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -9,18 +9,18 @@
 ; StructurizeCFG.
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
-; IR: %2 = extractvalue { i1, i64 } %1, 0
-; IR: %3 = extractvalue { i1, i64 } %1, 1
-; IR: br i1 %2, label %LeafBlock1, label %Flow
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
+; IR: %1 = extractvalue { i1, i64 } %0, 0
+; IR: %2 = extractvalue { i1, i64 } %0, 1
+; IR: br i1 %1, label %LeafBlock1, label %Flow
 
 ; IR: Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
-; IR: %7 = extractvalue { i1, i64 } %6, 0
-; IR: %8 = extractvalue { i1, i64 } %6, 1
-; IR: br i1 %7, label %LeafBlock, label %Flow1
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: %6 = extractvalue { i1, i64 } %5, 0
+; IR: %7 = extractvalue { i1, i64 } %5, 1
+; IR: br i1 %6, label %LeafBlock, label %Flow1
 
 ; IR: LeafBlock:
 ; IR: br label %Flow1
@@ -29,32 +29,32 @@
 ; IR: br label %Flow{{$}}
 
 ; IR:  Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
-; IR: %13 = extractvalue { i1, i64 } %12, 0
-; IR: %14 = extractvalue { i1, i64 } %12, 1
-; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: %10 = extractvalue { i1, i64 } %9, 0
+; IR: %11 = extractvalue { i1, i64 } %9, 1
+; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 ; IR: br label %UnifiedReturnBlock
 
 ; IR: Flow1:
-; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
-; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %8)
-; IR: %17 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %16)
-; IR: %18 = extractvalue { i1, i64 } %17, 0
-; IR: %19 = extractvalue { i1, i64 } %17, 1
-; IR: br i1 %18, label %exit1, label %Flow2
+; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
+; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
+; IR: %15 = extractvalue { i1, i64 } %14, 0
+; IR: %16 = extractvalue { i1, i64 } %14, 1
+; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
 ; IR: store volatile i32 17, i32 addrspace(3)* undef
 ; IR:  br label %Flow2
 
 ; IR: UnifiedReturnBlock:
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %14)
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR: ret void
 
 
@@ -141,14 +141,14 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
 
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
-; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
 
 
 ; IR: UnifiedUnreachableBlock:
@@ -201,46 +201,46 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: br i1
 
 ; IR: {{^}}Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
-; IR: br i1 %7, label %LeafBlock, label %Flow1
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %uniform.cond0.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: br i1 %6, label %LeafBlock, label %Flow1
 
 ; IR: {{^}}LeafBlock:
 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
-; IR: %9 = xor i1 %divergent.cond1, true
+; IR: %divergent.cond1.inv = xor i1 %divergent.cond1, true
 ; IR: br label %Flow1
 
 ; IR: LeafBlock1:
 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
-; IR: %10 = xor i1 %uniform.cond0, true
+; IR: %uniform.cond0.inv = xor i1 %uniform.cond0, true
 ; IR: br label %Flow
 
 ; IR: Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
-; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 ; IR: br label %UnifiedReturnBlock
 
 ; IR: {{^}}Flow1:
-; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
-; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %8)
-; IR: %17 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %16)
-; IR: %18 = extractvalue { i1, i64 } %17, 0
-; IR: %19 = extractvalue { i1, i64 } %17, 1
-; IR: br i1 %18, label %exit1, label %Flow2
+; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
+; IR: %13 = phi i1 [ %divergent.cond1.inv, %LeafBlock ], [ %4, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
+; IR: %15 = extractvalue { i1, i64 } %14, 0
+; IR: %16 = extractvalue { i1, i64 } %14, 1
+; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
 ; IR: store volatile i32 17, i32 addrspace(3)* undef
 ; IR: br label %Flow2
 
 ; IR: UnifiedReturnBlock:
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %14)
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR: ret void
 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 entry:
@@ -279,17 +279,17 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
-; IR: br i1 %2, label %LeafBlock1, label %Flow
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
+; IR: br i1 %1, label %LeafBlock1, label %Flow
 
 ; IR: Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 
 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 entry:
@@ -329,12 +329,12 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 ; IR: Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 
 ; IR: UnifiedReturnBlock:
 ; IR: %UnifiedRetVal = phi float [ 2.000000e+00, %Flow2 ], [ 1.000000e+00, %exit0 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %14)
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR: ret float %UnifiedRetVal
 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 entry:
@@ -402,31 +402,31 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
 
 ; IR: Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 
 ; IR: Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
-; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
 ; IR-NEXT: br label %UnifiedReturnBlock
 
 ; IR: Flow1:
-; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
-; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf.i64(i64 %8)
-; IR: %17 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %16)
-; IR: %18 = extractvalue { i1, i64 } %17, 0
-; IR: %19 = extractvalue { i1, i64 } %17, 1
-; IR: br i1 %18, label %exit1, label %Flow2
+; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
+; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
+; IR: %15 = extractvalue { i1, i64 } %14, 0
+; IR: %16 = extractvalue { i1, i64 } %14, 1
+; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
@@ -434,7 +434,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR-NEXT: br label %Flow2
 
 ; IR: UnifiedReturnBlock:
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %14)
+; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR-NEXT: ret void
 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 entry:
@@ -490,7 +490,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR-NEXT: br label %Flow2
 
 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %14)
+; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR-NEXT: ret void
 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 entry:
@@ -637,15 +637,15 @@ uniform.ret:
 
 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
 ; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
-; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
-; IR: br i1 %8, label %uniform.if, label %Flow2
+; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %6, label %uniform.if, label %Flow2
 
 ; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
-; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
-; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+; IR: %7 = phi i1 [ %uniform.cond2.inv, %uniform.then ], [ %uniform.cond1.inv, %uniform.if ]
+; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
 
 ; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
-; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %6)
+; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %5)
 ; IR-NEXT: ret void
 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
 entry:

From 2d2627d47a1fc1f966d058aadc18099038be5af0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 29 May 2020 14:58:20 -0400
Subject: [PATCH 570/770] AMDGPU: Remove fp-exceptions feature

This was never used, and the only thing it changed was removed in
284472be6da3353d81dfd25b1ac4218e852d1e5f. The floating point mode is
also not a property of the subtarget.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td                      | 6 ------
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp            | 1 -
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h              | 5 -----
 llvm/test/CodeGen/AMDGPU/clamp.ll                     | 6 +++---
 llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll | 9 ++++-----
 5 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 9f38e92c434d5..2dad5176e911a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -500,12 +500,6 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
 
-def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
-  "FPExceptions",
-  "true",
-  "Enable floating point exceptions"
->;
-
 class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
   "max-private-element-size-"#size,
   "MaxPrivateElementSize",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 58fee94f5c6af..92564d1c53abe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -153,7 +153,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   TargetTriple(TT),
   Has16BitInsts(false),
   HasMadMixInsts(false),
-  FPExceptions(false),
   HasSDWA(false),
   HasVOP3PInsts(false),
   HasMulI24(true),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index cac7c56360d00..189f18b960625 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -66,7 +66,6 @@ class AMDGPUSubtarget {
 protected:
   bool Has16BitInsts;
   bool HasMadMixInsts;
-  bool FPExceptions;
   bool HasSDWA;
   bool HasVOP3PInsts;
   bool HasMulI24;
@@ -148,10 +147,6 @@ class AMDGPUSubtarget {
     return HasMadMixInsts;
   }
 
-  bool hasFPExceptions() const {
-    return FPExceptions;
-  }
-
   bool hasSDWA() const {
     return HasSDWA;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 67ff2d0452ad6..8bfdbff5c4083 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -772,6 +772,6 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp-exceptions" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
-attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="+fp-exceptions" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
-attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="+fp-exceptions" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
+attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
+attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index b3fae8b541120..f7f075b095317 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -1,8 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
-; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp-exceptions -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
 
 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}

From e6a404fbe7270d22541b45d48ca29fea5c3510aa Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 29 May 2020 10:26:38 -0700
Subject: [PATCH 571/770] [lldb/CMake] Set both the BUILD and INSTALL RPATH on
 macOS (2/2)

This is also needed for lldb-test.
---
 lldb/tools/lldb-test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/tools/lldb-test/CMakeLists.txt b/lldb/tools/lldb-test/CMakeLists.txt
index 60b4a7ca8f70a..2edbd8e56d6ed 100644
--- a/lldb/tools/lldb-test/CMakeLists.txt
+++ b/lldb/tools/lldb-test/CMakeLists.txt
@@ -26,6 +26,7 @@ add_lldb_tool(lldb-test
 
 if(PYTHON_RPATH)
   set_property(TARGET lldb-test APPEND PROPERTY INSTALL_RPATH "${PYTHON_RPATH}")
+  set_property(TARGET lldb-test APPEND PROPERTY BUILD_RPATH   "${PYTHON_RPATH}")
 endif()
 
 target_include_directories(lldb-test PRIVATE ${LLDB_SOURCE_DIR}/source)

From 6f56a586c505bd587106a9b94388d70efb88155b Mon Sep 17 00:00:00 2001
From: paul_hoad <paul_hoad@amat.com>
Date: Fri, 29 May 2020 20:25:12 +0100
Subject: [PATCH 572/770] [clang-format] Create a python documentation tool to
 generate a summary of the clang-format status for the whole of the LLVM
 project

Summary:
Any change to clang-format is tested with the unit tests, However sometimes the better approach is to run it over a very large fully formatted source tree and then inspect the differences. This seems to be a source of many of the regressions found by @krasimir  and by @sylvestre.ledru and @Abpostelnicu who run it over the Mozilla sources, but often these regressions are only found after changes have been committed.

LLVM itself would be a good dog-fooding candidate for similar tests except such a large proportion of the tree is not 100% clang formatted, as such you are never aware if the change comes from a change to clang-format or  just because the tree has not been formatted first.

The following review is for a small python tool which scans the whole of the LLVM source tree and counts the number of files which have one or more clang-format violations.

This revision contains the tool and the output from the initial run of the tool and the generated documentation which looks like the following

Reviewers: krasimir, JakeMerdichAMD, sammccall, curdeius, bollu, alexshap, jdoerfert, DavidTruby, sscalpone

Reviewed By: curdeius

Subscribers: dschuff, aheejin, fedor.sergeev, ilya-biryukov, simoncook, cryptoad, arphaman, jfb, kadircet, mstorsjo, s.egerton, usaxena95, aartbik, phosek, sstefan1, cfe-commits, sylvestre.ledru, Abpostelnicu, krasimir

Tags: #clang, #clang-format

Differential Revision: https://reviews.llvm.org/D80627
---
 clang/docs/ClangFormat.rst                   |    5 +
 clang/docs/ClangFormattedStatus.rst          | 6406 ++++++++++++++++++
 clang/docs/index.rst                         |    1 +
 clang/docs/tools/generate_formatted_state.py |  146 +
 4 files changed, 6558 insertions(+)
 create mode 100644 clang/docs/ClangFormattedStatus.rst
 create mode 100755 clang/docs/tools/generate_formatted_state.py

diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index 51ecabe927b51..cc1ef38dbb9df 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -240,3 +240,8 @@ In an SVN client, you can do:
 
 The option `-U0` will create a diff without context lines (the script would format
 those as well).
+
+Current State of Clang Format for LLVM
+======================================
+
+The following table :doc:`ClangFormattedStatus` shows the current status of clang-formatting for the entire LLVM source tree.
diff --git a/clang/docs/ClangFormattedStatus.rst b/clang/docs/ClangFormattedStatus.rst
new file mode 100644
index 0000000000000..458bce762e799
--- /dev/null
+++ b/clang/docs/ClangFormattedStatus.rst
@@ -0,0 +1,6406 @@
+.. raw:: html
+
+      <style type="text/css">
+        .none { background-color: #FFCC99 }
+        .part { background-color: #FFFF99 }
+        .good { background-color: #2CCCFF }
+        .total { font-weight: bold; }
+      </style>
+
+.. role:: none
+.. role:: part
+.. role:: good
+.. role:: total
+
+======================
+Clang Formatted Status
+======================
+
+:doc:`ClangFormattedStatus` describes the state of LLVM source
+tree in terms of conformance to :doc:`ClangFormat` as of: May 29, 2020 17:04:26 (`dac21fd29cd <https://github.com/llvm/llvm-project/commit/dac21fd29cd>`_).
+
+
+.. list-table:: LLVM Clang-Format Status
+   :widths: 50 25 25 25 25
+   :header-rows: 1
+
+   * - Directory
+     - Total Files
+     - Formatted Files
+     - Unformatted Files
+     - % Complete
+   * - clang/bindings/python/tests/cindex/INPUTS
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - clang/docs/analyzer/checkers
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/examples/AnnotateFunctions
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/examples/Attribute
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/examples/clang-interpreter
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/examples/PrintFunctionNames
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/include/clang/Analysis
+     - `14`
+     - `4`
+     - `10`
+     - :part:`28%`
+   * - clang/include/clang/Analysis/Analyses
+     - `14`
+     - `2`
+     - `12`
+     - :part:`14%`
+   * - clang/include/clang/Analysis/DomainSpecific
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/include/clang/Analysis/FlowSensitive
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang/include/clang/Analysis/Support
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/include/clang/ARCMigrate
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - clang/include/clang/AST
+     - `113`
+     - `21`
+     - `92`
+     - :part:`18%`
+   * - clang/include/clang/ASTMatchers
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - clang/include/clang/ASTMatchers/Dynamic
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - clang/include/clang/Basic
+     - `76`
+     - `24`
+     - `52`
+     - :part:`31%`
+   * - clang/include/clang/CodeGen
+     - `9`
+     - `0`
+     - `9`
+     - :none:`0%`
+   * - clang/include/clang/CrossTU
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang/include/clang/DirectoryWatcher
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/include/clang/Driver
+     - `17`
+     - `4`
+     - `13`
+     - :part:`23%`
+   * - clang/include/clang/Edit
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - clang/include/clang/Format
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/include/clang/Frontend
+     - `28`
+     - `7`
+     - `21`
+     - :part:`25%`
+   * - clang/include/clang/FrontendTool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/include/clang/Index
+     - `7`
+     - `2`
+     - `5`
+     - :part:`28%`
+   * - clang/include/clang/Lex
+     - `29`
+     - `4`
+     - `25`
+     - :part:`13%`
+   * - clang/include/clang/Parse
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - clang/include/clang/Rewrite/Core
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - clang/include/clang/Rewrite/Frontend
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - clang/include/clang/Sema
+     - `32`
+     - `3`
+     - `29`
+     - :part:`9%`
+   * - clang/include/clang/Serialization
+     - `14`
+     - `2`
+     - `12`
+     - :part:`14%`
+   * - clang/include/clang/StaticAnalyzer/Checkers
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - clang/include/clang/StaticAnalyzer/Core
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - clang/include/clang/StaticAnalyzer/Core/BugReporter
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - clang/include/clang/StaticAnalyzer/Core/PathSensitive
+     - `36`
+     - `9`
+     - `27`
+     - :part:`25%`
+   * - clang/include/clang/StaticAnalyzer/Frontend
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - clang/include/clang/Tooling
+     - `16`
+     - `9`
+     - `7`
+     - :part:`56%`
+   * - clang/include/clang/Tooling/ASTDiff
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - clang/include/clang/Tooling/Core
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - clang/include/clang/Tooling/DependencyScanning
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - clang/include/clang/Tooling/Inclusions
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/include/clang/Tooling/Refactoring
+     - `14`
+     - `12`
+     - `2`
+     - :part:`85%`
+   * - clang/include/clang/Tooling/Refactoring/Extract
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang/include/clang/Tooling/Refactoring/Rename
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - clang/include/clang/Tooling/Syntax
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - clang/include/clang/Tooling/Transformer
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - clang/include/clang-c
+     - `9`
+     - `3`
+     - `6`
+     - :part:`33%`
+   * - clang/INPUTS
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/lib/Analysis
+     - `25`
+     - `2`
+     - `23`
+     - :part:`8%`
+   * - clang/lib/Analysis/plugins/CheckerDependencyHandling
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/Analysis/plugins/CheckerOptionHandling
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/Analysis/plugins/SampleAnalyzer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/ARCMigrate
+     - `22`
+     - `0`
+     - `22`
+     - :none:`0%`
+   * - clang/lib/AST
+     - `80`
+     - `2`
+     - `78`
+     - :part:`2%`
+   * - clang/lib/AST/Interp
+     - `44`
+     - `19`
+     - `25`
+     - :part:`43%`
+   * - clang/lib/ASTMatchers
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - clang/lib/ASTMatchers/Dynamic
+     - `6`
+     - `1`
+     - `5`
+     - :part:`16%`
+   * - clang/lib/Basic
+     - `32`
+     - `6`
+     - `26`
+     - :part:`18%`
+   * - clang/lib/Basic/Targets
+     - `46`
+     - `21`
+     - `25`
+     - :part:`45%`
+   * - clang/lib/CodeGen
+     - `87`
+     - `9`
+     - `78`
+     - :part:`10%`
+   * - clang/lib/CrossTU
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/DirectoryWatcher
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/DirectoryWatcher/default
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/DirectoryWatcher/linux
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/DirectoryWatcher/mac
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/DirectoryWatcher/windows
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/Driver
+     - `16`
+     - `2`
+     - `14`
+     - :part:`12%`
+   * - clang/lib/Driver/ToolChains
+     - `80`
+     - `24`
+     - `56`
+     - :part:`30%`
+   * - clang/lib/Driver/ToolChains/Arch
+     - `16`
+     - `2`
+     - `14`
+     - :part:`12%`
+   * - clang/lib/Edit
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - clang/lib/Format
+     - `29`
+     - `29`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/Frontend
+     - `33`
+     - `4`
+     - `29`
+     - :part:`12%`
+   * - clang/lib/Frontend/Rewrite
+     - `8`
+     - `0`
+     - `8`
+     - :none:`0%`
+   * - clang/lib/FrontendTool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/Headers
+     - `126`
+     - `10`
+     - `116`
+     - :part:`7%`
+   * - clang/lib/Headers/openmp_wrappers
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/Headers/ppc_wrappers
+     - `7`
+     - `2`
+     - `5`
+     - :part:`28%`
+   * - clang/lib/Index
+     - `12`
+     - `2`
+     - `10`
+     - :part:`16%`
+   * - clang/lib/Lex
+     - `23`
+     - `1`
+     - `22`
+     - :part:`4%`
+   * - clang/lib/Parse
+     - `15`
+     - `0`
+     - `15`
+     - :none:`0%`
+   * - clang/lib/Rewrite
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - clang/lib/Sema
+     - `54`
+     - `3`
+     - `51`
+     - :part:`5%`
+   * - clang/lib/Serialization
+     - `17`
+     - `1`
+     - `16`
+     - :part:`5%`
+   * - clang/lib/StaticAnalyzer/Checkers
+     - `115`
+     - `13`
+     - `102`
+     - :part:`11%`
+   * - clang/lib/StaticAnalyzer/Checkers/cert
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/StaticAnalyzer/Checkers/MPI-Checker
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - clang/lib/StaticAnalyzer/Checkers/RetainCountChecker
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - clang/lib/StaticAnalyzer/Checkers/UninitializedObject
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - clang/lib/StaticAnalyzer/Checkers/WebKit
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/StaticAnalyzer/Core
+     - `46`
+     - `8`
+     - `38`
+     - :part:`17%`
+   * - clang/lib/StaticAnalyzer/Frontend
+     - `8`
+     - `3`
+     - `5`
+     - :part:`37%`
+   * - clang/lib/Tooling
+     - `15`
+     - `6`
+     - `9`
+     - :part:`40%`
+   * - clang/lib/Tooling/ASTDiff
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/lib/Tooling/Core
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - clang/lib/Tooling/DependencyScanning
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - clang/lib/Tooling/Inclusions
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - clang/lib/Tooling/Refactoring
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - clang/lib/Tooling/Refactoring/Extract
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang/lib/Tooling/Refactoring/Rename
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - clang/lib/Tooling/Syntax
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - clang/lib/Tooling/Transformer
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - clang/tools/arcmt-test
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/c-index-test
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-check
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-diff
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-extdef-mapping
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-format
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/tools/clang-format/fuzzer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-fuzzer
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - clang/tools/clang-fuzzer/fuzzer-initialize
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/tools/clang-fuzzer/handle-cxx
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/tools/clang-fuzzer/handle-llvm
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - clang/tools/clang-fuzzer/proto-to-cxx
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - clang/tools/clang-fuzzer/proto-to-llvm
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - clang/tools/clang-import-test
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-offload-bundler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-offload-wrapper
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/tools/clang-refactor
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - clang/tools/clang-rename
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/tools/clang-scan-deps
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/tools/clang-shlib
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/tools/diagtool
+     - `9`
+     - `0`
+     - `9`
+     - :none:`0%`
+   * - clang/tools/driver
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - clang/tools/libclang
+     - `34`
+     - `6`
+     - `28`
+     - :part:`17%`
+   * - clang/tools/scan-build-py/tests/functional/src/include
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/unittests/Analysis
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - clang/unittests/AST
+     - `29`
+     - `7`
+     - `22`
+     - :part:`24%`
+   * - clang/unittests/ASTMatchers
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - clang/unittests/ASTMatchers/Dynamic
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - clang/unittests/Basic
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - clang/unittests/CodeGen
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - clang/unittests/CrossTU
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/unittests/DirectoryWatcher
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/unittests/Driver
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - clang/unittests/Format
+     - `18`
+     - `18`
+     - `0`
+     - :good:`100%`
+   * - clang/unittests/Frontend
+     - `8`
+     - `4`
+     - `4`
+     - :part:`50%`
+   * - clang/unittests/Index
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/unittests/Lex
+     - `6`
+     - `1`
+     - `5`
+     - :part:`16%`
+   * - clang/unittests/libclang
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang/unittests/libclang/CrashTests
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/unittests/Rename
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - clang/unittests/Rewrite
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang/unittests/Sema
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - clang/unittests/Serialization
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang/unittests/StaticAnalyzer
+     - `9`
+     - `4`
+     - `5`
+     - :part:`44%`
+   * - clang/unittests/Tooling
+     - `29`
+     - `7`
+     - `22`
+     - :part:`24%`
+   * - clang/unittests/Tooling/RecursiveASTVisitorTests
+     - `23`
+     - `9`
+     - `14`
+     - :part:`39%`
+   * - clang/unittests/Tooling/Syntax
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang/utils/perf-training/cxx
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang/utils/TableGen
+     - `20`
+     - `2`
+     - `18`
+     - :part:`10%`
+   * - clang-tools-extra/clang-apply-replacements/include/clang-apply-replacements/Tooling
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-apply-replacements/lib/Tooling
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-apply-replacements/tool
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-change-namespace
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang-tools-extra/clang-change-namespace/tool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/clang-doc
+     - `17`
+     - `16`
+     - `1`
+     - :part:`94%`
+   * - clang-tools-extra/clang-doc/tool
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-include-fixer
+     - `13`
+     - `7`
+     - `6`
+     - :part:`53%`
+   * - clang-tools-extra/clang-include-fixer/find-all-symbols
+     - `17`
+     - `13`
+     - `4`
+     - :part:`76%`
+   * - clang-tools-extra/clang-include-fixer/find-all-symbols/tool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/clang-include-fixer/plugin
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-include-fixer/tool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/clang-move
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - clang-tools-extra/clang-move/tool
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-query
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - clang-tools-extra/clang-query/tool
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-reorder-fields
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - clang-tools-extra/clang-reorder-fields/tool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/clang-tidy
+     - `18`
+     - `12`
+     - `6`
+     - :part:`66%`
+   * - clang-tools-extra/clang-tidy/abseil
+     - `40`
+     - `28`
+     - `12`
+     - :part:`70%`
+   * - clang-tools-extra/clang-tidy/android
+     - `33`
+     - `23`
+     - `10`
+     - :part:`69%`
+   * - clang-tools-extra/clang-tidy/boost
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-tidy/bugprone
+     - `105`
+     - `84`
+     - `21`
+     - :part:`80%`
+   * - clang-tools-extra/clang-tidy/cert
+     - `29`
+     - `27`
+     - `2`
+     - :part:`93%`
+   * - clang-tools-extra/clang-tidy/cppcoreguidelines
+     - `41`
+     - `38`
+     - `3`
+     - :part:`92%`
+   * - clang-tools-extra/clang-tidy/darwin
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - clang-tools-extra/clang-tidy/fuchsia
+     - `15`
+     - `9`
+     - `6`
+     - :part:`60%`
+   * - clang-tools-extra/clang-tidy/google
+     - `35`
+     - `23`
+     - `12`
+     - :part:`65%`
+   * - clang-tools-extra/clang-tidy/hicpp
+     - `9`
+     - `6`
+     - `3`
+     - :part:`66%`
+   * - clang-tools-extra/clang-tidy/linuxkernel
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - clang-tools-extra/clang-tidy/llvm
+     - `11`
+     - `10`
+     - `1`
+     - :part:`90%`
+   * - clang-tools-extra/clang-tidy/llvmlibc
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-tidy/misc
+     - `29`
+     - `25`
+     - `4`
+     - :part:`86%`
+   * - clang-tools-extra/clang-tidy/modernize
+     - `65`
+     - `43`
+     - `22`
+     - :part:`66%`
+   * - clang-tools-extra/clang-tidy/mpi
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - clang-tools-extra/clang-tidy/objc
+     - `15`
+     - `10`
+     - `5`
+     - :part:`66%`
+   * - clang-tools-extra/clang-tidy/openmp
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-tidy/performance
+     - `29`
+     - `24`
+     - `5`
+     - :part:`82%`
+   * - clang-tools-extra/clang-tidy/plugin
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clang-tidy/portability
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - clang-tools-extra/clang-tidy/readability
+     - `75`
+     - `61`
+     - `14`
+     - :part:`81%`
+   * - clang-tools-extra/clang-tidy/tool
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - clang-tools-extra/clang-tidy/utils
+     - `33`
+     - `26`
+     - `7`
+     - :part:`78%`
+   * - clang-tools-extra/clang-tidy/zircon
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd
+     - `73`
+     - `57`
+     - `16`
+     - :part:`78%`
+   * - clang-tools-extra/clangd/benchmarks
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/fuzzer
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/index
+     - `37`
+     - `34`
+     - `3`
+     - :part:`91%`
+   * - clang-tools-extra/clangd/index/dex
+     - `9`
+     - `8`
+     - `1`
+     - :part:`88%`
+   * - clang-tools-extra/clangd/index/dex/dexp
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/index/remote
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/index/remote/marshalling
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/index/remote/server
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/index/remote/unimplemented
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/indexer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/refactor
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/refactor/tweaks
+     - `13`
+     - `10`
+     - `3`
+     - :part:`76%`
+   * - clang-tools-extra/clangd/support
+     - `18`
+     - `18`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/tool
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/unittests
+     - `63`
+     - `51`
+     - `12`
+     - :part:`80%`
+   * - clang-tools-extra/clangd/unittests/remote
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/unittests/support
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/unittests/xpc
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/xpc
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/xpc/framework
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/clangd/xpc/test-client
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/modularize
+     - `9`
+     - `1`
+     - `8`
+     - :part:`11%`
+   * - clang-tools-extra/pp-trace
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - clang-tools-extra/tool-template
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/unittests/clang-apply-replacements
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/unittests/clang-change-namespace
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/unittests/clang-doc
+     - `9`
+     - `9`
+     - `0`
+     - :good:`100%`
+   * - clang-tools-extra/unittests/clang-include-fixer
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang-tools-extra/unittests/clang-include-fixer/find-all-symbols
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/unittests/clang-move
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - clang-tools-extra/unittests/clang-query
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - clang-tools-extra/unittests/clang-tidy
+     - `14`
+     - `6`
+     - `8`
+     - :part:`42%`
+   * - clang-tools-extra/unittests/include/common
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/include/fuzzer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/include/sanitizer
+     - `14`
+     - `1`
+     - `13`
+     - :part:`7%`
+   * - compiler-rt/include/xray
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - compiler-rt/lib/asan
+     - `59`
+     - `3`
+     - `56`
+     - :part:`5%`
+   * - compiler-rt/lib/asan/tests
+     - `17`
+     - `1`
+     - `16`
+     - :part:`5%`
+   * - compiler-rt/lib/BlocksRuntime
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - compiler-rt/lib/builtins
+     - `11`
+     - `9`
+     - `2`
+     - :part:`81%`
+   * - compiler-rt/lib/builtins/arm
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/builtins/ppc
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/cfi
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/dfsan
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - compiler-rt/lib/fuzzer
+     - `43`
+     - `5`
+     - `38`
+     - :part:`11%`
+   * - compiler-rt/lib/fuzzer/afl
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/fuzzer/dataflow
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - compiler-rt/lib/fuzzer/tests
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - compiler-rt/lib/gwp_asan
+     - `14`
+     - `13`
+     - `1`
+     - :part:`92%`
+   * - compiler-rt/lib/gwp_asan/optional
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/gwp_asan/platform_specific
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/gwp_asan/tests
+     - `14`
+     - `14`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/gwp_asan/tests/optional
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/hwasan
+     - `25`
+     - `5`
+     - `20`
+     - :part:`20%`
+   * - compiler-rt/lib/interception
+     - `8`
+     - `1`
+     - `7`
+     - :part:`12%`
+   * - compiler-rt/lib/interception/tests
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - compiler-rt/lib/lsan
+     - `20`
+     - `7`
+     - `13`
+     - :part:`35%`
+   * - compiler-rt/lib/msan
+     - `18`
+     - `4`
+     - `14`
+     - :part:`22%`
+   * - compiler-rt/lib/msan/tests
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - compiler-rt/lib/profile
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - compiler-rt/lib/safestack
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - compiler-rt/lib/sanitizer_common
+     - `159`
+     - `23`
+     - `136`
+     - :part:`14%`
+   * - compiler-rt/lib/sanitizer_common/symbolizer
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - compiler-rt/lib/sanitizer_common/tests
+     - `37`
+     - `1`
+     - `36`
+     - :part:`2%`
+   * - compiler-rt/lib/scudo
+     - `20`
+     - `0`
+     - `20`
+     - :none:`0%`
+   * - compiler-rt/lib/scudo/standalone
+     - `46`
+     - `42`
+     - `4`
+     - :part:`91%`
+   * - compiler-rt/lib/scudo/standalone/benchmarks
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/scudo/standalone/fuzz
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/scudo/standalone/include/scudo
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/scudo/standalone/tests
+     - `23`
+     - `23`
+     - `0`
+     - :good:`100%`
+   * - compiler-rt/lib/scudo/standalone/tools
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/stats
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - compiler-rt/lib/tsan/benchmarks
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - compiler-rt/lib/tsan/dd
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - compiler-rt/lib/tsan/go
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/tsan/rtl
+     - `62`
+     - `10`
+     - `52`
+     - :part:`16%`
+   * - compiler-rt/lib/tsan/tests/rtl
+     - `10`
+     - `1`
+     - `9`
+     - :part:`10%`
+   * - compiler-rt/lib/tsan/tests/unit
+     - `10`
+     - `0`
+     - `10`
+     - :none:`0%`
+   * - compiler-rt/lib/ubsan
+     - `27`
+     - `7`
+     - `20`
+     - :part:`25%`
+   * - compiler-rt/lib/ubsan_minimal
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - compiler-rt/lib/xray
+     - `39`
+     - `30`
+     - `9`
+     - :part:`76%`
+   * - compiler-rt/lib/xray/tests/unit
+     - `10`
+     - `8`
+     - `2`
+     - :part:`80%`
+   * - compiler-rt/tools/gwp_asan
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - debuginfo-tests/dexter/feature_tests/commands/penalty
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - debuginfo-tests/dexter/feature_tests/commands/perfect
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - debuginfo-tests/dexter/feature_tests/subtools
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - debuginfo-tests/dexter/feature_tests/subtools/clang-opt-bisect
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - debuginfo-tests/dexter-tests
+     - `8`
+     - `3`
+     - `5`
+     - :part:`37%`
+   * - debuginfo-tests/llgdb-tests
+     - `7`
+     - `0`
+     - `7`
+     - :none:`0%`
+   * - debuginfo-tests/llvm-prettyprinters/gdb
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Common
+     - `19`
+     - `19`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Decimal
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Evaluate
+     - `21`
+     - `21`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Lower
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Optimizer/CodeGen
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Optimizer/Dialect
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Optimizer/Support
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Parser
+     - `17`
+     - `17`
+     - `0`
+     - :good:`100%`
+   * - flang/include/flang/Semantics
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Common
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Decimal
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Evaluate
+     - `30`
+     - `30`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Lower
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Optimizer/Dialect
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Optimizer/Support
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Parser
+     - `34`
+     - `34`
+     - `0`
+     - :good:`100%`
+   * - flang/lib/Semantics
+     - `66`
+     - `65`
+     - `1`
+     - :part:`98%`
+   * - flang/module
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - flang/runtime
+     - `56`
+     - `56`
+     - `0`
+     - :good:`100%`
+   * - flang/tools/f18
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - flang/tools/f18-parse-demo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - flang/tools/tco
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - flang/unittests/Decimal
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - flang/unittests/Evaluate
+     - `15`
+     - `15`
+     - `0`
+     - :good:`100%`
+   * - flang/unittests/Runtime
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - libc/AOR_v20.02/math
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - libc/AOR_v20.02/math/include
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libc/AOR_v20.02/networking
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libc/AOR_v20.02/networking/include
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libc/AOR_v20.02/string
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libc/AOR_v20.02/string/include
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libc/fuzzing/string
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libc/include
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - libc/loader/linux/x86_64
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/src/assert
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - libc/src/errno
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - libc/src/math
+     - `23`
+     - `21`
+     - `2`
+     - :part:`91%`
+   * - libc/src/signal
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - libc/src/signal/linux
+     - `10`
+     - `10`
+     - `0`
+     - :good:`100%`
+   * - libc/src/stdio
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - libc/src/stdlib
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - libc/src/stdlib/linux
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/src/string
+     - `11`
+     - `10`
+     - `1`
+     - :part:`90%`
+   * - libc/src/string/memory_utils
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - libc/src/string/x86
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/src/sys/mman
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - libc/src/sys/mman/linux
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - libc/src/threads
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - libc/src/threads/linux
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - libc/src/unistd
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/src/unistd/linux
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libc/utils/benchmarks
+     - `14`
+     - `14`
+     - `0`
+     - :good:`100%`
+   * - libc/utils/CPP
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - libc/utils/FPUtil
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - libc/utils/HdrGen
+     - `9`
+     - `9`
+     - `0`
+     - :good:`100%`
+   * - libc/utils/MPFRWrapper
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - libc/utils/testutils
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - libc/utils/UnitTest
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - libclc/generic/include/clc
+     - `6`
+     - `2`
+     - `4`
+     - :part:`33%`
+   * - libclc/generic/include/clc/async
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/atomic
+     - `11`
+     - `7`
+     - `4`
+     - :part:`63%`
+   * - libclc/generic/include/clc/cl_khr_global_int32_base_atomics
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - libclc/generic/include/clc/cl_khr_global_int32_extended_atomics
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/cl_khr_int64_base_atomics
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - libclc/generic/include/clc/cl_khr_int64_extended_atomics
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/cl_khr_local_int32_base_atomics
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - libclc/generic/include/clc/cl_khr_local_int32_extended_atomics
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/common
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/explicit_fence
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/float
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libclc/generic/include/clc/geometric
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/image
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - libclc/generic/include/clc/integer
+     - `16`
+     - `13`
+     - `3`
+     - :part:`81%`
+   * - libclc/generic/include/clc/math
+     - `95`
+     - `92`
+     - `3`
+     - :part:`96%`
+   * - libclc/generic/include/clc/misc
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - libclc/generic/include/clc/relational
+     - `18`
+     - `12`
+     - `6`
+     - :part:`66%`
+   * - libclc/generic/include/clc/shared
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - libclc/generic/include/clc/synchronization
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/clc/workitem
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/integer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/include/math
+     - `15`
+     - `15`
+     - `0`
+     - :good:`100%`
+   * - libclc/generic/lib
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libclc/generic/lib/math
+     - `8`
+     - `1`
+     - `7`
+     - :part:`12%`
+   * - libclc/generic/lib/relational
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libclc/utils
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/benchmarks
+     - `16`
+     - `1`
+     - `15`
+     - :part:`6%`
+   * - libcxx/fuzzing
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - libcxx/include
+     - `21`
+     - `0`
+     - `21`
+     - :none:`0%`
+   * - libcxx/include/support/android
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/include/support/fuchsia
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - libcxx/include/support/ibm
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - libcxx/include/support/musl
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/include/support/newlib
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/include/support/solaris
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - libcxx/include/support/win32
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - libcxx/include/support/xlocale
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - libcxx/src
+     - `35`
+     - `1`
+     - `34`
+     - :part:`2%`
+   * - libcxx/src/experimental
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/src/filesystem
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - libcxx/src/include
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - libcxx/src/support/solaris
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/src/support/win32
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - libcxx/utils/google-benchmark/cmake
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - libcxx/utils/google-benchmark/include/benchmark
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxx/utils/google-benchmark/src
+     - `20`
+     - `16`
+     - `4`
+     - :part:`80%`
+   * - libcxxabi/fuzz
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libcxxabi/include
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - libcxxabi/src
+     - `26`
+     - `1`
+     - `25`
+     - :part:`3%`
+   * - libcxxabi/src/demangle
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - libcxxabi/src/include
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - libunwind/include
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - libunwind/include/mach-o
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - libunwind/src
+     - `9`
+     - `0`
+     - `9`
+     - :none:`0%`
+   * - lld/COFF
+     - `33`
+     - `10`
+     - `23`
+     - :part:`30%`
+   * - lld/Common
+     - `10`
+     - `9`
+     - `1`
+     - :part:`90%`
+   * - lld/ELF
+     - `48`
+     - `26`
+     - `22`
+     - :part:`54%`
+   * - lld/ELF/Arch
+     - `14`
+     - `7`
+     - `7`
+     - :part:`50%`
+   * - lld/include/lld/Common
+     - `12`
+     - `6`
+     - `6`
+     - :part:`50%`
+   * - lld/include/lld/Core
+     - `20`
+     - `4`
+     - `16`
+     - :part:`20%`
+   * - lld/include/lld/ReaderWriter
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lld/lib/Core
+     - `8`
+     - `2`
+     - `6`
+     - :part:`25%`
+   * - lld/lib/Driver
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lld/lib/ReaderWriter
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lld/lib/ReaderWriter/MachO
+     - `30`
+     - `1`
+     - `29`
+     - :part:`3%`
+   * - lld/lib/ReaderWriter/YAML
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lld/MachO
+     - `25`
+     - `25`
+     - `0`
+     - :good:`100%`
+   * - lld/MachO/Arch
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lld/MinGW
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lld/tools/lld
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lld/unittests/DriverTests
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lld/unittests/MachOTests
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - lld/wasm
+     - `27`
+     - `14`
+     - `13`
+     - :part:`51%`
+   * - lldb/examples/darwin/heap_find/heap
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/examples/functions
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/examples/interposing/darwin/fd_interposing
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/examples/lookup
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/examples/plugins/commands
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/examples/synthetic/bitfield
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/include/lldb
+     - `12`
+     - `7`
+     - `5`
+     - :part:`58%`
+   * - lldb/include/lldb/API
+     - `71`
+     - `59`
+     - `12`
+     - :part:`83%`
+   * - lldb/include/lldb/Breakpoint
+     - `24`
+     - `10`
+     - `14`
+     - :part:`41%`
+   * - lldb/include/lldb/Core
+     - `57`
+     - `31`
+     - `26`
+     - :part:`54%`
+   * - lldb/include/lldb/DataFormatters
+     - `18`
+     - `9`
+     - `9`
+     - :part:`50%`
+   * - lldb/include/lldb/Expression
+     - `17`
+     - `6`
+     - `11`
+     - :part:`35%`
+   * - lldb/include/lldb/Host
+     - `40`
+     - `20`
+     - `20`
+     - :part:`50%`
+   * - lldb/include/lldb/Host/android
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/include/lldb/Host/common
+     - `8`
+     - `2`
+     - `6`
+     - :part:`25%`
+   * - lldb/include/lldb/Host/freebsd
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/include/lldb/Host/linux
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - lldb/include/lldb/Host/macosx
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/include/lldb/Host/netbsd
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/include/lldb/Host/openbsd
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/include/lldb/Host/posix
+     - `9`
+     - `7`
+     - `2`
+     - :part:`77%`
+   * - lldb/include/lldb/Host/windows
+     - `11`
+     - `5`
+     - `6`
+     - :part:`45%`
+   * - lldb/include/lldb/Initialization
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - lldb/include/lldb/Interpreter
+     - `47`
+     - `38`
+     - `9`
+     - :part:`80%`
+   * - lldb/include/lldb/Symbol
+     - `36`
+     - `16`
+     - `20`
+     - :part:`44%`
+   * - lldb/include/lldb/Target
+     - `66`
+     - `37`
+     - `29`
+     - :part:`56%`
+   * - lldb/include/lldb/Utility
+     - `58`
+     - `36`
+     - `22`
+     - :part:`62%`
+   * - lldb/source
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/API
+     - `75`
+     - `8`
+     - `67`
+     - :part:`10%`
+   * - lldb/source/Breakpoint
+     - `24`
+     - `6`
+     - `18`
+     - :part:`25%`
+   * - lldb/source/Commands
+     - `56`
+     - `48`
+     - `8`
+     - :part:`85%`
+   * - lldb/source/Core
+     - `45`
+     - `24`
+     - `21`
+     - :part:`53%`
+   * - lldb/source/DataFormatters
+     - `16`
+     - `2`
+     - `14`
+     - :part:`12%`
+   * - lldb/source/Expression
+     - `13`
+     - `4`
+     - `9`
+     - :part:`30%`
+   * - lldb/source/Host/android
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Host/common
+     - `32`
+     - `17`
+     - `15`
+     - :part:`53%`
+   * - lldb/source/Host/freebsd
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Host/linux
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - lldb/source/Host/macosx/cfcpp
+     - `14`
+     - `12`
+     - `2`
+     - :part:`85%`
+   * - lldb/source/Host/netbsd
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Host/openbsd
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Host/posix
+     - `9`
+     - `5`
+     - `4`
+     - :part:`55%`
+   * - lldb/source/Host/windows
+     - `12`
+     - `5`
+     - `7`
+     - :part:`41%`
+   * - lldb/source/Initialization
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Interpreter
+     - `46`
+     - `25`
+     - `21`
+     - :part:`54%`
+   * - lldb/source/Plugins/ABI/AArch64
+     - `6`
+     - `2`
+     - `4`
+     - :part:`33%`
+   * - lldb/source/Plugins/ABI/ARC
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ABI/ARM
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - lldb/source/Plugins/ABI/Hexagon
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ABI/Mips
+     - `6`
+     - `2`
+     - `4`
+     - :part:`33%`
+   * - lldb/source/Plugins/ABI/PowerPC
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - lldb/source/Plugins/ABI/SystemZ
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ABI/X86
+     - `11`
+     - `4`
+     - `7`
+     - :part:`36%`
+   * - lldb/source/Plugins/Architecture/Arm
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Architecture/Mips
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/Architecture/PPC64
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Disassembler/LLVMC
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/DynamicLoader/Darwin-Kernel
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/DynamicLoader/Hexagon-DYLD
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/DynamicLoader/MacOSX-DYLD
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - lldb/source/Plugins/DynamicLoader/POSIX-DYLD
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - lldb/source/Plugins/DynamicLoader/Static
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/DynamicLoader/wasm-DYLD
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/DynamicLoader/Windows-DYLD
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/ExpressionParser/Clang
+     - `51`
+     - `26`
+     - `25`
+     - :part:`50%`
+   * - lldb/source/Plugins/Instruction/ARM
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - lldb/source/Plugins/Instruction/ARM64
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Instruction/MIPS
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/Instruction/MIPS64
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Instruction/PPC64
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/InstrumentationRuntime/ASan
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/InstrumentationRuntime/TSan
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/InstrumentationRuntime/UBSan
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/JITLoader/GDB
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Language/ClangCommon
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Language/CPlusPlus
+     - `29`
+     - `17`
+     - `12`
+     - :part:`58%`
+   * - lldb/source/Plugins/Language/ObjC
+     - `20`
+     - `13`
+     - `7`
+     - :part:`65%`
+   * - lldb/source/Plugins/Language/ObjCPlusPlus
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/LanguageRuntime/CPlusPlus
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/LanguageRuntime/ObjC
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime
+     - `16`
+     - `4`
+     - `12`
+     - :part:`25%`
+   * - lldb/source/Plugins/LanguageRuntime/RenderScript/RenderScriptRuntime
+     - `8`
+     - `3`
+     - `5`
+     - :part:`37%`
+   * - lldb/source/Plugins/MemoryHistory/asan
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/ObjectContainer/BSD-Archive
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ObjectContainer/Universal-Mach-O
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/ObjectFile/Breakpad
+     - `4`
+     - `3`
+     - `1`
+     - :part:`75%`
+   * - lldb/source/Plugins/ObjectFile/ELF
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - lldb/source/Plugins/ObjectFile/JIT
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ObjectFile/Mach-O
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ObjectFile/PECOFF
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - lldb/source/Plugins/ObjectFile/wasm
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/OperatingSystem/Python
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Platform/Android
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - lldb/source/Plugins/Platform/FreeBSD
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Platform/gdb-server
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Platform/Linux
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Platform/MacOSX
+     - `24`
+     - `8`
+     - `16`
+     - :part:`33%`
+   * - lldb/source/Plugins/Platform/MacOSX/objcxx
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Platform/NetBSD
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Platform/OpenBSD
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Platform/POSIX
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/Platform/Windows
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/Process/elf-core
+     - `20`
+     - `18`
+     - `2`
+     - :part:`90%`
+   * - lldb/source/Plugins/Process/FreeBSD
+     - `19`
+     - `11`
+     - `8`
+     - :part:`57%`
+   * - lldb/source/Plugins/Process/gdb-remote
+     - `26`
+     - `16`
+     - `10`
+     - :part:`61%`
+   * - lldb/source/Plugins/Process/Linux
+     - `24`
+     - `12`
+     - `12`
+     - :part:`50%`
+   * - lldb/source/Plugins/Process/mach-core
+     - `4`
+     - `3`
+     - `1`
+     - :part:`75%`
+   * - lldb/source/Plugins/Process/MacOSX-Kernel
+     - `16`
+     - `13`
+     - `3`
+     - :part:`81%`
+   * - lldb/source/Plugins/Process/minidump
+     - `17`
+     - `10`
+     - `7`
+     - :part:`58%`
+   * - lldb/source/Plugins/Process/NetBSD
+     - `8`
+     - `3`
+     - `5`
+     - :part:`37%`
+   * - lldb/source/Plugins/Process/POSIX
+     - `8`
+     - `5`
+     - `3`
+     - :part:`62%`
+   * - lldb/source/Plugins/Process/Utility
+     - `127`
+     - `87`
+     - `40`
+     - :part:`68%`
+   * - lldb/source/Plugins/Process/Windows/Common
+     - `34`
+     - `23`
+     - `11`
+     - :part:`67%`
+   * - lldb/source/Plugins/Process/Windows/Common/arm
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Process/Windows/Common/arm64
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/Process/Windows/Common/x64
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/Process/Windows/Common/x86
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/ScriptInterpreter/Lua
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/ScriptInterpreter/None
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/ScriptInterpreter/Python
+     - `8`
+     - `3`
+     - `5`
+     - :part:`37%`
+   * - lldb/source/Plugins/StructuredData/DarwinLog
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/SymbolFile/Breakpad
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/SymbolFile/DWARF
+     - `65`
+     - `35`
+     - `30`
+     - :part:`53%`
+   * - lldb/source/Plugins/SymbolFile/NativePDB
+     - `20`
+     - `12`
+     - `8`
+     - :part:`60%`
+   * - lldb/source/Plugins/SymbolFile/PDB
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - lldb/source/Plugins/SymbolFile/Symtab
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/SymbolVendor/ELF
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/SymbolVendor/MacOSX
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/SymbolVendor/wasm
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/source/Plugins/SystemRuntime/MacOSX
+     - `10`
+     - `1`
+     - `9`
+     - :part:`10%`
+   * - lldb/source/Plugins/TypeSystem/Clang
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/source/Plugins/UnwindAssembly/InstEmulation
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/source/Plugins/UnwindAssembly/x86
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - lldb/source/Symbol
+     - `32`
+     - `18`
+     - `14`
+     - :part:`56%`
+   * - lldb/source/Target
+     - `61`
+     - `28`
+     - `33`
+     - :part:`45%`
+   * - lldb/source/Utility
+     - `54`
+     - `41`
+     - `13`
+     - :part:`75%`
+   * - lldb/tools/argdumper
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/darwin-debug
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/debugserver/source
+     - `49`
+     - `38`
+     - `11`
+     - :part:`77%`
+   * - lldb/tools/debugserver/source/MacOSX
+     - `24`
+     - `16`
+     - `8`
+     - :part:`66%`
+   * - lldb/tools/debugserver/source/MacOSX/arm
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/tools/debugserver/source/MacOSX/arm64
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - lldb/tools/debugserver/source/MacOSX/DarwinLog
+     - `20`
+     - `18`
+     - `2`
+     - :part:`90%`
+   * - lldb/tools/debugserver/source/MacOSX/i386
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - lldb/tools/debugserver/source/MacOSX/x86_64
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - lldb/tools/driver
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/intel-features
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/intel-features/intel-mpx
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/intel-features/intel-pt
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/lldb-instr
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/tools/lldb-server
+     - `9`
+     - `4`
+     - `5`
+     - :part:`44%`
+   * - lldb/tools/lldb-test
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - lldb/tools/lldb-vscode
+     - `19`
+     - `12`
+     - `7`
+     - :part:`63%`
+   * - lldb/unittests
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/API
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Breakpoint
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Core
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - lldb/unittests/DataFormatter
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/debugserver
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - lldb/unittests/Disassembler
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/unittests/Editline
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Expression
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - lldb/unittests/Host
+     - `13`
+     - `10`
+     - `3`
+     - :part:`76%`
+   * - lldb/unittests/Host/linux
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Interpreter
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/unittests/Language/CPlusPlus
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/Language/Highlighting
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/ObjectFile/Breakpad
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/ObjectFile/ELF
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/ObjectFile/PECOFF
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/Platform
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Platform/Android
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/Process/gdb-remote
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - lldb/unittests/Process/Linux
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/Process/minidump
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/unittests/Process/minidump/Inputs
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Process/POSIX
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/ScriptInterpreter/Lua
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/ScriptInterpreter/Python
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - lldb/unittests/Signals
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Symbol
+     - `7`
+     - `4`
+     - `3`
+     - :part:`57%`
+   * - lldb/unittests/SymbolFile/DWARF
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - lldb/unittests/SymbolFile/DWARF/Inputs
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/SymbolFile/NativePDB
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/SymbolFile/PDB
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/SymbolFile/PDB/Inputs
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/Target
+     - `7`
+     - `3`
+     - `4`
+     - :part:`42%`
+   * - lldb/unittests/TestingSupport
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - lldb/unittests/TestingSupport/Host
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/TestingSupport/Symbol
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/tools/lldb-server/inferior
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - lldb/unittests/tools/lldb-server/tests
+     - `8`
+     - `1`
+     - `7`
+     - :part:`12%`
+   * - lldb/unittests/UnwindAssembly/ARM64
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/UnwindAssembly/PPC64
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - lldb/unittests/UnwindAssembly/x86
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/unittests/Utility
+     - `44`
+     - `31`
+     - `13`
+     - :part:`70%`
+   * - lldb/utils/lit-cpuid
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - lldb/utils/TableGen
+     - `6`
+     - `6`
+     - `0`
+     - :good:`100%`
+   * - llvm/benchmarks
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/bindings/go/llvm
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - llvm/cmake
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/BrainF
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/examples/Bye
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/ExceptionDemo
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Fibonacci
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/HowToUseJIT
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/HowToUseLLJIT
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/IRTransforms
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/examples/Kaleidoscope/BuildingAJIT/Chapter3
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/BuildingAJIT/Chapter4
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/BuildingAJIT/Chapter5
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter2
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter3
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter4
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter5
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter6
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter7
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter8
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/Chapter9
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/include
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/Kaleidoscope/MCJIT/cached
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/MCJIT/complete
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/MCJIT/initial
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/Kaleidoscope/MCJIT/lazy
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/examples/ModuleMaker
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/OrcV2Examples
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITDumpObjects
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITWithCustomObjectLinkingLayer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITWithInitializers
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITWithLazyReexports
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITWithObjectCache
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/OrcV2Examples/LLJITWithObjectLinkingLayerPlugin
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/examples/ParallelJIT
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/SpeculativeJIT
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/examples/ThinLtoJIT
+     - `9`
+     - `8`
+     - `1`
+     - :part:`88%`
+   * - llvm/include/llvm
+     - `8`
+     - `2`
+     - `6`
+     - :part:`25%`
+   * - llvm/include/llvm/ADT
+     - `84`
+     - `24`
+     - `60`
+     - :part:`28%`
+   * - llvm/include/llvm/Analysis
+     - `107`
+     - `29`
+     - `78`
+     - :part:`27%`
+   * - llvm/include/llvm/Analysis/ML
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/Analysis/Utils
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/AsmParser
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/include/llvm/BinaryFormat
+     - `13`
+     - `9`
+     - `4`
+     - :part:`69%`
+   * - llvm/include/llvm/Bitcode
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - llvm/include/llvm/Bitstream
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/include/llvm/CodeGen
+     - `140`
+     - `32`
+     - `108`
+     - :part:`22%`
+   * - llvm/include/llvm/CodeGen/GlobalISel
+     - `28`
+     - `12`
+     - `16`
+     - :part:`42%`
+   * - llvm/include/llvm/CodeGen/MIRParser
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/include/llvm/CodeGen/PBQP
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - llvm/include/llvm/DebugInfo
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/DebugInfo/CodeView
+     - `57`
+     - `40`
+     - `17`
+     - :part:`70%`
+   * - llvm/include/llvm/DebugInfo/DWARF
+     - `32`
+     - `17`
+     - `15`
+     - :part:`53%`
+   * - llvm/include/llvm/DebugInfo/GSYM
+     - `14`
+     - `2`
+     - `12`
+     - :part:`14%`
+   * - llvm/include/llvm/DebugInfo/MSF
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - llvm/include/llvm/DebugInfo/PDB
+     - `50`
+     - `7`
+     - `43`
+     - :part:`14%`
+   * - llvm/include/llvm/DebugInfo/PDB/DIA
+     - `20`
+     - `9`
+     - `11`
+     - :part:`45%`
+   * - llvm/include/llvm/DebugInfo/PDB/Native
+     - `49`
+     - `31`
+     - `18`
+     - :part:`63%`
+   * - llvm/include/llvm/DebugInfo/Symbolize
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/include/llvm/Demangle
+     - `7`
+     - `3`
+     - `4`
+     - :part:`42%`
+   * - llvm/include/llvm/DWARFLinker
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/ExecutionEngine
+     - `14`
+     - `3`
+     - `11`
+     - :part:`21%`
+   * - llvm/include/llvm/ExecutionEngine/JITLink
+     - `8`
+     - `5`
+     - `3`
+     - :part:`62%`
+   * - llvm/include/llvm/ExecutionEngine/Orc
+     - `32`
+     - `11`
+     - `21`
+     - :part:`34%`
+   * - llvm/include/llvm/ExecutionEngine/Orc/RPC
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/include/llvm/Frontend/OpenMP
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/FuzzMutate
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - llvm/include/llvm/IR
+     - `84`
+     - `15`
+     - `69`
+     - :part:`17%`
+   * - llvm/include/llvm/IRReader
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/LineEditor
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/Linker
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/include/llvm/LTO
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - llvm/include/llvm/LTO/legacy
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - llvm/include/llvm/MC
+     - `69`
+     - `17`
+     - `52`
+     - :part:`24%`
+   * - llvm/include/llvm/MC/MCDisassembler
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - llvm/include/llvm/MC/MCParser
+     - `8`
+     - `3`
+     - `5`
+     - :part:`37%`
+   * - llvm/include/llvm/MCA
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/MCA/HardwareUnits
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - llvm/include/llvm/MCA/Stages
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - llvm/include/llvm/Object
+     - `30`
+     - `10`
+     - `20`
+     - :part:`33%`
+   * - llvm/include/llvm/ObjectYAML
+     - `15`
+     - `13`
+     - `2`
+     - :part:`86%`
+   * - llvm/include/llvm/Option
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - llvm/include/llvm/Passes
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - llvm/include/llvm/ProfileData
+     - `8`
+     - `4`
+     - `4`
+     - :part:`50%`
+   * - llvm/include/llvm/ProfileData/Coverage
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - llvm/include/llvm/Remarks
+     - `11`
+     - `10`
+     - `1`
+     - :part:`90%`
+   * - llvm/include/llvm/Support
+     - `168`
+     - `49`
+     - `119`
+     - :part:`29%`
+   * - llvm/include/llvm/Support/Solaris/sys
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/Support/Windows
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/TableGen
+     - `7`
+     - `1`
+     - `6`
+     - :part:`14%`
+   * - llvm/include/llvm/Target
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - llvm/include/llvm/Testing/Support
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - llvm/include/llvm/TextAPI/ELF
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/include/llvm/TextAPI/MachO
+     - `9`
+     - `8`
+     - `1`
+     - :part:`88%`
+   * - llvm/include/llvm/ToolDrivers/llvm-dlltool
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/ToolDrivers/llvm-lib
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/Transforms
+     - `8`
+     - `2`
+     - `6`
+     - :part:`25%`
+   * - llvm/include/llvm/Transforms/AggressiveInstCombine
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/include/llvm/Transforms/Coroutines
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/Transforms/InstCombine
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/include/llvm/Transforms/Instrumentation
+     - `14`
+     - `8`
+     - `6`
+     - :part:`57%`
+   * - llvm/include/llvm/Transforms/IPO
+     - `29`
+     - `19`
+     - `10`
+     - :part:`65%`
+   * - llvm/include/llvm/Transforms/Scalar
+     - `61`
+     - `32`
+     - `29`
+     - :part:`52%`
+   * - llvm/include/llvm/Transforms/Utils
+     - `57`
+     - `26`
+     - `31`
+     - :part:`45%`
+   * - llvm/include/llvm/Transforms/Vectorize
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - llvm/include/llvm/WindowsManifest
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/include/llvm/WindowsResource
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - llvm/include/llvm/XRay
+     - `17`
+     - `14`
+     - `3`
+     - :part:`82%`
+   * - llvm/include/llvm-c
+     - `26`
+     - `11`
+     - `15`
+     - :part:`42%`
+   * - llvm/include/llvm-c/Transforms
+     - `8`
+     - `2`
+     - `6`
+     - :part:`25%`
+   * - llvm/lib/Analysis
+     - `104`
+     - `28`
+     - `76`
+     - :part:`26%`
+   * - llvm/lib/Analysis/ML
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/AsmParser
+     - `6`
+     - `2`
+     - `4`
+     - :part:`33%`
+   * - llvm/lib/BinaryFormat
+     - `11`
+     - `7`
+     - `4`
+     - :part:`63%`
+   * - llvm/lib/Bitcode/Reader
+     - `7`
+     - `2`
+     - `5`
+     - :part:`28%`
+   * - llvm/lib/Bitcode/Writer
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - llvm/lib/Bitstream/Reader
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/CodeGen
+     - `198`
+     - `33`
+     - `165`
+     - :part:`16%`
+   * - llvm/lib/CodeGen/AsmPrinter
+     - `42`
+     - `14`
+     - `28`
+     - :part:`33%`
+   * - llvm/lib/CodeGen/GlobalISel
+     - `24`
+     - `8`
+     - `16`
+     - :part:`33%`
+   * - llvm/lib/CodeGen/MIRParser
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - llvm/lib/CodeGen/SelectionDAG
+     - `31`
+     - `2`
+     - `29`
+     - :part:`6%`
+   * - llvm/lib/DebugInfo/CodeView
+     - `40`
+     - `25`
+     - `15`
+     - :part:`62%`
+   * - llvm/lib/DebugInfo/DWARF
+     - `28`
+     - `6`
+     - `22`
+     - :part:`21%`
+   * - llvm/lib/DebugInfo/GSYM
+     - `11`
+     - `1`
+     - `10`
+     - :part:`9%`
+   * - llvm/lib/DebugInfo/MSF
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/DebugInfo/PDB
+     - `40`
+     - `34`
+     - `6`
+     - :part:`85%`
+   * - llvm/lib/DebugInfo/PDB/DIA
+     - `18`
+     - `15`
+     - `3`
+     - :part:`83%`
+   * - llvm/lib/DebugInfo/PDB/Native
+     - `45`
+     - `33`
+     - `12`
+     - :part:`73%`
+   * - llvm/lib/DebugInfo/Symbolize
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - llvm/lib/Demangle
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - llvm/lib/DWARFLinker
+     - `4`
+     - `3`
+     - `1`
+     - :part:`75%`
+   * - llvm/lib/ExecutionEngine
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - llvm/lib/ExecutionEngine/IntelJITEvents
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - llvm/lib/ExecutionEngine/Interpreter
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - llvm/lib/ExecutionEngine/JITLink
+     - `14`
+     - `9`
+     - `5`
+     - :part:`64%`
+   * - llvm/lib/ExecutionEngine/MCJIT
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/ExecutionEngine/OProfileJIT
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/ExecutionEngine/Orc
+     - `28`
+     - `15`
+     - `13`
+     - :part:`53%`
+   * - llvm/lib/ExecutionEngine/OrcError
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/ExecutionEngine/PerfJITEvents
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/ExecutionEngine/RuntimeDyld
+     - `12`
+     - `1`
+     - `11`
+     - :part:`8%`
+   * - llvm/lib/ExecutionEngine/RuntimeDyld/Targets
+     - `10`
+     - `1`
+     - `9`
+     - :part:`10%`
+   * - llvm/lib/Extensions
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Frontend/OpenMP
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/FuzzMutate
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - llvm/lib/IR
+     - `61`
+     - `8`
+     - `53`
+     - :part:`13%`
+   * - llvm/lib/IRReader
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/LineEditor
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Linker
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/lib/LTO
+     - `8`
+     - `1`
+     - `7`
+     - :part:`12%`
+   * - llvm/lib/MC
+     - `62`
+     - `20`
+     - `42`
+     - :part:`32%`
+   * - llvm/lib/MC/MCDisassembler
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - llvm/lib/MC/MCParser
+     - `12`
+     - `1`
+     - `11`
+     - :part:`8%`
+   * - llvm/lib/MCA
+     - `7`
+     - `3`
+     - `4`
+     - :part:`42%`
+   * - llvm/lib/MCA/HardwareUnits
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - llvm/lib/MCA/Stages
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - llvm/lib/Object
+     - `29`
+     - `10`
+     - `19`
+     - :part:`34%`
+   * - llvm/lib/ObjectYAML
+     - `22`
+     - `11`
+     - `11`
+     - :part:`50%`
+   * - llvm/lib/Option
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - llvm/lib/Passes
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - llvm/lib/ProfileData
+     - `8`
+     - `2`
+     - `6`
+     - :part:`25%`
+   * - llvm/lib/ProfileData/Coverage
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/lib/Remarks
+     - `13`
+     - `10`
+     - `3`
+     - :part:`76%`
+   * - llvm/lib/Support
+     - `130`
+     - `42`
+     - `88`
+     - :part:`32%`
+   * - llvm/lib/Support/Unix
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/TableGen
+     - `11`
+     - `1`
+     - `10`
+     - :part:`9%`
+   * - llvm/lib/Target
+     - `5`
+     - `0`
+     - `5`
+     - :none:`0%`
+   * - llvm/lib/Target/AArch64
+     - `67`
+     - `8`
+     - `59`
+     - :part:`11%`
+   * - llvm/lib/Target/AArch64/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/AArch64/Disassembler
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - llvm/lib/Target/AArch64/MCTargetDesc
+     - `21`
+     - `6`
+     - `15`
+     - :part:`28%`
+   * - llvm/lib/Target/AArch64/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/AArch64/Utils
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/Target/AMDGPU
+     - `145`
+     - `11`
+     - `134`
+     - :part:`7%`
+   * - llvm/lib/Target/AMDGPU/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/AMDGPU/Disassembler
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/Target/AMDGPU/MCTargetDesc
+     - `18`
+     - `3`
+     - `15`
+     - :part:`16%`
+   * - llvm/lib/Target/AMDGPU/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/AMDGPU/Utils
+     - `9`
+     - `2`
+     - `7`
+     - :part:`22%`
+   * - llvm/lib/Target/ARC
+     - `24`
+     - `19`
+     - `5`
+     - :part:`79%`
+   * - llvm/lib/Target/ARC/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/ARC/MCTargetDesc
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - llvm/lib/Target/ARC/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/ARM
+     - `71`
+     - `7`
+     - `64`
+     - :part:`9%`
+   * - llvm/lib/Target/ARM/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/ARM/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/ARM/MCTargetDesc
+     - `26`
+     - `2`
+     - `24`
+     - :part:`7%`
+   * - llvm/lib/Target/ARM/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/ARM/Utils
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/Target/AVR
+     - `23`
+     - `4`
+     - `19`
+     - :part:`17%`
+   * - llvm/lib/Target/AVR/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/AVR/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/AVR/MCTargetDesc
+     - `20`
+     - `6`
+     - `14`
+     - :part:`30%`
+   * - llvm/lib/Target/AVR/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/BPF
+     - `28`
+     - `5`
+     - `23`
+     - :part:`17%`
+   * - llvm/lib/Target/BPF/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/BPF/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/BPF/MCTargetDesc
+     - `8`
+     - `1`
+     - `7`
+     - :part:`12%`
+   * - llvm/lib/Target/BPF/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/Hexagon
+     - `77`
+     - `2`
+     - `75`
+     - :part:`2%`
+   * - llvm/lib/Target/Hexagon/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Hexagon/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Hexagon/MCTargetDesc
+     - `26`
+     - `6`
+     - `20`
+     - :part:`23%`
+   * - llvm/lib/Target/Hexagon/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/Lanai
+     - `28`
+     - `19`
+     - `9`
+     - :part:`67%`
+   * - llvm/lib/Target/Lanai/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Lanai/Disassembler
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/Lanai/MCTargetDesc
+     - `13`
+     - `12`
+     - `1`
+     - :part:`92%`
+   * - llvm/lib/Target/Lanai/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/Mips
+     - `69`
+     - `12`
+     - `57`
+     - :part:`17%`
+   * - llvm/lib/Target/Mips/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Mips/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Mips/MCTargetDesc
+     - `25`
+     - `6`
+     - `19`
+     - :part:`24%`
+   * - llvm/lib/Target/Mips/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/MSP430
+     - `20`
+     - `0`
+     - `20`
+     - :none:`0%`
+   * - llvm/lib/Target/MSP430/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/MSP430/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/MSP430/MCTargetDesc
+     - `11`
+     - `3`
+     - `8`
+     - :part:`27%`
+   * - llvm/lib/Target/MSP430/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/NVPTX
+     - `42`
+     - `7`
+     - `35`
+     - :part:`16%`
+   * - llvm/lib/Target/NVPTX/MCTargetDesc
+     - `9`
+     - `5`
+     - `4`
+     - :part:`55%`
+   * - llvm/lib/Target/NVPTX/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/PowerPC
+     - `53`
+     - `2`
+     - `51`
+     - :part:`3%`
+   * - llvm/lib/Target/PowerPC/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/PowerPC/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/PowerPC/MCTargetDesc
+     - `18`
+     - `2`
+     - `16`
+     - :part:`11%`
+   * - llvm/lib/Target/PowerPC/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/RISCV
+     - `31`
+     - `13`
+     - `18`
+     - :part:`41%`
+   * - llvm/lib/Target/RISCV/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/RISCV/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/RISCV/MCTargetDesc
+     - `17`
+     - `8`
+     - `9`
+     - :part:`47%`
+   * - llvm/lib/Target/RISCV/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/RISCV/Utils
+     - `4`
+     - `3`
+     - `1`
+     - :part:`75%`
+   * - llvm/lib/Target/Sparc
+     - `23`
+     - `2`
+     - `21`
+     - :part:`8%`
+   * - llvm/lib/Target/Sparc/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Sparc/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/Sparc/MCTargetDesc
+     - `14`
+     - `4`
+     - `10`
+     - :part:`28%`
+   * - llvm/lib/Target/Sparc/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/SystemZ
+     - `40`
+     - `3`
+     - `37`
+     - :part:`7%`
+   * - llvm/lib/Target/SystemZ/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/SystemZ/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/SystemZ/MCTargetDesc
+     - `10`
+     - `4`
+     - `6`
+     - :part:`40%`
+   * - llvm/lib/Target/SystemZ/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/VE
+     - `19`
+     - `15`
+     - `4`
+     - :part:`78%`
+   * - llvm/lib/Target/VE/AsmParser
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/VE/MCTargetDesc
+     - `14`
+     - `13`
+     - `1`
+     - :part:`92%`
+   * - llvm/lib/Target/VE/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/WebAssembly
+     - `57`
+     - `41`
+     - `16`
+     - :part:`71%`
+   * - llvm/lib/Target/WebAssembly/AsmParser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/WebAssembly/Disassembler
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/WebAssembly/MCTargetDesc
+     - `12`
+     - `8`
+     - `4`
+     - :part:`66%`
+   * - llvm/lib/Target/WebAssembly/TargetInfo
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Target/X86
+     - `75`
+     - `12`
+     - `63`
+     - :part:`16%`
+   * - llvm/lib/Target/X86/AsmParser
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/lib/Target/X86/Disassembler
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/lib/Target/X86/MCTargetDesc
+     - `25`
+     - `6`
+     - `19`
+     - :part:`24%`
+   * - llvm/lib/Target/X86/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Target/XCore
+     - `27`
+     - `2`
+     - `25`
+     - :part:`7%`
+   * - llvm/lib/Target/XCore/Disassembler
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Target/XCore/MCTargetDesc
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - llvm/lib/Target/XCore/TargetInfo
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/Testing/Support
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/TextAPI/ELF
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/lib/TextAPI/MachO
+     - `11`
+     - `8`
+     - `3`
+     - :part:`72%`
+   * - llvm/lib/ToolDrivers/llvm-dlltool
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/ToolDrivers/llvm-lib
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Transforms/AggressiveInstCombine
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/lib/Transforms/CFGuard
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/Transforms/Coroutines
+     - `8`
+     - `0`
+     - `8`
+     - :none:`0%`
+   * - llvm/lib/Transforms/Hello
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/lib/Transforms/InstCombine
+     - `16`
+     - `1`
+     - `15`
+     - :part:`6%`
+   * - llvm/lib/Transforms/Instrumentation
+     - `21`
+     - `2`
+     - `19`
+     - :part:`9%`
+   * - llvm/lib/Transforms/IPO
+     - `39`
+     - `4`
+     - `35`
+     - :part:`10%`
+   * - llvm/lib/Transforms/ObjCARC
+     - `15`
+     - `3`
+     - `12`
+     - :part:`20%`
+   * - llvm/lib/Transforms/Scalar
+     - `75`
+     - `10`
+     - `65`
+     - :part:`13%`
+   * - llvm/lib/Transforms/Utils
+     - `72`
+     - `13`
+     - `59`
+     - :part:`18%`
+   * - llvm/lib/Transforms/Vectorize
+     - `22`
+     - `14`
+     - `8`
+     - :part:`63%`
+   * - llvm/lib/WindowsManifest
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/lib/XRay
+     - `14`
+     - `12`
+     - `2`
+     - :part:`85%`
+   * - llvm/tools/bugpoint
+     - `12`
+     - `1`
+     - `11`
+     - :part:`8%`
+   * - llvm/tools/bugpoint-passes
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/dsymutil
+     - `18`
+     - `15`
+     - `3`
+     - :part:`83%`
+   * - llvm/tools/gold
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llc
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/lli
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/tools/lli/ChildTarget
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-ar
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-as
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-as-fuzzer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-bcanalyzer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-c-test
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/tools/llvm-cat
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-cfi-verify
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-cfi-verify/lib
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - llvm/tools/llvm-config
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-cov
+     - `23`
+     - `12`
+     - `11`
+     - :part:`52%`
+   * - llvm/tools/llvm-cvtres
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-cxxdump
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - llvm/tools/llvm-cxxfilt
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-cxxmap
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-diff
+     - `7`
+     - `0`
+     - `7`
+     - :none:`0%`
+   * - llvm/tools/llvm-dis
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-dwarfdump
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - llvm/tools/llvm-dwarfdump/fuzzer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-dwp
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - llvm/tools/llvm-elfabi
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - llvm/tools/llvm-exegesis
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-exegesis/lib
+     - `44`
+     - `34`
+     - `10`
+     - :part:`77%`
+   * - llvm/tools/llvm-exegesis/lib/AArch64
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-exegesis/lib/Mips
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-exegesis/lib/PowerPC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-exegesis/lib/X86
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-extract
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-gsymutil
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-ifs
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-isel-fuzzer
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/tools/llvm-itanium-demangle-fuzzer
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/tools/llvm-jitlink
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - llvm/tools/llvm-jitlistener
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-link
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-lipo
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-lto
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-lto2
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-mc
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - llvm/tools/llvm-mc-assemble-fuzzer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-mc-disassemble-fuzzer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-mca
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - llvm/tools/llvm-mca/Views
+     - `20`
+     - `15`
+     - `5`
+     - :part:`75%`
+   * - llvm/tools/llvm-microsoft-demangle-fuzzer
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-ml
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - llvm/tools/llvm-modextract
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-mt
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-nm
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-objcopy
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - llvm/tools/llvm-objcopy/COFF
+     - `8`
+     - `7`
+     - `1`
+     - :part:`87%`
+   * - llvm/tools/llvm-objcopy/ELF
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - llvm/tools/llvm-objcopy/MachO
+     - `10`
+     - `10`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-objcopy/wasm
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-objdump
+     - `12`
+     - `8`
+     - `4`
+     - :part:`66%`
+   * - llvm/tools/llvm-opt-fuzzer
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/tools/llvm-opt-report
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-pdbutil
+     - `47`
+     - `16`
+     - `31`
+     - :part:`34%`
+   * - llvm/tools/llvm-profdata
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-rc
+     - `12`
+     - `7`
+     - `5`
+     - :part:`58%`
+   * - llvm/tools/llvm-readobj
+     - `21`
+     - `3`
+     - `18`
+     - :part:`14%`
+   * - llvm/tools/llvm-reduce
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - llvm/tools/llvm-reduce/deltas
+     - `14`
+     - `8`
+     - `6`
+     - :part:`57%`
+   * - llvm/tools/llvm-rtdyld
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-shlib
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-size
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-special-case-list-fuzzer
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-split
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-stress
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-strings
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-symbolizer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/llvm-undname
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/llvm-xray
+     - `19`
+     - `16`
+     - `3`
+     - :part:`84%`
+   * - llvm/tools/llvm-yaml-numeric-parser-fuzzer
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/lto
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/tools/obj2yaml
+     - `11`
+     - `4`
+     - `7`
+     - :part:`36%`
+   * - llvm/tools/opt
+     - `10`
+     - `2`
+     - `8`
+     - :part:`20%`
+   * - llvm/tools/remarks-shlib
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/sancov
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/sanstats
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/verify-uselistorder
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/tools/vfabi-demangle-fuzzer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/tools/yaml2obj
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/ADT
+     - `74`
+     - `29`
+     - `45`
+     - :part:`39%`
+   * - llvm/unittests/Analysis
+     - `33`
+     - `10`
+     - `23`
+     - :part:`30%`
+   * - llvm/unittests/Analysis/ML
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/AsmParser
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/BinaryFormat
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - llvm/unittests/Bitcode
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/unittests/Bitstream
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/unittests/CodeGen
+     - `10`
+     - `2`
+     - `8`
+     - :part:`20%`
+   * - llvm/unittests/CodeGen/GlobalISel
+     - `10`
+     - `1`
+     - `9`
+     - :part:`10%`
+   * - llvm/unittests/DebugInfo/CodeView
+     - `3`
+     - `1`
+     - `2`
+     - :part:`33%`
+   * - llvm/unittests/DebugInfo/DWARF
+     - `13`
+     - `8`
+     - `5`
+     - :part:`61%`
+   * - llvm/unittests/DebugInfo/GSYM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/DebugInfo/MSF
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - llvm/unittests/DebugInfo/PDB
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - llvm/unittests/DebugInfo/PDB/Inputs
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/Demangle
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - llvm/unittests/ExecutionEngine
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/ExecutionEngine/JITLink
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/ExecutionEngine/MCJIT
+     - `7`
+     - `0`
+     - `7`
+     - :none:`0%`
+   * - llvm/unittests/ExecutionEngine/Orc
+     - `20`
+     - `4`
+     - `16`
+     - :part:`20%`
+   * - llvm/unittests/Frontend
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/unittests/FuzzMutate
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - llvm/unittests/IR
+     - `35`
+     - `7`
+     - `28`
+     - :part:`20%`
+   * - llvm/unittests/LineEditor
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/Linker
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/MC
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - llvm/unittests/MC/AMDGPU
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/MI
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/Object
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - llvm/unittests/ObjectYAML
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - llvm/unittests/Option
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/Passes
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/ProfileData
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/unittests/Remarks
+     - `8`
+     - `5`
+     - `3`
+     - :part:`62%`
+   * - llvm/unittests/Support
+     - `86`
+     - `21`
+     - `65`
+     - :part:`24%`
+   * - llvm/unittests/Support/DynamicLibrary
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - llvm/unittests/TableGen
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/unittests/Target/AArch64
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/unittests/Target/AMDGPU
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/Target/ARM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/Target/PowerPC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/Target/WebAssembly
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/Target/X86
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/unittests/TextAPI
+     - `6`
+     - `3`
+     - `3`
+     - :part:`50%`
+   * - llvm/unittests/tools/llvm-cfi-verify
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - llvm/unittests/tools/llvm-exegesis
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - llvm/unittests/tools/llvm-exegesis/AArch64
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/tools/llvm-exegesis/ARM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/tools/llvm-exegesis/Common
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/tools/llvm-exegesis/Mips
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - llvm/unittests/tools/llvm-exegesis/PowerPC
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/unittests/tools/llvm-exegesis/X86
+     - `9`
+     - `8`
+     - `1`
+     - :part:`88%`
+   * - llvm/unittests/Transforms/IPO
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/unittests/Transforms/Scalar
+     - `2`
+     - `0`
+     - `2`
+     - :none:`0%`
+   * - llvm/unittests/Transforms/Utils
+     - `17`
+     - `7`
+     - `10`
+     - :part:`41%`
+   * - llvm/unittests/Transforms/Vectorize
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - llvm/unittests/XRay
+     - `8`
+     - `7`
+     - `1`
+     - :part:`87%`
+   * - llvm/utils/benchmark/cmake
+     - `5`
+     - `3`
+     - `2`
+     - :part:`60%`
+   * - llvm/utils/benchmark/include/benchmark
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/benchmark/src
+     - `19`
+     - `0`
+     - `19`
+     - :none:`0%`
+   * - llvm/utils/FileCheck
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/fpcmp
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/KillTheDoctor
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/not
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - llvm/utils/PerfectShuffle
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/TableGen
+     - `74`
+     - `8`
+     - `66`
+     - :part:`10%`
+   * - llvm/utils/TableGen/GlobalISel
+     - `17`
+     - `8`
+     - `9`
+     - :part:`47%`
+   * - llvm/utils/unittest/googlemock/include/gmock
+     - `11`
+     - `0`
+     - `11`
+     - :none:`0%`
+   * - llvm/utils/unittest/googlemock/include/gmock/internal
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/utils/unittest/googlemock/include/gmock/internal/custom
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - llvm/utils/unittest/googletest/include/gtest
+     - `10`
+     - `0`
+     - `10`
+     - :none:`0%`
+   * - llvm/utils/unittest/googletest/include/gtest/internal
+     - `11`
+     - `0`
+     - `11`
+     - :none:`0%`
+   * - llvm/utils/unittest/googletest/include/gtest/internal/custom
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - llvm/utils/unittest/googletest/src
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/unittest/UnitTestMain
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - llvm/utils/yaml-bench
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/standalone/include/Standalone
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/standalone/lib/Standalone
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/standalone/standalone-opt
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/standalone/standalone-translate
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch1
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch1/include/toy
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch1/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/toy/Ch2
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch2/include/toy
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch2/mlir
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch2/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/toy/Ch3
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch3/include/toy
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch3/mlir
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch3/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/toy/Ch4
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch4/include/toy
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch4/mlir
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch4/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/toy/Ch5
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch5/include/toy
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch5/mlir
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - mlir/examples/toy/Ch5/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/toy/Ch6
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch6/include/toy
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch6/mlir
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - mlir/examples/toy/Ch6/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/examples/toy/Ch7
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch7/include/toy
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - mlir/examples/toy/Ch7/mlir
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - mlir/examples/toy/Ch7/parser
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Analysis
+     - `8`
+     - `7`
+     - `1`
+     - :part:`87%`
+   * - mlir/include/mlir/Conversion/AffineToStandard
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/AVX512ToLLVM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir/Conversion/GPUCommon
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/GPUToNVVM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir/Conversion/GPUToROCDL
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/GPUToSPIRV
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - mlir/include/mlir/Conversion/GPUToVulkan
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir/Conversion/LinalgToLLVM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir/Conversion/LinalgToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/LinalgToStandard
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/SCFToGPU
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/SCFToStandard
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/StandardToLLVM
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - mlir/include/mlir/Conversion/StandardToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Conversion/VectorToLLVM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir/Conversion/VectorToSCF
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Affine
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - mlir/include/mlir/Dialect/Affine/EDSC
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Affine/IR
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/AVX512
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/GPU
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - mlir/include/mlir/Dialect/Linalg
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/include/mlir/Dialect/Linalg/Analysis
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Linalg/EDSC
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - mlir/include/mlir/Dialect/Linalg/IR
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - mlir/include/mlir/Dialect/Linalg/Transforms
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Linalg/Utils
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/LLVMIR
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/LLVMIR/Transforms
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/OpenMP
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Quant
+     - `6`
+     - `5`
+     - `1`
+     - :part:`83%`
+   * - mlir/include/mlir/Dialect/SCF
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/SCF/EDSC
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/SDBM
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - mlir/include/mlir/Dialect/Shape/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/SPIRV
+     - `11`
+     - `11`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/StandardOps/EDSC
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/StandardOps/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/StandardOps/Transforms
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Utils
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Vector
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Dialect/Vector/EDSC
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/EDSC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/ExecutionEngine
+     - `5`
+     - `2`
+     - `3`
+     - :part:`40%`
+   * - mlir/include/mlir/Interfaces
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - mlir/include/mlir/IR
+     - `42`
+     - `9`
+     - `33`
+     - :part:`21%`
+   * - mlir/include/mlir/Pass
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - mlir/include/mlir/Support
+     - `9`
+     - `5`
+     - `4`
+     - :part:`55%`
+   * - mlir/include/mlir/TableGen
+     - `18`
+     - `17`
+     - `1`
+     - :part:`94%`
+   * - mlir/include/mlir/Target
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Target/LLVMIR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/include/mlir/Transforms
+     - `12`
+     - `7`
+     - `5`
+     - :part:`58%`
+   * - mlir/include/mlir-c
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Analysis
+     - `8`
+     - `7`
+     - `1`
+     - :part:`87%`
+   * - mlir/lib/Conversion
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/AffineToStandard
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/lib/Conversion/AVX512ToLLVM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/GPUCommon
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/GPUToNVVM
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/lib/Conversion/GPUToROCDL
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/GPUToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/GPUToVulkan
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/LinalgToLLVM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/LinalgToSPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/LinalgToStandard
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/SCFToGPU
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/SCFToStandard
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/StandardToLLVM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/StandardToSPIRV
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/VectorToLLVM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Conversion/VectorToSCF
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Affine/EDSC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Affine/IR
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Affine/Transforms
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Affine/Utils
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/AVX512/IR
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/lib/Dialect/GPU/IR
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/lib/Dialect/GPU/Transforms
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - mlir/lib/Dialect/Linalg/Analysis
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/lib/Dialect/Linalg/EDSC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Linalg/IR
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Linalg/Transforms
+     - `10`
+     - `9`
+     - `1`
+     - :part:`90%`
+   * - mlir/lib/Dialect/Linalg/Utils
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/LLVMIR/IR
+     - `4`
+     - `1`
+     - `3`
+     - :part:`25%`
+   * - mlir/lib/Dialect/LLVMIR/Transforms
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/OpenMP/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Quant/IR
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Quant/Transforms
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Quant/Utils
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/SCF
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/SCF/EDSC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/SCF/Transforms
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/SDBM
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Shape/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/SPIRV
+     - `8`
+     - `5`
+     - `3`
+     - :part:`62%`
+   * - mlir/lib/Dialect/SPIRV/Serialization
+     - `4`
+     - `2`
+     - `2`
+     - :part:`50%`
+   * - mlir/lib/Dialect/SPIRV/Transforms
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/StandardOps/EDSC
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/StandardOps/IR
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/StandardOps/Transforms
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Dialect/Vector
+     - `3`
+     - `2`
+     - `1`
+     - :part:`66%`
+   * - mlir/lib/Dialect/Vector/EDSC
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/EDSC
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/ExecutionEngine
+     - `5`
+     - `5`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Interfaces
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/IR
+     - `32`
+     - `32`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Parser
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - mlir/lib/Pass
+     - `7`
+     - `6`
+     - `1`
+     - :part:`85%`
+   * - mlir/lib/Support
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/TableGen
+     - `16`
+     - `16`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Target/LLVMIR
+     - `8`
+     - `8`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Transforms
+     - `19`
+     - `17`
+     - `2`
+     - :part:`89%`
+   * - mlir/lib/Transforms/Utils
+     - `7`
+     - `7`
+     - `0`
+     - :good:`100%`
+   * - mlir/lib/Translation
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/tools/mlir-cpu-runner
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/tools/mlir-cuda-runner
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/tools/mlir-linalg-ods-gen
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/tools/mlir-opt
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - mlir/tools/mlir-shlib
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/tools/mlir-tblgen
+     - `16`
+     - `14`
+     - `2`
+     - :part:`87%`
+   * - mlir/tools/mlir-translate
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/tools/mlir-vulkan-runner
+     - `4`
+     - `4`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/Dialect
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/Dialect/Quant
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/Dialect/SPIRV
+     - `2`
+     - `2`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/IR
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/Pass
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/SDBM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - mlir/unittests/TableGen
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - openmp/libomptarget/deviceRTLs
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/libomptarget/deviceRTLs/amdgcn/src
+     - `3`
+     - `3`
+     - `0`
+     - :good:`100%`
+   * - openmp/libomptarget/deviceRTLs/common
+     - `8`
+     - `4`
+     - `4`
+     - :part:`50%`
+   * - openmp/libomptarget/deviceRTLs/nvptx/src
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - openmp/libomptarget/include
+     - `2`
+     - `1`
+     - `1`
+     - :part:`50%`
+   * - openmp/libomptarget/plugins/cuda/src
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/libomptarget/plugins/generic-elf-64bit/src
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/libomptarget/plugins/ve/src
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/libomptarget/src
+     - `8`
+     - `0`
+     - `8`
+     - :none:`0%`
+   * - openmp/runtime/doc/doxygen
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/runtime/src
+     - `74`
+     - `37`
+     - `37`
+     - :part:`50%`
+   * - openmp/runtime/src/thirdparty/ittnotify
+     - `6`
+     - `0`
+     - `6`
+     - :none:`0%`
+   * - openmp/runtime/src/thirdparty/ittnotify/legacy
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/tools/archer
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - openmp/tools/archer/tests/ompt
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - parallel-libs/acxxel
+     - `6`
+     - `4`
+     - `2`
+     - :part:`66%`
+   * - parallel-libs/acxxel/examples
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - parallel-libs/acxxel/tests
+     - `5`
+     - `4`
+     - `1`
+     - :part:`80%`
+   * - polly/include/polly
+     - `22`
+     - `22`
+     - `0`
+     - :good:`100%`
+   * - polly/include/polly/CodeGen
+     - `14`
+     - `14`
+     - `0`
+     - :good:`100%`
+   * - polly/include/polly/Support
+     - `11`
+     - `11`
+     - `0`
+     - :good:`100%`
+   * - polly/lib/Analysis
+     - `9`
+     - `9`
+     - `0`
+     - :good:`100%`
+   * - polly/lib/CodeGen
+     - `15`
+     - `15`
+     - `0`
+     - :good:`100%`
+   * - polly/lib/Exchange
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/lib/External/isl
+     - `67`
+     - `1`
+     - `66`
+     - :part:`1%`
+   * - polly/lib/External/isl/imath
+     - `3`
+     - `0`
+     - `3`
+     - :none:`0%`
+   * - polly/lib/External/isl/imath_wrap
+     - `4`
+     - `0`
+     - `4`
+     - :none:`0%`
+   * - polly/lib/External/isl/include/isl
+     - `62`
+     - `8`
+     - `54`
+     - :part:`12%`
+   * - polly/lib/External/isl/interface
+     - `5`
+     - `1`
+     - `4`
+     - :part:`20%`
+   * - polly/lib/External/pet/include
+     - `1`
+     - `0`
+     - `1`
+     - :none:`0%`
+   * - polly/lib/External/ppcg
+     - `17`
+     - `0`
+     - `17`
+     - :none:`0%`
+   * - polly/lib/Plugin
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/lib/Support
+     - `10`
+     - `10`
+     - `0`
+     - :good:`100%`
+   * - polly/lib/Transform
+     - `14`
+     - `14`
+     - `0`
+     - :good:`100%`
+   * - polly/tools/GPURuntime
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/unittests/DeLICM
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/unittests/Flatten
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/unittests/Isl
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/unittests/ScheduleOptimizer
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/unittests/ScopPassManager
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - polly/unittests/Support
+     - `1`
+     - `1`
+     - `0`
+     - :good:`100%`
+   * - pstl/include/pstl/internal
+     - `22`
+     - `18`
+     - `4`
+     - :part:`81%`
+   * - Total
+     - :total:`13035`
+     - :total:`5791`
+     - :total:`7244`
+     - :total:`44%`
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index 493f736f2be4f..c49312861baa8 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -77,6 +77,7 @@ Using Clang Tools
    ClangCheck
    ClangFormat
    ClangFormatStyleOptions
+   ClangFormattedStatus
 
 Design Documents
 ================
diff --git a/clang/docs/tools/generate_formatted_state.py b/clang/docs/tools/generate_formatted_state.py
new file mode 100755
index 0000000000000..1b620a84ac0b4
--- /dev/null
+++ b/clang/docs/tools/generate_formatted_state.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# A tool to parse creates a document outlining how clang formatted the
+# LLVM project is.
+
+import sys
+import os
+import subprocess
+from datetime import datetime
+
+
+def get_git_revision_short_hash():
+    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']
+                                   ).decode(sys.stdout.encoding).strip()
+
+
+def get_style(count, passed):
+    if passed == count:
+        return ":good:"
+    elif passed != 0:
+        return ":part:"
+    else:
+        return ":none:"
+
+
+TOP_DIR = os.path.join(os.path.dirname(__file__), '../../..')
+CLANG_DIR = os.path.join(os.path.dirname(__file__), '../..')
+DOC_FILE = os.path.join(CLANG_DIR, 'docs/ClangFormattedStatus.rst')
+
+rootdir = TOP_DIR
+
+skipped_dirs = [".git", "test"]
+suffixes = (".cpp", ".h")
+
+rst_prefix = """\
+.. raw:: html
+
+      <style type="text/css">
+        .none {{ background-color: #FFCC99 }}
+        .part {{ background-color: #FFFF99 }}
+        .good {{ background-color: #2CCCFF }}
+        .total {{ font-weight: bold; }}
+      </style>
+
+.. role:: none
+.. role:: part
+.. role:: good
+.. role:: total
+
+======================
+Clang Formatted Status
+======================
+
+:doc:`ClangFormattedStatus` describes the state of LLVM source
+tree in terms of conformance to :doc:`ClangFormat` as of: {today} (`{sha} <https://github.com/llvm/llvm-project/commit/{sha}>`_).
+
+
+.. list-table:: LLVM Clang-Format Status
+   :widths: 50 25 25 25 25
+   :header-rows: 1\n
+   * - Directory
+     - Total Files
+     - Formatted Files
+     - Unformatted Files
+     - % Complete
+"""
+
+table_row = """\
+   * - {path}
+     - {style}`{count}`
+     - {style}`{passes}`
+     - {style}`{fails}`
+     - {style2}`{percent}%`
+"""
+
+with open(DOC_FILE, 'wb') as output:
+    sha = get_git_revision_short_hash()
+    today = datetime.now().strftime("%B %d, %Y %H:%M:%S")
+    output.write(bytes(rst_prefix.format(today=today,
+                                         sha=sha).encode("utf-8")))
+
+    total_files_count = 0
+    total_files_pass = 0
+    total_files_fail = 0
+    for root, subdirs, files in os.walk(rootdir):
+        for subdir in subdirs:
+            if any(sd == subdir for sd in skipped_dirs):
+                subdirs.remove(subdir)
+
+        path = os.path.relpath(root, TOP_DIR)
+        path = path.replace('\\', '/')
+
+        head, _ = os.path.split(root)
+        while head:
+            head, _ = os.path.split(head)
+
+        file_count = 0
+        file_pass = 0
+        file_fail = 0
+        for filename in files:
+            file_path = os.path.join(root, filename)
+            ext = os.path.splitext(file_path)[-1].lower()
+            if not ext.endswith(suffixes):
+                continue
+
+            file_count += 1
+
+            args = ["clang-format", "-n", file_path]
+            cmd = subprocess.Popen(args, stderr=subprocess.PIPE)
+            stdout, err = cmd.communicate()
+
+            relpath = os.path.relpath(file_path, TOP_DIR)
+            relpath = relpath.replace('\\', '/')
+            if err.decode(sys.stdout.encoding).find(': warning:') > 0:
+                print(relpath, ":", "FAIL")
+                file_fail += 1
+            else:
+                print(relpath, ":", "PASS")
+                file_pass += 1
+
+        total_files_count += file_count
+        total_files_pass += file_pass
+        total_files_fail += file_fail
+
+        if file_count > 0:
+            percent = (int(100.0 * (float(file_pass)/float(file_count))))
+            style = get_style(file_count, file_pass)
+            output.write(bytes(table_row.format(path=path,
+                                                count=file_count,
+                                                passes=file_pass,
+                                                fails=file_fail,
+                                                percent=str(percent), style="",
+                                                style2=style).encode("utf-8")))
+            output.flush()
+
+            print("----\n")
+            print(path, file_count, file_pass, file_fail, percent)
+            print("----\n")
+
+    total_percent = (float(total_files_pass)/float(total_files_count))
+    percent_str = str(int(100.0 * total_percent))
+    output.write(bytes(table_row.format(path="Total",
+                                        count=total_files_count,
+                                        passes=total_files_pass,
+                                        fails=total_files_fail,
+                                        percent=percent_str, style=":total:",
+                                        style2=":total:").encode("utf-8")))

From 80e107ccd088a2705d0e776799a8815a58061cb3 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Fri, 29 May 2020 12:15:07 -0700
Subject: [PATCH 573/770] Add NoMerge MIFlag to avoid MIR branch folding

Let the codegen recognized the nomerge attribute and disable branch folding when the attribute is given

Differential Revision: https://reviews.llvm.org/D79537
---
 llvm/include/llvm/CodeGen/MachineInstr.h      |  3 ++
 llvm/include/llvm/CodeGen/SelectionDAG.h      | 13 +++++++
 llvm/include/llvm/CodeGen/TargetLowering.h    |  7 ++--
 llvm/lib/CodeGen/BranchFolding.cpp            |  3 ++
 llvm/lib/CodeGen/MIRPrinter.cpp               |  2 ++
 llvm/lib/CodeGen/MachineInstr.cpp             |  2 ++
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  4 +++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  1 +
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  1 +
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  4 ++-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  5 +--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  1 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  1 +
 llvm/test/CodeGen/AArch64/nomerge.ll          | 36 +++++++++++++++++++
 llvm/test/CodeGen/ARM/nomerge.ll              | 36 +++++++++++++++++++
 llvm/test/CodeGen/PowerPC/nomerge.ll          | 35 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/nomerge.ll            | 35 ++++++++++++++++++
 llvm/test/CodeGen/X86/nomerge.ll              | 36 +++++++++++++++++++
 18 files changed, 220 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/nomerge.ll
 create mode 100644 llvm/test/CodeGen/ARM/nomerge.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/nomerge.ll
 create mode 100644 llvm/test/CodeGen/RISCV/nomerge.ll
 create mode 100644 llvm/test/CodeGen/X86/nomerge.ll

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 1c841155e6434..f0418a8efe398 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -105,6 +105,9 @@ class MachineInstr
                                         // known to be exact.
     NoFPExcept   = 1 << 14,             // Instruction does not raise
                                         // floatint-point exceptions.
+    NoMerge      = 1 << 15,             // Passes that drop source location info
+                                        // (e.g. branch folding) should skip
+                                        // this instruction.
   };
 
 private:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 462d9f91c4f15..590919f89cac7 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -278,6 +278,7 @@ class SelectionDAG {
   struct CallSiteDbgInfo {
     CallSiteInfo CSInfo;
     MDNode *HeapAllocSite = nullptr;
+    bool NoMerge = false;
   };
 
   DenseMap<const SDNode *, CallSiteDbgInfo> SDCallSiteDbgInfo;
@@ -1916,6 +1917,18 @@ class SelectionDAG {
     return It->second.HeapAllocSite;
   }
 
+  void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) {
+    if (NoMerge)
+      SDCallSiteDbgInfo[Node].NoMerge = NoMerge;
+  }
+
+  bool getNoMergeSiteInfo(const SDNode *Node) {
+    auto I = SDCallSiteDbgInfo.find(Node);
+    if (I == SDCallSiteDbgInfo.end())
+      return false;
+    return I->second.NoMerge;
+  }
+
   /// Return the current function's default denormal handling kind for the given
   /// floating point type.
   DenormalMode getDenormalMode(EVT VT) const {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2689838b3e7cc..d2fb5afb9adbf 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3613,6 +3613,7 @@ class TargetLowering : public TargetLoweringBase {
     bool IsConvergent      : 1;
     bool IsPatchPoint      : 1;
     bool IsPreallocated : 1;
+    bool NoMerge           : 1;
 
     // IsTailCall should be modified by implementations of
     // TargetLowering::LowerCall that perform tail call conversions.
@@ -3636,7 +3637,8 @@ class TargetLowering : public TargetLoweringBase {
     CallLoweringInfo(SelectionDAG &DAG)
         : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
           DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
-          IsPatchPoint(false), IsPreallocated(false), DAG(DAG) {}
+          IsPatchPoint(false), IsPreallocated(false), NoMerge(false),
+          DAG(DAG) {}
 
     CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
       DL = dl;
@@ -3685,7 +3687,8 @@ class TargetLowering : public TargetLoweringBase {
       IsReturnValueUsed = !Call.use_empty();
       RetSExt = Call.hasRetAttr(Attribute::SExt);
       RetZExt = Call.hasRetAttr(Attribute::ZExt);
-
+      NoMerge = Call.hasFnAttr(Attribute::NoMerge);
+      
       Callee = Target;
 
       CallConv = Call.getCallingConv();
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 852bfb36c7208..df79019a6402d 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -348,6 +348,9 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
         MBBI1->isInlineAsm()) {
       break;
     }
+    if (MBBI1->getFlag(MachineInstr::NoMerge) ||
+        MBBI2->getFlag(MachineInstr::NoMerge))
+      break;
     ++TailLen;
     I1 = MBBI1;
     I2 = MBBI2;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 36cd39f2c4759..fa23df6288e99 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -778,6 +778,8 @@ void MIPrinter::print(const MachineInstr &MI) {
     OS << "exact ";
   if (MI.getFlag(MachineInstr::NoFPExcept))
     OS << "nofpexcept ";
+  if (MI.getFlag(MachineInstr::NoMerge))
+    OS << "nomerge ";
 
   OS << TII->getName(MI.getOpcode());
   if (I < E)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 7afa61f2c4dbd..987de0cb3b0e5 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1595,6 +1595,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "exact ";
   if (getFlag(MachineInstr::NoFPExcept))
     OS << "nofpexcept ";
+  if (getFlag(MachineInstr::NoMerge))
+    OS << "nomerge ";
 
   // Print the opcode name.
   if (TII)
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 83aaf93c9ba0d..731cd2396b626 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -872,6 +872,10 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
         DAG->getTarget().Options.EmitCallSiteInfo)
       MF.addCallArgsForwardingRegs(MI, DAG->getSDCallSiteInfo(Node));
 
+    if (DAG->getNoMergeSiteInfo(Node)) {
+      MI->setFlag(MachineInstr::MIFlag::NoMerge);
+    }
+
     return MI;
   };
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 81b50346437c7..aece1d0da59ab 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4541,6 +4541,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 98161c3494445..bb50525b673b7 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2542,6 +2542,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 53f9ac678c7b7..7e0cbbff2515c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -5586,6 +5586,7 @@ SDValue PPCTargetLowering::FinishCall(
 
   std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
   Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
   Glue = Chain.getValue(1);
 
   // When performing tail call optimization the callee pops its arguments off
@@ -5667,7 +5668,8 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
       // hasNest
       Subtarget.is64BitELFABI() &&
-          any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }));
+          any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
+      CLI.NoMerge);
 
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
     return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 29d4e54edc672..77083b4761655 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -979,12 +979,13 @@ namespace llvm {
       const bool IsPatchPoint : 1;
       const bool IsIndirect : 1;
       const bool HasNest : 1;
+      const bool NoMerge : 1;
 
       CallFlags(CallingConv::ID CC, bool IsTailCall, bool IsVarArg,
-                bool IsPatchPoint, bool IsIndirect, bool HasNest)
+                bool IsPatchPoint, bool IsIndirect, bool HasNest, bool NoMerge)
           : CallConv(CC), IsTailCall(IsTailCall), IsVarArg(IsVarArg),
             IsPatchPoint(IsPatchPoint), IsIndirect(IsIndirect),
-            HasNest(HasNest) {}
+            HasNest(HasNest), NoMerge(NoMerge) {}
     };
 
   private:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8035d42e1d420..7b1174491e64e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2353,6 +2353,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   Glue = Chain.getValue(1);
 
   // Mark the end of the call, which is glued to the call itself.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a1121600346f2..3d2cdccd50a59 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4336,6 +4336,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   }
   InFlag = Chain.getValue(1);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
   // Save heapallocsite metadata.
diff --git a/llvm/test/CodeGen/AArch64/nomerge.ll b/llvm/test/CodeGen/AArch64/nomerge.ll
new file mode 100644
index 0000000000000..4ef1027560802
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/nomerge.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=aarch64 -o - | FileCheck %s
+
+define void @foo(i32 %i) {
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.then2:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.end3:
+  tail call void @bar() #0
+  ret void
+}
+
+declare void @bar()
+
+attributes #0 = { nomerge }
+
+; CHECK-LABEL: foo:
+; CHECK: // %bb.0: // %entry
+; CHECK: // %bb.1: // %entry
+; CHECK: // %bb.2: // %if.then
+; CHECK-NEXT: bl bar
+; CHECK: b bar
+; CHECK: .LBB0_3: // %if.then2
+; CHECK-NEXT: bl bar
+; CHECK: .LBB0_4: // %if.end3
+; CHECK: b bar
diff --git a/llvm/test/CodeGen/ARM/nomerge.ll b/llvm/test/CodeGen/ARM/nomerge.ll
new file mode 100644
index 0000000000000..b4e01c0560cf7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/nomerge.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm -o - | FileCheck %s
+
+define void @foo(i32 %i) {
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.then2:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.end3:
+  tail call void @bar() #0
+  ret void
+}
+
+declare void @bar()
+
+attributes #0 = { nomerge }
+
+; CHECK-LABEL: foo:
+; CHECK: @ %bb.0: @ %entry
+; CHECK: @ %bb.1: @ %entry
+; CHECK: @ %bb.2: @ %if.then
+; CHECK-NEXT: bl bar
+; CHECK: b bar
+; CHECK: .LBB0_3: @ %if.then2
+; CHECK-NEXT: bl bar
+; CHECK: .LBB0_4: @ %if.end3
+; CHECK: b bar
diff --git a/llvm/test/CodeGen/PowerPC/nomerge.ll b/llvm/test/CodeGen/PowerPC/nomerge.ll
new file mode 100644
index 0000000000000..4e3db233e3ce0
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/nomerge.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=powerpc -o - | FileCheck %s
+
+define void @foo(i32 %i) {
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.then2:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.end3:
+  tail call void @bar() #0
+  ret void
+}
+
+declare void @bar()
+
+attributes #0 = { nomerge }
+
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %entry
+; CHECK: # %bb.1: # %entry
+; CHECK: # %bb.2: # %if.then
+; CHECK-NEXT: bl bar
+; CHECK: .LBB0_3: # %if.then2
+; CHECK-NEXT: bl bar
+; CHECK: .LBB0_4: # %if.end3
+; CHECK-NEXT: bl bar
diff --git a/llvm/test/CodeGen/RISCV/nomerge.ll b/llvm/test/CodeGen/RISCV/nomerge.ll
new file mode 100644
index 0000000000000..c35c708d0fcbf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/nomerge.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=riscv64 -o - | FileCheck %s
+
+define void @foo(i32 %i) {
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.then2:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.end3:
+  tail call void @bar() #0
+  ret void
+}
+
+declare void @bar()
+
+attributes #0 = { nomerge }
+
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %entry
+; CHECK: # %bb.1: # %entry
+; CHECK: # %bb.2: # %if.then
+; CHECK-NEXT: call bar
+; CHECK: .LBB0_3: # %if.then2
+; CHECK-NEXT: call bar
+; CHECK: .LBB0_4: # %if.end3
+; CHECK: tail bar
diff --git a/llvm/test/CodeGen/X86/nomerge.ll b/llvm/test/CodeGen/X86/nomerge.ll
new file mode 100644
index 0000000000000..8da27f79db908
--- /dev/null
+++ b/llvm/test/CodeGen/X86/nomerge.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64 -o - | FileCheck %s
+
+define void @foo(i32 %i) {
+entry:
+  switch i32 %i, label %if.end3 [
+    i32 5, label %if.then
+    i32 7, label %if.then2
+  ]
+
+if.then:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.then2:
+  tail call void @bar() #0
+  br label %if.end3
+
+if.end3:
+  tail call void @bar() #0
+  ret void
+}
+
+declare void @bar()
+
+attributes #0 = { nomerge }
+
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %entry
+; CHECK: # %bb.1: # %entry
+; CHECK: # %bb.2: # %if.then
+; CHECK-NEXT: callq bar
+; CHECK: jmp bar # TAILCALL
+; CHECK: .LBB0_3: # %if.then2
+; CHECK: callq bar
+; CHECK: .LBB0_4: # %if.end3
+; CHECK: jmp bar # TAILCALL

From 1a5c97f3a4b88438b19ff34a285e559e57b1e9d4 Mon Sep 17 00:00:00 2001
From: Jan Korous <jkorous@apple.com>
Date: Tue, 28 Apr 2020 20:00:49 -0700
Subject: [PATCH 574/770] [ASTMatchers] Matchers related to C++ inheritance

Differential Revision: https://reviews.llvm.org/D79063
---
 ...onPrivateMemberVariablesInClassesCheck.cpp |   5 +-
 clang/docs/LibASTMatchersReference.html       | 188 ++++++++++++++++--
 clang/include/clang/AST/ASTTypeTraits.h       |   7 +
 clang/include/clang/ASTMatchers/ASTMatchers.h | 118 ++++++++---
 .../clang/ASTMatchers/ASTMatchersInternal.h   |  19 ++
 clang/lib/AST/ASTTypeTraits.cpp               |   1 +
 clang/lib/ASTMatchers/ASTMatchersInternal.cpp |  24 +++
 clang/lib/ASTMatchers/Dynamic/Registry.cpp    |   1 +
 .../ASTMatchers/ASTMatchersNarrowingTest.cpp  | 121 +++++++++++
 9 files changed, 432 insertions(+), 52 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp
index 12fb7d8a7ae8c..5b69885097427 100644
--- a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.cpp
@@ -59,8 +59,9 @@ void NonPrivateMemberVariablesInClassesCheck::registerMatchers(
   // If we are ok with public fields, then we only want to complain about
   // protected fields, else we want to complain about all non-private fields.
   // We can ignore public member variables in structs/classes, in unions.
-  auto InterestingField = fieldDecl(
-      IgnorePublicMemberVariables ? isProtected() : unless(isPrivate()));
+  auto InterestingField = IgnorePublicMemberVariables
+                              ? fieldDecl(isProtected())
+                              : fieldDecl(unless(isPrivate()));
 
   // We only want the records that not only contain the mutable data (non-static
   // member variables), but also have some logic (non-static, non-implicit
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index 9db6795eb5fab..bb5e4984fcdf4 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -2175,6 +2175,75 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isPrivate1')"><a name="isPrivate1Anchor">isPrivate</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isPrivate1"><pre>Matches private C++ declarations and C++ base specifers that specify private
+inheritance.
+
+Examples:
+  class C {
+  public:    int a;
+  protected: int b;
+  private:   int c; // fieldDecl(isPrivate()) matches 'c'
+  };
+
+  struct Base {};
+  struct Derived1 : private Base {}; // matches 'Base'
+  class Derived2 : Base {}; // matches 'Base'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isProtected1')"><a name="isProtected1Anchor">isProtected</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isProtected1"><pre>Matches protected C++ declarations and C++ base specifers that specify
+protected inheritance.
+
+Examples:
+  class C {
+  public:    int a;
+  protected: int b; // fieldDecl(isProtected()) matches 'b'
+  private:   int c;
+  };
+
+  class Base {};
+  class Derived : protected Base {}; // matches 'Base'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isPublic1')"><a name="isPublic1Anchor">isPublic</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isPublic1"><pre>Matches public C++ declarations and C++ base specifers that specify public
+inheritance.
+
+Examples:
+  class C {
+  public:    int a; // fieldDecl(isPublic()) matches 'a'
+  protected: int b;
+  private:   int c;
+  };
+
+  class Base {};
+  class Derived1 : public Base {}; // matches 'Base'
+  struct Derived2 : Base {}; // matches 'Base'
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('isVirtual1')"><a name="isVirtual1Anchor">isVirtual</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isVirtual1"><pre>Matches declarations of virtual methods and C++ base specifers that specify
+virtual inheritance.
+
+Example:
+  class A {
+   public:
+    virtual void x(); // matches x
+  };
+
+Example:
+  class Base {};
+  class DirectlyDerived : virtual Base {}; // matches Base
+  class IndirectlyDerived : DirectlyDerived, Base {}; // matches Base
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;</td><td class="name" onclick="toggle('equals5')"><a name="equals5Anchor">equals</a></td><td>bool Value</td></tr>
 <tr><td colspan="4" class="doc" id="equals5"><pre></pre></td></tr>
 
@@ -2562,14 +2631,21 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;</td><td class="name" onclick="toggle('isVirtual0')"><a name="isVirtual0Anchor">isVirtual</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isVirtual0"><pre>Matches if the given method declaration is virtual.
+<tr><td colspan="4" class="doc" id="isVirtual0"><pre>Matches declarations of virtual methods and C++ base specifers that specify
+virtual inheritance.
 
-Given
+Example:
   class A {
    public:
-    virtual void x();
+    virtual void x(); // matches x
   };
-  matches A::x
+
+Example:
+  class Base {};
+  class DirectlyDerived : virtual Base {}; // matches Base
+  class IndirectlyDerived : DirectlyDerived, Base {}; // matches Base
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;
 </pre></td></tr>
 
 
@@ -3012,44 +3088,52 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPrivate0')"><a name="isPrivate0Anchor">isPrivate</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isPrivate0"><pre>Matches private C++ declarations.
+<tr><td colspan="4" class="doc" id="isPrivate0"><pre>Matches private C++ declarations and C++ base specifers that specify private
+inheritance.
 
-Given
+Examples:
   class C {
   public:    int a;
   protected: int b;
-  private:   int c;
+  private:   int c; // fieldDecl(isPrivate()) matches 'c'
   };
-fieldDecl(isPrivate())
-  matches 'int c;'
+
+  struct Base {};
+  struct Derived1 : private Base {}; // matches 'Base'
+  class Derived2 : Base {}; // matches 'Base'
 </pre></td></tr>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isProtected0')"><a name="isProtected0Anchor">isProtected</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isProtected0"><pre>Matches protected C++ declarations.
+<tr><td colspan="4" class="doc" id="isProtected0"><pre>Matches protected C++ declarations and C++ base specifers that specify
+protected inheritance.
 
-Given
+Examples:
   class C {
   public:    int a;
-  protected: int b;
+  protected: int b; // fieldDecl(isProtected()) matches 'b'
   private:   int c;
   };
-fieldDecl(isProtected())
-  matches 'int b;'
+
+  class Base {};
+  class Derived : protected Base {}; // matches 'Base'
 </pre></td></tr>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('isPublic0')"><a name="isPublic0Anchor">isPublic</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isPublic0"><pre>Matches public C++ declarations.
+<tr><td colspan="4" class="doc" id="isPublic0"><pre>Matches public C++ declarations and C++ base specifers that specify public
+inheritance.
 
-Given
+Examples:
   class C {
-  public:    int a;
+  public:    int a; // fieldDecl(isPublic()) matches 'a'
   protected: int b;
   private:   int c;
   };
-fieldDecl(isPublic())
-  matches 'int a;'
+
+  class Base {};
+  class Derived1 : public Base {}; // matches 'Base'
+  struct Derived2 : Base {}; // matches 'Base'
 </pre></td></tr>
 
 
@@ -5135,6 +5219,33 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;</td><td class="name" onclick="toggle('hasType7')"><a name="hasType7Anchor">hasType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt; InnerMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasType7"><pre>Overloaded to match the declaration of the expression's or value
+declaration's type.
+
+In case of a value declaration (for example a variable declaration),
+this resolves one layer of indirection. For example, in the value
+declaration "X x;", cxxRecordDecl(hasName("X")) matches the declaration of
+X, while varDecl(hasType(cxxRecordDecl(hasName("X")))) matches the
+declaration of x.
+
+Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
+            and z (matcher = varDecl(hasType(cxxRecordDecl(hasName("X")))))
+            and friend class X (matcher = friendDecl(hasType("X"))
+ class X {};
+ void y(X &amp;x) { x; X z; }
+ class Y { friend class X; };
+
+Example matches class Derived
+(matcher = cxxRecordDecl(hasAnyBase(hasType(cxxRecordDecl(hasName("Base"))))))
+class Base {};
+class Derived : Base {};
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;,
+Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXConstructExpr.html">CXXConstructExpr</a>&gt;</td><td class="name" onclick="toggle('forEachArgumentWithParam1')"><a name="forEachArgumentWithParam1Anchor">forEachArgumentWithParam</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; ArgMatcher, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ParmVarDecl.html">ParmVarDecl</a>&gt; ParamMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachArgumentWithParam1"><pre>Matches all arguments and their respective ParmVarDecl.
 
@@ -5518,6 +5629,21 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('hasAnyBase0')"><a name="hasAnyBase0Anchor">hasAnyBase</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt; BaseSpecMatcher</td></tr>
+<tr><td colspan="4" class="doc" id="hasAnyBase0"><pre>Matches C++ classes that have a direct or indirect base matching BaseSpecMatcher.
+
+Example matches DirectlyDerived, IndirectlyDerived (BaseSpecMatcher ==
+hasType(cxxRecordDecl(hasName("SpecialBase")))) class Foo;
+  class Bar : Foo {};
+  class Baz : Bar {};
+  class SpecialBase;
+  class DirectlyDerived : SpecialBase {};  // directly derived
+  class IndirectlyDerived : DirectlyDerived {};  // indirectly derived
+
+FIXME: Refactor this and isDerivedFrom to reuse implementation.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXRecordDecl.html">CXXRecordDecl</a>&gt;</td><td class="name" onclick="toggle('hasMethod0')"><a name="hasMethod0Anchor">hasMethod</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXMethodDecl.html">CXXMethodDecl</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasMethod0"><pre>Matches the first method of a class or struct that satisfies InnerMatcher.
 
@@ -6075,7 +6201,13 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
  void y(X &amp;x) { x; X z; }
  class Y { friend class X; };
 
-Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
+Example matches class Derived
+(matcher = cxxRecordDecl(hasAnyBase(hasType(cxxRecordDecl(hasName("Base"))))))
+class Base {};
+class Derived : Base {};
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;,
+Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;
 </pre></td></tr>
 
 
@@ -6289,7 +6421,13 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
  void y(X &amp;x) { x; X z; }
  class Y { friend class X; };
 
-Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
+Example matches class Derived
+(matcher = cxxRecordDecl(hasAnyBase(hasType(cxxRecordDecl(hasName("Base"))))))
+class Base {};
+class Derived : Base {};
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;,
+Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;
 </pre></td></tr>
 
 
@@ -7677,7 +7815,13 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
  void y(X &amp;x) { x; X z; }
  class Y { friend class X; };
 
-Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;
+Example matches class Derived
+(matcher = cxxRecordDecl(hasAnyBase(hasType(cxxRecordDecl(hasName("Base"))))))
+class Base {};
+class Derived : Base {};
+
+Usable as: Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ValueDecl.html">ValueDecl</a>&gt;,
+Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXBaseSpecifier.html">CXXBaseSpecifier</a>&gt;
 </pre></td></tr>
 
 
diff --git a/clang/include/clang/AST/ASTTypeTraits.h b/clang/include/clang/AST/ASTTypeTraits.h
index cd80e9bc38084..67fa4ab1b6a4f 100644
--- a/clang/include/clang/AST/ASTTypeTraits.h
+++ b/clang/include/clang/AST/ASTTypeTraits.h
@@ -16,6 +16,7 @@
 #define LLVM_CLANG_AST_ASTTYPETRAITS_H
 
 #include "clang/AST/ASTFwd.h"
+#include "clang/AST/DeclCXX.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/TypeLoc.h"
@@ -136,6 +137,7 @@ class ASTNodeKind {
     NKI_QualType,
     NKI_TypeLoc,
     NKI_LastKindWithoutPointerIdentity = NKI_TypeLoc,
+    NKI_CXXBaseSpecifier,
     NKI_CXXCtorInitializer,
     NKI_NestedNameSpecifier,
     NKI_Decl,
@@ -198,6 +200,7 @@ KIND_TO_KIND_ID(Decl)
 KIND_TO_KIND_ID(Stmt)
 KIND_TO_KIND_ID(Type)
 KIND_TO_KIND_ID(OMPClause)
+KIND_TO_KIND_ID(CXXBaseSpecifier)
 #define DECL(DERIVED, BASE) KIND_TO_KIND_ID(DERIVED##Decl)
 #include "clang/AST/DeclNodes.inc"
 #define STMT(DERIVED, BASE) KIND_TO_KIND_ID(DERIVED)
@@ -510,6 +513,10 @@ template <>
 struct DynTypedNode::BaseConverter<
     TypeLoc, void> : public ValueConverter<TypeLoc> {};
 
+template <>
+struct DynTypedNode::BaseConverter<CXXBaseSpecifier, void>
+    : public PtrConverter<CXXBaseSpecifier> {};
+
 // The only operation we allow on unsupported types is \c get.
 // This allows to conveniently use \c DynTypedNode when having an arbitrary
 // AST node that is not supported, but prevents misuse - a user cannot create
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index a3747faa139c8..6dbcc01442ed6 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -47,6 +47,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTTypeTraits.h"
 #include "clang/AST/Attr.h"
+#include "clang/AST/CXXInheritance.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclFriend.h"
@@ -548,52 +549,72 @@ extern const internal::VariadicDynCastAllOfMatcher<Decl,
 extern const internal::VariadicDynCastAllOfMatcher<Decl, TemplateTypeParmDecl>
     templateTypeParmDecl;
 
-/// Matches public C++ declarations.
+/// Matches public C++ declarations and C++ base specifers that specify public
+/// inheritance.
 ///
-/// Given
+/// Examples:
 /// \code
 ///   class C {
-///   public:    int a;
+///   public:    int a; // fieldDecl(isPublic()) matches 'a'
 ///   protected: int b;
 ///   private:   int c;
 ///   };
 /// \endcode
-/// fieldDecl(isPublic())
-///   matches 'int a;'
-AST_MATCHER(Decl, isPublic) {
-  return Node.getAccess() == AS_public;
+///
+/// \code
+///   class Base {};
+///   class Derived1 : public Base {}; // matches 'Base'
+///   struct Derived2 : Base {}; // matches 'Base'
+/// \endcode
+AST_POLYMORPHIC_MATCHER(isPublic,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(Decl,
+                                                        CXXBaseSpecifier)) {
+  return getAccessSpecifier(Node) == AS_public;
 }
 
-/// Matches protected C++ declarations.
+/// Matches protected C++ declarations and C++ base specifers that specify
+/// protected inheritance.
 ///
-/// Given
+/// Examples:
 /// \code
 ///   class C {
 ///   public:    int a;
-///   protected: int b;
+///   protected: int b; // fieldDecl(isProtected()) matches 'b'
 ///   private:   int c;
 ///   };
 /// \endcode
-/// fieldDecl(isProtected())
-///   matches 'int b;'
-AST_MATCHER(Decl, isProtected) {
-  return Node.getAccess() == AS_protected;
+///
+/// \code
+///   class Base {};
+///   class Derived : protected Base {}; // matches 'Base'
+/// \endcode
+AST_POLYMORPHIC_MATCHER(isProtected,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(Decl,
+                                                        CXXBaseSpecifier)) {
+  return getAccessSpecifier(Node) == AS_protected;
 }
 
-/// Matches private C++ declarations.
+/// Matches private C++ declarations and C++ base specifers that specify private
+/// inheritance.
 ///
-/// Given
+/// Examples:
 /// \code
 ///   class C {
 ///   public:    int a;
 ///   protected: int b;
-///   private:   int c;
+///   private:   int c; // fieldDecl(isPrivate()) matches 'c'
 ///   };
 /// \endcode
-/// fieldDecl(isPrivate())
-///   matches 'int c;'
-AST_MATCHER(Decl, isPrivate) {
-  return Node.getAccess() == AS_private;
+///
+/// \code
+///   struct Base {};
+///   struct Derived1 : private Base {}; // matches 'Base'
+///   class Derived2 : Base {}; // matches 'Base'
+/// \endcode
+AST_POLYMORPHIC_MATCHER(isPrivate,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(Decl,
+                                                        CXXBaseSpecifier)) {
+  return getAccessSpecifier(Node) == AS_private;
 }
 
 /// Matches non-static data members that are bit-fields.
@@ -2839,6 +2860,26 @@ AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
   return Matcher<ObjCInterfaceDecl>(M).matches(*InterfaceDecl, Finder, Builder);
 }
 
+/// Matches C++ classes that have a direct or indirect base matching \p
+/// BaseSpecMatcher.
+///
+/// Example:
+/// matcher hasAnyBase(hasType(cxxRecordDecl(hasName("SpecialBase")))))
+/// \code
+///   class Foo;
+///   class Bar : Foo {};
+///   class Baz : Bar {};
+///   class SpecialBase;
+///   class Proxy : SpecialBase {};  // matches Proxy
+///   class IndirectlyDerived : Proxy {};  //matches IndirectlyDerived
+/// \endcode
+///
+// FIXME: Refactor this and isDerivedFrom to reuse implementation.
+AST_MATCHER_P(CXXRecordDecl, hasAnyBase, internal::Matcher<CXXBaseSpecifier>,
+              BaseSpecMatcher) {
+  return internal::matchesAnyBase(Node, BaseSpecMatcher, Finder, Builder);
+}
+
 /// Similar to \c isDerivedFrom(), but also matches classes that directly
 /// match \c Base.
 AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
@@ -3469,9 +3510,19 @@ AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
 ///  class Y { friend class X; };
 /// \endcode
 ///
-/// Usable as: Matcher<Expr>, Matcher<ValueDecl>
+/// Example matches class Derived
+/// (matcher = cxxRecordDecl(hasAnyBase(hasType(cxxRecordDecl(hasName("Base"))))))
+/// \code
+/// class Base {};
+/// class Derived : Base {};
+/// \endcode
+///
+/// Usable as: Matcher<Expr>, Matcher<FriendDecl>, Matcher<ValueDecl>,
+/// Matcher<CXXBaseSpecifier>
 AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
-    hasType, AST_POLYMORPHIC_SUPPORTED_TYPES(Expr, FriendDecl, ValueDecl),
+    hasType,
+    AST_POLYMORPHIC_SUPPORTED_TYPES(Expr, FriendDecl, ValueDecl,
+                                    CXXBaseSpecifier),
     internal::Matcher<Decl>, InnerMatcher, 1) {
   QualType QT = internal::getUnderlyingType(Node);
   if (!QT.isNull())
@@ -5177,17 +5228,28 @@ AST_MATCHER_P(CXXMethodDecl, forEachOverridden,
   return Matched;
 }
 
-/// Matches if the given method declaration is virtual.
+/// Matches declarations of virtual methods and C++ base specifers that specify
+/// virtual inheritance.
 ///
-/// Given
+/// Example:
 /// \code
 ///   class A {
 ///    public:
-///     virtual void x();
+///     virtual void x(); // matches x
 ///   };
 /// \endcode
-///   matches A::x
-AST_MATCHER(CXXMethodDecl, isVirtual) {
+///
+/// Example:
+/// \code
+///   class Base {};
+///   class DirectlyDerived : virtual Base {}; // matches Base
+///   class IndirectlyDerived : DirectlyDerived, Base {}; // matches Base
+/// \endcode
+///
+/// Usable as: Matcher<CXXMethodDecl>, Matcher<CXXBaseSpecifier>
+AST_POLYMORPHIC_MATCHER(isVirtual,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(CXXMethodDecl,
+                                                        CXXBaseSpecifier)) {
   return Node.isVirtual();
 }
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index e064b28b84f91..fc41407ba2961 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -130,6 +130,9 @@ inline QualType getUnderlyingType(const FriendDecl &Node) {
     return TSI->getType();
   return QualType();
 }
+inline QualType getUnderlyingType(const CXXBaseSpecifier &Node) {
+  return Node.getType();
+}
 
 /// Unifies obtaining the FunctionProtoType pointer from both
 /// FunctionProtoType and FunctionDecl nodes..
@@ -142,6 +145,15 @@ inline const FunctionProtoType *getFunctionProtoType(const FunctionDecl &Node) {
   return Node.getType()->getAs<FunctionProtoType>();
 }
 
+/// Unifies obtaining the access specifier from Decl and CXXBaseSpecifier nodes.
+inline clang::AccessSpecifier getAccessSpecifier(const Decl &Node) {
+  return Node.getAccess();
+}
+
+inline clang::AccessSpecifier getAccessSpecifier(const CXXBaseSpecifier &Node) {
+  return Node.getAccessSpecifier();
+}
+
 /// Internal version of BoundNodes. Holds all the bound nodes.
 class BoundNodesMap {
 public:
@@ -1929,6 +1941,13 @@ using HasOverloadOpNameMatcher = PolymorphicMatcherWithParam1<
 HasOverloadOpNameMatcher
 hasAnyOverloadedOperatorNameFunc(ArrayRef<const StringRef *> NameRefs);
 
+/// Returns true if \p Node has a base specifier matching \p BaseSpec.
+///
+/// A class is not considered to be derived from itself.
+bool matchesAnyBase(const CXXRecordDecl &Node,
+                    const Matcher<CXXBaseSpecifier> &BaseSpecMatcher,
+                    ASTMatchFinder *Finder, BoundNodesTreeBuilder *Builder);
+
 } // namespace internal
 
 } // namespace ast_matchers
diff --git a/clang/lib/AST/ASTTypeTraits.cpp b/clang/lib/AST/ASTTypeTraits.cpp
index 6404edd79679a..37e81e5813eca 100644
--- a/clang/lib/AST/ASTTypeTraits.cpp
+++ b/clang/lib/AST/ASTTypeTraits.cpp
@@ -27,6 +27,7 @@ const ASTNodeKind::KindInfo ASTNodeKind::AllKindInfo[] = {
   { NKI_None, "NestedNameSpecifierLoc" },
   { NKI_None, "QualType" },
   { NKI_None, "TypeLoc" },
+  { NKI_None, "CXXBaseSpecifier" },
   { NKI_None, "CXXCtorInitializer" },
   { NKI_None, "NestedNameSpecifier" },
   { NKI_None, "Decl" },
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 1ee89ccd3c115..9b69734d075d3 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -68,6 +68,30 @@ bool OptionallyVariadicOperator(const DynTypedNode &DynNode,
                                 BoundNodesTreeBuilder *Builder,
                                 ArrayRef<DynTypedMatcher> InnerMatchers);
 
+bool matchesAnyBase(const CXXRecordDecl &Node,
+                    const Matcher<CXXBaseSpecifier> &BaseSpecMatcher,
+                    ASTMatchFinder *Finder, BoundNodesTreeBuilder *Builder) {
+  if (!Node.hasDefinition())
+    return false;
+
+  CXXBasePaths Paths;
+  Paths.setOrigin(&Node);
+
+  const auto basePredicate =
+      [Finder, Builder, &BaseSpecMatcher](const CXXBaseSpecifier *BaseSpec,
+                                          CXXBasePath &IgnoredParam) {
+        BoundNodesTreeBuilder Result(*Builder);
+        if (BaseSpecMatcher.matches(*BaseSpec, Finder, Builder)) {
+          *Builder = std::move(Result);
+          return true;
+        }
+        return false;
+      };
+
+  return Node.lookupInBases(basePredicate, Paths,
+                            /*LookupInDependent =*/true);
+}
+
 void BoundNodesTreeBuilder::visitMatches(Visitor *ResultVisitor) {
   if (Bindings.empty())
     Bindings.push_back(BoundNodesMap());
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 14d9bbb3e52d1..950a56f795516 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -237,6 +237,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(has);
   REGISTER_MATCHER(hasAncestor);
   REGISTER_MATCHER(hasAnyArgument);
+  REGISTER_MATCHER(hasAnyBase);
   REGISTER_MATCHER(hasAnyClause);
   REGISTER_MATCHER(hasAnyConstructorInitializer);
   REGISTER_MATCHER(hasAnyDeclaration);
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 929188abf6acd..9f8538edb35bd 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -3006,5 +3006,126 @@ void x() {
   EXPECT_TRUE(matchesWithOpenMP(Source6, Matcher));
 }
 
+TEST(HasAnyBase, DirectBase) {
+  EXPECT_TRUE(matches(
+      "struct Base {};"
+      "struct ExpectedMatch : Base {};",
+      cxxRecordDecl(hasName("ExpectedMatch"),
+                    hasAnyBase(hasType(cxxRecordDecl(hasName("Base")))))));
+}
+
+TEST(HasAnyBase, IndirectBase) {
+  EXPECT_TRUE(matches(
+      "struct Base {};"
+      "struct Intermediate : Base {};"
+      "struct ExpectedMatch : Intermediate {};",
+      cxxRecordDecl(hasName("ExpectedMatch"),
+                    hasAnyBase(hasType(cxxRecordDecl(hasName("Base")))))));
+}
+
+TEST(HasAnyBase, NoBase) {
+  EXPECT_TRUE(notMatches("struct Foo {};"
+                         "struct Bar {};",
+                         cxxRecordDecl(hasAnyBase(hasType(cxxRecordDecl())))));
+}
+
+TEST(IsPublicBase, Public) {
+  EXPECT_TRUE(matches("class Base {};"
+                      "class Derived : public Base {};",
+                      cxxRecordDecl(hasAnyBase(isPublic()))));
+}
+
+TEST(IsPublicBase, DefaultAccessSpecifierPublic) {
+  EXPECT_TRUE(matches("class Base {};"
+                      "struct Derived : Base {};",
+                      cxxRecordDecl(hasAnyBase(isPublic()))));
+}
+
+TEST(IsPublicBase, Private) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : private Base {};",
+                         cxxRecordDecl(hasAnyBase(isPublic()))));
+}
+
+TEST(IsPublicBase, DefaultAccessSpecifierPrivate) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : Base {};",
+                         cxxRecordDecl(hasAnyBase(isPublic()))));
+}
+
+TEST(IsPublicBase, Protected) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : protected Base {};",
+                         cxxRecordDecl(hasAnyBase(isPublic()))));
+}
+
+TEST(IsPrivateBase, Private) {
+  EXPECT_TRUE(matches("class Base {};"
+                      "class Derived : private Base {};",
+                      cxxRecordDecl(hasAnyBase(isPrivate()))));
+}
+
+TEST(IsPrivateBase, DefaultAccessSpecifierPrivate) {
+  EXPECT_TRUE(matches("struct Base {};"
+                      "class Derived : Base {};",
+                      cxxRecordDecl(hasAnyBase(isPrivate()))));
+}
+
+TEST(IsPrivateBase, Public) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : public Base {};",
+                         cxxRecordDecl(hasAnyBase(isPrivate()))));
+}
+
+TEST(IsPrivateBase, DefaultAccessSpecifierPublic) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "struct Derived : Base {};",
+                         cxxRecordDecl(hasAnyBase(isPrivate()))));
+}
+
+TEST(IsPrivateBase, Protected) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : protected Base {};",
+                         cxxRecordDecl(hasAnyBase(isPrivate()))));
+}
+
+TEST(IsProtectedBase, Protected) {
+  EXPECT_TRUE(matches("class Base {};"
+                      "class Derived : protected Base {};",
+                      cxxRecordDecl(hasAnyBase(isProtected()))));
+}
+
+TEST(IsProtectedBase, Public) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : public Base {};",
+                         cxxRecordDecl(hasAnyBase(isProtected()))));
+}
+
+TEST(IsProtectedBase, Private) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : private Base {};",
+                         cxxRecordDecl(hasAnyBase(isProtected()))));
+}
+
+TEST(IsVirtual, Directly) {
+  EXPECT_TRUE(matches("class Base {};"
+                      "class Derived : virtual Base {};",
+                      cxxRecordDecl(hasAnyBase(isVirtual()))));
+}
+
+TEST(IsVirtual, Indirectly) {
+  EXPECT_TRUE(
+      matches("class Base {};"
+              "class Intermediate : virtual Base {};"
+              "class Derived : Intermediate {};",
+              cxxRecordDecl(hasName("Derived"), hasAnyBase(isVirtual()))));
+}
+
+TEST(IsVirtual, NoVirtualBase) {
+  EXPECT_TRUE(notMatches("class Base {};"
+                         "class Derived : Base {};",
+                         cxxRecordDecl(hasAnyBase(isVirtual()))));
+}
+
 } // namespace ast_matchers
 } // namespace clang

From 6f6e91d19337315548f550479f94cbc0af93c8fe Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 29 May 2020 20:42:22 +0100
Subject: [PATCH 575/770] [Matrix] Implement + and - operators for MatrixType.

This patch implements the + and - binary operators for values of
MatrixType. It adds support for matrix +/- matrix, scalar +/- matrix and
matrix +/- scalar.

For the matrix, matrix case, the types must initially be structurally
equivalent. For the scalar,matrix variants, the element type of the
matrix must match the scalar type.

Reviewers: rjmccall, anemet, Bigcheese, rsmith, martong

Reviewed By: rjmccall

Differential Revision: https://reviews.llvm.org/D76793
---
 clang/include/clang/AST/Type.h                |   7 +-
 clang/include/clang/Sema/Sema.h               |   5 +
 clang/lib/CodeGen/CGExprScalar.cpp            |  11 ++
 clang/lib/Sema/SemaExpr.cpp                   |  67 +++++++
 clang/lib/Sema/SemaOverload.cpp               |  43 +++++
 clang/test/CodeGen/matrix-type-operators.c    | 174 ++++++++++++++++++
 .../test/CodeGenCXX/matrix-type-operators.cpp | 156 ++++++++++++++++
 clang/test/Sema/matrix-type-operators.c       |  33 ++++
 clang/test/SemaCXX/matrix-type-operators.cpp  |  93 ++++++++++
 llvm/include/llvm/IR/MatrixBuilder.h          |  20 ++
 10 files changed, 608 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/matrix-type-operators.c
 create mode 100644 clang/test/CodeGenCXX/matrix-type-operators.cpp
 create mode 100644 clang/test/Sema/matrix-type-operators.c
 create mode 100644 clang/test/SemaCXX/matrix-type-operators.cpp

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index ed31dea925f39..3cdce1fbfe533 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2050,7 +2050,8 @@ class alignas(8) Type : public ExtQualsTypeCommonBase {
   bool isComplexIntegerType() const;            // GCC _Complex integer type.
   bool isVectorType() const;                    // GCC vector type.
   bool isExtVectorType() const;                 // Extended vector type.
-  bool isConstantMatrixType() const;            // Matrix type.
+  bool isMatrixType() const;                    // Matrix type.
+  bool isConstantMatrixType() const;            // Constant matrix type.
   bool isDependentAddressSpaceType() const;     // value-dependent address space qualifier
   bool isObjCObjectPointerType() const;         // pointer to ObjC object
   bool isObjCRetainableType() const;            // ObjC object or block pointer
@@ -6744,6 +6745,10 @@ inline bool Type::isExtVectorType() const {
   return isa<ExtVectorType>(CanonicalType);
 }
 
+inline bool Type::isMatrixType() const {
+  return isa<MatrixType>(CanonicalType);
+}
+
 inline bool Type::isConstantMatrixType() const {
   return isa<ConstantMatrixType>(CanonicalType);
 }
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 594c6e03aa38f..03977d2c94f9b 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11205,6 +11205,11 @@ class Sema final {
   QualType CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                       SourceLocation Loc);
 
+  /// Type checking for matrix binary operators.
+  QualType CheckMatrixElementwiseOperands(ExprResult &LHS, ExprResult &RHS,
+                                          SourceLocation Loc,
+                                          bool IsCompAssign);
+
   bool areLaxCompatibleVectorTypes(QualType srcType, QualType destType);
   bool isLaxVectorConversion(QualType srcType, QualType destType);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 346c429f663e5..84620b1f7d81d 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsPowerPC.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/Module.h"
 #include <cstdarg>
 
@@ -3536,6 +3537,11 @@ Value *ScalarExprEmitter::EmitAdd(const BinOpInfo &op) {
     }
   }
 
+  if (op.Ty->isConstantMatrixType()) {
+    llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+    return MB.CreateAdd(op.LHS, op.RHS);
+  }
+
   if (op.Ty->isUnsignedIntegerType() &&
       CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow) &&
       !CanElideOverflowCheck(CGF.getContext(), op))
@@ -3720,6 +3726,11 @@ Value *ScalarExprEmitter::EmitSub(const BinOpInfo &op) {
       }
     }
 
+    if (op.Ty->isConstantMatrixType()) {
+      llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+      return MB.CreateSub(op.LHS, op.RHS);
+    }
+
     if (op.Ty->isUnsignedIntegerType() &&
         CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow) &&
         !CanElideOverflowCheck(CGF.getContext(), op))
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 63f71d81e0474..f4e8c2d09edc4 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -10243,6 +10243,11 @@ QualType Sema::CheckAdditionOperands(ExprResult &LHS, ExprResult &RHS,
     return compType;
   }
 
+  if (LHS.get()->getType()->isConstantMatrixType() ||
+      RHS.get()->getType()->isConstantMatrixType()) {
+    return CheckMatrixElementwiseOperands(LHS, RHS, Loc, CompLHSTy);
+  }
+
   QualType compType = UsualArithmeticConversions(
       LHS, RHS, Loc, CompLHSTy ? ACK_CompAssign : ACK_Arithmetic);
   if (LHS.isInvalid() || RHS.isInvalid())
@@ -10338,6 +10343,11 @@ QualType Sema::CheckSubtractionOperands(ExprResult &LHS, ExprResult &RHS,
     return compType;
   }
 
+  if (LHS.get()->getType()->isConstantMatrixType() ||
+      RHS.get()->getType()->isConstantMatrixType()) {
+    return CheckMatrixElementwiseOperands(LHS, RHS, Loc, CompLHSTy);
+  }
+
   QualType compType = UsualArithmeticConversions(
       LHS, RHS, Loc, CompLHSTy ? ACK_CompAssign : ACK_Arithmetic);
   if (LHS.isInvalid() || RHS.isInvalid())
@@ -11933,6 +11943,63 @@ QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
   return GetSignedVectorType(LHS.get()->getType());
 }
 
+static bool tryConvertScalarToMatrixElementTy(Sema &S, QualType ElementType,
+                                              ExprResult *Scalar) {
+  InitializedEntity Entity =
+      InitializedEntity::InitializeTemporary(ElementType);
+  InitializationKind Kind = InitializationKind::CreateCopy(
+      Scalar->get()->getBeginLoc(), SourceLocation());
+  Expr *Arg = Scalar->get();
+  InitializationSequence InitSeq(S, Entity, Kind, Arg);
+  *Scalar = InitSeq.Perform(S, Entity, Kind, Arg);
+  return !Scalar->isInvalid();
+}
+
+QualType Sema::CheckMatrixElementwiseOperands(ExprResult &LHS, ExprResult &RHS,
+                                              SourceLocation Loc,
+                                              bool IsCompAssign) {
+  if (!IsCompAssign) {
+    LHS = DefaultFunctionArrayLvalueConversion(LHS.get());
+    if (LHS.isInvalid())
+      return QualType();
+  }
+  RHS = DefaultFunctionArrayLvalueConversion(RHS.get());
+  if (RHS.isInvalid())
+    return QualType();
+
+  // For conversion purposes, we ignore any qualifiers.
+  // For example, "const float" and "float" are equivalent.
+  QualType LHSType = LHS.get()->getType().getUnqualifiedType();
+  QualType RHSType = RHS.get()->getType().getUnqualifiedType();
+
+  const MatrixType *LHSMatType = LHSType->getAs<MatrixType>();
+  const MatrixType *RHSMatType = RHSType->getAs<MatrixType>();
+  assert((LHSMatType || RHSMatType) && "At least one operand must be a matrix");
+
+  if (Context.hasSameType(LHSType, RHSType))
+    return LHSType;
+
+  // Type conversion may change LHS/RHS. Keep copies to the original results, in
+  // case we have to return InvalidOperands.
+  ExprResult OriginalLHS = LHS;
+  ExprResult OriginalRHS = RHS;
+  if (LHSMatType && !RHSMatType) {
+    if (tryConvertScalarToMatrixElementTy(*this, LHSMatType->getElementType(),
+                                          &RHS))
+      return LHSType;
+    return InvalidOperands(Loc, OriginalLHS, OriginalRHS);
+  }
+
+  if (!LHSMatType && RHSMatType) {
+    if (tryConvertScalarToMatrixElementTy(*this, RHSMatType->getElementType(),
+                                          &LHS))
+      return RHSType;
+    return InvalidOperands(Loc, OriginalLHS, OriginalRHS);
+  }
+
+  return InvalidOperands(Loc, LHS, RHS);
+}
+
 inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
                                            SourceLocation Loc,
                                            BinaryOperatorKind Opc) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index ad75529debdba..1aef43614d99e 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -7682,6 +7682,10 @@ class BuiltinCandidateTypeSet  {
   /// candidates.
   TypeSet VectorTypes;
 
+  /// The set of matrix types that will be used in the built-in
+  /// candidates.
+  TypeSet MatrixTypes;
+
   /// A flag indicating non-record types are viable candidates
   bool HasNonRecordTypes;
 
@@ -7742,6 +7746,11 @@ class BuiltinCandidateTypeSet  {
   iterator vector_begin() { return VectorTypes.begin(); }
   iterator vector_end() { return VectorTypes.end(); }
 
+  llvm::iterator_range<iterator> matrix_types() { return MatrixTypes; }
+  iterator matrix_begin() { return MatrixTypes.begin(); }
+  iterator matrix_end() { return MatrixTypes.end(); }
+
+  bool containsMatrixType(QualType Ty) const { return MatrixTypes.count(Ty); }
   bool hasNonRecordTypes() { return HasNonRecordTypes; }
   bool hasArithmeticOrEnumeralTypes() { return HasArithmeticOrEnumeralTypes; }
   bool hasNullPtrType() const { return HasNullPtrType; }
@@ -7916,6 +7925,11 @@ BuiltinCandidateTypeSet::AddTypesConvertedFrom(QualType Ty,
     // extension.
     HasArithmeticOrEnumeralTypes = true;
     VectorTypes.insert(Ty);
+  } else if (Ty->isMatrixType()) {
+    // Similar to vector types, we treat vector types as arithmetic types in
+    // many contexts as an extension.
+    HasArithmeticOrEnumeralTypes = true;
+    MatrixTypes.insert(Ty);
   } else if (Ty->isNullPtrType()) {
     HasNullPtrType = true;
   } else if (AllowUserConversions && TyRec) {
@@ -8144,6 +8158,13 @@ class BuiltinOperatorOverloadBuilder {
 
   }
 
+  /// Helper to add an overload candidate for a binary builtin with types \p L
+  /// and \p R.
+  void AddCandidate(QualType L, QualType R) {
+    QualType LandR[2] = {L, R};
+    S.AddBuiltinCandidate(LandR, Args, CandidateSet);
+  }
+
 public:
   BuiltinOperatorOverloadBuilder(
     Sema &S, ArrayRef<Expr *> Args,
@@ -8562,6 +8583,27 @@ class BuiltinOperatorOverloadBuilder {
     }
   }
 
+  /// Add binary operator overloads for each candidate matrix type M1, M2:
+  ///  * (M1, M1) -> M1
+  ///  * (M1, M1.getElementType()) -> M1
+  ///  * (M2.getElementType(), M2) -> M2
+  ///  * (M2, M2) -> M2 // Only if M2 is not part of CandidateTypes[0].
+  void addMatrixBinaryArithmeticOverloads() {
+    if (!HasArithmeticOrEnumeralCandidateType)
+      return;
+
+    for (QualType M1 : CandidateTypes[0].matrix_types()) {
+      AddCandidate(M1, cast<MatrixType>(M1)->getElementType());
+      AddCandidate(M1, M1);
+    }
+
+    for (QualType M2 : CandidateTypes[1].matrix_types()) {
+      AddCandidate(cast<MatrixType>(M2)->getElementType(), M2);
+      if (!CandidateTypes[0].containsMatrixType(M2))
+        AddCandidate(M2, M2);
+    }
+  }
+
   // C++2a [over.built]p14:
   //
   //   For every integral type T there exists a candidate operator function
@@ -9135,6 +9177,7 @@ void Sema::AddBuiltinOperatorCandidates(OverloadedOperatorKind Op,
     } else {
       OpBuilder.addBinaryPlusOrMinusPointerOverloads(Op);
       OpBuilder.addGenericBinaryArithmeticOverloads();
+      OpBuilder.addMatrixBinaryArithmeticOverloads();
     }
     break;
 
diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c
new file mode 100644
index 0000000000000..a92b7ebc6d1df
--- /dev/null
+++ b/clang/test/CodeGen/matrix-type-operators.c
@@ -0,0 +1,174 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+typedef unsigned long long ullx4x2_t __attribute__((matrix_type(4, 2)));
+
+// Floating point matrix/scalar additions.
+
+void add_matrix_matrix_double(dx5x5_t a, dx5x5_t b, dx5x5_t c) {
+  // CHECK-LABEL: define void @add_matrix_matrix_double(<25 x double> %a, <25 x double> %b, <25 x double> %c)
+  // CHECK:       [[B:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
+  // CHECK-NEXT:  [[C:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <25 x double> [[B]], [[C]]
+  // CHECK-NEXT:  store <25 x double> [[RES]], <25 x double>* {{.*}}, align 8
+
+  a = b + c;
+}
+
+void add_matrix_matrix_float(fx2x3_t a, fx2x3_t b, fx2x3_t c) {
+  // CHECK-LABEL: define void @add_matrix_matrix_float(<6 x float> %a, <6 x float> %b, <6 x float> %c)
+  // CHECK:       [[B:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
+  // CHECK-NEXT:  [[C:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[B]], [[C]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], <6 x float>* {{.*}}, align 4
+
+  a = b + c;
+}
+
+void add_matrix_scalar_double_float(dx5x5_t a, float vf) {
+  // CHECK-LABEL: define void @add_matrix_scalar_double_float(<25 x double> %a, float %vf)
+  // CHECK:       [[MATRIX:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
+  // CHECK-NEXT:  [[SCALAR:%.*]] = load float, float* %vf.addr, align 4
+  // CHECK-NEXT:  [[SCALAR_EXT:%.*]] = fpext float [[SCALAR]] to double
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <25 x double> undef, double [[SCALAR_EXT]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <25 x double> [[SCALAR_EMBED]], <25 x double> undef, <25 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <25 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <25 x double> [[RES]], <25 x double>* {{.*}}, align 8
+
+  a = a + vf;
+}
+
+void add_matrix_scalar_double_double(dx5x5_t a, double vd) {
+  // CHECK-LABEL: define void @add_matrix_scalar_double_double(<25 x double> %a, double %vd)
+  // CHECK:       [[MATRIX:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
+  // CHECK-NEXT:  [[SCALAR:%.*]] = load double, double* %vd.addr, align 8
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <25 x double> undef, double [[SCALAR]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <25 x double> [[SCALAR_EMBED]], <25 x double> undef, <25 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <25 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <25 x double> [[RES]], <25 x double>* {{.*}}, align 8
+
+  a = a + vd;
+}
+
+void add_matrix_scalar_float_float(fx2x3_t b, float vf) {
+  // CHECK-LABEL: define void @add_matrix_scalar_float_float(<6 x float> %b, float %vf)
+  // CHECK:       [[MATRIX:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
+  // CHECK-NEXT:  [[SCALAR:%.*]] = load float, float* %vf.addr, align 4
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> undef, float [[SCALAR]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> undef, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], <6 x float>* {{.*}}, align 4
+
+  b = b + vf;
+}
+
+void add_matrix_scalar_float_double(fx2x3_t b, double vd) {
+  // CHECK-LABEL: define void @add_matrix_scalar_float_double(<6 x float> %b, double %vd)
+  // CHECK:       [[MATRIX:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
+  // CHECK-NEXT:  [[SCALAR:%.*]] = load double, double* %vd.addr, align 8
+  // CHECK-NEXT:  [[SCALAR_TRUNC:%.*]] = fptrunc double [[SCALAR]] to float
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <6 x float> undef, float [[SCALAR_TRUNC]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <6 x float> [[SCALAR_EMBED]], <6 x float> undef, <6 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <6 x float> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:  store <6 x float> [[RES]], <6 x float>* {{.*}}, align 4
+
+  b = b + vd;
+}
+
+// Integer matrix/scalar additions
+
+void add_matrix_matrix_int(ix9x3_t a, ix9x3_t b, ix9x3_t c) {
+  // CHECK-LABEL: define void @add_matrix_matrix_int(<27 x i32> %a, <27 x i32> %b, <27 x i32> %c)
+  // CHECK:       [[B:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:  [[C:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:  [[RES:%.*]] = add <27 x i32> [[B]], [[C]]
+  // CHECK-NEXT:  store <27 x i32> [[RES]], <27 x i32>* {{.*}}, align 4
+  a = b + c;
+}
+
+void add_matrix_matrix_unsigned_long_long(ullx4x2_t a, ullx4x2_t b, ullx4x2_t c) {
+  // CHECK-LABEL: define void @add_matrix_matrix_unsigned_long_long(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c)
+  // CHECK:       [[B:%.*]] = load <8 x i64>, <8 x i64>* {{.*}}, align 8
+  // CHECK-NEXT:  [[C:%.*]] = load <8 x i64>, <8 x i64>* {{.*}}, align 8
+  // CHECK-NEXT:  [[RES:%.*]] = add <8 x i64> [[B]], [[C]]
+  // CHECK-NEXT:  store <8 x i64> [[RES]], <8 x i64>* {{.*}}, align 8
+
+  a = b + c;
+}
+
+void add_matrix_scalar_int_short(ix9x3_t a, short vs) {
+  // CHECK-LABEL: define void @add_matrix_scalar_int_short(<27 x i32> %a, i16 signext %vs)
+  // CHECK:        [[MATRIX:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4
+  // CHECK-NEXT:   [[SCALAR:%.*]] = load i16, i16* %vs.addr, align 2
+  // CHECK-NEXT:   [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <27 x i32> undef, i32 [[SCALAR_EXT]], i32 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <27 x i32> [[SCALAR_EMBED]], <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <27 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <27 x i32> [[RES]], <27 x i32>* [[MAT_ADDR]], align 4
+
+  a = a + vs;
+}
+
+void add_matrix_scalar_int_long_int(ix9x3_t a, long int vli) {
+  // CHECK-LABEL: define void @add_matrix_scalar_int_long_int(<27 x i32> %a, i64 %vli)
+  // CHECK:        [[MATRIX:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4
+  // CHECK-NEXT:   [[SCALAR:%.*]] = load i64, i64* %vli.addr, align 8
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <27 x i32> undef, i32 [[SCALAR_TRUNC]], i32 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <27 x i32> [[SCALAR_EMBED]], <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <27 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <27 x i32> [[RES]], <27 x i32>* [[MAT_ADDR]], align 4
+
+  a = a + vli;
+}
+
+void add_matrix_scalar_int_unsigned_long_long(ix9x3_t a, unsigned long long int vulli) {
+  // CHECK-LABEL: define void @add_matrix_scalar_int_unsigned_long_long(<27 x i32> %a, i64 %vulli)
+  // CHECK:        [[MATRIX:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4
+  // CHECK-NEXT:   [[SCALAR:%.*]] = load i64, i64* %vulli.addr, align 8
+  // CHECK-NEXT:   [[SCALAR_TRUNC:%.*]] = trunc i64 [[SCALAR]] to i32
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <27 x i32> undef, i32 [[SCALAR_TRUNC]], i32 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <27 x i32> [[SCALAR_EMBED]], <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <27 x i32> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK-NEXT:   store <27 x i32> [[RES]], <27 x i32>* [[MAT_ADDR]], align 4
+
+  a = a + vulli;
+}
+
+void add_matrix_scalar_long_long_int_short(ullx4x2_t b, short vs) {
+  // CHECK-LABEL: define void @add_matrix_scalar_long_long_int_short(<8 x i64> %b, i16 signext %vs)
+  // CHECK:         [[SCALAR:%.*]] = load i16, i16* %vs.addr, align 2
+  // CHECK-NEXT:    [[SCALAR_EXT:%.*]] = sext i16 [[SCALAR]] to i64
+  // CHECK-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, <8 x i64>* {{.*}}, align 8
+  // CHECK-NEXT:    [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> undef, i64 [[SCALAR_EXT]], i32 0
+  // CHECK-NEXT:    [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], <8 x i64>* {{.*}}, align 8
+
+  b = vs + b;
+}
+
+void add_matrix_scalar_long_long_int_int(ullx4x2_t b, long int vli) {
+  // CHECK-LABEL: define void @add_matrix_scalar_long_long_int_int(<8 x i64> %b, i64 %vli)
+  // CHECK:         [[SCALAR:%.*]] = load i64, i64* %vli.addr, align 8
+  // CHECK-NEXT:    [[MATRIX:%.*]] = load <8 x i64>, <8 x i64>* {{.*}}, align 8
+  // CHECK-NEXT:    [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> undef, i64 [[SCALAR]], i32 0
+  // CHECK-NEXT:    [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK-NEXT:    [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:    store <8 x i64> [[RES]], <8 x i64>* {{.*}}, align 8
+
+  b = vli + b;
+}
+
+void add_matrix_scalar_long_long_int_unsigned_long_long(ullx4x2_t b, unsigned long long int vulli) {
+  // CHECK-LABEL: define void @add_matrix_scalar_long_long_int_unsigned_long_long
+  // CHECK:        [[SCALAR:%.*]] = load i64, i64* %vulli.addr, align 8
+  // CHECK-NEXT:   [[MATRIX:%.*]] = load <8 x i64>, <8 x i64>* %0, align 8
+  // CHECK-NEXT:   [[SCALAR_EMBED:%.*]] = insertelement <8 x i64> undef, i64 [[SCALAR]], i32 0
+  // CHECK-NEXT:   [[SCALAR_EMBED1:%.*]] = shufflevector <8 x i64> [[SCALAR_EMBED]], <8 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK-NEXT:   [[RES:%.*]] = add <8 x i64> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK-NEXT:   store <8 x i64> [[RES]], <8 x i64>* {{.*}}, align 8
+  b = vulli + b;
+}
diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp
new file mode 100644
index 0000000000000..fe2f8e292cb2f
--- /dev/null
+++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t add(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy0, R0, C0> &B) {
+  return A.value + B.value;
+}
+
+void test_add_template() {
+  // CHECK-LABEL: define void @_Z17test_add_templatev()
+  // CHECK:       %call = call <10 x float> @_Z3addIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(%struct.MyMatrix* nonnull align 4 dereferenceable(40) %Mat1, %struct.MyMatrix* nonnull align 4 dereferenceable(40) %Mat2)
+
+  // CHECK-LABEL: define linkonce_odr <10 x float> @_Z3addIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(
+  // CHECK:       [[MAT1:%.*]] = load <10 x float>, <10 x float>* {{.*}}, align 4
+  // CHECK:       [[MAT2:%.*]] = load <10 x float>, <10 x float>* {{.*}}, align 4
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <10 x float> [[MAT1]], [[MAT2]]
+  // CHECK-NEXT:  ret <10 x float> [[RES]]
+
+  MyMatrix<float, 2, 5> Mat1;
+  MyMatrix<float, 2, 5> Mat2;
+  Mat1.value = add(Mat1, Mat2);
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t subtract(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy0, R0, C0> &B) {
+  return A.value - B.value;
+}
+
+void test_subtract_template() {
+  // CHECK-LABEL: define void @_Z22test_subtract_templatev()
+  // CHECK:       %call = call <10 x float> @_Z8subtractIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(%struct.MyMatrix* nonnull align 4 dereferenceable(40) %Mat1, %struct.MyMatrix* nonnull align 4 dereferenceable(40) %Mat2)
+
+  // CHECK-LABEL: define linkonce_odr <10 x float> @_Z8subtractIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(
+  // CHECK:       [[MAT1:%.*]] = load <10 x float>, <10 x float>* {{.*}}, align 4
+  // CHECK:       [[MAT2:%.*]] = load <10 x float>, <10 x float>* {{.*}}, align 4
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <10 x float> [[MAT1]], [[MAT2]]
+  // CHECK-NEXT:  ret <10 x float> [[RES]]
+
+  MyMatrix<float, 2, 5> Mat1;
+  MyMatrix<float, 2, 5> Mat2;
+  Mat1.value = subtract(Mat1, Mat2);
+}
+
+struct DoubleWrapper1 {
+  int x;
+  operator double() {
+    return x;
+  }
+};
+
+void test_DoubleWrapper1_Sub1(MyMatrix<double, 10, 9> &m) {
+  // CHECK-LABEL: define void @_Z24test_DoubleWrapper1_Sub1R8MyMatrixIdLj10ELj9EE(
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
+  // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper1cvdEv(%struct.DoubleWrapper1* %w1)
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> undef, double [[SCALAR]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> undef, <90 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <90 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <90 x double> [[RES]], <90 x double>* {{.*}}, align 8
+
+  DoubleWrapper1 w1;
+  w1.x = 10;
+  m.value = m.value - w1;
+}
+
+void test_DoubleWrapper1_Sub2(MyMatrix<double, 10, 9> &m) {
+  // CHECK-LABEL: define void @_Z24test_DoubleWrapper1_Sub2R8MyMatrixIdLj10ELj9EE(
+  // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper1cvdEv(%struct.DoubleWrapper1* %w1)
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> undef, double [[SCALAR]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> undef, <90 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <90 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <90 x double> [[RES]], <90 x double>* {{.*}}, align 8
+
+  DoubleWrapper1 w1;
+  w1.x = 10;
+  m.value = w1 - m.value;
+}
+
+struct DoubleWrapper2 {
+  int x;
+  operator double() {
+    return x;
+  }
+};
+
+void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
+  // CHECK-LABEL: define void @_Z24test_DoubleWrapper2_Add1R8MyMatrixIdLj10ELj9EE(
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
+  // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* %w2)
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> undef, double [[SCALAR]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> undef, <90 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <90 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <90 x double> [[RES]], <90 x double>* {{.*}}, align 8
+
+  DoubleWrapper2 w2;
+  w2.x = 20;
+  m.value = m.value + w2;
+}
+
+void test_DoubleWrapper2_Add2(MyMatrix<double, 10, 9> &m) {
+  // CHECK-LABEL: define void @_Z24test_DoubleWrapper2_Add2R8MyMatrixIdLj10ELj9EE(
+  // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* %w2)
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> undef, double [[SCALAR]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> undef, <90 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <90 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <90 x double> [[RES]], <90 x double>* {{.*}}, align 8
+
+  DoubleWrapper2 w2;
+  w2.x = 20;
+  m.value = w2 + m.value;
+}
+
+struct IntWrapper {
+  char x;
+  operator int() {
+    return x;
+  }
+};
+
+void test_IntWrapper_Add(MyMatrix<double, 10, 9> &m) {
+  // CHECK-LABEL: define void @_Z19test_IntWrapper_AddR8MyMatrixIdLj10ELj9EE(
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
+  // CHECK:       [[SCALAR:%.*]] = call i32 @_ZN10IntWrappercviEv(%struct.IntWrapper* %w3)
+  // CHECK:       [[SCALAR_FP:%.*]] = sitofp i32 %call to double
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> undef, double [[SCALAR_FP]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> undef, <90 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fadd <90 x double> [[MATRIX]], [[SCALAR_EMBED1]]
+  // CHECK:       store <90 x double> [[RES]], <90 x double>* {{.*}}, align 8
+
+  IntWrapper w3;
+  w3.x = 'c';
+  m.value = m.value + w3;
+}
+
+void test_IntWrapper_Sub(MyMatrix<double, 10, 9> &m) {
+  // CHECK-LABEL: define void @_Z19test_IntWrapper_SubR8MyMatrixIdLj10ELj9EE(
+  // CHECK:       [[SCALAR:%.*]] = call i32 @_ZN10IntWrappercviEv(%struct.IntWrapper* %w3)
+  // CHECK-NEXT:  [[SCALAR_FP:%.*]] = sitofp i32 %call to double
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
+  // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> undef, double [[SCALAR_FP]], i32 0
+  // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> undef, <90 x i32> zeroinitializer
+  // CHECK-NEXT:  [[RES:%.*]] = fsub <90 x double> [[SCALAR_EMBED1]], [[MATRIX]]
+  // CHECK:       store <90 x double> [[RES]], <90 x double>* {{.*}}, align 8
+
+  IntWrapper w3;
+  w3.x = 'c';
+  m.value = w3 - m.value;
+}
diff --git a/clang/test/Sema/matrix-type-operators.c b/clang/test/Sema/matrix-type-operators.c
new file mode 100644
index 0000000000000..41bcea5da37d0
--- /dev/null
+++ b/clang/test/Sema/matrix-type-operators.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+typedef float sx10x5_t __attribute__((matrix_type(10, 5)));
+typedef float sx10x10_t __attribute__((matrix_type(10, 10)));
+
+void add(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
+  a = b + c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t' (aka 'float __attribute__((matrix_type(10, 5)))'))}}
+
+  a = b + b; // expected-error {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = 10 + b;
+  // expected-error@-1 {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = b + &c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
+  // expected-error@-2 {{casting 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*') to incompatible type 'float'}}
+}
+
+void sub(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
+  a = b - c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t' (aka 'float __attribute__((matrix_type(10, 5)))'))}}
+
+  a = b - b; // expected-error {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = 10 - b;
+  // expected-error@-1 {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = b - &c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
+  // expected-error@-2 {{casting 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*') to incompatible type 'float'}}
+}
diff --git a/clang/test/SemaCXX/matrix-type-operators.cpp b/clang/test/SemaCXX/matrix-type-operators.cpp
new file mode 100644
index 0000000000000..153f89a21369e
--- /dev/null
+++ b/clang/test/SemaCXX/matrix-type-operators.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t add(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy1, R1, C1> &B) {
+  char *v1 = A.value + B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))'))}}
+  // expected-error@-3 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))'))}}
+
+  return A.value + B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))'))}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))'))}}
+}
+
+void test_add_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  unsigned v1 = add<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error@-1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-note@-2 {{in instantiation of function template specialization 'add<unsigned int, 2, 2, unsigned int, 2, 2, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = add<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'add<unsigned int, 2, 2, unsigned int, 3, 3, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = add<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note@-1 {{in instantiation of function template specialization 'add<unsigned int, 3, 3, float, 2, 2, unsigned int, 2, 2>' requested here}}
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t subtract(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy1, R1, C1> &B) {
+  char *v1 = A.value - B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-3 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))')}}
+
+  return A.value - B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))')}}
+}
+
+void test_subtract_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  unsigned v1 = subtract<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error@-1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-note@-2 {{in instantiation of function template specialization 'subtract<unsigned int, 2, 2, unsigned int, 2, 2, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = subtract<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'subtract<unsigned int, 2, 2, unsigned int, 3, 3, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = subtract<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note@-1 {{in instantiation of function template specialization 'subtract<unsigned int, 3, 3, float, 2, 2, unsigned int, 2, 2>' requested here}}
+}
+
+struct UserT {};
+
+struct StructWithC {
+  operator UserT() {
+    // expected-note@-1 4 {{candidate function}}
+    return {};
+  }
+};
+
+void test_DoubleWrapper(MyMatrix<double, 10, 9> &m, StructWithC &c) {
+  m.value = m.value + c;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<double, 10, 9>::matrix_t' (aka 'double __attribute__((matrix_type(10, 9)))') and 'StructWithC')}}
+
+  m.value = c + m.value;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('StructWithC' and 'MyMatrix<double, 10, 9>::matrix_t' (aka 'double __attribute__((matrix_type(10, 9)))'))}}
+
+  m.value = m.value - c;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<double, 10, 9>::matrix_t' (aka 'double __attribute__((matrix_type(10, 9)))') and 'StructWithC')}}
+
+  m.value = c - m.value;
+  // expected-error@-1 {{no viable conversion from 'StructWithC' to 'double'}}
+  // expected-error@-2 {{invalid operands to binary expression ('StructWithC' and 'MyMatrix<double, 10, 9>::matrix_t' (aka 'double __attribute__((matrix_type(10, 9)))'))}}
+}
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index ab92f86ebb4ee..84148841485b5 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -127,6 +127,16 @@ template <class IRBuilderTy> class MatrixBuilder {
   /// Add matrixes \p LHS and \p RHS. Support both integer and floating point
   /// matrixes.
   Value *CreateAdd(Value *LHS, Value *RHS) {
+    assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy());
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+      RHS = B.CreateVectorSplat(
+          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          "scalar.splat");
+    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+      LHS = B.CreateVectorSplat(
+          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          "scalar.splat");
+
     return cast<VectorType>(LHS->getType())
                    ->getElementType()
                    ->isFloatingPointTy()
@@ -137,6 +147,16 @@ template <class IRBuilderTy> class MatrixBuilder {
   /// Subtract matrixes \p LHS and \p RHS. Support both integer and floating
   /// point matrixes.
   Value *CreateSub(Value *LHS, Value *RHS) {
+    assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy());
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+      RHS = B.CreateVectorSplat(
+          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          "scalar.splat");
+    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+      LHS = B.CreateVectorSplat(
+          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          "scalar.splat");
+
     return cast<VectorType>(LHS->getType())
                    ->getElementType()
                    ->isFloatingPointTy()

From f66a43c11a7899d5c578b80d7f154abcea3b8d8e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 30 Apr 2020 16:53:13 -0700
Subject: [PATCH 576/770] Process gep (phi ptr1, ptr2) in SROA

Differential Revision: https://reviews.llvm.org/D79218
---
 llvm/lib/Transforms/Scalar/SROA.cpp  |  51 ++++
 llvm/test/Transforms/SROA/phi-gep.ll | 364 +++++++++++++++++++++++++++
 2 files changed, 415 insertions(+)
 create mode 100644 llvm/test/Transforms/SROA/phi-gep.ll

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 1d486a3e74fd1..0440805d484a1 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3488,11 +3488,62 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     return true;
   }
 
+  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
+  bool foldGEPPhi(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
+    if (llvm::any_of(PHI->incoming_values(), [](Value *In)
+          { Instruction *I = dyn_cast<Instruction>(In);
+            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+                   !I->getParent()->isLegalToHoistInto();
+          }))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
+                      << "\n    original: " << *PHI
+                      << "\n              " << GEPI
+                      << "\n          to: ");
+
+    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    bool IsInBounds = GEPI.isInBounds();
+    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
+    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
+                                          PHI->getNumIncomingValues(),
+                                          PHI->getName() + ".sroa.phi");
+    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
+      Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+
+      IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+      Value *NewVal = IsInBounds
+          ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
+          : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
+      NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
+      if (auto *I = dyn_cast<Instruction>(NewVal)) {
+        visit(I);
+      }
+    }
+
+    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.eraseFromParent();
+
+    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
+                 dbgs() << "\n              " << *In;
+               dbgs() << "\n              " << *NewPN << '\n');
+
+    return true;
+  }
+
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     if (isa<SelectInst>(GEPI.getPointerOperand()) &&
         foldGEPSelect(GEPI))
       return true;
 
+    if (isa<PHINode>(GEPI.getPointerOperand()) &&
+        foldGEPPhi(GEPI))
+      return true;
+
     enqueueUsers(GEPI);
     return false;
   }
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
new file mode 100644
index 0000000000000..72d07ed84a656
--- /dev/null
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -0,0 +1,364 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -sroa < %s | FileCheck %s
+
+%pair = type { i32, i32 }
+
+define i32 @test_sroa_phi_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 2, [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_non_inbound(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_non_inbound(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 2, [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
+  %gep = getelementptr %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_undef(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ undef, [[IF_THEN]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ undef, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+@g = global %pair zeroinitializer, align 4
+
+define i32 @test_sroa_phi_gep_global(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_global(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ @g, [[IF_THEN]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ @g, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_arg_phi_inspt(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_arg_phi_inspt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_INSPT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_INSPT]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 2, [[FOR]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_inspt = phi i32 [ 0, %entry ], [ %i, %for ]
+  %i = add i32 %phi_inspt, 1
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %for ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_phi_inspt(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_phi_inspt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_IN:%.*]] = phi %pair* [ null, [[ENTRY:%.*]] ], [ [[B]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_INSPT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_INSPT]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY]] ], [ [[PHI_IN]], [[FOR]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_in = phi %pair * [ null, %entry ], [ %b, %for ]
+  %phi_inspt = phi i32 [ 0, %entry ], [ %i, %for ]
+  %i = add i32 %phi_inspt, 1
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %phi_in, %for ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_gep_phi_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_gep_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[A_SROA_0]], [[ENTRY]] ], [ [[GEP_FOR:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
+; CHECK-NEXT:    [[GEP_FOR]] = getelementptr inbounds i32, i32* [[PHI]], i32 0
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_END:%.*]] = phi i32* [ [[A_SROA_0]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
+  %phi = phi i32* [ %gep_a, %entry], [ %gep_for, %for ]
+  %i = add i32 %phi_i, 1
+  %gep_for = getelementptr inbounds i32, i32* %phi, i32 0
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi_end = phi i32* [ %gep_a, %entry], [ %phi, %for ]
+  %load = load i32, i32* %phi_end, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_gep_cast_phi_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_gep_cast_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST2:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_2:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_SROA_PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_2_SROA_GEP:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
+; CHECK-NEXT:    [[GEP_FOR_1:%.*]] = bitcast float* [[PHI_SROA_PHI]] to i32*
+; CHECK-NEXT:    [[GEP_FOR_2]] = bitcast i32* [[GEP_FOR_1]] to float*
+; CHECK-NEXT:    [[GEP_FOR_2_SROA_GEP]] = getelementptr inbounds float, float* [[GEP_FOR_2]], i32 0
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_END:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST2]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_END_1:%.*]] = bitcast float* [[PHI_END]] to i32*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END_1]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_a.1 = bitcast i32* %gep_a to float*
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
+  %phi = phi float* [ %gep_a.1, %entry], [ %gep_for.2, %for ]
+  %i = add i32 %phi_i, 1
+  %gep_for = getelementptr inbounds float, float* %phi, i32 0
+  %gep_for.1 = bitcast float* %gep_for to i32*
+  %gep_for.2 = bitcast i32* %gep_for.1 to float*
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi_end = phi float* [ %gep_a.1, %entry], [ %phi, %for ]
+  %phi_end.1 = bitcast float* %phi_end to i32*
+  %load = load i32, i32* %phi_end.1, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @test_sroa_invoke_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[CALL:%.*]], label [[END:%.*]]
+; CHECK:       call:
+; CHECK-NEXT:    [[B:%.*]] = invoke %pair* @foo()
+; CHECK-NEXT:    to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[CALL]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK:       invoke_catch:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    catch i8* null
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %a = alloca %pair, align 4
+  br i1 %cond, label %call, label %end
+
+call:
+  %b = invoke %pair* @foo()
+  to label %end unwind label %invoke_catch
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %call ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+
+invoke_catch:
+  %res = landingpad { i8*, i32 }
+  catch i8* null
+  ret i32 0
+}
+
+define i32 @test_sroa_phi_gep_nonconst_idx(i1 %cond, i32 %idx) {
+; CHECK-LABEL: @test_sroa_phi_gep_nonconst_idx(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 %idx, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+declare %pair* @foo()
+
+declare i32 @__gxx_personality_v0(...)

From 6a4714030e467293e839433a6754685ed3bdc108 Mon Sep 17 00:00:00 2001
From: Tobias Bosch <tbosch@google.com>
Date: Thu, 28 May 2020 15:40:43 -0700
Subject: [PATCH 577/770] [DebugInfo][DAG] Don't reuse debug location on COPY
 if width changes.

Summary:
This caused incorrect debug information for parameters:
Previously, after a COPY of a parameter that changes the width,
we would emit a DBG_VALUE that continues to be associated to that
parameter, even though it now used a different width.
This made the LiveDebugValues pass assume the parameter value
got clobbered and it stopped tracking the parameter entry
value, leading to incorrect debug information.

Fixes https://bugs.llvm.org/show_bug.cgi?id=39715

Subscribers: aprantl, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80819
---
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |  4 +-
 llvm/test/DebugInfo/X86/dbg-value-funcarg3.ll | 58 +++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/DebugInfo/X86/dbg-value-funcarg3.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index d2fac644d9024..733762339d612 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -623,7 +623,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         // Otherwise this is another use or second copy use.
         CopyUseMI = nullptr; break;
       }
-      if (CopyUseMI) {
+      if (CopyUseMI &&
+          TRI.getRegSizeInBits(LDI->second, MRI) ==
+              TRI.getRegSizeInBits(CopyUseMI->getOperand(0).getReg(), MRI)) {
         // Use MI's debug location, which describes where Variable was
         // declared, rather than whatever is attached to CopyUseMI.
         MachineInstr *NewMI =
diff --git a/llvm/test/DebugInfo/X86/dbg-value-funcarg3.ll b/llvm/test/DebugInfo/X86/dbg-value-funcarg3.ll
new file mode 100644
index 0000000000000..234fd98eef492
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dbg-value-funcarg3.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -start-after=codegenprepare -stop-before=finalize-isel -o - %s | FileCheck %s
+
+; Input to this test looked like this and was compiled using: clang -g -O1 -mllvm -stop-after=codegenprepare -S
+;
+;    int fn1(long t1) {
+;      return t;
+;    }
+;
+
+; Catch metadata references for involved variables.
+;
+; CHECK-DAG: ![[T1:.*]] = !DILocalVariable(name: "t1"
+
+
+define dso_local i32 @fn1(i64 %t1) local_unnamed_addr #0 !dbg !7 {
+; We expect that the same width COPY reuses the debug location,
+; but the width narrowing COPY does not.
+; 
+; CHECK-LABEL: name:            fn1
+; CHECK: DBG_VALUE $rdi, $noreg, ![[T1]], !DIExpression(),
+; CHECK-NEXT: %0:gr64 = COPY $rdi
+; CHECK-NEXT: DBG_VALUE %0, $noreg, ![[T1]], !DIExpression(),
+; CHECK-NEXT: %1:gr32 = COPY %0.sub_32bit
+; CHECK-NEXT: COPY
+; CHECK-NEXT: RET
+entry:
+  call void @llvm.dbg.value(metadata i64 %t1, metadata !13, metadata !DIExpression()), !dbg !14
+  %0 = trunc i64 %t1 to i32, !dbg !15
+  ret i32 %0, !dbg !16
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { norecurse nounwind readnone uwtable }
+attributes #1 = { nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 11.0.0 (git@github.com:tbosch/llvm-project.git 0b11aed869bf09ba60
+d7ed17334cf0b76e6a5922)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.cc", directory: "")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:tbosch/llvm-project.git 0b11aed869bf09ba60d7ed17334cf0b76e6a5922)"}
+!7 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "t1", arg: 1, scope: !7, file: !1, line: 1, type: !11)
+!14 = !DILocation(line: 0, scope: !7)
+!15 = !DILocation(line: 2, column: 10, scope: !7)
+!16 = !DILocation(line: 2, column: 3, scope: !7)

From 248410937899d99f6f258d7299ec6fae0f98d900 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 29 May 2020 16:28:22 -0400
Subject: [PATCH 578/770] AMDGPU/GlobalISel: Add boilerplate for inline asm
 lowering

Test mostly from minor adjustments to the AArch64 one.
---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |   6 +
 .../GlobalISel/irtranslator-inline-asm.ll     | 237 ++++++++++++++++++
 3 files changed, 244 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 92564d1c53abe..3e24c4a8dd32c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -269,6 +269,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
   InstSelector.reset(new AMDGPUInstructionSelector(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 189f18b960625..4db2a0fb5687b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -25,6 +25,7 @@
 #include "SIInstrInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -272,6 +273,7 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
 private:
   /// GlobalISel related APIs.
   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
@@ -421,6 +423,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
     return CallLoweringInfo.get();
   }
 
+  const InlineAsmLowering *getInlineAsmLowering() const override {
+    return InlineAsmLoweringInfo.get();
+  }
+
   InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
new file mode 100644
index 0000000000000..15320c9155efa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -global-isel -stop-after=irtranslator -verify-machineinstrs -o - %s | FileCheck %s
+
+define amdgpu_kernel void @asm_convergent() convergent{
+  ; CHECK-LABEL: name: asm_convergent
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &s_barrier, 33 /* sideeffect isconvergent attdialect */, !0
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "s_barrier", ""() convergent, !srcloc !0
+  ret void
+}
+
+define amdgpu_kernel void @asm_simple_memory_clobber() {
+  ; CHECK-LABEL: name: asm_simple_memory_clobber
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, !0
+  ; CHECK:   INLINEASM &"", 1 /* sideeffect attdialect */, !0
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "", "~{memory}"(), !srcloc !0
+  call void asm sideeffect "", ""(), !srcloc !0
+  ret void
+}
+
+define amdgpu_kernel void @asm_simple_vgpr_clobber() {
+  ; CHECK-LABEL: name: asm_simple_vgpr_clobber
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &"v_mov_b32 v0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0, !0
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"(), !srcloc !0
+  ret void
+}
+
+define amdgpu_kernel void @asm_simple_sgpr_clobber() {
+  ; CHECK-LABEL: name: asm_simple_sgpr_clobber
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &"s_mov_b32 s0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $sgpr0, !0
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "s_mov_b32 s0, 7", "~{s0}"(), !srcloc !0
+  ret void
+}
+
+define amdgpu_kernel void @asm_simple_agpr_clobber() {
+  ; CHECK-LABEL: name: asm_simple_agpr_clobber
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &"; def a0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $agpr0, !0
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "; def a0", "~{a0}"(), !srcloc !0
+  ret void
+}
+
+define i32 @asm_vgpr_early_clobber() {
+  ; CHECK-LABEL: name: asm_vgpr_early_clobber
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]]
+  ; CHECK:   $vgpr0 = COPY [[ADD]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
+  call { i32, i32 } asm sideeffect "v_mov_b32 $0, 7; v_mov_b32 $1, 7", "=&v,=&v"(), !srcloc !0
+  %asmresult = extractvalue { i32, i32 } %1, 0
+  %asmresult1 = extractvalue { i32, i32 } %1, 1
+  %add = add i32 %asmresult, %asmresult1
+  ret i32 %add
+}
+
+define i32 @test_specific_vgpr_output() nounwind {
+  ; CHECK-LABEL: name: test_specific_vgpr_output
+  ; CHECK: bb.1.entry:
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"v_mov_b32 v1, 7", 0 /* attdialect */, 10 /* regdef */, implicit-def $vgpr1
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   $vgpr0 = COPY [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
+entry:
+  %0 = tail call i32 asm "v_mov_b32 v1, 7", "={v1}"() nounwind
+  ret i32 %0
+}
+
+define i32 @test_single_vgpr_output() nounwind {
+  ; CHECK-LABEL: name: test_single_vgpr_output
+  ; CHECK: bb.1.entry:
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK:   $vgpr0 = COPY [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
+entry:
+  %0 = tail call i32 asm "v_mov_b32 $0, 7", "=v"() nounwind
+  ret i32 %0
+}
+
+define i32 @test_single_sgpr_output_s32() nounwind {
+  ; CHECK-LABEL: name: test_single_sgpr_output_s32
+  ; CHECK: bb.1.entry:
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK:   $vgpr0 = COPY [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
+entry:
+  %0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
+  ret i32 %0
+}
+
+; Check support for returning several floats
+define float @test_multiple_register_outputs_same() #0 {
+  ; CHECK-LABEL: name: test_multiple_register_outputs_same
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]]
+  ; CHECK:   $vgpr0 = COPY [[FADD]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
+  %1 = call { float, float } asm "v_mov_b32 $0, 0; v_mov_b32 $1, 1", "=v,=v"()
+  %asmresult = extractvalue { float, float } %1, 0
+  %asmresult1 = extractvalue { float, float } %1, 1
+  %add = fadd float %asmresult, %asmresult1
+  ret float %add
+}
+
+; Check support for returning several floats
+define double @test_multiple_register_outputs_mixed() #0 {
+  ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s64) = COPY %2
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0, implicit $vgpr1
+  %1 = call { float, double } asm "v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", "=v,=v"()
+  %asmresult = extractvalue { float, double } %1, 1
+  ret double %asmresult
+}
+
+
+define float @test_vector_output() nounwind {
+  ; CHECK-LABEL: name: test_vector_output
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK:   INLINEASM &"v_add_f64 $0, 0, 0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $vgpr14_vgpr15
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr14_vgpr15
+  ; CHECK:   [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY1]](<2 x s32>), [[C]](s32)
+  ; CHECK:   $vgpr0 = COPY [[EVEC]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]], implicit $vgpr0
+  %1 = tail call <2 x float> asm sideeffect "v_add_f64 $0, 0, 0", "={v[14:15]}"() nounwind
+  %2 = extractelement <2 x float> %1, i32 0
+  ret float %2
+}
+
+define amdgpu_kernel void @test_input_vgpr_imm() {
+  ; CHECK-LABEL: name: test_input_vgpr_imm
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
+  ; CHECK:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 9 /* reguse */, [[COPY]]
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42)
+  ret void
+}
+
+define amdgpu_kernel void @test_input_sgpr_imm() {
+  ; CHECK-LABEL: name: test_input_sgpr_imm
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
+  ; CHECK:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 9 /* reguse */, [[COPY]]
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42)
+  ret void
+}
+
+define amdgpu_kernel void @test_input_imm() {
+  ; CHECK-LABEL: name: test_input_imm
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   INLINEASM &"s_mov_b32 s0, $0", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42
+  ; CHECK:   INLINEASM &"s_mov_b64 s[0:1], $0", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42
+  ; CHECK:   S_ENDPGM 0
+  call void asm sideeffect "s_mov_b32 s0, $0", "i"(i32 42)
+  call void asm sideeffect "s_mov_b64 s[0:1], $0", "i"(i64 42)
+  ret void
+}
+
+define float @test_input_vgpr(i32 %src) nounwind {
+  ; CHECK-LABEL: name: test_input_vgpr
+  ; CHECK: bb.1.entry:
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
+  ; CHECK:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 9 /* reguse */, [[COPY2]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK:   $vgpr0 = COPY [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY4]], implicit $vgpr0
+entry:
+  %0 = tail call float asm "v_add_f32 $0, 1.0, $1", "=v,v"(i32 %src) nounwind
+  ret float %0
+}
+
+define i32 @test_memory_constraint(i32 addrspace(3)* %a) nounwind {
+  ; CHECK-LABEL: name: test_memory_constraint
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY %2
+  ; CHECK:   $vgpr0 = COPY [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]], implicit $vgpr0
+  %1 = tail call i32 asm "ds_read_b32 $0, $1", "=v,*m"(i32 addrspace(3)* %a)
+  ret i32 %1
+}
+
+!0 = !{i32 70}

From af852d6f3638fead78af9048503a67cf7b308eb6 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Fri, 29 May 2020 13:49:38 -0700
Subject: [PATCH 579/770] Revert "Process gep (phi ptr1, ptr2) in SROA"

This reverts commit f66a43c11a7899d5c578b80d7f154abcea3b8d8e.
---
 llvm/lib/Transforms/Scalar/SROA.cpp  |  51 ----
 llvm/test/Transforms/SROA/phi-gep.ll | 364 ---------------------------
 2 files changed, 415 deletions(-)
 delete mode 100644 llvm/test/Transforms/SROA/phi-gep.ll

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 0440805d484a1..1d486a3e74fd1 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3488,62 +3488,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     return true;
   }
 
-  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
-  bool foldGEPPhi(GetElementPtrInst &GEPI) {
-    if (!GEPI.hasAllConstantIndices())
-      return false;
-
-    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
-    if (llvm::any_of(PHI->incoming_values(), [](Value *In)
-          { Instruction *I = dyn_cast<Instruction>(In);
-            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
-                   !I->getParent()->isLegalToHoistInto();
-          }))
-      return false;
-
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
-                      << "\n    original: " << *PHI
-                      << "\n              " << GEPI
-                      << "\n          to: ");
-
-    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
-    bool IsInBounds = GEPI.isInBounds();
-    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
-    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
-                                          PHI->getNumIncomingValues(),
-                                          PHI->getName() + ".sroa.phi");
-    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
-      Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
-
-      IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
-      Value *NewVal = IsInBounds
-          ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
-          : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
-      NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
-      if (auto *I = dyn_cast<Instruction>(NewVal)) {
-        visit(I);
-      }
-    }
-
-    GEPI.replaceAllUsesWith(NewPN);
-    GEPI.eraseFromParent();
-
-    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
-                 dbgs() << "\n              " << *In;
-               dbgs() << "\n              " << *NewPN << '\n');
-
-    return true;
-  }
-
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     if (isa<SelectInst>(GEPI.getPointerOperand()) &&
         foldGEPSelect(GEPI))
       return true;
 
-    if (isa<PHINode>(GEPI.getPointerOperand()) &&
-        foldGEPPhi(GEPI))
-      return true;
-
     enqueueUsers(GEPI);
     return false;
   }
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
deleted file mode 100644
index 72d07ed84a656..0000000000000
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ /dev/null
@@ -1,364 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -sroa < %s | FileCheck %s
-
-%pair = type { i32, i32 }
-
-define i32 @test_sroa_phi_gep(i1 %cond) {
-; CHECK-LABEL: @test_sroa_phi_gep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 2, [[IF_THEN]] ]
-; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %b = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
-  store i32 1, i32* %gep_a, align 4
-  store i32 2, i32* %gep_b, align 4
-  br i1 %cond, label %if.then, label %end
-
-if.then:
-  br label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_phi_gep_non_inbound(i1 %cond) {
-; CHECK-LABEL: @test_sroa_phi_gep_non_inbound(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 2, [[IF_THEN]] ]
-; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %b = alloca %pair, align 4
-  %gep_a = getelementptr %pair, %pair* %a, i32 0, i32 1
-  %gep_b = getelementptr %pair, %pair* %b, i32 0, i32 1
-  store i32 1, i32* %gep_a, align 4
-  store i32 2, i32* %gep_b, align 4
-  br i1 %cond, label %if.then, label %end
-
-if.then:
-  br label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
-  %gep = getelementptr %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_phi_gep_undef(i1 %cond) {
-; CHECK-LABEL: @test_sroa_phi_gep_undef(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ undef, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  br i1 %cond, label %if.then, label %end
-
-if.then:
-  br label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ undef, %if.then ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-@g = global %pair zeroinitializer, align 4
-
-define i32 @test_sroa_phi_gep_global(i1 %cond) {
-; CHECK-LABEL: @test_sroa_phi_gep_global(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
-; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ @g, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  store i32 1, i32* %gep_a, align 4
-  br i1 %cond, label %if.then, label %end
-
-if.then:
-  br label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ @g, %if.then ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_phi_gep_arg_phi_inspt(i1 %cond) {
-; CHECK-LABEL: @test_sroa_phi_gep_arg_phi_inspt(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[PHI_INSPT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[I]] = add i32 [[PHI_INSPT]], 1
-; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 2, [[FOR]] ]
-; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %b = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
-  store i32 1, i32* %gep_a, align 4
-  store i32 2, i32* %gep_b, align 4
-  br i1 %cond, label %for, label %end
-
-for:
-  %phi_inspt = phi i32 [ 0, %entry ], [ %i, %for ]
-  %i = add i32 %phi_inspt, 1
-  %loop.cond = icmp ult i32 %i, 10
-  br i1 %loop.cond, label %for, label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ %b, %for ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_phi_gep_phi_inspt(i1 %cond) {
-; CHECK-LABEL: @test_sroa_phi_gep_phi_inspt(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
-; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[PHI_IN:%.*]] = phi %pair* [ null, [[ENTRY:%.*]] ], [ [[B]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI_INSPT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[I:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[I]] = add i32 [[PHI_INSPT]], 1
-; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY]] ], [ [[PHI_IN]], [[FOR]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %b = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
-  store i32 1, i32* %gep_a, align 4
-  store i32 2, i32* %gep_b, align 4
-  br i1 %cond, label %for, label %end
-
-for:
-  %phi_in = phi %pair * [ null, %entry ], [ %b, %for ]
-  %phi_inspt = phi i32 [ 0, %entry ], [ %i, %for ]
-  %i = add i32 %phi_inspt, 1
-  %loop.cond = icmp ult i32 %i, 10
-  br i1 %loop.cond, label %for, label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ %phi_in, %for ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_gep_phi_gep(i1 %cond) {
-; CHECK-LABEL: @test_sroa_gep_phi_gep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[A_SROA_0]], [[ENTRY]] ], [ [[GEP_FOR:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
-; CHECK-NEXT:    [[GEP_FOR]] = getelementptr inbounds i32, i32* [[PHI]], i32 0
-; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI_END:%.*]] = phi i32* [ [[A_SROA_0]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  br i1 %cond, label %for, label %end
-
-for:
-  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
-  %phi = phi i32* [ %gep_a, %entry], [ %gep_for, %for ]
-  %i = add i32 %phi_i, 1
-  %gep_for = getelementptr inbounds i32, i32* %phi, i32 0
-  %loop.cond = icmp ult i32 %i, 10
-  br i1 %loop.cond, label %for, label %end
-
-end:
-  %phi_end = phi i32* [ %gep_a, %entry], [ %phi, %for ]
-  %load = load i32, i32* %phi_end, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_gep_cast_phi_gep(i1 %cond) {
-; CHECK-LABEL: @test_sroa_gep_cast_phi_gep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST2:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_2:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI_SROA_PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_2_SROA_GEP:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
-; CHECK-NEXT:    [[GEP_FOR_1:%.*]] = bitcast float* [[PHI_SROA_PHI]] to i32*
-; CHECK-NEXT:    [[GEP_FOR_2]] = bitcast i32* [[GEP_FOR_1]] to float*
-; CHECK-NEXT:    [[GEP_FOR_2_SROA_GEP]] = getelementptr inbounds float, float* [[GEP_FOR_2]], i32 0
-; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI_END:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST2]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI_END_1:%.*]] = bitcast float* [[PHI_END]] to i32*
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END_1]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_a.1 = bitcast i32* %gep_a to float*
-  br i1 %cond, label %for, label %end
-
-for:
-  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
-  %phi = phi float* [ %gep_a.1, %entry], [ %gep_for.2, %for ]
-  %i = add i32 %phi_i, 1
-  %gep_for = getelementptr inbounds float, float* %phi, i32 0
-  %gep_for.1 = bitcast float* %gep_for to i32*
-  %gep_for.2 = bitcast i32* %gep_for.1 to float*
-  %loop.cond = icmp ult i32 %i, 10
-  br i1 %loop.cond, label %for, label %end
-
-end:
-  %phi_end = phi float* [ %gep_a.1, %entry], [ %phi, %for ]
-  %phi_end.1 = bitcast float* %phi_end to i32*
-  %load = load i32, i32* %phi_end.1, align 4
-  ret i32 %load
-}
-
-define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality i32 (...)* @__gxx_personality_v0 {
-; CHECK-LABEL: @test_sroa_invoke_phi_gep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[CALL:%.*]], label [[END:%.*]]
-; CHECK:       call:
-; CHECK-NEXT:    [[B:%.*]] = invoke %pair* @foo()
-; CHECK-NEXT:    to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[CALL]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-; CHECK:       invoke_catch:
-; CHECK-NEXT:    [[RES:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    catch i8* null
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %a = alloca %pair, align 4
-  br i1 %cond, label %call, label %end
-
-call:
-  %b = invoke %pair* @foo()
-  to label %end unwind label %invoke_catch
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ %b, %call ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-
-invoke_catch:
-  %res = landingpad { i8*, i32 }
-  catch i8* null
-  ret i32 0
-}
-
-define i32 @test_sroa_phi_gep_nonconst_idx(i1 %cond, i32 %idx) {
-; CHECK-LABEL: @test_sroa_phi_gep_nonconst_idx(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
-; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 [[IDX:%.*]], i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %b = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
-  store i32 1, i32* %gep_a, align 4
-  store i32 2, i32* %gep_b, align 4
-  br i1 %cond, label %if.then, label %end
-
-if.then:
-  br label %end
-
-end:
-  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
-  %gep = getelementptr inbounds %pair, %pair* %phi, i32 %idx, i32 1
-  %load = load i32, i32* %gep, align 4
-  ret i32 %load
-}
-
-declare %pair* @foo()
-
-declare i32 @__gxx_personality_v0(...)

From 8d8f8b353175b50dfdb1e2d5f3c0ada0a4ad25ed Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 29 May 2020 13:56:44 -0700
Subject: [PATCH 580/770] [lldb/Test] Don't leak forked processes on Darwin

We are leaking forked processes on macOS because the cleanup function
was checking the existence of /proc/pid which does not exist on macOS.
I've changed the code to be platform agnostic.
---
 lldb/packages/Python/lldbsuite/test/lldbtest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 0dee4f217c801..04ba7ea02d095 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -884,8 +884,10 @@ def cleanupSubprocesses(self):
         del self.subprocesses[:]
         # Ensure any forked processes are cleaned up
         for pid in self.forkedProcessPids:
-            if os.path.exists("/proc/" + str(pid)):
+            try:
                 os.kill(pid, signal.SIGTERM)
+            except OSError:
+                pass
 
     def spawnSubprocess(self, executable, args=[], install_remote=True):
         """ Creates a subprocess.Popen object with the specified executable and arguments,

From 2d7fdab8e39fa3108f05cb3ff4af1a057f501c87 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 29 May 2020 11:16:23 -0700
Subject: [PATCH 581/770] [CMake] Change target 'check' from 'check-llvm' to
 'check-all'

Reviewed By: echristo, mehdi_amini

Differential Revision: https://reviews.llvm.org/D80823
---
 llvm/test/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index c765943a9b36e..7e6c66e249fce 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -202,8 +202,7 @@ add_lit_testsuites(LLVM ${CMAKE_CURRENT_SOURCE_DIR}
   DEPENDS ${LLVM_TEST_DEPENDS}
   )
 
-# Setup a legacy alias for 'check-llvm'. This will likely change to be an
-# alias for 'check-all' at some point in the future.
+# Setup an alias for 'check-all'.
 add_custom_target(check)
-add_dependencies(check check-llvm)
+add_dependencies(check check-all)
 set_target_properties(check PROPERTIES FOLDER "Tests")

From 881c5eef98a6c3fa59907ba2eefa6e8d086394a6 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 29 May 2020 14:22:03 -0700
Subject: [PATCH 582/770] [ELF] Add -z rel and -z rela

LLD supports both REL and RELA for static relocations, but emits either
of REL and RELA for dynamic relocations. The relocation entry format is
specified by each psABI.

musl ld.so supports both REL and RELA. For such ld.so implementations,
REL (.rel.dyn .rel.plt) has size benefits even if the psABI chooses RELA:
sizeof(Elf64_Rel)=16 < sizeof(Elf64_Rela)=24.

* COPY, GLOB_DAT and J[U]MP_SLOT always have 0 addend. A ld.so
  implementation does not need to read the implicit addend.
  REL is strictly better.
* A RELATIVE has a non-zero addend. Such relocations can be packed
  compactly with the RELR relocation entry format, which is out of scope
  of this patch.
* For other dynamic relocation types (e.g. symbolic relocation R_X86_64_64),
  a ld.so implementation needs to read the implicit addend. REL may have
  minor performance impact, because reading implicit addends forces
  random access reads instead of being able to blast out a bunch of
  writes while chasing the relocation array.

This patch adds -z rel and -z rela to change the relocation entry format
for dynamic relocations. I have tested that a -z rel produced x86-64
executable works with musl ld.so

-z rela may be useful for debugging purposes on processors whose psABIs
specify REL as the canonical format: addends can be easily read by a tool.

Reviewed By: grimar, mcgrathr

Differential Revision: https://reviews.llvm.org/D80496
---
 lld/ELF/Driver.cpp               | 45 +++++++++++++++--------
 lld/docs/ld.lld.1                |  6 +++
 lld/test/ELF/i386-zrel-zrela.s   | 63 ++++++++++++++++++++++++++++++++
 lld/test/ELF/x86-64-zrel-zrela.s | 58 +++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+), 15 deletions(-)
 create mode 100644 lld/test/ELF/i386-zrel-zrela.s
 create mode 100644 lld/test/ELF/x86-64-zrel-zrela.s

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index a38d6542f9884..0019bb8cfdb9b 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -422,11 +422,11 @@ static bool isKnownZFlag(StringRef s) {
          s == "nodelete" || s == "nodlopen" || s == "noexecstack" ||
          s == "nognustack" || s == "nokeep-text-section-prefix" ||
          s == "norelro" || s == "noseparate-code" || s == "notext" ||
-         s == "now" || s == "origin" || s == "pac-plt" || s == "relro" ||
-         s == "retpolineplt" || s == "rodynamic" || s == "shstk" ||
-         s == "text" || s == "undefs" || s == "wxneeded" ||
-         s.startswith("common-page-size=") || s.startswith("max-page-size=") ||
-         s.startswith("stack-size=");
+         s == "now" || s == "origin" || s == "pac-plt" || s == "rel" ||
+         s == "rela" || s == "relro" || s == "retpolineplt" ||
+         s == "rodynamic" || s == "shstk" || s == "text" || s == "undefs" ||
+         s == "wxneeded" || s.startswith("common-page-size=") ||
+         s.startswith("max-page-size=") || s.startswith("stack-size=");
 }
 
 // Report an error for an unknown -z option.
@@ -842,6 +842,22 @@ static std::vector<StringRef> getSymbolOrderingFile(MemoryBufferRef mb) {
   return names.takeVector();
 }
 
+static bool getIsRela(opt::InputArgList &args) {
+  // If -z rel or -z rela is specified, use the last option.
+  for (auto *arg : args.filtered_reverse(OPT_z)) {
+    StringRef s(arg->getValue());
+    if (s == "rel")
+      return false;
+    if (s == "rela")
+      return true;
+  }
+
+  // Otherwise use the psABI defined relocation entry format.
+  uint16_t m = config->emachine;
+  return m == EM_AARCH64 || m == EM_AMDGPU || m == EM_HEXAGON || m == EM_PPC ||
+         m == EM_PPC64 || m == EM_RISCV || m == EM_X86_64;
+}
+
 static void parseClangOption(StringRef opt, const Twine &msg) {
   std::string err;
   raw_string_ostream os(err);
@@ -1204,20 +1220,19 @@ static void setConfigs(opt::InputArgList &args) {
 
   // ELF defines two different ways to store relocation addends as shown below:
   //
-  //  Rel:  Addends are stored to the location where relocations are applied.
+  //  Rel: Addends are stored to the location where relocations are applied. It
+  //  cannot pack the full range of addend values for all relocation types, but
+  //  this only affects relocation types that we don't support emitting as
+  //  dynamic relocations (see getDynRel).
   //  Rela: Addends are stored as part of relocation entry.
   //
   // In other words, Rela makes it easy to read addends at the price of extra
-  // 4 or 8 byte for each relocation entry. We don't know why ELF defined two
-  // different mechanisms in the first place, but this is how the spec is
-  // defined.
+  // 4 or 8 byte for each relocation entry.
   //
-  // You cannot choose which one, Rel or Rela, you want to use. Instead each
-  // ABI defines which one you need to use. The following expression expresses
-  // that.
-  config->isRela = m == EM_AARCH64 || m == EM_AMDGPU || m == EM_HEXAGON ||
-                   m == EM_PPC || m == EM_PPC64 || m == EM_RISCV ||
-                   m == EM_X86_64;
+  // We pick the format for dynamic relocations according to the psABI for each
+  // processor, but a contrary choice can be made if the dynamic loader
+  // supports.
+  config->isRela = getIsRela(args);
 
   // If the output uses REL relocations we must store the dynamic relocation
   // addends to the output sections. We also store addends for RELA relocations
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 298da173d58e3..781bff1e970c0 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -720,6 +720,12 @@ processing.
 .It Cm pac-plt
 AArch64 only, use pointer authentication in PLT.
 .Pp
+.It Cm rel
+Use REL format for dynamic relocations.
+.Pp
+.It Cm rela
+Use RELA format for dynamic relocations.
+.Pp
 .It Cm retpolineplt
 Emit retpoline format PLT entries as a mitigation for CVE-2017-5715.
 .Pp
diff --git a/lld/test/ELF/i386-zrel-zrela.s b/lld/test/ELF/i386-zrel-zrela.s
new file mode 100644
index 0000000000000..61b9e4122f0f5
--- /dev/null
+++ b/lld/test/ELF/i386-zrel-zrela.s
@@ -0,0 +1,63 @@
+# REQUIRES: x86
+## The i386 psABI uses Elf64_Rela relocation entries. We produce
+## Elf32_Rel dynamic relocations by default, but can use Elf32_Rela with -z rela.
+
+# RUN: llvm-mc -filetype=obj -triple=i386 %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readobj -d -r -x .data %t.so | FileCheck --check-prefix=REL %s
+# RUN: ld.lld -shared -z rel %t.o -o %t1.so
+# RUN: llvm-readobj -d -r -x .data %t1.so | FileCheck --check-prefix=REL %s
+
+# REL:      REL      {{.*}}
+# REL-NEXT: RELSZ    32 (bytes)
+# REL-NEXT: RELENT   8 (bytes)
+# REL-NEXT: RELCOUNT 1
+# REL-NEXT: JMPREL   {{.*}}
+# REL-NEXT: PLTRELSZ 8 (bytes)
+# REL-NEXT: PLTGOT   {{.*}}
+# REL-NEXT: PLTREL   REL{{$}}
+# REL:      .rel.dyn {
+# REL-NEXT:   R_386_RELATIVE - 0x0
+# REL-NEXT:   R_386_GLOB_DAT func 0x0
+# REL-NEXT:   R_386_TLS_TPOFF tls 0x0
+# REL-NEXT:   R_386_32 _start 0x0
+# REL-NEXT: }
+# REL-NEXT: .rel.plt {
+# REL-NEXT:   R_386_JUMP_SLOT func 0x0
+# REL-NEXT: }
+
+# REL:      Hex dump of section '.data':
+# REL-NEXT: 0x000042cc cc420000 2a000000
+
+# RUN: ld.lld -shared -z rel -z rela %t.o -o %t2.so
+# RUN: llvm-readobj -d -r %t2.so | FileCheck --check-prefix=RELA %s
+
+# RELA:      RELA      {{.*}}
+# RELA-NEXT: RELASZ    48 (bytes)
+# RELA-NEXT: RELAENT   12 (bytes)
+# RELA-NEXT: RELACOUNT 1
+# RELA-NEXT: JMPREL    {{.*}}
+# RELA-NEXT: PLTRELSZ  12 (bytes)
+# RELA-NEXT: PLTGOT    {{.*}}
+# RELA-NEXT: PLTREL    RELA
+# RELA:      .rela.dyn {
+# RELA-NEXT:   R_386_RELATIVE - 0x42EC
+# RELA-NEXT:   R_386_GLOB_DAT func 0x0
+# RELA-NEXT:   R_386_TLS_TPOFF tls 0x2A
+# RELA-NEXT:   R_386_32 _start 0x2A
+# RELA-NEXT: }
+# RELA-NEXT: .rela.plt {
+# RELA-NEXT:   R_386_JUMP_SLOT func 0x0
+# RELA-NEXT: }
+
+.globl _start
+_start:
+  call func@PLT
+  movl func@GOT(%eax), %eax
+
+.section .text1,"awx"
+  movl %gs:tls@NTPOFF+42, %eax
+
+.data
+  .long .data
+  .long _start+42
diff --git a/lld/test/ELF/x86-64-zrel-zrela.s b/lld/test/ELF/x86-64-zrel-zrela.s
new file mode 100644
index 0000000000000..62b154fe7e985
--- /dev/null
+++ b/lld/test/ELF/x86-64-zrel-zrela.s
@@ -0,0 +1,58 @@
+# REQUIRES: x86
+## The x86-64 psABI uses Elf64_Rela relocation entries. We produce
+## Elf64_Rel dynamic relocations by default, but can use Elf64_Rel with -z rel.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readobj -d -r %t.so | FileCheck --check-prefix=RELA %s
+# RUN: ld.lld -shared -z rela %t.o -o %t1.so
+# RUN: llvm-readobj -d -r %t1.so | FileCheck --check-prefix=RELA %s
+
+# RELA:      RELA      {{.*}}
+# RELA-NEXT: RELASZ    72 (bytes)
+# RELA-NEXT: RELAENT   24 (bytes)
+# RELA-NEXT: RELACOUNT 1
+# RELA-NEXT: JMPREL    {{.*}}
+# RELA-NEXT: PLTRELSZ  24 (bytes)
+# RELA-NEXT: PLTGOT    {{.*}}
+# RELA-NEXT: PLTREL    RELA
+# RELA:      .rela.dyn {
+# RELA-NEXT:   R_X86_64_RELATIVE - 0x3428
+# RELA-NEXT:   R_X86_64_GLOB_DAT func 0x0
+# RELA-NEXT:   R_X86_64_64 _start 0x2A
+# RELA-NEXT: }
+# RELA-NEXT: .rela.plt {
+# RELA-NEXT:   R_X86_64_JUMP_SLOT func 0x0
+# RELA-NEXT: }
+
+# RUN: ld.lld -shared -z rela -z rel %t.o -o %t2.so
+# RUN: llvm-readobj -d -r -x .data %t2.so | FileCheck --check-prefix=REL %s
+
+# REL:      REL      {{.*}}
+# REL-NEXT: RELSZ    48 (bytes)
+# REL-NEXT: RELENT   16 (bytes)
+# REL-NEXT: RELCOUNT 1
+# REL-NEXT: JMPREL   {{.*}}
+# REL-NEXT: PLTRELSZ 16 (bytes)
+# REL-NEXT: PLTGOT   {{.*}}
+# REL-NEXT: PLTREL   REL{{$}}
+# REL:      .rel.dyn {
+# REL-NEXT:   R_X86_64_RELATIVE - 0x0
+# REL-NEXT:   R_X86_64_GLOB_DAT func 0x0
+# REL-NEXT:   R_X86_64_64 _start 0
+# REL-NEXT: }
+# REL-NEXT: .rel.plt {
+# REL-NEXT:   R_X86_64_JUMP_SLOT func 0x0
+# REL-NEXT: }
+
+# REL:      Hex dump of section '.data':
+# REL-NEXT: 0x00003408 08340000 00000000 2a000000 00000000
+
+.globl _start
+_start:
+  call func@PLT
+  movq func@GOTPCREL(%rip), %rax
+
+.data
+  .quad .data
+  .quad _start+42

From e75325cfc397c562964dd39b47198d73c9e9602a Mon Sep 17 00:00:00 2001
From: Diego Caballero <diego.caballero@intel.com>
Date: Fri, 29 May 2020 14:31:03 -0700
Subject: [PATCH 583/770] [mlir][Affine] Minor clean-up of D79829

Addressing D79829 post-commit comments. Minor changes.

Reviewed By: rriddle

Differential Revision: https://reviews.llvm.org/D80814
---
 .../Affine/IR/AffineMemoryOpInterfaces.h      |  6 ++--
 .../Affine/IR/AffineMemoryOpInterfaces.td     | 28 ++++++++++---------
 .../mlir/Dialect/Affine/IR/CMakeLists.txt     |  7 +----
 mlir/lib/Analysis/AffineAnalysis.cpp          |  9 +++---
 .../Affine/IR/AffineMemoryOpInterfaces.cpp    |  2 +-
 5 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h b/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h
index f42fc256befa0..1f4fdb6126fed 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_INTERFACES_AFFINEMEMORYOPINTERFACES_H_
-#define MLIR_INTERFACES_AFFINEMEMORYOPINTERFACES_H_
+#ifndef MLIR_DIALECT_AFFINE_IR_AFFINEMEMORYOPDIALECT_H_
+#define MLIR_DIALECT_AFFINE_IR_AFFINEMEMORYOPDIALECT_H_
 
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/OpDefinition.h"
@@ -21,4 +21,4 @@ namespace mlir {
 #include "mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h.inc"
 } // namespace mlir
 
-#endif // MLIR_INTERFACES_AFFINEMEMORYOPINTERFACES_H_
+#endif // MLIR_DIALECT_AFFINE_IR_AFFINEMEMORYOPDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td b/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td
index 8738000d8d5fc..dd174da447312 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_AFFINEMEMORYOPINTERFACES
-#define MLIR_AFFINEMEMORYOPINTERFACES
+#ifndef AFFINEMEMORYOPINTERFACES
+#define AFFINEMEMORYOPINTERFACES
 
 include "mlir/IR/OpBase.td"
 
@@ -23,7 +23,7 @@ def AffineReadOpInterface : OpInterface<"AffineReadOpInterface"> {
 
   let methods = [
     InterfaceMethod<
-      /*desc=*/[{ Returns the memref operand to read from. }],
+      /*desc=*/"Returns the memref operand to read from.",
       /*retTy=*/"Value",
       /*methodName=*/"getMemRef",
       /*args=*/(ins),
@@ -34,7 +34,7 @@ def AffineReadOpInterface : OpInterface<"AffineReadOpInterface"> {
       }]
     >,
     InterfaceMethod<
-      /*desc=*/[{ Returns the type of the memref operand. }],
+      /*desc=*/"Returns the type of the memref operand.",
       /*retTy=*/"MemRefType",
       /*methodName=*/"getMemRefType",
       /*args=*/(ins),
@@ -45,7 +45,7 @@ def AffineReadOpInterface : OpInterface<"AffineReadOpInterface"> {
       }]
     >,
     InterfaceMethod<
-      /*desc=*/[{ Returns affine map operands. }],
+      /*desc=*/"Returns affine map operands.",
       /*retTy=*/"Operation::operand_range",
       /*methodName=*/"getMapOperands",
       /*args=*/(ins),
@@ -56,8 +56,9 @@ def AffineReadOpInterface : OpInterface<"AffineReadOpInterface"> {
       }]
     >,
     InterfaceMethod<
-      /*desc=*/[{ Returns the affine map used to index the memref for this
-                  operation. }],
+      /*desc=*/[{
+        Returns the affine map used to index the memref for this operation.
+      }],
       /*retTy=*/"AffineMap",
       /*methodName=*/"getAffineMap",
       /*args=*/(ins),
@@ -78,7 +79,7 @@ def AffineWriteOpInterface : OpInterface<"AffineWriteOpInterface"> {
 
   let methods = [
     InterfaceMethod<
-      /*desc=*/[{ Returns the memref operand to write to. }],
+      /*desc=*/"Returns the memref operand to write to.",
       /*retTy=*/"Value",
       /*methodName=*/"getMemRef",
       /*args=*/(ins),
@@ -89,7 +90,7 @@ def AffineWriteOpInterface : OpInterface<"AffineWriteOpInterface"> {
       }]
     >,
     InterfaceMethod<
-      /*desc=*/[{ Returns the type of the memref operand. }],
+      /*desc=*/"Returns the type of the memref operand.",
       /*retTy=*/"MemRefType",
       /*methodName=*/"getMemRefType",
       /*args=*/(ins),
@@ -100,7 +101,7 @@ def AffineWriteOpInterface : OpInterface<"AffineWriteOpInterface"> {
       }]
     >,
     InterfaceMethod<
-      /*desc=*/[{ Returns affine map operands. }],
+      /*desc=*/"Returns affine map operands.",
       /*retTy=*/"Operation::operand_range",
       /*methodName=*/"getMapOperands",
       /*args=*/(ins),
@@ -111,8 +112,9 @@ def AffineWriteOpInterface : OpInterface<"AffineWriteOpInterface"> {
       }]
     >,
     InterfaceMethod<
-      /*desc=*/[{ Returns the affine map used to index the memref for this
-                  operation. }],
+      /*desc=*/[{
+        Returns the affine map used to index the memref for this operation.
+      }],
       /*retTy=*/"AffineMap",
       /*methodName=*/"getAffineMap",
       /*args=*/(ins),
@@ -125,4 +127,4 @@ def AffineWriteOpInterface : OpInterface<"AffineWriteOpInterface"> {
   ];
 }
 
-#endif // MLIR_AFFINEMEMORYOPINTERFACES
+#endif // AFFINEMEMORYOPINTERFACES
diff --git a/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt
index 77806274f14c4..f0bb4caeec388 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt
@@ -1,10 +1,5 @@
 add_mlir_dialect(AffineOps affine)
 add_mlir_doc(AffineOps -gen-op-doc AffineOps Dialects/)
 
-set(LLVM_TARGET_DEFINITIONS AffineMemoryOpInterfaces.td)
-mlir_tablegen(AffineMemoryOpInterfaces.h.inc -gen-op-interface-decls)
-mlir_tablegen(AffineMemoryOpInterfaces.cpp.inc -gen-op-interface-defs)
-add_public_tablegen_target(MLIRAffineMemoryOpInterfacesIncGen)
-add_dependencies(mlir-generic-headers MLIRAffineMemoryOpInterfacesIncGen)
-
+add_mlir_interface(AffineMemoryOpInterfaces)
 add_dependencies(MLIRAffineOpsIncGen MLIRAffineMemoryOpInterfacesIncGen)
diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
index 8c4828805882c..ca662c71fe436 100644
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -660,12 +660,11 @@ static void computeDirectionVector(
 void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const {
   // Get affine map from AffineLoad/Store.
   AffineMap map;
-  if (auto loadOp = dyn_cast<AffineReadOpInterface>(opInst)) {
+  if (auto loadOp = dyn_cast<AffineReadOpInterface>(opInst))
     map = loadOp.getAffineMap();
-  } else {
-    auto storeOp = cast<AffineWriteOpInterface>(opInst);
-    map = storeOp.getAffineMap();
-  }
+  else
+    map = cast<AffineWriteOpInterface>(opInst).getAffineMap();
+
   SmallVector<Value, 8> operands(indices.begin(), indices.end());
   fullyComposeAffineMapAndOperands(&map, &operands);
   map = simplifyAffineMap(map);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp b/mlir/lib/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp
index 6f5861efa9568..33c1b3376d0a4 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp
@@ -1,4 +1,4 @@
-//===- AffineMemoryOpInterfaces.cpp - Loop-like operations in MLIR --------===//
+//===- AffineMemoryOpInterfaces.cpp ---------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From c8f1aca316c2ee02347752079b86ba2322a6cf72 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 14:34:56 -0700
Subject: [PATCH 584/770] [SVE] Eliminate calls to default-false
 VectorType::get() from Utils

Reviewers: efriedma, c-rhodes, sdesmalen, xbolva00

Reviewed By: c-rhodes

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80337
---
 llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c32db981ee7c2..6ad8bc6e09426 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2107,7 +2107,7 @@ static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
     // x86_64 can't use {float, float} since that would be returned in both
     // xmm0 and xmm1, which isn't what a real struct would do.
     ResTy = T.getArch() == Triple::x86_64
-                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2))
                 : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
   } else {
     Name = "__sincospi_stret";

From e4d2037a5ccb76e446dc13803b486e2f9b0c7f29 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 15:14:48 -0700
Subject: [PATCH 585/770] [SVE] Eliminate calls to default-false
 VectorType::get() from Instrumentation

Reviewers: efriedma, fpetrogalli, kmclaughlin

Reviewed By: fpetrogalli

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80335
---
 llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 2 +-
 llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index a67107cc22710..795ef13919c1c 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1409,7 +1409,7 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, Align Alignment,
   const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
   uint64_t Offset = 0;
   if (Size >= ShadowVecSize) {
-    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    auto *ShadowVecTy = FixedVectorType::get(DFS.ShadowTy, ShadowVecSize);
     Value *ShadowVec = UndefValue::get(ShadowVecTy);
     for (unsigned i = 0; i != ShadowVecSize; ++i) {
       ShadowVec = IRB.CreateInsertElement(
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 53661b5e1bbfe..48a910c4ef217 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1374,8 +1374,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const DataLayout &DL = F.getParent()->getDataLayout();
     if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
       uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
-      return VectorType::get(IntegerType::get(*MS.C, EltSize),
-                             VT->getNumElements());
+      return FixedVectorType::get(IntegerType::get(*MS.C, EltSize),
+                                  VT->getNumElements());
     }
     if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
       return ArrayType::get(getShadowTy(AT->getElementType()),
@@ -2756,8 +2756,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const unsigned X86_MMXSizeInBits = 64;
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
-    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
-                           X86_MMXSizeInBits / EltSizeInBits);
+    return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                                X86_MMXSizeInBits / EltSizeInBits);
   }
 
   // Returns a signed counterpart for an (un)signed-saturate-and-pack

From fbac9ce226d7a27e418fdeac72a0b3c9f2c48742 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Fri, 22 May 2020 10:18:26 -0700
Subject: [PATCH 586/770] [flang] Batch together the changes to the PFT
 intermediate data structure for upstreaming to llvm-project.

These files have had many changes since they were originally upstreamed.
Some of the changes are cosmetic.  Most of the functional changes were
done to support the lowering of control-flow syntax from the front-end
parse trees to the FIR dialect.

This patch is meant to be a reviewable size. The functionality it
provides will be used by code yet to be upstreamed in lowering.

review comments:

[review D80449][NFC] make PFT ParentVariant a ReferenceVariant

ReferenceVariant had to be slightly updated to also support
non constant references (which is required for ParentType).

[review D80449] extend Variable implementation beyond a comment
---
 flang/include/flang/Lower/PFTBuilder.h |  598 ++++++-----
 flang/include/flang/Lower/Utils.h      |   31 +
 flang/include/flang/Semantics/symbol.h |   27 +
 flang/lib/Lower/PFTBuilder.cpp         | 1296 ++++++++++++++++--------
 flang/test/Lower/pre-fir-tree01.f90    |    6 +-
 flang/test/Lower/pre-fir-tree02.f90    |   48 +-
 flang/test/Lower/pre-fir-tree03.f90    |   12 +-
 flang/test/Lower/pre-fir-tree04.f90    |   10 +-
 flang/tools/f18/f18.cpp                |    3 +-
 9 files changed, 1310 insertions(+), 721 deletions(-)
 create mode 100644 flang/include/flang/Lower/Utils.h

diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 733027cc425d8..852700b8c0b15 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -1,35 +1,35 @@
-//===-- include/flang/Lower/PFTBuilder.h ------------------------*- C++ -*-===//
+//===-- Lower/PFTBuilder.h -- PFT builder -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// PFT (Pre-FIR Tree) interface.
+//
+//===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_LOWER_PFT_BUILDER_H_
-#define FORTRAN_LOWER_PFT_BUILDER_H_
+#ifndef FORTRAN_LOWER_PFTBUILDER_H
+#define FORTRAN_LOWER_PFTBUILDER_H
 
+#include "flang/Common/reference.h"
 #include "flang/Common/template.h"
 #include "flang/Parser/parse-tree.h"
-#include <memory>
-
-/// Build a light-weight tree over the parse-tree to help with lowering to FIR.
-/// It is named Pre-FIR Tree (PFT) to underline it has no other usage than
-/// helping lowering to FIR.
-/// The PFT will capture pointers back into the parse tree, so the parse tree
-/// data structure may <em>not</em> be changed between the construction of the
-/// PFT and all of its uses.
-///
-/// The PFT captures a structured view of the program.  The program is a list of
-/// units.  Function like units will contain lists of evaluations.  Evaluations
-/// are either statements or constructs, where a construct contains a list of
-/// evaluations. The resulting PFT structure can then be used to create FIR.
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/raw_ostream.h"
 
-namespace llvm {
-class raw_ostream;
+namespace mlir {
+class Block;
 }
 
-namespace Fortran::lower {
+namespace Fortran {
+namespace semantics {
+class SemanticsContext;
+class Scope;
+} // namespace semantics
+namespace lower {
 namespace pft {
 
 struct Evaluation;
@@ -40,40 +40,56 @@ struct FunctionLikeUnit;
 // TODO: A collection of Evaluations can obviously be any of the container
 // types; leaving this as a std::list _for now_ because we reserve the right to
 // insert PFT nodes in any order in O(1) time.
-using EvaluationCollection = std::list<Evaluation>;
-
-struct ParentType {
-  template <typename A>
-  ParentType(A &parent) : p{&parent} {}
-  const std::variant<Program *, ModuleLikeUnit *, FunctionLikeUnit *,
-                     Evaluation *>
-      p;
-};
+using EvaluationList = std::list<Evaluation>;
+using LabelEvalMap = llvm::DenseMap<Fortran::parser::Label, Evaluation *>;
+
+/// Provide a variant like container that can hold references. It can hold
+/// constant or mutable references. It is used in the other classes to provide
+/// union of const references to parse-tree nodes.
+template <bool isConst, typename... A>
+class ReferenceVariantBase {
+public:
+  template <typename B>
+  using BaseType = std::conditional_t<isConst, const B, B>;
+  template <typename B>
+  using Ref = common::Reference<BaseType<B>>;
+
+  ReferenceVariantBase() = delete;
+  template <typename B>
+  ReferenceVariantBase(B &b) : u{Ref<B>{b}} {}
+
+  template <typename B>
+  constexpr BaseType<B> &get() const {
+    return std::get<Ref<B>> > (u).get();
+  }
+  template <typename B>
+  constexpr BaseType<B> *getIf() const {
+    auto *ptr = std::get_if<Ref<B>>(&u);
+    return ptr ? &ptr->get() : nullptr;
+  }
+  template <typename B>
+  constexpr bool isA() const {
+    return std::holds_alternative<Ref<B>>(u);
+  }
+  template <typename VISITOR>
+  constexpr auto visit(VISITOR &&visitor) const {
+    return std::visit(
+        common::visitors{[&visitor](auto ref) { return visitor(ref.get()); }},
+        u);
+  }
 
-/// Flags to describe the impact of parse-trees nodes on the program
-/// control flow. These annotations to parse-tree nodes are later used to
-/// build the control flow graph when lowering to FIR.
-enum class CFGAnnotation {
-  None,            // Node does not impact control flow.
-  Goto,            // Node acts like a goto on the control flow.
-  CondGoto,        // Node acts like a conditional goto on the control flow.
-  IndGoto,         // Node acts like an indirect goto on the control flow.
-  IoSwitch,        // Node is an IO statement with ERR, END, or EOR specifier.
-  Switch,          // Node acts like a switch on the control flow.
-  Iterative,       // Node creates iterations in the control flow.
-  FirStructuredOp, // Node is a structured loop.
-  Return,          // Node triggers a return from the current procedure.
-  Terminate        // Node terminates the program.
+private:
+  std::variant<Ref<A>...> u;
 };
+template <typename... A>
+using ReferenceVariant = ReferenceVariantBase<true, A...>;
+template <typename... A>
+using MutableReferenceVariant = ReferenceVariantBase<false, A...>;
 
-/// Compiler-generated jump
-///
-/// This is used to convert implicit control-flow edges to explicit form in the
-/// decorated PFT
-struct CGJump {
-  CGJump(Evaluation &to) : target{to} {}
-  Evaluation &target;
-};
+/// ParentVariant is used to provide a reference to the unit a parse-tree node
+/// belongs to. It is a variant of non-nullable pointers.
+using ParentVariant = MutableReferenceVariant<Program, ModuleLikeUnit,
+                                              FunctionLikeUnit, Evaluation>;
 
 /// Classify the parse-tree nodes from ExecutablePartConstruct
 
@@ -95,15 +111,6 @@ using ActionStmts = std::tuple<
 using OtherStmts = std::tuple<parser::FormatStmt, parser::EntryStmt,
                               parser::DataStmt, parser::NamelistStmt>;
 
-using Constructs =
-    std::tuple<parser::AssociateConstruct, parser::BlockConstruct,
-               parser::CaseConstruct, parser::ChangeTeamConstruct,
-               parser::CriticalConstruct, parser::DoConstruct,
-               parser::IfConstruct, parser::SelectRankConstruct,
-               parser::SelectTypeConstruct, parser::WhereConstruct,
-               parser::ForallConstruct, parser::CompilerDirective,
-               parser::OpenMPConstruct, parser::OmpEndLoopDirective>;
-
 using ConstructStmts = std::tuple<
     parser::AssociateStmt, parser::EndAssociateStmt, parser::BlockStmt,
     parser::EndBlockStmt, parser::SelectCaseStmt, parser::CaseStmt,
@@ -115,257 +122,342 @@ using ConstructStmts = std::tuple<
     parser::MaskedElsewhereStmt, parser::ElsewhereStmt, parser::EndWhereStmt,
     parser::ForallConstructStmt, parser::EndForallStmt>;
 
+using Constructs =
+    std::tuple<parser::AssociateConstruct, parser::BlockConstruct,
+               parser::CaseConstruct, parser::ChangeTeamConstruct,
+               parser::CriticalConstruct, parser::DoConstruct,
+               parser::IfConstruct, parser::SelectRankConstruct,
+               parser::SelectTypeConstruct, parser::WhereConstruct,
+               parser::ForallConstruct>;
+
+using Directives =
+    std::tuple<parser::CompilerDirective, parser::OpenMPConstruct,
+               parser::OmpEndLoopDirective>;
+
+template <typename A>
+static constexpr bool isActionStmt{common::HasMember<A, ActionStmts>};
+
+template <typename A>
+static constexpr bool isOtherStmt{common::HasMember<A, OtherStmts>};
+
 template <typename A>
-constexpr static bool isActionStmt{common::HasMember<A, ActionStmts>};
+static constexpr bool isConstructStmt{common::HasMember<A, ConstructStmts>};
 
 template <typename A>
-constexpr static bool isConstruct{common::HasMember<A, Constructs>};
+static constexpr bool isConstruct{common::HasMember<A, Constructs>};
 
 template <typename A>
-constexpr static bool isConstructStmt{common::HasMember<A, ConstructStmts>};
+static constexpr bool isDirective{common::HasMember<A, Directives>};
 
 template <typename A>
-constexpr static bool isOtherStmt{common::HasMember<A, OtherStmts>};
+static constexpr bool isIntermediateConstructStmt{common::HasMember<
+    A, std::tuple<parser::CaseStmt, parser::ElseIfStmt, parser::ElseStmt,
+                  parser::SelectRankCaseStmt, parser::TypeGuardStmt>>};
 
 template <typename A>
-constexpr static bool isGenerated{std::is_same_v<A, CGJump>};
+static constexpr bool isNopConstructStmt{common::HasMember<
+    A, std::tuple<parser::EndAssociateStmt, parser::CaseStmt,
+                  parser::EndSelectStmt, parser::ElseIfStmt, parser::ElseStmt,
+                  parser::EndIfStmt, parser::SelectRankCaseStmt,
+                  parser::TypeGuardStmt>>};
 
 template <typename A>
-constexpr static bool isFunctionLike{common::HasMember<
+static constexpr bool isFunctionLike{common::HasMember<
     A, std::tuple<parser::MainProgram, parser::FunctionSubprogram,
                   parser::SubroutineSubprogram,
                   parser::SeparateModuleSubprogram>>};
 
-/// Function-like units can contains lists of evaluations.  These can be
-/// (simple) statements or constructs, where a construct contains its own
-/// evaluations.
-struct Evaluation {
-  using EvalTuple = common::CombineTuples<ActionStmts, OtherStmts, Constructs,
-                                          ConstructStmts>;
+using LabelSet = llvm::SmallSet<parser::Label, 5>;
+using SymbolRef = common::Reference<const semantics::Symbol>;
+using SymbolLabelMap = llvm::DenseMap<SymbolRef, LabelSet>;
 
-  /// Hide non-nullable pointers to the parse-tree node.
-  template <typename A>
-  using MakeRefType = const A *const;
-  using EvalVariant =
-      common::CombineVariants<common::MapTemplate<MakeRefType, EvalTuple>,
-                              std::variant<CGJump>>;
-  template <typename A>
-  constexpr auto visit(A visitor) const {
-    return std::visit(common::visitors{
-                          [&](const auto *p) { return visitor(*p); },
-                          [&](auto &r) { return visitor(r); },
-                      },
-                      u);
-  }
-  template <typename A>
-  constexpr const A *getIf() const {
-    if constexpr (!std::is_same_v<A, CGJump>) {
-      if (auto *ptr{std::get_if<MakeRefType<A>>(&u)}) {
-        return *ptr;
-      }
-    } else {
-      return std::get_if<CGJump>(&u);
-    }
-    return nullptr;
-  }
-  template <typename A>
-  constexpr bool isA() const {
-    if constexpr (!std::is_same_v<A, CGJump>) {
-      return std::holds_alternative<MakeRefType<A>>(u);
-    }
-    return std::holds_alternative<CGJump>(u);
-  }
+template <typename A>
+struct MakeReferenceVariantHelper {};
+template <typename... A>
+struct MakeReferenceVariantHelper<std::variant<A...>> {
+  using type = ReferenceVariant<A...>;
+};
+template <typename... A>
+struct MakeReferenceVariantHelper<std::tuple<A...>> {
+  using type = ReferenceVariant<A...>;
+};
+template <typename A>
+using MakeReferenceVariant = typename MakeReferenceVariantHelper<A>::type;
 
-  Evaluation() = delete;
-  Evaluation(const Evaluation &) = delete;
-  Evaluation(Evaluation &&) = default;
+using EvaluationTuple =
+    common::CombineTuples<ActionStmts, OtherStmts, ConstructStmts, Constructs,
+                          Directives>;
+/// Hide non-nullable pointers to the parse-tree node.
+/// Build type std::variant<const A* const, const B* const, ...>
+/// from EvaluationTuple type (std::tuple<A, B, ...>).
+using EvaluationVariant = MakeReferenceVariant<EvaluationTuple>;
+
+/// Function-like units contain lists of evaluations.  These can be simple
+/// statements or constructs, where a construct contains its own evaluations.
+struct Evaluation : EvaluationVariant {
 
   /// General ctor
   template <typename A>
-  Evaluation(const A &a, const ParentType &p, const parser::CharBlock &pos,
-             const std::optional<parser::Label> &lab)
-      : u{&a}, parent{p}, pos{pos}, lab{lab} {}
-
-  /// Compiler-generated jump
-  Evaluation(const CGJump &jump, const ParentType &p)
-      : u{jump}, parent{p}, cfg{CFGAnnotation::Goto} {}
+  Evaluation(const A &a, const ParentVariant &parentVariant,
+             const parser::CharBlock &position,
+             const std::optional<parser::Label> &label)
+      : EvaluationVariant{a},
+        parentVariant{parentVariant}, position{position}, label{label} {}
 
   /// Construct ctor
   template <typename A>
-  Evaluation(const A &a, const ParentType &parent) : u{&a}, parent{parent} {
-    static_assert(pft::isConstruct<A>, "must be a construct");
+  Evaluation(const A &a, const ParentVariant &parentVariant)
+      : EvaluationVariant{a}, parentVariant{parentVariant} {
+    static_assert(pft::isConstruct<A> || pft::isDirective<A>,
+                  "must be a construct or directive");
   }
 
-  constexpr bool isActionOrGenerated() const {
+  /// Evaluation classification predicates.
+  constexpr bool isActionStmt() const {
     return visit(common::visitors{
-        [](auto &r) {
-          using T = std::decay_t<decltype(r)>;
-          return isActionStmt<T> || isGenerated<T>;
-        },
-    });
+        [](auto &r) { return pft::isActionStmt<std::decay_t<decltype(r)>>; }});
   }
-
-  constexpr bool isStmt() const {
+  constexpr bool isOtherStmt() const {
     return visit(common::visitors{
-        [](auto &r) {
-          using T = std::decay_t<decltype(r)>;
-          static constexpr bool isStmt{isActionStmt<T> || isOtherStmt<T> ||
-                                       isConstructStmt<T>};
-          static_assert(!(isStmt && pft::isConstruct<T>),
-                        "statement classification is inconsistent");
-          return isStmt;
-        },
-    });
+        [](auto &r) { return pft::isOtherStmt<std::decay_t<decltype(r)>>; }});
   }
-  constexpr bool isConstruct() const { return !isStmt(); }
-
-  /// Set the type of originating control flow type for this evaluation.
-  void setCFG(CFGAnnotation a, Evaluation *cstr) {
-    cfg = a;
-    setBranches(cstr);
+  constexpr bool isConstructStmt() const {
+    return visit(common::visitors{[](auto &r) {
+      return pft::isConstructStmt<std::decay_t<decltype(r)>>;
+    }});
   }
-
-  /// Is this evaluation a control-flow origin? (The PFT must be annotated)
-  bool isControlOrigin() const { return cfg != CFGAnnotation::None; }
-
-  /// Is this evaluation a control-flow target? (The PFT must be annotated)
-  bool isControlTarget() const { return isTarget; }
-
-  /// Set the containsBranches flag iff this evaluation (a construct) contains
-  /// control flow
-  void setBranches() { containsBranches = true; }
-
-  EvaluationCollection *getConstructEvals() {
-    auto *evals{subs.get()};
-    if (isStmt() && !evals) {
-      return nullptr;
-    }
-    if (isConstruct() && evals) {
-      return evals;
-    }
-    llvm_unreachable("evaluation subs is inconsistent");
-    return nullptr;
+  constexpr bool isConstruct() const {
+    return visit(common::visitors{
+        [](auto &r) { return pft::isConstruct<std::decay_t<decltype(r)>>; }});
   }
-
-  /// Set that the construct `cstr` (if not a nullptr) has branches.
-  static void setBranches(Evaluation *cstr) {
-    if (cstr)
-      cstr->setBranches();
+  constexpr bool isDirective() const {
+    return visit(common::visitors{
+        [](auto &r) { return pft::isDirective<std::decay_t<decltype(r)>>; }});
+  }
+  /// Return the predicate:  "This is a non-initial, non-terminal construct
+  /// statement."  For an IfConstruct, this is ElseIfStmt and ElseStmt.
+  constexpr bool isIntermediateConstructStmt() const {
+    return visit(common::visitors{[](auto &r) {
+      return pft::isIntermediateConstructStmt<std::decay_t<decltype(r)>>;
+    }});
+  }
+  constexpr bool isNopConstructStmt() const {
+    return visit(common::visitors{[](auto &r) {
+      return pft::isNopConstructStmt<std::decay_t<decltype(r)>>;
+    }});
   }
 
-  EvalVariant u;
-  ParentType parent;
-  parser::CharBlock pos;
-  std::optional<parser::Label> lab;
-  std::unique_ptr<EvaluationCollection> subs; // construct sub-statements
-  CFGAnnotation cfg{CFGAnnotation::None};
-  bool isTarget{false};         // this evaluation is a control target
-  bool containsBranches{false}; // construct contains branches
+  /// Return FunctionLikeUnit to which this evaluation
+  /// belongs. Nullptr if it does not belong to such unit.
+  FunctionLikeUnit *getOwningProcedure() const;
+
+  bool lowerAsStructured() const;
+  bool lowerAsUnstructured() const;
+
+  // FIR generation looks primarily at PFT statement (leaf) nodes.  So members
+  // such as lexicalSuccessor and the various block fields are only applicable
+  // to statement nodes.  One exception is that an internal construct node is
+  // a convenient place for a constructExit link that applies to exits from any
+  // statement within the construct.  The controlSuccessor member is used for
+  // nonlexical successors, such as linking to a GOTO target.  For multiway
+  // branches, controlSuccessor is set to one of the targets (might as well be
+  // the first target).  Successor and exit links always target statements.
+  //
+  // An unstructured construct is one that contains some form of goto.  This
+  // is indicated by the isUnstructured member flag, which may be set on a
+  // statement and propagated to enclosing constructs.  This distinction allows
+  // a structured IF or DO statement to be materialized with custom structured
+  // FIR operations.  An unstructured statement is materialized as mlir
+  // operation sequences that include explicit branches.
+  //
+  // There are two mlir::Block members.  The block member is set for statements
+  // that begin a new block.  If a statement may have more than one associated
+  // block, this member must be the block that would be the target of a branch
+  // to the statement.  The prime example of a statement that may have multiple
+  // associated blocks is NonLabelDoStmt, which may have a loop preheader block
+  // for loop initialization code, and always has a header block that is the
+  // target of the loop back edge.  If the NonLabelDoStmt is a concurrent loop,
+  // there may be an arbitrary number of nested preheader, header, and mask
+  // blocks.  Any such additional blocks in the localBlocks member are local
+  // to a construct and cannot be the target of an unstructured branch.  For
+  // NonLabelDoStmt, the block member designates the preheader block, which may
+  // be absent if loop initialization code may be appended to a predecessor
+  // block.  The primary loop header block is localBlocks[0], with additional
+  // DO CONCURRENT blocks at localBlocks[1], etc.
+  //
+  // The printIndex member is only set for statements.  It is used for dumps
+  // and does not affect FIR generation.  It may also be helpful for debugging.
+
+  ParentVariant parentVariant;
+  parser::CharBlock position{};
+  std::optional<parser::Label> label{};
+  std::unique_ptr<EvaluationList> evaluationList; // nested evaluations
+  Evaluation *parentConstruct{nullptr};  // set for nodes below the top level
+  Evaluation *lexicalSuccessor{nullptr}; // set for ActionStmt, ConstructStmt
+  Evaluation *controlSuccessor{nullptr}; // set for some statements
+  Evaluation *constructExit{nullptr};    // set for constructs
+  bool isNewBlock{false};                // evaluation begins a new basic block
+  bool isUnstructured{false};        // evaluation has unstructured control flow
+  bool skip{false};                  // evaluation has been processed in advance
+  class mlir::Block *block{nullptr}; // isNewBlock block
+  llvm::SmallVector<mlir::Block *, 1> localBlocks{}; // construct local blocks
+  int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps
 };
 
+using ProgramVariant =
+    ReferenceVariant<parser::MainProgram, parser::FunctionSubprogram,
+                     parser::SubroutineSubprogram, parser::Module,
+                     parser::Submodule, parser::SeparateModuleSubprogram,
+                     parser::BlockData>;
 /// A program is a list of program units.
-/// These units can be function like, module like, or block data
-struct ProgramUnit {
+/// These units can be function like, module like, or block data.
+struct ProgramUnit : ProgramVariant {
   template <typename A>
-  ProgramUnit(const A &ptr, const ParentType &parent)
-      : p{&ptr}, parent{parent} {}
+  ProgramUnit(const A &p, const ParentVariant &parentVariant)
+      : ProgramVariant{p}, parentVariant{parentVariant} {}
   ProgramUnit(ProgramUnit &&) = default;
   ProgramUnit(const ProgramUnit &) = delete;
 
-  const std::variant<
-      const parser::MainProgram *, const parser::FunctionSubprogram *,
-      const parser::SubroutineSubprogram *, const parser::Module *,
-      const parser::Submodule *, const parser::SeparateModuleSubprogram *,
-      const parser::BlockData *>
-      p;
-  ParentType parent;
+  ParentVariant parentVariant;
+};
+
+/// A variable captures an object to be created per the declaration part of a
+/// function like unit.
+///
+/// Properties can be applied by lowering. For example, a local array that is
+/// known to be very large may be transformed into a heap allocated entity by
+/// lowering. That decision would be tracked in its Variable instance.
+struct Variable {
+  explicit Variable(const Fortran::semantics::Symbol &sym, bool global = false,
+                    int depth = 0)
+      : sym{&sym}, depth{depth}, global{global} {}
+
+  const Fortran::semantics::Symbol &getSymbol() const { return *sym; }
+  
+  bool isGlobal() const { return global; }
+  bool isHeapAlloc() const { return heapAlloc; }
+  bool isPointer() const { return pointer; }
+  bool isTarget() const { return target; }
+  int getDepth() const { return depth; }
+  
+  void setHeapAlloc(bool to = true) { heapAlloc = to; }
+  void setPointer(bool to = true) { pointer = to; }
+  void setTarget(bool to = true) { target = to; }
+
+private:
+  const Fortran::semantics::Symbol *sym;
+  int depth;
+  bool global;
+  bool heapAlloc{false}; // variable needs deallocation on exit
+  bool pointer{false};
+  bool target{false};
 };
 
-/// Function-like units have similar structure. They all can contain executable
-/// statements as well as other function-like units (internal procedures and
-/// function statements).
+/// Function-like units may contain evaluations (executable statements) and
+/// nested function-like units (internal procedures and function statements).
 struct FunctionLikeUnit : public ProgramUnit {
   // wrapper statements for function-like syntactic structures
   using FunctionStatement =
-      std::variant<const parser::Statement<parser::ProgramStmt> *,
-                   const parser::Statement<parser::EndProgramStmt> *,
-                   const parser::Statement<parser::FunctionStmt> *,
-                   const parser::Statement<parser::EndFunctionStmt> *,
-                   const parser::Statement<parser::SubroutineStmt> *,
-                   const parser::Statement<parser::EndSubroutineStmt> *,
-                   const parser::Statement<parser::MpSubprogramStmt> *,
-                   const parser::Statement<parser::EndMpSubprogramStmt> *>;
-
-  FunctionLikeUnit(const parser::MainProgram &f, const ParentType &parent);
-  FunctionLikeUnit(const parser::FunctionSubprogram &f,
-                   const ParentType &parent);
-  FunctionLikeUnit(const parser::SubroutineSubprogram &f,
-                   const ParentType &parent);
-  FunctionLikeUnit(const parser::SeparateModuleSubprogram &f,
-                   const ParentType &parent);
+      ReferenceVariant<parser::Statement<parser::ProgramStmt>,
+                       parser::Statement<parser::EndProgramStmt>,
+                       parser::Statement<parser::FunctionStmt>,
+                       parser::Statement<parser::EndFunctionStmt>,
+                       parser::Statement<parser::SubroutineStmt>,
+                       parser::Statement<parser::EndSubroutineStmt>,
+                       parser::Statement<parser::MpSubprogramStmt>,
+                       parser::Statement<parser::EndMpSubprogramStmt>>;
+
+  FunctionLikeUnit(
+      const parser::MainProgram &f, const ParentVariant &parentVariant,
+      const Fortran::semantics::SemanticsContext &semanticsContext);
+  FunctionLikeUnit(
+      const parser::FunctionSubprogram &f, const ParentVariant &parentVariant,
+      const Fortran::semantics::SemanticsContext &semanticsContext);
+  FunctionLikeUnit(
+      const parser::SubroutineSubprogram &f, const ParentVariant &parentVariant,
+      const Fortran::semantics::SemanticsContext &semanticsContext);
+  FunctionLikeUnit(
+      const parser::SeparateModuleSubprogram &f,
+      const ParentVariant &parentVariant,
+      const Fortran::semantics::SemanticsContext &semanticsContext);
   FunctionLikeUnit(FunctionLikeUnit &&) = default;
   FunctionLikeUnit(const FunctionLikeUnit &) = delete;
 
-  bool isMainProgram() {
-    return std::holds_alternative<
-        const parser::Statement<parser::EndProgramStmt> *>(endStmt);
+  void processSymbolTable(const Fortran::semantics::Scope &);
+
+  std::vector<Variable> getOrderedSymbolTable() { return varList[0]; }
+
+  bool isMainProgram() const {
+    return endStmt.isA<parser::Statement<parser::EndProgramStmt>>();
   }
-  const parser::FunctionStmt *getFunction() {
-    return getA<parser::FunctionStmt>();
+
+  /// Get the starting source location for this function like unit
+  parser::CharBlock getStartingSourceLoc() {
+    if (beginStmt)
+      return stmtSourceLoc(*beginStmt);
+    if (!evaluationList.empty())
+      return evaluationList.front().position;
+    return stmtSourceLoc(endStmt);
   }
-  const parser::SubroutineStmt *getSubroutine() {
-    return getA<parser::SubroutineStmt>();
+
+  /// Returns reference to the subprogram symbol of this FunctionLikeUnit.
+  /// Dies if the FunctionLikeUnit is not a subprogram.
+  const semantics::Symbol &getSubprogramSymbol() const {
+    assert(symbol && "not inside a procedure");
+    return *symbol;
   }
-  const parser::MpSubprogramStmt *getMPSubp() {
-    return getA<parser::MpSubprogramStmt>();
+
+  /// Helper to get location from FunctionLikeUnit begin/end statements.
+  static parser::CharBlock stmtSourceLoc(const FunctionStatement &stmt) {
+    return stmt.visit(common::visitors{[](const auto &x) { return x.source; }});
   }
 
   /// Anonymous programs do not have a begin statement
   std::optional<FunctionStatement> beginStmt;
   FunctionStatement endStmt;
-  EvaluationCollection evals;        // statements
-  std::list<FunctionLikeUnit> funcs; // internal procedures
-
-private:
-  template <typename A>
-  const A *getA() {
-    if (beginStmt) {
-      if (auto p =
-              std::get_if<const parser::Statement<A> *>(&beginStmt.value()))
-        return &(*p)->statement;
-    }
-    return nullptr;
-  }
+  EvaluationList evaluationList;
+  LabelEvalMap labelEvaluationMap;
+  SymbolLabelMap assignSymbolLabelMap;
+  std::list<FunctionLikeUnit> nestedFunctions;
+  /// Symbol associated to this FunctionLikeUnit.
+  /// Null if the FunctionLikeUnit is an anonymous program.
+  /// The symbol has MainProgramDetails for named programs, otherwise it has
+  /// SubprogramDetails.
+  const semantics::Symbol *symbol{nullptr};
+  /// Terminal basic block (if any)
+  mlir::Block *finalBlock{};
+  std::vector<std::vector<Variable>> varList;
 };
 
-/// Module-like units have similar structure. They all can contain a list of
-/// function-like units.
+/// Module-like units contain a list of function-like units.
 struct ModuleLikeUnit : public ProgramUnit {
   // wrapper statements for module-like syntactic structures
   using ModuleStatement =
-      std::variant<const parser::Statement<parser::ModuleStmt> *,
-                   const parser::Statement<parser::EndModuleStmt> *,
-                   const parser::Statement<parser::SubmoduleStmt> *,
-                   const parser::Statement<parser::EndSubmoduleStmt> *>;
-
-  ModuleLikeUnit(const parser::Module &m, const ParentType &parent);
-  ModuleLikeUnit(const parser::Submodule &m, const ParentType &parent);
+      ReferenceVariant<parser::Statement<parser::ModuleStmt>,
+                       parser::Statement<parser::EndModuleStmt>,
+                       parser::Statement<parser::SubmoduleStmt>,
+                       parser::Statement<parser::EndSubmoduleStmt>>;
+
+  ModuleLikeUnit(const parser::Module &m, const ParentVariant &parentVariant);
+  ModuleLikeUnit(const parser::Submodule &m,
+                 const ParentVariant &parentVariant);
   ~ModuleLikeUnit() = default;
   ModuleLikeUnit(ModuleLikeUnit &&) = default;
   ModuleLikeUnit(const ModuleLikeUnit &) = delete;
 
   ModuleStatement beginStmt;
   ModuleStatement endStmt;
-  std::list<FunctionLikeUnit> funcs;
+  std::list<FunctionLikeUnit> nestedFunctions;
 };
 
 struct BlockDataUnit : public ProgramUnit {
-  BlockDataUnit(const parser::BlockData &bd, const ParentType &parent);
+  BlockDataUnit(const parser::BlockData &bd,
+                const ParentVariant &parentVariant);
   BlockDataUnit(BlockDataUnit &&) = default;
   BlockDataUnit(const BlockDataUnit &) = delete;
 };
 
-/// A Program is the top-level PFT
+/// A Program is the top-level root of the PFT.
 struct Program {
   using Units = std::variant<FunctionLikeUnit, ModuleLikeUnit, BlockDataUnit>;
 
@@ -375,23 +467,31 @@ struct Program {
 
   std::list<Units> &getUnits() { return units; }
 
+  /// LLVM dump method on a Program.
+  void dump();
+
 private:
   std::list<Units> units;
 };
 
 } // namespace pft
 
-/// Create an PFT from the parse tree
-std::unique_ptr<pft::Program> createPFT(const parser::Program &root);
-
-/// Decorate the PFT with control flow annotations
+/// Create a PFT (Pre-FIR Tree) from the parse tree.
 ///
-/// The PFT must be decorated with control-flow annotations to prepare it for
-/// use in generating a CFG-like structure.
-void annotateControl(pft::Program &);
-
-void dumpPFT(llvm::raw_ostream &o, pft::Program &);
-
-} // namespace Fortran::lower
-
-#endif // FORTRAN_LOWER_PFT_BUILDER_H_
+/// A PFT is a light weight tree over the parse tree that is used to create FIR.
+/// The PFT captures pointers back into the parse tree, so the parse tree must
+/// not be changed between the construction of the PFT and its last use.  The
+/// PFT captures a structured view of a program.  A program is a list of units.
+/// A function like unit contains a list of evaluations.  An evaluation is
+/// either a statement, or a construct with a nested list of evaluations.
+std::unique_ptr<pft::Program>
+createPFT(const parser::Program &root,
+          const Fortran::semantics::SemanticsContext &semanticsContext);
+
+/// Dumper for displaying a PFT.
+void dumpPFT(llvm::raw_ostream &outputStream, pft::Program &pft);
+
+} // namespace lower
+} // namespace Fortran
+
+#endif // FORTRAN_LOWER_PFTBUILDER_H
diff --git a/flang/include/flang/Lower/Utils.h b/flang/include/flang/Lower/Utils.h
new file mode 100644
index 0000000000000..d7c7b565dbc6a
--- /dev/null
+++ b/flang/include/flang/Lower/Utils.h
@@ -0,0 +1,31 @@
+//===-- Lower/Utils.h -- utilities ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_UTILS_H
+#define FORTRAN_LOWER_UTILS_H
+
+#include "flang/Common/indirection.h"
+#include "flang/Parser/char-block.h"
+#include "llvm/ADT/StringRef.h"
+
+/// Convert an F18 CharBlock to an LLVM StringRef
+inline llvm::StringRef toStringRef(const Fortran::parser::CharBlock &cb) {
+  return {cb.begin(), cb.size()};
+}
+
+/// Template helper to remove Fortran::common::Indirection wrappers.
+template <typename A>
+const A &removeIndirection(const A &a) {
+  return a;
+}
+template <typename A>
+const A &removeIndirection(const Fortran::common::Indirection<A> &a) {
+  return a.value();
+}
+
+#endif // FORTRAN_LOWER_UTILS_H
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 34e4ea95eb4af..6ffa84ca184c5 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -13,6 +13,7 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Common/enum-set.h"
 #include "flang/Common/reference.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include <array>
 #include <list>
 #include <optional>
@@ -760,4 +761,30 @@ inline bool operator<(MutableSymbolRef x, MutableSymbolRef y) {
 using SymbolSet = std::set<SymbolRef>;
 
 } // namespace Fortran::semantics
+
+// Define required  info so that SymbolRef can be used inside llvm::DenseMap.
+namespace llvm {
+template <> struct DenseMapInfo<Fortran::semantics::SymbolRef> {
+  static inline Fortran::semantics::SymbolRef getEmptyKey() {
+    auto ptr = DenseMapInfo<const Fortran::semantics::Symbol *>::getEmptyKey();
+    return *reinterpret_cast<Fortran::semantics::SymbolRef *>(&ptr);
+  }
+
+  static inline Fortran::semantics::SymbolRef getTombstoneKey() {
+    auto ptr =
+        DenseMapInfo<const Fortran::semantics::Symbol *>::getTombstoneKey();
+    return *reinterpret_cast<Fortran::semantics::SymbolRef *>(&ptr);
+  }
+
+  static unsigned getHashValue(const Fortran::semantics::SymbolRef &sym) {
+    return DenseMapInfo<const Fortran::semantics::Symbol *>::getHashValue(
+        &sym.get());
+  }
+
+  static bool isEqual(const Fortran::semantics::SymbolRef &LHS,
+      const Fortran::semantics::SymbolRef &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
 #endif // FORTRAN_SEMANTICS_SYMBOL_H_
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index 5941b570b2164..e23370ec9512d 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -1,4 +1,4 @@
-//===-- lib/Lower/PFTBuilder.cc -------------------------------------------===//
+//===-- PFTBuilder.cc -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,36 +7,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/Utils.h"
 #include "flang/Parser/dump-parse-tree.h"
 #include "flang/Parser/parse-tree-visitor.h"
-#include "llvm/ADT/DenseMap.h"
-#include <algorithm>
-#include <cassert>
-#include <utility>
+#include "flang/Semantics/semantics.h"
+#include "flang/Semantics/tools.h"
+#include "llvm/Support/CommandLine.h"
 
-namespace Fortran::lower {
-namespace {
+static llvm::cl::opt<bool> clDisableStructuredFir(
+    "no-structured-fir", llvm::cl::desc("disable generation of structured FIR"),
+    llvm::cl::init(false), llvm::cl::Hidden);
+
+using namespace Fortran;
 
-/// Helpers to unveil parser node inside parser::Statement<>,
-/// parser::UnlabeledStatement, and common::Indirection<>
+namespace {
+/// Helpers to unveil parser node inside Fortran::parser::Statement<>,
+/// Fortran::parser::UnlabeledStatement, and Fortran::common::Indirection<>
 template <typename A>
 struct RemoveIndirectionHelper {
   using Type = A;
-  static constexpr const Type &unwrap(const A &a) { return a; }
 };
 template <typename A>
 struct RemoveIndirectionHelper<common::Indirection<A>> {
   using Type = A;
-  static constexpr const Type &unwrap(const common::Indirection<A> &a) {
-    return a.value();
-  }
 };
 
-template <typename A>
-const auto &removeIndirection(const A &a) {
-  return RemoveIndirectionHelper<A>::unwrap(a);
-}
-
 template <typename A>
 struct UnwrapStmt {
   static constexpr bool isStmt{false};
@@ -46,64 +41,70 @@ struct UnwrapStmt<parser::Statement<A>> {
   static constexpr bool isStmt{true};
   using Type = typename RemoveIndirectionHelper<A>::Type;
   constexpr UnwrapStmt(const parser::Statement<A> &a)
-      : unwrapped{removeIndirection(a.statement)}, pos{a.source}, lab{a.label} {
-  }
+      : unwrapped{removeIndirection(a.statement)}, position{a.source},
+        label{a.label} {}
   const Type &unwrapped;
-  parser::CharBlock pos;
-  std::optional<parser::Label> lab;
+  parser::CharBlock position;
+  std::optional<parser::Label> label;
 };
 template <typename A>
 struct UnwrapStmt<parser::UnlabeledStatement<A>> {
   static constexpr bool isStmt{true};
   using Type = typename RemoveIndirectionHelper<A>::Type;
   constexpr UnwrapStmt(const parser::UnlabeledStatement<A> &a)
-      : unwrapped{removeIndirection(a.statement)}, pos{a.source} {}
+      : unwrapped{removeIndirection(a.statement)}, position{a.source} {}
   const Type &unwrapped;
-  parser::CharBlock pos;
-  std::optional<parser::Label> lab;
+  parser::CharBlock position;
+  std::optional<parser::Label> label;
 };
 
 /// The instantiation of a parse tree visitor (Pre and Post) is extremely
-/// expensive in terms of compile and link time, so one goal here is to limit
-/// the bridge to one such instantiation.
+/// expensive in terms of compile and link time.  So one goal here is to
+/// limit the bridge to one such instantiation.
 class PFTBuilder {
 public:
-  PFTBuilder() : pgm{new pft::Program}, parents{*pgm.get()} {}
+  PFTBuilder(const semantics::SemanticsContext &semanticsContext)
+      : pgm{std::make_unique<lower::pft::Program>()},
+        parentVariantStack{*pgm.get()}, semanticsContext{semanticsContext} {}
 
   /// Get the result
-  std::unique_ptr<pft::Program> result() { return std::move(pgm); }
+  std::unique_ptr<lower::pft::Program> result() { return std::move(pgm); }
 
   template <typename A>
   constexpr bool Pre(const A &a) {
-    bool visit{true};
-    if constexpr (pft::isFunctionLike<A>) {
-      return enterFunc(a);
-    } else if constexpr (pft::isConstruct<A>) {
-      return enterConstruct(a);
+    if constexpr (lower::pft::isFunctionLike<A>) {
+      return enterFunction(a, semanticsContext);
+    } else if constexpr (lower::pft::isConstruct<A> ||
+                         lower::pft::isDirective<A>) {
+      return enterConstructOrDirective(a);
     } else if constexpr (UnwrapStmt<A>::isStmt) {
       using T = typename UnwrapStmt<A>::Type;
       // Node "a" being visited has one of the following types:
       // Statement<T>, Statement<Indirection<T>, UnlabeledStatement<T>,
       // or UnlabeledStatement<Indirection<T>>
       auto stmt{UnwrapStmt<A>(a)};
-      if constexpr (pft::isConstructStmt<T> || pft::isOtherStmt<T>) {
-        addEval(pft::Evaluation{stmt.unwrapped, parents.back(), stmt.pos,
-                                stmt.lab});
-        visit = false;
+      if constexpr (lower::pft::isConstructStmt<T> ||
+                    lower::pft::isOtherStmt<T>) {
+        addEvaluation(lower::pft::Evaluation{stmt.unwrapped,
+                                             parentVariantStack.back(),
+                                             stmt.position, stmt.label});
+        return false;
       } else if constexpr (std::is_same_v<T, parser::ActionStmt>) {
-        addEval(makeEvalAction(stmt.unwrapped, stmt.pos, stmt.lab));
-        visit = false;
+        addEvaluation(
+            makeEvaluationAction(stmt.unwrapped, stmt.position, stmt.label));
+        return true;
       }
     }
-    return visit;
+    return true;
   }
 
   template <typename A>
   constexpr void Post(const A &) {
-    if constexpr (pft::isFunctionLike<A>) {
-      exitFunc();
-    } else if constexpr (pft::isConstruct<A>) {
-      exitConstruct();
+    if constexpr (lower::pft::isFunctionLike<A>) {
+      exitFunction();
+    } else if constexpr (lower::pft::isConstruct<A> ||
+                         lower::pft::isDirective<A>) {
+      exitConstructOrDirective();
     }
   }
 
@@ -116,25 +117,26 @@ class PFTBuilder {
 
   // Block data
   bool Pre(const parser::BlockData &node) {
-    addUnit(pft::BlockDataUnit{node, parents.back()});
+    addUnit(lower::pft::BlockDataUnit{node, parentVariantStack.back()});
     return false;
   }
 
   // Get rid of production wrapper
   bool Pre(const parser::UnlabeledStatement<parser::ForallAssignmentStmt>
                &statement) {
-    addEval(std::visit(
+    addEvaluation(std::visit(
         [&](const auto &x) {
-          return pft::Evaluation{x, parents.back(), statement.source, {}};
+          return lower::pft::Evaluation{
+              x, parentVariantStack.back(), statement.source, {}};
         },
         statement.statement.u));
     return false;
   }
   bool Pre(const parser::Statement<parser::ForallAssignmentStmt> &statement) {
-    addEval(std::visit(
+    addEvaluation(std::visit(
         [&](const auto &x) {
-          return pft::Evaluation{x, parents.back(), statement.source,
-                                 statement.label};
+          return lower::pft::Evaluation{x, parentVariantStack.back(),
+                                        statement.source, statement.label};
         },
         statement.statement.u));
     return false;
@@ -145,8 +147,9 @@ class PFTBuilder {
             [&](const parser::Statement<parser::AssignmentStmt> &stmt) {
               // Not caught as other AssignmentStmt because it is not
               // wrapped in a parser::ActionStmt.
-              addEval(pft::Evaluation{stmt.statement, parents.back(),
-                                      stmt.source, stmt.label});
+              addEvaluation(lower::pft::Evaluation{stmt.statement,
+                                                   parentVariantStack.back(),
+                                                   stmt.source, stmt.label});
               return false;
             },
             [&](const auto &) { return true; },
@@ -155,79 +158,80 @@ class PFTBuilder {
   }
 
 private:
-  // ActionStmt has a couple of non-conforming cases, which get handled
-  // explicitly here.  The other cases use an Indirection, which we discard in
-  // the PFT.
-  pft::Evaluation makeEvalAction(const parser::ActionStmt &statement,
-                                 parser::CharBlock pos,
-                                 std::optional<parser::Label> lab) {
-    return std::visit(
-        common::visitors{
-            [&](const auto &x) {
-              return pft::Evaluation{removeIndirection(x), parents.back(), pos,
-                                     lab};
-            },
-        },
-        statement.u);
-  }
-
-  // When we enter a function-like structure, we want to build a new unit and
-  // set the builder's cursors to point to it.
+  /// Initialize a new module-like unit and make it the builder's focus.
   template <typename A>
-  bool enterFunc(const A &func) {
-    auto &unit = addFunc(pft::FunctionLikeUnit{func, parents.back()});
-    funclist = &unit.funcs;
-    pushEval(&unit.evals);
-    parents.emplace_back(unit);
+  bool enterModule(const A &func) {
+    auto &unit =
+        addUnit(lower::pft::ModuleLikeUnit{func, parentVariantStack.back()});
+    functionList = &unit.nestedFunctions;
+    parentVariantStack.emplace_back(unit);
     return true;
   }
-  /// Make funclist to point to current parent function list if it exists.
-  void setFunctListToParentFuncs() {
-    if (!parents.empty()) {
-      std::visit(common::visitors{
-                     [&](pft::FunctionLikeUnit *p) { funclist = &p->funcs; },
-                     [&](pft::ModuleLikeUnit *p) { funclist = &p->funcs; },
-                     [&](auto *) { funclist = nullptr; },
-                 },
-                 parents.back().p);
-    }
-  }
 
-  void exitFunc() {
-    popEval();
-    parents.pop_back();
-    setFunctListToParentFuncs();
+  void exitModule() {
+    parentVariantStack.pop_back();
+    resetFunctionList();
   }
 
-  // When we enter a construct structure, we want to build a new construct and
-  // set the builder's evaluation cursor to point to it.
+  /// Initialize a new function-like unit and make it the builder's focus.
   template <typename A>
-  bool enterConstruct(const A &construct) {
-    auto &con = addEval(pft::Evaluation{construct, parents.back()});
-    con.subs.reset(new pft::EvaluationCollection);
-    pushEval(con.subs.get());
-    parents.emplace_back(con);
+  bool enterFunction(const A &func,
+                     const semantics::SemanticsContext &semanticsContext) {
+    auto &unit = addFunction(lower::pft::FunctionLikeUnit{
+        func, parentVariantStack.back(), semanticsContext});
+    labelEvaluationMap = &unit.labelEvaluationMap;
+    assignSymbolLabelMap = &unit.assignSymbolLabelMap;
+    functionList = &unit.nestedFunctions;
+    pushEvaluationList(&unit.evaluationList);
+    parentVariantStack.emplace_back(unit);
     return true;
   }
 
-  void exitConstruct() {
-    popEval();
-    parents.pop_back();
+  void exitFunction() {
+    // Guarantee that there is a branch target after the last user statement.
+    static const parser::ContinueStmt endTarget{};
+    addEvaluation(
+        lower::pft::Evaluation{endTarget, parentVariantStack.back(), {}, {}});
+    lastLexicalEvaluation = nullptr;
+    analyzeBranches(nullptr, *evaluationListStack.back()); // add branch links
+    popEvaluationList();
+    labelEvaluationMap = nullptr;
+    assignSymbolLabelMap = nullptr;
+    parentVariantStack.pop_back();
+    resetFunctionList();
   }
 
-  // When we enter a module structure, we want to build a new module and
-  // set the builder's function cursor to point to it.
+  /// Initialize a new construct and make it the builder's focus.
   template <typename A>
-  bool enterModule(const A &func) {
-    auto &unit = addUnit(pft::ModuleLikeUnit{func, parents.back()});
-    funclist = &unit.funcs;
-    parents.emplace_back(unit);
+  bool enterConstructOrDirective(const A &construct) {
+    auto &eval = addEvaluation(
+        lower::pft::Evaluation{construct, parentVariantStack.back()});
+    eval.evaluationList.reset(new lower::pft::EvaluationList);
+    pushEvaluationList(eval.evaluationList.get());
+    parentVariantStack.emplace_back(eval);
+    constructAndDirectiveStack.emplace_back(&eval);
     return true;
   }
 
-  void exitModule() {
-    parents.pop_back();
-    setFunctListToParentFuncs();
+  void exitConstructOrDirective() {
+    popEvaluationList();
+    parentVariantStack.pop_back();
+    constructAndDirectiveStack.pop_back();
+  }
+
+  /// Reset functionList to an enclosing function's functionList.
+  void resetFunctionList() {
+    if (!parentVariantStack.empty()) {
+      parentVariantStack.back().visit(common::visitors{
+          [&](lower::pft::FunctionLikeUnit &p) {
+            functionList = &p.nestedFunctions;
+          },
+          [&](lower::pft::ModuleLikeUnit &p) {
+            functionList = &p.nestedFunctions;
+          },
+          [&](auto &) { functionList = nullptr; },
+      });
+    }
   }
 
   template <typename A>
@@ -237,330 +241,608 @@ class PFTBuilder {
   }
 
   template <typename A>
-  A &addFunc(A &&func) {
-    if (funclist) {
-      funclist->emplace_back(std::move(func));
-      return funclist->back();
+  A &addFunction(A &&func) {
+    if (functionList) {
+      functionList->emplace_back(std::move(func));
+      return functionList->back();
     }
     return addUnit(std::move(func));
   }
 
-  /// move the Evaluation to the end of the current list
-  pft::Evaluation &addEval(pft::Evaluation &&eval) {
-    assert(funclist && "not in a function");
-    assert(evallist.size() > 0);
-    evallist.back()->emplace_back(std::move(eval));
-    return evallist.back()->back();
+  // ActionStmt has a couple of non-conforming cases, explicitly handled here.
+  // The other cases use an Indirection, which are discarded in the PFT.
+  lower::pft::Evaluation
+  makeEvaluationAction(const parser::ActionStmt &statement,
+                       parser::CharBlock position,
+                       std::optional<parser::Label> label) {
+    return std::visit(
+        common::visitors{
+            [&](const auto &x) {
+              return lower::pft::Evaluation{removeIndirection(x),
+                                            parentVariantStack.back(), position,
+                                            label};
+            },
+        },
+        statement.u);
+  }
+
+  /// Append an Evaluation to the end of the current list.
+  lower::pft::Evaluation &addEvaluation(lower::pft::Evaluation &&eval) {
+    assert(functionList && "not in a function");
+    assert(evaluationListStack.size() > 0);
+    if (constructAndDirectiveStack.size() > 0) {
+      eval.parentConstruct = constructAndDirectiveStack.back();
+    }
+    evaluationListStack.back()->emplace_back(std::move(eval));
+    lower::pft::Evaluation *p = &evaluationListStack.back()->back();
+    if (p->isActionStmt() || p->isConstructStmt()) {
+      if (lastLexicalEvaluation) {
+        lastLexicalEvaluation->lexicalSuccessor = p;
+        p->printIndex = lastLexicalEvaluation->printIndex + 1;
+      } else {
+        p->printIndex = 1;
+      }
+      lastLexicalEvaluation = p;
+    }
+    if (p->label.has_value()) {
+      labelEvaluationMap->try_emplace(*p->label, p);
+    }
+    return evaluationListStack.back()->back();
   }
 
   /// push a new list on the stack of Evaluation lists
-  void pushEval(pft::EvaluationCollection *eval) {
-    assert(funclist && "not in a function");
+  void pushEvaluationList(lower::pft::EvaluationList *eval) {
+    assert(functionList && "not in a function");
     assert(eval && eval->empty() && "evaluation list isn't correct");
-    evallist.emplace_back(eval);
+    evaluationListStack.emplace_back(eval);
   }
 
   /// pop the current list and return to the last Evaluation list
-  void popEval() {
-    assert(funclist && "not in a function");
-    evallist.pop_back();
-  }
-
-  std::unique_ptr<pft::Program> pgm;
-  /// funclist points to FunctionLikeUnit::funcs list (resp.
-  /// ModuleLikeUnit::funcs) when building a FunctionLikeUnit (resp.
-  /// ModuleLikeUnit) to store internal procedures (resp. module procedures).
-  /// Otherwise (e.g. when building the top level Program), it is null.
-  std::list<pft::FunctionLikeUnit> *funclist{nullptr};
-  /// evallist is a stack of pointer to FunctionLikeUnit::evals (or
-  /// Evaluation::subs) that are being build.
-  std::vector<pft::EvaluationCollection *> evallist;
-  std::vector<pft::ParentType> parents;
-};
+  void popEvaluationList() {
+    assert(functionList && "not in a function");
+    evaluationListStack.pop_back();
+  }
+
+  /// Mark I/O statement ERR, EOR, and END specifier branch targets.
+  template <typename A>
+  void analyzeIoBranches(lower::pft::Evaluation &eval, const A &stmt) {
+    auto processIfLabel{[&](const auto &specs) {
+      using LabelNodes =
+          std::tuple<parser::ErrLabel, parser::EorLabel, parser::EndLabel>;
+      for (const auto &spec : specs) {
+        const auto *label = std::visit(
+            [](const auto &label) -> const parser::Label * {
+              using B = std::decay_t<decltype(label)>;
+              if constexpr (common::HasMember<B, LabelNodes>) {
+                return &label.v;
+              }
+              return nullptr;
+            },
+            spec.u);
+
+        if (label)
+          markBranchTarget(eval, *label);
+      }
+    }};
+
+    using OtherIOStmts =
+        std::tuple<parser::BackspaceStmt, parser::CloseStmt,
+                   parser::EndfileStmt, parser::FlushStmt, parser::OpenStmt,
+                   parser::RewindStmt, parser::WaitStmt>;
 
-template <typename Label, typename A>
-constexpr bool hasLabel(const A &stmt) {
-  auto isLabel{
-      [](const auto &v) { return std::holds_alternative<Label>(v.u); }};
-  if constexpr (std::is_same_v<A, parser::ReadStmt> ||
-                std::is_same_v<A, parser::WriteStmt>) {
-    return std::any_of(std::begin(stmt.controls), std::end(stmt.controls),
-                       isLabel);
-  }
-  if constexpr (std::is_same_v<A, parser::WaitStmt>) {
-    return std::any_of(std::begin(stmt.v), std::end(stmt.v), isLabel);
-  }
-  if constexpr (std::is_same_v<Label, parser::ErrLabel>) {
-    if constexpr (common::HasMember<
-                      A, std::tuple<parser::OpenStmt, parser::CloseStmt,
-                                    parser::BackspaceStmt, parser::EndfileStmt,
-                                    parser::RewindStmt, parser::FlushStmt>>)
-      return std::any_of(std::begin(stmt.v), std::end(stmt.v), isLabel);
-    if constexpr (std::is_same_v<A, parser::InquireStmt>) {
-      const auto &specifiers{std::get<std::list<parser::InquireSpec>>(stmt.u)};
-      return std::any_of(std::begin(specifiers), std::end(specifiers), isLabel);
+    if constexpr (std::is_same_v<A, parser::ReadStmt> ||
+                  std::is_same_v<A, parser::WriteStmt>) {
+      processIfLabel(stmt.controls);
+    } else if constexpr (std::is_same_v<A, parser::InquireStmt>) {
+      processIfLabel(std::get<std::list<parser::InquireSpec>>(stmt.u));
+    } else if constexpr (common::HasMember<A, OtherIOStmts>) {
+      processIfLabel(stmt.v);
+    } else {
+      // Always crash if this is instantiated
+      static_assert(!std::is_same_v<A, parser::ReadStmt>,
+                    "Unexpected IO statement");
     }
   }
-  return false;
-}
 
-bool hasAltReturns(const parser::CallStmt &callStmt) {
-  const auto &args{std::get<std::list<parser::ActualArgSpec>>(callStmt.v.t)};
-  for (const auto &arg : args) {
-    const auto &actual{std::get<parser::ActualArg>(arg.t)};
-    if (std::holds_alternative<parser::AltReturnSpec>(actual.u))
-      return true;
+  /// Set the exit of a construct, possibly from multiple enclosing constructs.
+  void setConstructExit(lower::pft::Evaluation &eval) {
+    eval.constructExit = eval.evaluationList->back().lexicalSuccessor;
+    if (eval.constructExit && eval.constructExit->isNopConstructStmt()) {
+      eval.constructExit = eval.constructExit->parentConstruct->constructExit;
+    }
+    assert(eval.constructExit && "missing construct exit");
   }
-  return false;
-}
 
-/// Determine if `callStmt` has alternate returns and if so set `e` to be the
-/// origin of a switch-like control flow
-///
-/// \param cstr points to the current construct. It may be null at the top-level
-/// of a FunctionLikeUnit.
-void altRet(pft::Evaluation &evaluation, const parser::CallStmt &callStmt,
-            pft::Evaluation *cstr) {
-  if (hasAltReturns(callStmt))
-    evaluation.setCFG(pft::CFGAnnotation::Switch, cstr);
-}
+  void markBranchTarget(lower::pft::Evaluation &sourceEvaluation,
+                        lower::pft::Evaluation &targetEvaluation) {
+    sourceEvaluation.isUnstructured = true;
+    if (!sourceEvaluation.controlSuccessor) {
+      sourceEvaluation.controlSuccessor = &targetEvaluation;
+    }
+    targetEvaluation.isNewBlock = true;
+  }
+  void markBranchTarget(lower::pft::Evaluation &sourceEvaluation,
+                        parser::Label label) {
+    assert(label && "missing branch target label");
+    lower::pft::Evaluation *targetEvaluation{
+        labelEvaluationMap->find(label)->second};
+    assert(targetEvaluation && "missing branch target evaluation");
+    markBranchTarget(sourceEvaluation, *targetEvaluation);
+  }
 
-/// \param cstr points to the current construct. It may be null at the top-level
-/// of a FunctionLikeUnit.
-void annotateEvalListCFG(pft::EvaluationCollection &evaluationCollection,
-                         pft::Evaluation *cstr) {
-  bool nextIsTarget = false;
-  for (auto &eval : evaluationCollection) {
-    eval.isTarget = nextIsTarget;
-    nextIsTarget = false;
-    if (auto *subs{eval.getConstructEvals()}) {
-      annotateEvalListCFG(*subs, &eval);
-      // assume that the entry and exit are both possible branch targets
-      nextIsTarget = true;
+  /// Return the first non-nop successor of an evaluation, possibly exiting
+  /// from one or more enclosing constructs.
+  lower::pft::Evaluation *exitSuccessor(lower::pft::Evaluation &eval) {
+    lower::pft::Evaluation *successor{eval.lexicalSuccessor};
+    if (successor && successor->isNopConstructStmt()) {
+      successor = successor->parentConstruct->constructExit;
     }
+    assert(successor && "missing exit successor");
+    return successor;
+  }
 
-    if (eval.isActionOrGenerated() && eval.lab.has_value())
-      eval.isTarget = true;
-    eval.visit(common::visitors{
-        [&](const parser::CallStmt &statement) {
-          altRet(eval, statement, cstr);
-        },
-        [&](const parser::CycleStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Goto, cstr);
-        },
-        [&](const parser::ExitStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Goto, cstr);
-        },
-        [&](const parser::FailImageStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Terminate, cstr);
-        },
-        [&](const parser::GotoStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Goto, cstr);
-        },
-        [&](const parser::IfStmt &) {
-          eval.setCFG(pft::CFGAnnotation::CondGoto, cstr);
-        },
-        [&](const parser::ReturnStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Return, cstr);
-        },
-        [&](const parser::StopStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Terminate, cstr);
-        },
-        [&](const parser::ArithmeticIfStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Switch, cstr);
-        },
-        [&](const parser::AssignedGotoStmt &) {
-          eval.setCFG(pft::CFGAnnotation::IndGoto, cstr);
-        },
-        [&](const parser::ComputedGotoStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Switch, cstr);
-        },
-        [&](const parser::WhereStmt &) {
-          // fir.loop + fir.where around the next stmt
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::Iterative, cstr);
-        },
-        [&](const parser::ForallStmt &) {
-          // fir.loop around the next stmt
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::Iterative, cstr);
-        },
-        [&](pft::CGJump &) { eval.setCFG(pft::CFGAnnotation::Goto, cstr); },
-        [&](const parser::SelectCaseStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Switch, cstr);
-        },
-        [&](const parser::NonLabelDoStmt &) {
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::Iterative, cstr);
-        },
-        [&](const parser::EndDoStmt &) {
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::Goto, cstr);
-        },
-        [&](const parser::IfThenStmt &) {
-          eval.setCFG(pft::CFGAnnotation::CondGoto, cstr);
-        },
-        [&](const parser::ElseIfStmt &) {
-          eval.setCFG(pft::CFGAnnotation::CondGoto, cstr);
-        },
-        [&](const parser::SelectRankStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Switch, cstr);
-        },
-        [&](const parser::SelectTypeStmt &) {
-          eval.setCFG(pft::CFGAnnotation::Switch, cstr);
-        },
-        [&](const parser::WhereConstruct &) {
-          // mark the WHERE as if it were a DO loop
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::Iterative, cstr);
-        },
-        [&](const parser::WhereConstructStmt &) {
-          eval.setCFG(pft::CFGAnnotation::CondGoto, cstr);
-        },
-        [&](const parser::MaskedElsewhereStmt &) {
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::CondGoto, cstr);
-        },
-        [&](const parser::ForallConstructStmt &) {
-          eval.isTarget = true;
-          eval.setCFG(pft::CFGAnnotation::Iterative, cstr);
-        },
+  /// Mark the exit successor of an Evaluation as a new block.
+  void markSuccessorAsNewBlock(lower::pft::Evaluation &eval) {
+    exitSuccessor(eval)->isNewBlock = true;
+  }
 
-        [&](const auto &stmt) {
-          // Handle statements with similar impact on control flow
-          using IoStmts = std::tuple<parser::BackspaceStmt, parser::CloseStmt,
-                                     parser::EndfileStmt, parser::FlushStmt,
-                                     parser::InquireStmt, parser::OpenStmt,
-                                     parser::ReadStmt, parser::RewindStmt,
-                                     parser::WaitStmt, parser::WriteStmt>;
-
-          using TargetStmts =
-              std::tuple<parser::EndAssociateStmt, parser::EndBlockStmt,
-                         parser::CaseStmt, parser::EndSelectStmt,
-                         parser::EndChangeTeamStmt, parser::EndCriticalStmt,
-                         parser::ElseStmt, parser::EndIfStmt,
-                         parser::SelectRankCaseStmt, parser::TypeGuardStmt,
-                         parser::ElsewhereStmt, parser::EndWhereStmt,
-                         parser::EndForallStmt>;
-
-          using DoNothingConstructStmts =
-              std::tuple<parser::BlockStmt, parser::AssociateStmt,
-                         parser::CriticalStmt, parser::ChangeTeamStmt>;
-
-          using A = std::decay_t<decltype(stmt)>;
-          if constexpr (common::HasMember<A, IoStmts>) {
-            if (hasLabel<parser::ErrLabel>(stmt) ||
-                hasLabel<parser::EorLabel>(stmt) ||
-                hasLabel<parser::EndLabel>(stmt))
-              eval.setCFG(pft::CFGAnnotation::IoSwitch, cstr);
-          } else if constexpr (common::HasMember<A, TargetStmts>) {
-            eval.isTarget = true;
-          } else if constexpr (common::HasMember<A, DoNothingConstructStmts>) {
-            // Explicitly do nothing for these construct statements
-          } else {
-            static_assert(!pft::isConstructStmt<A>,
-                          "All ConstructStmts impact on the control flow "
-                          "should be explicitly handled");
-          }
-          /* else do nothing */
-        },
-    });
+  template <typename A>
+  inline std::string getConstructName(const A &stmt) {
+    using MaybeConstructNameWrapper =
+        std::tuple<parser::BlockStmt, parser::CycleStmt, parser::ElseStmt,
+                   parser::ElsewhereStmt, parser::EndAssociateStmt,
+                   parser::EndBlockStmt, parser::EndCriticalStmt,
+                   parser::EndDoStmt, parser::EndForallStmt, parser::EndIfStmt,
+                   parser::EndSelectStmt, parser::EndWhereStmt,
+                   parser::ExitStmt>;
+    if constexpr (common::HasMember<A, MaybeConstructNameWrapper>) {
+      if (stmt.v)
+        return stmt.v->ToString();
+    }
+
+    using MaybeConstructNameInTuple = std::tuple<
+        parser::AssociateStmt, parser::CaseStmt, parser::ChangeTeamStmt,
+        parser::CriticalStmt, parser::ElseIfStmt, parser::EndChangeTeamStmt,
+        parser::ForallConstructStmt, parser::IfThenStmt, parser::LabelDoStmt,
+        parser::MaskedElsewhereStmt, parser::NonLabelDoStmt,
+        parser::SelectCaseStmt, parser::SelectRankCaseStmt,
+        parser::TypeGuardStmt, parser::WhereConstructStmt>;
+
+    if constexpr (common::HasMember<A, MaybeConstructNameInTuple>) {
+      if (auto name{std::get<std::optional<parser::Name>>(stmt.t)})
+        return name->ToString();
+    }
+
+    // These statements have several std::optional<parser::Name>
+    if constexpr (std::is_same_v<A, parser::SelectRankStmt> ||
+                  std::is_same_v<A, parser::SelectTypeStmt>) {
+      if (auto name{std::get<0>(stmt.t)}) {
+        return name->ToString();
+      }
+    }
+    return {};
   }
-}
 
-/// Annotate the PFT with CFG source decorations (see CFGAnnotation) and mark
-/// potential branch targets
-inline void annotateFuncCFG(pft::FunctionLikeUnit &functionLikeUnit) {
-  annotateEvalListCFG(functionLikeUnit.evals, nullptr);
-  for (auto &internalFunc : functionLikeUnit.funcs)
-    annotateFuncCFG(internalFunc);
-}
+  /// \p parentConstruct can be null if this statement is at the highest
+  /// level of a program.
+  template <typename A>
+  void insertConstructName(const A &stmt,
+                           lower::pft::Evaluation *parentConstruct) {
+    std::string name{getConstructName(stmt)};
+    if (!name.empty()) {
+      constructNameMap[name] = parentConstruct;
+    }
+  }
+
+  /// Insert branch links for a list of Evaluations.
+  /// \p parentConstruct can be null if the evaluationList contains the
+  /// top-level statements of a program.
+  void analyzeBranches(lower::pft::Evaluation *parentConstruct,
+                       std::list<lower::pft::Evaluation> &evaluationList) {
+    lower::pft::Evaluation *lastConstructStmtEvaluation{nullptr};
+    lower::pft::Evaluation *lastIfStmtEvaluation{nullptr};
+    for (auto &eval : evaluationList) {
+      eval.visit(common::visitors{
+          // Action statements
+          [&](const parser::CallStmt &s) {
+            // Look for alternate return specifiers.
+            const auto &args{std::get<std::list<parser::ActualArgSpec>>(s.v.t)};
+            for (const auto &arg : args) {
+              const auto &actual{std::get<parser::ActualArg>(arg.t)};
+              if (const auto *altReturn{
+                      std::get_if<parser::AltReturnSpec>(&actual.u)}) {
+                markBranchTarget(eval, altReturn->v);
+              }
+            }
+          },
+          [&](const parser::CycleStmt &s) {
+            std::string name{getConstructName(s)};
+            lower::pft::Evaluation *construct{name.empty()
+                                                  ? doConstructStack.back()
+                                                  : constructNameMap[name]};
+            assert(construct && "missing CYCLE construct");
+            markBranchTarget(eval, construct->evaluationList->back());
+          },
+          [&](const parser::ExitStmt &s) {
+            std::string name{getConstructName(s)};
+            lower::pft::Evaluation *construct{name.empty()
+                                                  ? doConstructStack.back()
+                                                  : constructNameMap[name]};
+            assert(construct && "missing EXIT construct");
+            markBranchTarget(eval, *construct->constructExit);
+          },
+          [&](const parser::GotoStmt &s) { markBranchTarget(eval, s.v); },
+          [&](const parser::IfStmt &) { lastIfStmtEvaluation = &eval; },
+          [&](const parser::ReturnStmt &) {
+            eval.isUnstructured = true;
+            if (eval.lexicalSuccessor->lexicalSuccessor)
+              markSuccessorAsNewBlock(eval);
+          },
+          [&](const parser::StopStmt &) {
+            eval.isUnstructured = true;
+            if (eval.lexicalSuccessor->lexicalSuccessor)
+              markSuccessorAsNewBlock(eval);
+          },
+          [&](const parser::ComputedGotoStmt &s) {
+            for (auto &label : std::get<std::list<parser::Label>>(s.t)) {
+              markBranchTarget(eval, label);
+            }
+          },
+          [&](const parser::ArithmeticIfStmt &s) {
+            markBranchTarget(eval, std::get<1>(s.t));
+            markBranchTarget(eval, std::get<2>(s.t));
+            markBranchTarget(eval, std::get<3>(s.t));
+            if (semantics::ExprHasTypeCategory(
+                    *semantics::GetExpr(std::get<parser::Expr>(s.t)),
+                    common::TypeCategory::Real)) {
+              // Real expression evaluation uses an additional local block.
+              eval.localBlocks.emplace_back(nullptr);
+            }
+          },
+          [&](const parser::AssignStmt &s) { // legacy label assignment
+            auto &label = std::get<parser::Label>(s.t);
+            const auto *sym = std::get<parser::Name>(s.t).symbol;
+            assert(sym && "missing AssignStmt symbol");
+            lower::pft::Evaluation *target{
+                labelEvaluationMap->find(label)->second};
+            assert(target && "missing branch target evaluation");
+            if (!target->isA<parser::FormatStmt>()) {
+              target->isNewBlock = true;
+            }
+            auto iter = assignSymbolLabelMap->find(*sym);
+            if (iter == assignSymbolLabelMap->end()) {
+              lower::pft::LabelSet labelSet{};
+              labelSet.insert(label);
+              assignSymbolLabelMap->try_emplace(*sym, labelSet);
+            } else {
+              iter->second.insert(label);
+            }
+          },
+          [&](const parser::AssignedGotoStmt &) {
+            // Although this statement is a branch, it doesn't have any
+            // explicit control successors.  So the code at the end of the
+            // loop won't mark the exit successor.  Do that here.
+            markSuccessorAsNewBlock(eval);
+          },
+
+          // Construct statements
+          [&](const parser::AssociateStmt &s) {
+            insertConstructName(s, parentConstruct);
+          },
+          [&](const parser::BlockStmt &s) {
+            insertConstructName(s, parentConstruct);
+          },
+          [&](const parser::SelectCaseStmt &s) {
+            insertConstructName(s, parentConstruct);
+            lastConstructStmtEvaluation = &eval;
+          },
+          [&](const parser::CaseStmt &) {
+            eval.isNewBlock = true;
+            lastConstructStmtEvaluation->controlSuccessor = &eval;
+            lastConstructStmtEvaluation = &eval;
+          },
+          [&](const parser::EndSelectStmt &) {
+            eval.lexicalSuccessor->isNewBlock = true;
+            lastConstructStmtEvaluation = nullptr;
+          },
+          [&](const parser::ChangeTeamStmt &s) {
+            insertConstructName(s, parentConstruct);
+          },
+          [&](const parser::CriticalStmt &s) {
+            insertConstructName(s, parentConstruct);
+          },
+          [&](const parser::NonLabelDoStmt &s) {
+            insertConstructName(s, parentConstruct);
+            doConstructStack.push_back(parentConstruct);
+            auto &control{std::get<std::optional<parser::LoopControl>>(s.t)};
+            // eval.block is the loop preheader block, which will be set
+            // elsewhere if the NonLabelDoStmt is itself a target.
+            // eval.localBlocks[0] is the loop header block.
+            eval.localBlocks.emplace_back(nullptr);
+            if (!control.has_value()) {
+              eval.isUnstructured = true; // infinite loop
+              return;
+            }
+            eval.lexicalSuccessor->isNewBlock = true;
+            eval.controlSuccessor = &evaluationList.back();
+            if (std::holds_alternative<parser::ScalarLogicalExpr>(control->u)) {
+              eval.isUnstructured = true; // while loop
+            }
+            // Defer additional processing for an unstructured concurrent loop
+            // to the EndDoStmt, when the loop is known to be unstructured.
+          },
+          [&](const parser::EndDoStmt &) {
+            lower::pft::Evaluation &doEval{evaluationList.front()};
+            eval.controlSuccessor = &doEval;
+            doConstructStack.pop_back();
+            if (parentConstruct->lowerAsStructured()) {
+              return;
+            }
+            // Now that the loop is known to be unstructured, finish concurrent
+            // loop processing, using NonLabelDoStmt information.
+            parentConstruct->constructExit->isNewBlock = true;
+            const auto &doStmt{doEval.getIf<parser::NonLabelDoStmt>()};
+            assert(doStmt && "missing NonLabelDoStmt");
+            auto &control{
+                std::get<std::optional<parser::LoopControl>>(doStmt->t)};
+            if (!control.has_value()) {
+              return; // infinite loop
+            }
+            const auto *concurrent{
+                std::get_if<parser::LoopControl::Concurrent>(&control->u)};
+            if (!concurrent) {
+              return;
+            }
+            // Unstructured concurrent loop.  NonLabelDoStmt code accounts
+            // for one concurrent loop dimension.  Reserve preheader,
+            // header, and latch blocks for the remaining dimensions, and
+            // one block for a mask expression.
+            const auto &header{
+                std::get<parser::ConcurrentHeader>(concurrent->t)};
+            auto dims{std::get<std::list<parser::ConcurrentControl>>(header.t)
+                          .size()};
+            for (; dims > 1; --dims) {
+              doEval.localBlocks.emplace_back(nullptr); // preheader
+              doEval.localBlocks.emplace_back(nullptr); // header
+              eval.localBlocks.emplace_back(nullptr);   // latch
+            }
+            if (std::get<std::optional<parser::ScalarLogicalExpr>>(header.t)) {
+              doEval.localBlocks.emplace_back(nullptr); // mask
+            }
+          },
+          [&](const parser::IfThenStmt &s) {
+            insertConstructName(s, parentConstruct);
+            eval.lexicalSuccessor->isNewBlock = true;
+            lastConstructStmtEvaluation = &eval;
+          },
+          [&](const parser::ElseIfStmt &) {
+            eval.isNewBlock = true;
+            eval.lexicalSuccessor->isNewBlock = true;
+            lastConstructStmtEvaluation->controlSuccessor = &eval;
+            lastConstructStmtEvaluation = &eval;
+          },
+          [&](const parser::ElseStmt &) {
+            eval.isNewBlock = true;
+            lastConstructStmtEvaluation->controlSuccessor = &eval;
+            lastConstructStmtEvaluation = nullptr;
+          },
+          [&](const parser::EndIfStmt &) {
+            if (parentConstruct->lowerAsUnstructured()) {
+              parentConstruct->constructExit->isNewBlock = true;
+            }
+            if (lastConstructStmtEvaluation) {
+              lastConstructStmtEvaluation->controlSuccessor =
+                  parentConstruct->constructExit;
+              lastConstructStmtEvaluation = nullptr;
+            }
+          },
+          [&](const parser::SelectRankStmt &s) {
+            insertConstructName(s, parentConstruct);
+          },
+          [&](const parser::SelectRankCaseStmt &) { eval.isNewBlock = true; },
+          [&](const parser::SelectTypeStmt &s) {
+            insertConstructName(s, parentConstruct);
+          },
+          [&](const parser::TypeGuardStmt &) { eval.isNewBlock = true; },
+
+          // Constructs - set (unstructured) construct exit targets
+          [&](const parser::AssociateConstruct &) { setConstructExit(eval); },
+          [&](const parser::BlockConstruct &) {
+            // EndBlockStmt may have code.
+            eval.constructExit = &eval.evaluationList->back();
+          },
+          [&](const parser::CaseConstruct &) {
+            setConstructExit(eval);
+            eval.isUnstructured = true;
+          },
+          [&](const parser::ChangeTeamConstruct &) {
+            // EndChangeTeamStmt may have code.
+            eval.constructExit = &eval.evaluationList->back();
+          },
+          [&](const parser::CriticalConstruct &) {
+            // EndCriticalStmt may have code.
+            eval.constructExit = &eval.evaluationList->back();
+          },
+          [&](const parser::DoConstruct &) { setConstructExit(eval); },
+          [&](const parser::IfConstruct &) { setConstructExit(eval); },
+          [&](const parser::SelectRankConstruct &) {
+            setConstructExit(eval);
+            eval.isUnstructured = true;
+          },
+          [&](const parser::SelectTypeConstruct &) {
+            setConstructExit(eval);
+            eval.isUnstructured = true;
+          },
+
+          [&](const auto &stmt) {
+            using A = std::decay_t<decltype(stmt)>;
+            using IoStmts = std::tuple<parser::BackspaceStmt, parser::CloseStmt,
+                                       parser::EndfileStmt, parser::FlushStmt,
+                                       parser::InquireStmt, parser::OpenStmt,
+                                       parser::ReadStmt, parser::RewindStmt,
+                                       parser::WaitStmt, parser::WriteStmt>;
+            if constexpr (common::HasMember<A, IoStmts>) {
+              analyzeIoBranches(eval, stmt);
+            }
+
+            /* do nothing */
+          },
+      });
+
+      // Analyze construct evaluations.
+      if (eval.evaluationList) {
+        analyzeBranches(&eval, *eval.evaluationList);
+      }
+
+      // Insert branch links for an unstructured IF statement.
+      if (lastIfStmtEvaluation && lastIfStmtEvaluation != &eval) {
+        // eval is the action substatement of an IfStmt.
+        if (eval.lowerAsUnstructured()) {
+          eval.isNewBlock = true;
+          markSuccessorAsNewBlock(eval);
+          lastIfStmtEvaluation->isUnstructured = true;
+        }
+        lastIfStmtEvaluation->controlSuccessor = exitSuccessor(eval);
+        lastIfStmtEvaluation = nullptr;
+      }
+
+      // Set the successor of the last statement in an IF or SELECT block.
+      if (!eval.controlSuccessor && eval.lexicalSuccessor &&
+          eval.lexicalSuccessor->isIntermediateConstructStmt()) {
+        eval.controlSuccessor = parentConstruct->constructExit;
+        eval.lexicalSuccessor->isNewBlock = true;
+      }
+
+      // Propagate isUnstructured flag to enclosing construct.
+      if (parentConstruct && eval.isUnstructured) {
+        parentConstruct->isUnstructured = true;
+      }
+
+      // The lexical successor of a branch starts a new block.
+      if (eval.controlSuccessor && eval.isActionStmt() &&
+          eval.lowerAsUnstructured()) {
+        markSuccessorAsNewBlock(eval);
+      }
+    }
+  }
+
+  std::unique_ptr<lower::pft::Program> pgm;
+  std::vector<lower::pft::ParentVariant> parentVariantStack;
+  const semantics::SemanticsContext &semanticsContext;
+
+  /// functionList points to the internal or module procedure function list
+  /// of a FunctionLikeUnit or a ModuleLikeUnit.  It may be null.
+  std::list<lower::pft::FunctionLikeUnit> *functionList{nullptr};
+  std::vector<lower::pft::Evaluation *> constructAndDirectiveStack{};
+  std::vector<lower::pft::Evaluation *> doConstructStack{};
+  /// evaluationListStack is the current nested construct evaluationList state.
+  std::vector<lower::pft::EvaluationList *> evaluationListStack{};
+  llvm::DenseMap<parser::Label, lower::pft::Evaluation *> *labelEvaluationMap{
+      nullptr};
+  lower::pft::SymbolLabelMap *assignSymbolLabelMap{nullptr};
+  std::map<std::string, lower::pft::Evaluation *> constructNameMap{};
+  lower::pft::Evaluation *lastLexicalEvaluation{nullptr};
+};
 
 class PFTDumper {
 public:
-  void dumpPFT(llvm::raw_ostream &outputStream, pft::Program &pft) {
+  void dumpPFT(llvm::raw_ostream &outputStream, lower::pft::Program &pft) {
     for (auto &unit : pft.getUnits()) {
       std::visit(common::visitors{
-                     [&](pft::BlockDataUnit &unit) {
+                     [&](lower::pft::BlockDataUnit &unit) {
                        outputStream << getNodeIndex(unit) << " ";
                        outputStream << "BlockData: ";
                        outputStream << "\nEndBlockData\n\n";
                      },
-                     [&](pft::FunctionLikeUnit &func) {
+                     [&](lower::pft::FunctionLikeUnit &func) {
                        dumpFunctionLikeUnit(outputStream, func);
                      },
-                     [&](pft::ModuleLikeUnit &unit) {
+                     [&](lower::pft::ModuleLikeUnit &unit) {
                        dumpModuleLikeUnit(outputStream, unit);
                      },
                  },
                  unit);
     }
-    resetIndexes();
   }
 
-  llvm::StringRef evalName(pft::Evaluation &eval) {
+  llvm::StringRef evaluationName(lower::pft::Evaluation &eval) {
     return eval.visit(common::visitors{
-        [](const pft::CGJump) { return "CGJump"; },
         [](const auto &parseTreeNode) {
           return parser::ParseTreeDumper::GetNodeName(parseTreeNode);
         },
     });
   }
 
-  void dumpEvalList(llvm::raw_ostream &outputStream,
-                    pft::EvaluationCollection &evaluationCollection,
-                    int indent = 1) {
+  void dumpEvaluationList(llvm::raw_ostream &outputStream,
+                          lower::pft::EvaluationList &evaluationList,
+                          int indent = 1) {
     static const std::string white{"                                      ++"};
     std::string indentString{white.substr(0, indent * 2)};
-    for (pft::Evaluation &eval : evaluationCollection) {
-      outputStream << indentString << getNodeIndex(eval) << " ";
-      llvm::StringRef name{evalName(eval)};
-      if (auto *subs{eval.getConstructEvals()}) {
-        outputStream << "<<" << name << ">>";
-        outputStream << "\n";
-        dumpEvalList(outputStream, *subs, indent + 1);
-        outputStream << indentString << "<<End" << name << ">>\n";
-      } else {
-        outputStream << name;
-        outputStream << ": " << eval.pos.ToString() + "\n";
+    for (lower::pft::Evaluation &eval : evaluationList) {
+      llvm::StringRef name{evaluationName(eval)};
+      std::string bang{eval.isUnstructured ? "!" : ""};
+      if (eval.isConstruct() || eval.isDirective()) {
+        outputStream << indentString << "<<" << name << bang << ">>";
+        if (eval.constructExit) {
+          outputStream << " -> " << eval.constructExit->printIndex;
+        }
+        outputStream << '\n';
+        dumpEvaluationList(outputStream, *eval.evaluationList, indent + 1);
+        outputStream << indentString << "<<End " << name << bang << ">>\n";
+        continue;
+      }
+      outputStream << indentString;
+      if (eval.printIndex) {
+        outputStream << eval.printIndex << ' ';
+      }
+      if (eval.isNewBlock) {
+        outputStream << '^';
+      }
+      if (eval.localBlocks.size()) {
+        outputStream << '*';
       }
+      outputStream << name << bang;
+      if (eval.isActionStmt() || eval.isConstructStmt()) {
+        if (eval.controlSuccessor) {
+          outputStream << " -> " << eval.controlSuccessor->printIndex;
+        }
+      }
+      if (eval.position.size()) {
+        outputStream << ": " << eval.position.ToString();
+      }
+      outputStream << '\n';
     }
   }
 
   void dumpFunctionLikeUnit(llvm::raw_ostream &outputStream,
-                            pft::FunctionLikeUnit &functionLikeUnit) {
+                            lower::pft::FunctionLikeUnit &functionLikeUnit) {
     outputStream << getNodeIndex(functionLikeUnit) << " ";
     llvm::StringRef unitKind{};
     std::string name{};
     std::string header{};
     if (functionLikeUnit.beginStmt) {
-      std::visit(
-          common::visitors{
-              [&](const parser::Statement<parser::ProgramStmt> *statement) {
-                unitKind = "Program";
-                name = statement->statement.v.ToString();
-              },
-              [&](const parser::Statement<parser::FunctionStmt> *statement) {
-                unitKind = "Function";
-                name =
-                    std::get<parser::Name>(statement->statement.t).ToString();
-                header = statement->source.ToString();
-              },
-              [&](const parser::Statement<parser::SubroutineStmt> *statement) {
-                unitKind = "Subroutine";
-                name =
-                    std::get<parser::Name>(statement->statement.t).ToString();
-                header = statement->source.ToString();
-              },
-              [&](const parser::Statement<parser::MpSubprogramStmt>
-                      *statement) {
-                unitKind = "MpSubprogram";
-                name = statement->statement.v.ToString();
-                header = statement->source.ToString();
-              },
-              [&](auto *) {},
-          },
-          *functionLikeUnit.beginStmt);
+      functionLikeUnit.beginStmt->visit(common::visitors{
+          [&](const parser::Statement<parser::ProgramStmt> &statement) {
+            unitKind = "Program";
+            name = statement.statement.v.ToString();
+          },
+          [&](const parser::Statement<parser::FunctionStmt> &statement) {
+            unitKind = "Function";
+            name = std::get<parser::Name>(statement.statement.t).ToString();
+            header = statement.source.ToString();
+          },
+          [&](const parser::Statement<parser::SubroutineStmt> &statement) {
+            unitKind = "Subroutine";
+            name = std::get<parser::Name>(statement.statement.t).ToString();
+            header = statement.source.ToString();
+          },
+          [&](const parser::Statement<parser::MpSubprogramStmt> &statement) {
+            unitKind = "MpSubprogram";
+            name = statement.statement.v.ToString();
+            header = statement.source.ToString();
+          },
+          [&](const auto &) {},
+      });
     } else {
       unitKind = "Program";
       name = "<anonymous>";
@@ -569,10 +851,10 @@ class PFTDumper {
     if (header.size())
       outputStream << ": " << header;
     outputStream << '\n';
-    dumpEvalList(outputStream, functionLikeUnit.evals);
-    if (!functionLikeUnit.funcs.empty()) {
+    dumpEvaluationList(outputStream, functionLikeUnit.evaluationList);
+    if (!functionLikeUnit.nestedFunctions.empty()) {
       outputStream << "\nContains\n";
-      for (auto &func : functionLikeUnit.funcs)
+      for (auto &func : functionLikeUnit.nestedFunctions)
         dumpFunctionLikeUnit(outputStream, func);
       outputStream << "EndContains\n";
     }
@@ -580,11 +862,11 @@ class PFTDumper {
   }
 
   void dumpModuleLikeUnit(llvm::raw_ostream &outputStream,
-                          pft::ModuleLikeUnit &moduleLikeUnit) {
+                          lower::pft::ModuleLikeUnit &moduleLikeUnit) {
     outputStream << getNodeIndex(moduleLikeUnit) << " ";
     outputStream << "ModuleLike: ";
     outputStream << "\nContains\n";
-    for (auto &func : moduleLikeUnit.funcs)
+    for (auto &func : moduleLikeUnit.nestedFunctions)
       dumpFunctionLikeUnit(outputStream, func);
     outputStream << "EndContains\nEndModuleLike\n\n";
   }
@@ -599,99 +881,249 @@ class PFTDumper {
     nodeIndexes.try_emplace(addr, nextIndex);
     return nextIndex++;
   }
-  std::size_t getNodeIndex(const pft::Program &) { return 0; }
-
-  void resetIndexes() {
-    nodeIndexes.clear();
-    nextIndex = 1;
-  }
+  std::size_t getNodeIndex(const lower::pft::Program &) { return 0; }
 
 private:
   llvm::DenseMap<const void *, std::size_t> nodeIndexes;
   std::size_t nextIndex{1}; // 0 is the root
 };
 
+} // namespace
+
 template <typename A, typename T>
-pft::FunctionLikeUnit::FunctionStatement getFunctionStmt(const T &func) {
-  return pft::FunctionLikeUnit::FunctionStatement{
-      &std::get<parser::Statement<A>>(func.t)};
+static lower::pft::FunctionLikeUnit::FunctionStatement
+getFunctionStmt(const T &func) {
+  return std::get<parser::Statement<A>>(func.t);
 }
 template <typename A, typename T>
-pft::ModuleLikeUnit::ModuleStatement getModuleStmt(const T &mod) {
-  return pft::ModuleLikeUnit::ModuleStatement{
-      &std::get<parser::Statement<A>>(mod.t)};
+static lower::pft::ModuleLikeUnit::ModuleStatement getModuleStmt(const T &mod) {
+  return std::get<parser::Statement<A>>(mod.t);
+}
+
+static const semantics::Symbol *getSymbol(
+    std::optional<lower::pft::FunctionLikeUnit::FunctionStatement> &beginStmt) {
+  if (!beginStmt)
+    return nullptr;
+
+  const auto *symbol = beginStmt->visit(common::visitors{
+      [](const parser::Statement<parser::ProgramStmt> &stmt)
+          -> const semantics::Symbol * { return stmt.statement.v.symbol; },
+      [](const parser::Statement<parser::FunctionStmt> &stmt)
+          -> const semantics::Symbol * {
+        return std::get<parser::Name>(stmt.statement.t).symbol;
+      },
+      [](const parser::Statement<parser::SubroutineStmt> &stmt)
+          -> const semantics::Symbol * {
+        return std::get<parser::Name>(stmt.statement.t).symbol;
+      },
+      [](const parser::Statement<parser::MpSubprogramStmt> &stmt)
+          -> const semantics::Symbol * { return stmt.statement.v.symbol; },
+      [](const auto &) -> const semantics::Symbol * {
+        llvm_unreachable("unknown FunctionLike beginStmt");
+        return nullptr;
+      }});
+  assert(symbol && "parser::Name must have resolved symbol");
+  return symbol;
+}
+
+bool Fortran::lower::pft::Evaluation::lowerAsStructured() const {
+  return !lowerAsUnstructured();
 }
 
+bool Fortran::lower::pft::Evaluation::lowerAsUnstructured() const {
+  return isUnstructured || clDisableStructuredFir;
+}
+
+lower::pft::FunctionLikeUnit *
+Fortran::lower::pft::Evaluation::getOwningProcedure() const {
+  return parentVariant.visit(common::visitors{
+      [](lower::pft::FunctionLikeUnit &c) { return &c; },
+      [&](lower::pft::Evaluation &c) { return c.getOwningProcedure(); },
+      [](auto &) -> lower::pft::FunctionLikeUnit * { return nullptr; },
+  });
+}
+
+namespace {
+/// This helper class is for sorting the symbols in the symbol table. We want
+/// the symbols in an order such that a symbol will be visited after those it
+/// depends upon. Otherwise this sort is stable and preserves the order of the
+/// symbol table, which is sorted by name.
+struct SymbolDependenceDepth {
+  explicit SymbolDependenceDepth(
+      std::vector<std::vector<lower::pft::Variable>> &vars)
+      : vars{vars} {}
+
+  // Recursively visit each symbol to determine the height of its dependence on
+  // other symbols.
+  int analyze(const semantics::Symbol &sym) {
+    auto done = seen.insert(&sym);
+    if (!done.second)
+      return 0;
+    if (semantics::IsProcedure(sym)) {
+      // TODO: add declaration?
+      return 0;
+    }
+    if (sym.has<semantics::UseDetails>() ||
+        sym.has<semantics::HostAssocDetails>() ||
+        sym.has<semantics::NamelistDetails>() ||
+        sym.has<semantics::MiscDetails>()) {
+      // FIXME: do we want to do anything with any of these?
+      return 0;
+    }
+
+    // Symbol must be something lowering will have to allocate.
+    bool global = semantics::IsSaved(sym);
+    int depth = 0;
+    const auto *symTy = sym.GetType();
+    assert(symTy && "symbol must have a type");
+
+    // check CHARACTER's length
+    if (symTy->category() == semantics::DeclTypeSpec::Character)
+      if (auto e = symTy->characterTypeSpec().length().GetExplicit())
+        for (const auto &s : evaluate::CollectSymbols(*e))
+          depth = std::max(analyze(s) + 1, depth);
+
+    if (const auto *details = sym.detailsIf<semantics::ObjectEntityDetails>()) {
+      auto doExplicit = [&](const auto &bound) {
+        if (bound.isExplicit()) {
+          semantics::SomeExpr e{*bound.GetExplicit()};
+          for (const auto &s : evaluate::CollectSymbols(e))
+            depth = std::max(analyze(s) + 1, depth);
+        }
+      };
+      // handle any symbols in array bound declarations
+      for (const auto &subs : details->shape()) {
+        doExplicit(subs.lbound());
+        doExplicit(subs.ubound());
+      }
+      // handle any symbols in coarray bound declarations
+      for (const auto &subs : details->coshape()) {
+        doExplicit(subs.lbound());
+        doExplicit(subs.ubound());
+      }
+      // handle any symbols in initialization expressions
+      if (auto e = details->init()) {
+        // A PARAMETER may not be marked as implicitly SAVE, so set the flag.
+        global = true;
+        for (const auto &s : evaluate::CollectSymbols(*e))
+          depth = std::max(analyze(s) + 1, depth);
+      }
+    }
+    adjustSize(depth + 1);
+    vars[depth].emplace_back(sym, global, depth);
+    if (Fortran::semantics::IsAllocatable(sym))
+      vars[depth].back().setHeapAlloc();
+    if (Fortran::semantics::IsPointer(sym))
+      vars[depth].back().setPointer();
+    if (sym.attrs().test(Fortran::semantics::Attr::TARGET))
+      vars[depth].back().setTarget();
+    return depth;
+  }
+
+  // Save the final list of symbols as a single vector and free the rest.
+  void finalize() {
+    for (int i = 1, end = vars.size(); i < end; ++i)
+      vars[0].insert(vars[0].end(), vars[i].begin(), vars[i].end());
+    vars.resize(1);
+  }
+
+private:
+  // Make sure the table is of appropriate size.
+  void adjustSize(std::size_t size) {
+    if (vars.size() < size)
+      vars.resize(size);
+  }
+
+  llvm::SmallSet<const semantics::Symbol *, 32> seen;
+  std::vector<std::vector<lower::pft::Variable>> &vars;
+};
 } // namespace
 
-pft::FunctionLikeUnit::FunctionLikeUnit(const parser::MainProgram &func,
-                                        const pft::ParentType &parent)
-    : ProgramUnit{func, parent} {
-  auto &ps{
+void Fortran::lower::pft::FunctionLikeUnit::processSymbolTable(
+    const semantics::Scope &scope) {
+  SymbolDependenceDepth sdd{varList};
+  for (const auto &iter : scope)
+    sdd.analyze(iter.second.get());
+  sdd.finalize();
+}
+
+Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
+    const parser::MainProgram &func, const lower::pft::ParentVariant &parent,
+    const semantics::SemanticsContext &semanticsContext)
+    : ProgramUnit{func, parent}, endStmt{
+                                     getFunctionStmt<parser::EndProgramStmt>(
+                                         func)} {
+  const auto &ps{
       std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(func.t)};
   if (ps.has_value()) {
-    const parser::Statement<parser::ProgramStmt> &statement{ps.value()};
-    beginStmt = &statement;
+    beginStmt = ps.value();
+    symbol = getSymbol(beginStmt);
+    processSymbolTable(*symbol->scope());
+  } else {
+    processSymbolTable(semanticsContext.FindScope(
+        std::get<parser::Statement<parser::EndProgramStmt>>(func.t).source));
   }
-  endStmt = getFunctionStmt<parser::EndProgramStmt>(func);
 }
 
-pft::FunctionLikeUnit::FunctionLikeUnit(const parser::FunctionSubprogram &func,
-                                        const pft::ParentType &parent)
+Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
+    const parser::FunctionSubprogram &func,
+    const lower::pft::ParentVariant &parent,
+    const semantics::SemanticsContext &)
     : ProgramUnit{func, parent},
       beginStmt{getFunctionStmt<parser::FunctionStmt>(func)},
-      endStmt{getFunctionStmt<parser::EndFunctionStmt>(func)} {}
+      endStmt{getFunctionStmt<parser::EndFunctionStmt>(func)}, symbol{getSymbol(
+                                                                   beginStmt)} {
+  processSymbolTable(*symbol->scope());
+}
 
-pft::FunctionLikeUnit::FunctionLikeUnit(
-    const parser::SubroutineSubprogram &func, const pft::ParentType &parent)
+Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
+    const parser::SubroutineSubprogram &func,
+    const lower::pft::ParentVariant &parent,
+    const semantics::SemanticsContext &)
     : ProgramUnit{func, parent},
       beginStmt{getFunctionStmt<parser::SubroutineStmt>(func)},
-      endStmt{getFunctionStmt<parser::EndSubroutineStmt>(func)} {}
+      endStmt{getFunctionStmt<parser::EndSubroutineStmt>(func)},
+      symbol{getSymbol(beginStmt)} {
+  processSymbolTable(*symbol->scope());
+}
 
-pft::FunctionLikeUnit::FunctionLikeUnit(
-    const parser::SeparateModuleSubprogram &func, const pft::ParentType &parent)
+Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
+    const parser::SeparateModuleSubprogram &func,
+    const lower::pft::ParentVariant &parent,
+    const semantics::SemanticsContext &)
     : ProgramUnit{func, parent},
       beginStmt{getFunctionStmt<parser::MpSubprogramStmt>(func)},
-      endStmt{getFunctionStmt<parser::EndMpSubprogramStmt>(func)} {}
+      endStmt{getFunctionStmt<parser::EndMpSubprogramStmt>(func)},
+      symbol{getSymbol(beginStmt)} {
+  processSymbolTable(*symbol->scope());
+}
 
-pft::ModuleLikeUnit::ModuleLikeUnit(const parser::Module &m,
-                                    const pft::ParentType &parent)
+Fortran::lower::pft::ModuleLikeUnit::ModuleLikeUnit(
+    const parser::Module &m, const lower::pft::ParentVariant &parent)
     : ProgramUnit{m, parent}, beginStmt{getModuleStmt<parser::ModuleStmt>(m)},
       endStmt{getModuleStmt<parser::EndModuleStmt>(m)} {}
 
-pft::ModuleLikeUnit::ModuleLikeUnit(const parser::Submodule &m,
-                                    const pft::ParentType &parent)
+Fortran::lower::pft::ModuleLikeUnit::ModuleLikeUnit(
+    const parser::Submodule &m, const lower::pft::ParentVariant &parent)
     : ProgramUnit{m, parent}, beginStmt{getModuleStmt<parser::SubmoduleStmt>(
                                   m)},
       endStmt{getModuleStmt<parser::EndSubmoduleStmt>(m)} {}
 
-pft::BlockDataUnit::BlockDataUnit(const parser::BlockData &bd,
-                                  const pft::ParentType &parent)
+Fortran::lower::pft::BlockDataUnit::BlockDataUnit(
+    const parser::BlockData &bd, const lower::pft::ParentVariant &parent)
     : ProgramUnit{bd, parent} {}
 
-std::unique_ptr<pft::Program> createPFT(const parser::Program &root) {
-  PFTBuilder walker;
+std::unique_ptr<lower::pft::Program>
+Fortran::lower::createPFT(const parser::Program &root,
+                          const semantics::SemanticsContext &semanticsContext) {
+  PFTBuilder walker(semanticsContext);
   Walk(root, walker);
   return walker.result();
 }
 
-void annotateControl(pft::Program &pft) {
-  for (auto &unit : pft.getUnits()) {
-    std::visit(common::visitors{
-                   [](pft::BlockDataUnit &) {},
-                   [](pft::FunctionLikeUnit &func) { annotateFuncCFG(func); },
-                   [](pft::ModuleLikeUnit &unit) {
-                     for (auto &func : unit.funcs)
-                       annotateFuncCFG(func);
-                   },
-               },
-               unit);
-  }
-}
-
-/// Dump a PFT.
-void dumpPFT(llvm::raw_ostream &outputStream, pft::Program &pft) {
+void Fortran::lower::dumpPFT(llvm::raw_ostream &outputStream,
+                             lower::pft::Program &pft) {
   PFTDumper{}.dumpPFT(outputStream, pft);
 }
 
-} // namespace Fortran::lower
+void Fortran::lower::pft::Program::dump() { dumpPFT(llvm::errs(), *this); }
diff --git a/flang/test/Lower/pre-fir-tree01.f90 b/flang/test/Lower/pre-fir-tree01.f90
index 97f15eea052e4..6b27add4659f4 100644
--- a/flang/test/Lower/pre-fir-tree01.f90
+++ b/flang/test/Lower/pre-fir-tree01.f90
@@ -16,10 +16,10 @@ subroutine foo()
       print *, "hello", i, j
     ! CHECK: EndDoStmt
     end do
-    ! CHECK: <<EndDoConstruct>>
+    ! CHECK: <<End DoConstruct>>
   ! CHECK: EndDoStmt
   end do
-  ! CHECK: <<EndDoConstruct>>
+  ! CHECK: <<End DoConstruct>>
 end subroutine
 ! CHECK: EndSubroutine foo
 
@@ -102,7 +102,7 @@ function subfoo2()
       write (*, 11) "test: ", xdim, pressure
     ! CHECK: EndIfStmt
     end if
-    ! CHECK: <<EndIfConstruct>>
+    ! CHECK: <<End IfConstruct>>
   end procedure
 end submodule
 ! CHECK: EndModuleLike
diff --git a/flang/test/Lower/pre-fir-tree02.f90 b/flang/test/Lower/pre-fir-tree02.f90
index ec9077a550a24..0fc219ff9a880 100644
--- a/flang/test/Lower/pre-fir-tree02.f90
+++ b/flang/test/Lower/pre-fir-tree02.f90
@@ -27,10 +27,10 @@ subroutine incr(i)
       print *, "hello", i, j
     ! CHECK: EndDoStmt
     end do
-    ! CHECK: <<EndDoConstruct>>
+    ! CHECK: <<End DoConstruct>>
   ! CHECK: EndDoStmt
   end do
-  ! CHECK: <<EndDoConstruct>>
+  ! CHECK: <<End DoConstruct>>
 
   ! CHECK: <<AssociateConstruct>>
   ! CHECK: AssociateStmt
@@ -39,9 +39,9 @@ subroutine incr(i)
     allocate(x(k))
   ! CHECK: EndAssociateStmt
   end associate
-  ! CHECK: <<EndAssociateConstruct>>
+  ! CHECK: <<End AssociateConstruct>>
 
-  ! CHECK: <<BlockConstruct>>
+  ! CHECK: <<BlockConstruct!>>
   ! CHECK: BlockStmt
   block
     integer :: k, l
@@ -52,7 +52,7 @@ subroutine incr(i)
     k = size(p)
     ! CHECK: AssignmentStmt
     l = 1
-    ! CHECK: <<CaseConstruct>>
+    ! CHECK: <<CaseConstruct!>>
     ! CHECK: SelectCaseStmt
     select case (k)
       ! CHECK: CaseStmt
@@ -76,13 +76,13 @@ subroutine incr(i)
           print *, "-"
         ! CHECK: EndIfStmt
         end if
-        ! CHECK: <<EndIfConstruct>>
+        ! CHECK: <<End IfConstruct>>
         ! CHECK: CaseStmt
       case (2:10)
       ! CHECK: CaseStmt
       case default
         ! Note: label-do-loop are canonicalized into do constructs
-        ! CHECK: <<DoConstruct>>
+        ! CHECK: <<DoConstruct!>>
         ! CHECK: NonLabelDoStmt
         do 22 while(l<=k)
           ! CHECK: IfStmt
@@ -90,15 +90,15 @@ subroutine incr(i)
           ! CHECK: CallStmt
 22        call incr(l)
         ! CHECK: EndDoStmt
-       ! CHECK: <<EndDoConstruct>>
+       ! CHECK: <<End DoConstruct!>>
       ! CHECK: CaseStmt
       case (100:)
     ! CHECK: EndSelectStmt
     end select
-  ! CHECK: <<EndCaseConstruct>>
+  ! CHECK: <<End CaseConstruct!>>
   ! CHECK: EndBlockStmt
   end block
-  ! CHECK: <<EndBlockConstruct>>
+  ! CHECK: <<End BlockConstruct!>>
 
   ! CHECK-NOT: WhereConstruct
   ! CHECK: WhereStmt
@@ -118,14 +118,14 @@ subroutine incr(i)
       ! CHECK: AssignmentStmt
       y = y/2.
     end where
-    ! CHECK: <<EndWhereConstruct>>
+    ! CHECK: <<End WhereConstruct>>
   ! CHECK: ElsewhereStmt
   elsewhere
     ! CHECK: AssignmentStmt
     x = x + 1.
   ! CHECK: EndWhereStmt
   end where
-  ! CHECK: <<EndWhereConstruct>>
+  ! CHECK: <<End WhereConstruct>>
 
   ! CHECK-NOT: ForAllConstruct
   ! CHECK: ForallStmt
@@ -138,7 +138,7 @@ subroutine incr(i)
     x(i) = x(i) + y(10*i)
   ! CHECK: EndForallStmt
   end forall
-  ! CHECK: <<EndForallConstruct>>
+  ! CHECK: <<End ForallConstruct>>
 
   ! CHECK: DeallocateStmt
   deallocate(x)
@@ -157,7 +157,7 @@ module test
   function foo(x)
     real x(..)
     integer :: foo
-    ! CHECK: <<SelectRankConstruct>>
+    ! CHECK: <<SelectRankConstruct!>>
     ! CHECK: SelectRankStmt
     select rank(x)
       ! CHECK: SelectRankCaseStmt
@@ -178,13 +178,13 @@ function foo(x)
         foo = 2
     ! CHECK: EndSelectStmt
     end select
-    ! CHECK: <<EndSelectRankConstruct>>
+    ! CHECK: <<End SelectRankConstruct!>>
   end function
 
   ! CHECK: Function bar
   function bar(x)
     class(*) :: x
-    ! CHECK: <<SelectTypeConstruct>>
+    ! CHECK: <<SelectTypeConstruct!>>
     ! CHECK: SelectTypeStmt
     select type(x)
       ! CHECK: TypeGuardStmt
@@ -203,7 +203,7 @@ function bar(x)
         bar = -1
     ! CHECK: EndSelectStmt
     end select
-    ! CHECK: <<EndSelectTypeConstruct>>
+    ! CHECK: <<End SelectTypeConstruct!>>
   end function
 
   ! CHECK: Subroutine sub
@@ -219,7 +219,7 @@ subroutine sub(a)
 
 ! CHECK: Subroutine altreturn
 subroutine altreturn(i, j, *, *)
-  ! CHECK: <<IfConstruct>>
+  ! CHECK: <<IfConstruct!>>
   if (i>j) then
     ! CHECK: ReturnStmt
     return 1
@@ -227,7 +227,7 @@ subroutine altreturn(i, j, *, *)
     ! CHECK: ReturnStmt
     return 2
   end if
-  ! CHECK: <<EndIfConstruct>>
+  ! CHECK: <<End IfConstruct!>>
 end subroutine
 
 
@@ -246,7 +246,7 @@ subroutine iostmts(filename, a, b, c)
     ! CHECK: OpenStmt
     open(10, FILE=filename)
   end if
-  ! CHECK: <<EndIfConstruct>>
+  ! CHECK: <<End IfConstruct>>
   ! CHECK: ReadStmt
   read(10, *) length
   ! CHECK: RewindStmt
@@ -297,18 +297,18 @@ subroutine sub2()
 5 j = j + 1
 6 i = i + j/2
 
-  ! CHECK: <<DoConstruct>>
+  ! CHECK: <<DoConstruct!>>
   do1: do k=1,10
-    ! CHECK: <<DoConstruct>>
+    ! CHECK: <<DoConstruct!>>
     do2: do l=5,20
       ! CHECK: CycleStmt
       cycle do1
       ! CHECK: ExitStmt
       exit do2
     end do do2
-    ! CHECK: <<EndDoConstruct>>
+    ! CHECK: <<End DoConstruct!>>
   end do do1
-  ! CHECK: <<EndDoConstruct>>
+  ! CHECK: <<End DoConstruct!>>
 
   ! CHECK: PauseStmt
   pause 7
diff --git a/flang/test/Lower/pre-fir-tree03.f90 b/flang/test/Lower/pre-fir-tree03.f90
index 2eedfe7610ce2..1c8651b64f830 100644
--- a/flang/test/Lower/pre-fir-tree03.f90
+++ b/flang/test/Lower/pre-fir-tree03.f90
@@ -20,10 +20,10 @@ program test_omp
       print *, "in omp do"
     ! CHECK: EndDoStmt
     end do
-    ! CHECK: <<EndDoConstruct>>
+    ! CHECK: <<End DoConstruct>>
     ! CHECK: OmpEndLoopDirective
     !$omp end do
-    ! CHECK: <<EndOpenMPConstruct>>
+    ! CHECK: <<End OpenMPConstruct>>
 
     ! CHECK: PrintStmt
     print *, "not in omp do"
@@ -37,13 +37,13 @@ program test_omp
       print *, "in omp do"
     ! CHECK: EndDoStmt
     end do
-    ! CHECK: <<EndDoConstruct>>
-    ! CHECK: <<EndOpenMPConstruct>>
+    ! CHECK: <<End DoConstruct>>
+    ! CHECK: <<End OpenMPConstruct>>
     ! CHECK-NOT: OmpEndLoopDirective
     ! CHECK: PrintStmt
     print *, "no in omp do"
   !$omp end parallel
-    ! CHECK: <<EndOpenMPConstruct>>
+    ! CHECK: <<End OpenMPConstruct>>
 
   ! CHECK: PrintStmt
   print *, "sequential again"
@@ -53,7 +53,7 @@ program test_omp
     ! CHECK: PrintStmt
     print *, "in task"
   !$omp end task
-  ! CHECK: <<EndOpenMPConstruct>>
+  ! CHECK: <<End OpenMPConstruct>>
 
   ! CHECK: PrintStmt
   print *, "sequential again"
diff --git a/flang/test/Lower/pre-fir-tree04.f90 b/flang/test/Lower/pre-fir-tree04.f90
index 8e39d72557507..34212fbb1ff01 100644
--- a/flang/test/Lower/pre-fir-tree04.f90
+++ b/flang/test/Lower/pre-fir-tree04.f90
@@ -16,7 +16,7 @@ Subroutine test_coarray
     ! CHECK: AssignmentStmt
     x = x[4, 1]
   end team
-  ! CHECK: <<EndChangeTeamConstruct>>
+  ! CHECK: <<End ChangeTeamConstruct>>
   ! CHECK: FormTeamStmt
   form team(1, t)
 
@@ -28,14 +28,14 @@ Subroutine test_coarray
     ! CHECK: EventWaitStmt
     event wait (done)
   end if
-  ! CHECK: <<EndIfConstruct>>
+  ! CHECK: <<End IfConstruct>>
 
   ! CHECK: <<CriticalConstruct>>
   critical
     ! CHECK: AssignmentStmt
     counter[1] = counter[1] + 1
   end critical
-  ! CHECK: <<EndCriticalConstruct>>
+  ! CHECK: <<End CriticalConstruct>>
 
   ! CHECK: LockStmt
   lock(alock)
@@ -59,12 +59,12 @@ Subroutine test_coarray
     ! CHECK: SyncImagesStmt
     sync images(1)
   end if
-  ! CHECK: <<EndIfConstruct>>
+  ! CHECK: <<End IfConstruct>>
 
   ! CHECK: <<IfConstruct>>
   if (y<0.) then
     ! CHECK: FailImageStmt
    fail image
   end if
-  ! CHECK: <<EndIfConstruct>>
+  ! CHECK: <<End IfConstruct>>
 end
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 5538c9fc3e9ae..26682eaa64897 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -315,8 +315,7 @@ std::string CompileFortran(std::string path, Fortran::parser::Options options,
     return {};
   }
   if (driver.dumpPreFirTree) {
-    if (auto ast{Fortran::lower::createPFT(parseTree)}) {
-      Fortran::lower::annotateControl(*ast);
+    if (auto ast{Fortran::lower::createPFT(parseTree, semanticsContext)}) {
       Fortran::lower::dumpPFT(llvm::outs(), *ast);
     } else {
       llvm::errs() << "Pre FIR Tree is NULL.\n";

From 8f8029b4587e120e4cceceebf6d350f881f5d827 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 15:24:15 -0700
Subject: [PATCH 587/770] [SVE] Eliminate calls to default-false
 VectorType::get() from InstCombine

Reviewers: efriedma, david-arm, fpetrogalli, spatel

Reviewed By: david-arm

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80334
---
 .../InstCombine/InstCombineCalls.cpp          |  9 ++++----
 .../InstCombine/InstCombineCasts.cpp          | 21 ++++++++++++-------
 .../InstCombine/InstCombineCompares.cpp       |  6 +++---
 .../InstCombineSimplifyDemanded.cpp           |  3 ++-
 .../InstCombine/InstCombineVectorOps.cpp      |  6 +++---
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a3d5215fad4f8..e628a2277a214 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -839,7 +839,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
       Index /= 8;
 
       Type *IntTy8 = Type::getInt8Ty(II.getContext());
-      VectorType *ShufTy = VectorType::get(IntTy8, 16);
+      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
 
       SmallVector<int, 16> ShuffleMask;
       for (int i = 0; i != (int)Length; ++i)
@@ -916,7 +916,7 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
     Index /= 8;
 
     Type *IntTy8 = Type::getInt8Ty(II.getContext());
-    VectorType *ShufTy = VectorType::get(IntTy8, 16);
+    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
 
     SmallVector<int, 16> ShuffleMask;
     for (int i = 0; i != (int)Index; ++i)
@@ -2849,8 +2849,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         // We don't need a select if we know the mask bit is a 1.
         if (!C || !C->getValue()[0]) {
           // Cast the mask to an i1 vector and then extract the lowest element.
-          auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
-                             cast<IntegerType>(Mask->getType())->getBitWidth());
+          auto *MaskTy = FixedVectorType::get(
+              Builder.getInt1Ty(),
+              cast<IntegerType>(Mask->getType())->getBitWidth());
           Mask = Builder.CreateBitCast(Mask, MaskTy);
           Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
           // Extract the lowest element from the passthru operand.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index a2b75848ea028..cc008d3337ad4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -483,7 +483,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
   // bitcast it to a vector type that we can extract from.
   unsigned NumVecElts = VecWidth / DestWidth;
   if (VecType->getElementType() != DestType) {
-    VecType = VectorType::get(DestType, NumVecElts);
+    VecType = FixedVectorType::get(DestType, NumVecElts);
     VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
   }
 
@@ -870,7 +870,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
       assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
              "overflow 32-bits");
 
-      Type *BitCastTo = VectorType::get(DestTy, BitCastNumElts);
+      auto *BitCastTo = FixedVectorType::get(DestTy, BitCastNumElts);
       Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
       return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
     }
@@ -1536,7 +1536,7 @@ static Type *shrinkFPConstantVector(Value *V) {
   }
 
   // Make a vector type from the minimal type.
-  return VectorType::get(MinType, NumElts);
+  return FixedVectorType::get(MinType, NumElts);
 }
 
 /// Find the minimum FP type we can safely truncate to.
@@ -1921,8 +1921,11 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
     return commonPointerCastTransforms(CI);
 
   Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
-  if (auto *VTy = dyn_cast<VectorType>(Ty)) // Handle vectors of pointers.
-    PtrTy = VectorType::get(PtrTy, VTy->getNumElements());
+  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+    // Handle vectors of pointers.
+    // FIXME: what should happen for scalable vectors?
+    PtrTy = FixedVectorType::get(PtrTy, VTy->getNumElements());
+  }
 
   Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy);
   return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
@@ -1961,7 +1964,8 @@ static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
         DestTy->getElementType()->getPrimitiveSizeInBits())
       return nullptr;
 
-    SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+    SrcTy =
+        FixedVectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
   }
 
@@ -2187,7 +2191,7 @@ static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
     return nullptr;
 
   unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
-  auto *NewVecType = VectorType::get(DestType, NumElts);
+  auto *NewVecType = FixedVectorType::get(DestType, NumElts);
   auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
                                          NewVecType, "bc");
   return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
@@ -2658,7 +2662,8 @@ Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
     Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
     if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
       // Handle vectors of pointers.
-      MidTy = VectorType::get(MidTy, VT->getNumElements());
+      // FIXME: what should happen for scalable vectors?
+      MidTy = FixedVectorType::get(MidTy, VT->getNumElements());
     }
 
     Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8f50358d1d3d5..48375a1a323f4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1862,7 +1862,7 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
     if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
       Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
       if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
-        NTy = VectorType::get(NTy, AndVTy->getNumElements());
+        NTy = FixedVectorType::get(NTy, AndVTy->getNumElements());
       Value *Trunc = Builder.CreateTrunc(X, NTy);
       auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
                                                             : CmpInst::ICMP_SLT;
@@ -2152,7 +2152,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
       DL.isLegalInteger(TypeBits - Amt)) {
     Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
     if (auto *ShVTy = dyn_cast<VectorType>(ShType))
-      TruncTy = VectorType::get(TruncTy, ShVTy->getNumElements());
+      TruncTy = FixedVectorType::get(TruncTy, ShVTy->getNumElements());
     Constant *NewC =
         ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
     return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
@@ -2785,7 +2785,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
 
           Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
           if (auto *XVTy = dyn_cast<VectorType>(XType))
-            NewType = VectorType::get(NewType, XVTy->getNumElements());
+            NewType = FixedVectorType::get(NewType, XVTy->getNumElements());
           Value *NewBitcast = Builder.CreateBitCast(X, NewType);
           if (TrueIfSigned)
             return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index b9ee985402c86..2a36a7651c8e0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1144,7 +1144,8 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
 
   Module *M = II->getParent()->getParent()->getParent();
   Type *EltTy = IIVTy->getElementType();
-  Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
+  Type *NewTy =
+      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
 
   OverloadTys[0] = NewTy;
   Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 430f2f4de3ac2..ff70347569abc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1336,10 +1336,10 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
   Type *EltTy = V->getType()->getScalarType();
   Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
   if (isa<UndefValue>(V))
-    return UndefValue::get(VectorType::get(EltTy, Mask.size()));
+    return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
 
   if (isa<ConstantAggregateZero>(V))
-    return ConstantAggregateZero::get(VectorType::get(EltTy, Mask.size()));
+    return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
 
   if (Constant *C = dyn_cast<Constant>(V))
     return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
@@ -2131,7 +2131,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         continue;
       if (!VectorType::isValidElementType(TgtTy))
         continue;
-      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
       if (!BegIsAligned) {
         // Shuffle the input so [0,NumElements) contains the output, and
         // [NumElems,SrcNumElems) is undef.

From 91beb5176b4d5a7cc09c419b9d75cb19f67d0bf9 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Fri, 29 May 2020 18:07:39 -0400
Subject: [PATCH 588/770] [mlir] NFC - Add debug information for Linalg
 transformations.

Address post-commit review of https://reviews.llvm.org/D79518
---
 .../Dialect/Linalg/Transforms/Transforms.h    |  2 +-
 mlir/include/mlir/IR/PatternMatch.h           |  2 +-
 .../Dialect/Linalg/Transforms/Transforms.cpp  | 32 ++++++++++++++-----
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 2e0673795f305..2e6a859260795 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -521,7 +521,7 @@ struct LinalgCopyVTWForwardingPattern
 LogicalResult applyStagedPatterns(
     Operation *op, ArrayRef<OwningRewritePatternList> stage1Patterns,
     const OwningRewritePatternList &stage2Patterns,
-    llvm::function_ref<LogicalResult(Operation *)> stage3Lambda = nullptr);
+    function_ref<LogicalResult(Operation *)> stage3Lambda = nullptr);
 } // namespace linalg
 } // namespace mlir
 
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 6b124e0ecdfaf..8178f71ec43d9 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -394,7 +394,7 @@ class OwningRewritePatternList {
   /// type `T`.
   template <typename T>
   OwningRewritePatternList(T &&t) {
-    patterns.emplace_back(std::make_unique<T>(t));
+    patterns.emplace_back(std::make_unique<T>(std::forward<T>(t)));
   }
 
   PatternListT::iterator begin() { return patterns.begin(); }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 527d162298bf4..76e118e482f00 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -37,6 +37,8 @@ using namespace mlir::linalg;
 
 using llvm::dbgs;
 
+#define DEBUG_TYPE "linalg-transforms"
+
 //===----------------------------------------------------------------------===//
 // Transformations exposed as rewrite patterns.
 //===----------------------------------------------------------------------===//
@@ -45,13 +47,13 @@ const StringLiteral mlir::linalg::LinalgTransforms::kLinalgTransformMarker =
     "__internal_linalg_transform__";
 
 mlir::linalg::LinalgMarker::LinalgMarker(ArrayRef<StringRef> matchDisjunction,
-                                         llvm::Optional<StringRef> replacement)
+                                         Optional<StringRef> replacement)
     : matchDisjunction(matchDisjunction.begin(), matchDisjunction.end()),
       replacement(replacement) {}
 
 mlir::linalg::LinalgMarker::LinalgMarker(ArrayRef<StringRef> matchDisjunction,
                                          StringRef replacement)
-    : LinalgMarker(matchDisjunction, llvm::Optional<StringRef>{replacement}) {}
+    : LinalgMarker(matchDisjunction, Optional<StringRef>{replacement}) {}
 
 LogicalResult
 mlir::linalg::LinalgMarker::checkAndNotify(PatternRewriter &rewriter,
@@ -72,7 +74,7 @@ mlir::linalg::LinalgMarker::checkAndNotify(PatternRewriter &rewriter,
     // 3. Has no marker but was expecting a marker.
     return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
       diag << " does not have any marker from list: ";
-      llvm::interleaveComma(matchDisjunction, diag);
+      interleaveComma(matchDisjunction, diag);
     });
   }
 
@@ -84,7 +86,7 @@ mlir::linalg::LinalgMarker::checkAndNotify(PatternRewriter &rewriter,
   // 5. Fail to match.
   return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
     diag << " does not have any marker from list: ";
-    llvm::interleaveComma(matchDisjunction, diag);
+    interleaveComma(matchDisjunction, diag);
   });
 }
 
@@ -105,7 +107,7 @@ mlir::linalg::LinalgTilingOptions::setTileSizes(ArrayRef<int64_t> ts) {
     OpBuilder::InsertionGuard guard(b);
     b.setInsertionPointToStart(
         &op->getParentOfType<FuncOp>().getBody().front());
-    return llvm::to_vector<4>(llvm::map_range(tileSizes, [&](int64_t s) {
+    return llvm::to_vector<4>(map_range(tileSizes, [&](int64_t s) {
       Value v = b.create<ConstantIndexOp>(op->getLoc(), s);
       return v;
     }));
@@ -217,19 +219,33 @@ LogicalResult mlir::linalg::LinalgBaseVectorizationPattern::matchAndRewrite(
 LogicalResult mlir::linalg::applyStagedPatterns(
     Operation *op, ArrayRef<OwningRewritePatternList> stage1Patterns,
     const OwningRewritePatternList &stage2Patterns,
-    llvm::function_ref<LogicalResult(Operation *)> stage3Lambda) {
+    function_ref<LogicalResult(Operation *)> stage3Lambda) {
+  unsigned iteration = 0;
+  (void)iteration;
+  StringRef dbgPref = "\n[" DEBUG_TYPE "]: ";
+  (void)dbgPref;
   for (const auto &patterns : stage1Patterns) {
     if (!applyPatternsAndFoldGreedily(op, patterns)) {
-      llvm::dbgs() << "Underlying first stage rewrite did not converge";
+      dbgs() << "Underlying first stage rewrite did not converge";
       return failure();
     }
+    LLVM_DEBUG(dbgs()
+               << dbgPref << "After 1st stage, iter: " << ++iteration << "\n"
+               << *op);
     if (!applyPatternsAndFoldGreedily(op, stage2Patterns)) {
-      llvm::dbgs() << "Underlying second stage rewrite did not converge";
+      LLVM_DEBUG(dbgs()
+                 << dbgPref << "Underlying 2nd stage rewrite did not converge");
       return failure();
     }
+    LLVM_DEBUG(dbgs()
+               << dbgPref << "After 2nd stage, iter : " << iteration << "\n"
+               << *op);
     if (stage3Lambda) {
       if (failed(stage3Lambda(op)))
         return failure();
+      LLVM_DEBUG(dbgs()
+                 << dbgPref << "After 3rd stage, iter : " << iteration << "\n"
+                 << *op);
     }
   }
   return success();

From a45688a72c63b6359df7c23f2a2895f271d414e9 Mon Sep 17 00:00:00 2001
From: Valery N Dmitriev <valery.n.dmitriev@intel.com>
Date: Thu, 28 May 2020 18:24:03 -0700
Subject: [PATCH 589/770] [SLP] Apply external to vectorizable tree users cost
 adjustment for relevant aggregate build instructions only (UserCost). Users
 are detected with findBuildAggregate routine and the trick is that following
 SLP vectorization may end up vectorizing entire list with smaller chunks.
 Cost adjustment then is applied for individual chunks and these adjustments
 obviously have to be smaller than the entire aggregate build cost.

Differential Revision: https://reviews.llvm.org/D80773
---
 .../llvm/Transforms/Vectorize/SLPVectorizer.h |  10 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 122 +++++++++++-------
 .../SLPVectorizer/AArch64/transpose.ll        |  45 +++----
 .../SLPVectorizer/X86/alternate-fp.ll         |  46 +++++--
 .../SLPVectorizer/X86/alternate-int.ll        |  43 +++++-
 .../test/Transforms/SLPVectorizer/X86/hadd.ll |  23 +++-
 .../test/Transforms/SLPVectorizer/X86/hsub.ll |  23 +++-
 .../SLPVectorizer/X86/load-merge.ll           |  14 +-
 .../SLPVectorizer/X86/vec_list_bias.ll        |  35 ++---
 9 files changed, 221 insertions(+), 140 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 8a01114745f3a..77236dec75dc2 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -93,11 +93,15 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
   /// Try to vectorize a list of operands.
-  /// \param UserCost Cost of the user operations of \p VL if they may affect
-  /// the cost of the vectorization.
+  /// When \p InsertUses is provided and its entries are non-zero
+  /// then users of \p VL are known to be InsertElement instructions
+  /// each associated with same VL entry index. Their cost is then
+  /// used to adjust cost of the vectorization assuming instcombine pass
+  /// then optimizes ExtractElement-InsertElement sequence.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          int UserCost = 0, bool AllowReorder = false);
+                          bool AllowReorder = false,
+                          ArrayRef<Value *> InsertUses = None);
 
   /// Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4c18fab4ec098..218b7fc83e429 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5929,12 +5929,13 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
-  Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
+  Value *VL[] = {A, B};
+  return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           int UserCost, bool AllowReorder) {
+                                           bool AllowReorder,
+                                           ArrayRef<Value *> InsertUses) {
   if (VL.size() < 2)
     return false;
 
@@ -5983,6 +5984,13 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool CandidateFound = false;
   int MinCost = SLPCostThreshold;
 
+  bool CompensateUseCost =
+      !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+        return V && isa<InsertElementInst>(V);
+      });
+  assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+         "Each scalar expected to have an associated InsertElement user.");
+
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
@@ -6030,8 +6038,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost() - UserCost;
+      int Cost = R.getTreeCost();
       CandidateFound = true;
+      if (CompensateUseCost) {
+        // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+        // rather than sum of single inserts as the latter may overestimate
+        // cost. This work should imply improving cost estimation for extracts
+        // that added in for external (for vectorization tree) users,i.e. that
+        // part should also switch to same interface.
+        // For example, the following case is projected code after SLP:
+        //  %4 = extractelement <4 x i64> %3, i32 0
+        //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+        //  %5 = extractelement <4 x i64> %3, i32 1
+        //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+        //  %6 = extractelement <4 x i64> %3, i32 2
+        //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+        //  %7 = extractelement <4 x i64> %3, i32 3
+        //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+        //
+        // Extracts here added by SLP in order to feed users (the inserts) of
+        // original scalars and contribute to "ExtractCost" at cost evaluation.
+        // The inserts in turn form sequence to build an aggregate that
+        // detected by findBuildAggregate routine.
+        // SLP makes an assumption that such sequence will be optimized away
+        // later (instcombine) so it tries to compensate ExctractCost with
+        // cost of insert sequence.
+        // Current per element cost calculation approach is not quite accurate
+        // and tends to create bias toward favoring vectorization.
+        // Switching to the TTI interface might help a bit.
+        // Alternative solution could be pattern-match to detect a no-op or
+        // shuffle.
+        unsigned UserCost = 0;
+        for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+          auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+          if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+            UserCost += TTI->getVectorInstrCost(
+                Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+        }
+        LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+                          << ".\n");
+        Cost -= UserCost;
+      }
+
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
@@ -7047,48 +7095,16 @@ class HorizontalReduction {
 /// \return true if it matches.
 static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
-                               int &UserCost) {
+                               SmallVectorImpl<Value *> &InsertElts) {
   assert((isa<InsertElementInst>(LastInsertInst) ||
           isa<InsertValueInst>(LastInsertInst)) &&
          "Expected insertelement or insertvalue instruction!");
-  UserCost = 0;
   do {
-    // TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather
-    // than sum of single inserts as the latter may overestimate cost.
-    // This work should imply improving cost estimation for extracts that
-    // added in for external (for vectorization tree) users.
-    // For example, in following case all extracts added in order to feed
-    // into external users (inserts), which in turn form sequence to build
-    // an aggregate that we do match here:
-    //  %4 = extractelement <4 x i64> %3, i32 0
-    //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
-    //  %5 = extractelement <4 x i64> %3, i32 1
-    //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
-    //  %6 = extractelement <4 x i64> %3, i32 2
-    //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
-    //  %7 = extractelement <4 x i64> %3, i32 3
-    //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
-    //
-    // Cost of this entire sequence is currently estimated as sum of single
-    // extracts (as this aggregate build sequence is an external to
-    // vectorization tree user) minus cost of the aggregate build.
-    // As this whole sequence will be optimized away we want the cost to be
-    // zero. But it is not quite possible using given approach (at least for
-    // X86) because inserts can be more expensive than extracts for longer
-    // vector lengths so the difference turns out not zero in such a case.
-    // Ideally we want to match this entire sequence and treat it as a no-op
-    // (i.e. do not count into final cost at all).
-    // Currently the difference tends to be negative thus adding a bias
-    // toward favoring vectorization. If we switch into using TTI interface
-    // the bias tendency will remain but will be lower.
     Value *InsertedOperand;
-    if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+    auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
+    if (IE) {
       InsertedOperand = IE->getOperand(1);
       LastInsertInst = IE->getOperand(0);
-      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
-        UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
-                                            IE->getType(), CI->getZExtValue());
-      }
     } else {
       auto *IV = cast<InsertValueInst>(LastInsertInst);
       InsertedOperand = IV->getInsertedValueOperand();
@@ -7096,16 +7112,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
     }
     if (isa<InsertElementInst>(InsertedOperand) ||
         isa<InsertValueInst>(InsertedOperand)) {
-      int TmpUserCost;
       SmallVector<Value *, 8> TmpBuildVectorOpds;
+      SmallVector<Value *, 8> TmpInsertElts;
       if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
-                              TmpUserCost))
+                              TmpInsertElts))
         return false;
       BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
                              TmpBuildVectorOpds.rend());
-      UserCost += TmpUserCost;
+      InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
     } else {
       BuildVectorOpds.push_back(InsertedOperand);
+      InsertElts.push_back(IE);
     }
     if (isa<UndefValue>(LastInsertInst))
       break;
@@ -7115,6 +7132,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+  std::reverse(InsertElts.begin(), InsertElts.end());
   return true;
 }
 
@@ -7279,26 +7297,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                  BasicBlock *BB, BoUpSLP &R) {
-  int UserCost = 0;
   const DataLayout &DL = BB->getModule()->getDataLayout();
   if (!R.canMapToVector(IVI->getType(), DL))
     return false;
 
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
+  SmallVector<Value *, 16> BuildVectorInsts;
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      BuildVectorOpds.size() < 2)
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
-  int UserCost;
+  SmallVector<Value *, 16> BuildVectorInsts;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      BuildVectorOpds.size() < 2 ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
@@ -7306,7 +7327,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
 
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
 }
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -7384,8 +7406,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
       bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
-                                            /*UserCost=*/0, AllowReorder)) {
+      if (NumElts > 1 &&
+          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 707d2a216a1a5..6d2c2f410ade1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -153,15 +153,12 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
 ; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
+; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
 ; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
 ; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
@@ -187,21 +184,25 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0
+; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index de7c59286ac38..e1cad3e14014f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -56,20 +56,38 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> <i32 4, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 9, i32 10, i32 undef>
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = fdiv float [[A1]], [[B1]]
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], [[B2]]
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], [[B3]]
+; SLM-NEXT:    [[AB4:%.*]] = fmul float [[A4]], [[B4]]
+; SLM-NEXT:    [[AB5:%.*]] = fdiv float [[A5]], [[B5]]
+; SLM-NEXT:    [[AB6:%.*]] = fdiv float [[A6]], [[B6]]
+; SLM-NEXT:    [[AB7:%.*]] = fmul float [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x float> [[R7]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v8f32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 085b7137f2b2c..0324f35330c18 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -101,11 +101,44 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @ashr_shl_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @ashr_shl_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_shl_v8i32(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_shl_v8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index ae79a7eb0e808..de686fe0e18cf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -179,13 +179,22 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
 ; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index c1c2ac544b156..c71e0da973369 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -179,13 +179,22 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
 ; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 524a1c51389ed..50eeead886fea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -111,15 +111,13 @@ define <4 x float> @PR16739_byval(<4 x float>* nocapture readonly dereferenceabl
 ; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast float* [[T2]] to i64*
 ; CHECK-NEXT:    [[T4:%.*]] = load i64, i64* [[T3]], align 8
+; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
+; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
+; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T8]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP6]], i32 1
+; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
+; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
 ; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
 ; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
index 9ceea2b81ac9f..532bf693c6316 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -6,8 +6,6 @@
 ; Vectorization triggered by cost bias caused by subtracting
 ; the cost of entire "aggregate build" sequence while
 ; building vectorizable tree from only a portion of it.
-; FIXME: this is unprofitable to vectorize.
-
 
 define void @test(i32* nocapture %t2) {
 ; CHECK-LABEL: @test(
@@ -29,9 +27,11 @@ define void @test(i32* nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
+; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -41,29 +41,16 @@ define void @test(i32* nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 6270, i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[TMP15]], i32 3
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP12]], i32 4
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0
+; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP15]], i32 7
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4

From 56eb7556e75ca022bfa9b4c6b60a9571b41e2447 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 15:32:36 -0700
Subject: [PATCH 590/770] [SVE] Eliminate calls to default-false
 VectorType::get() from AArch64

Reviewers: efriedma, c-rhodes, david-arm, mcrosier, t.p.northover

Reviewed By: efriedma

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80327
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 19 ++++++++++---------
 .../Target/AArch64/AArch64StackTagging.cpp    |  4 ++--
 .../AArch64/AArch64TargetTransformInfo.cpp    |  6 +++---
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index aece1d0da59ab..dfa4b493c2216 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9571,7 +9571,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getElementType();
   if (EltTy->isPointerTy())
-    VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getNumElements());
+    VecTy =
+        FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy->getNumElements());
 
   IRBuilder<> Builder(LI);
 
@@ -9581,8 +9582,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   if (NumLoads > 1) {
     // If we're going to generate more than one load, reset the sub-vector type
     // to something legal.
-    VecTy = VectorType::get(VecTy->getElementType(),
-                            VecTy->getNumElements() / NumLoads);
+    VecTy = FixedVectorType::get(VecTy->getElementType(),
+                                 VecTy->getNumElements() / NumLoads);
 
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
@@ -9626,8 +9627,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
-            SubVec, VectorType::get(SVI->getType()->getElementType(),
-                                    VecTy->getNumElements()));
+            SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
+                                         VecTy->getNumElements()));
       SubVecs[SVI].push_back(SubVec);
     }
   }
@@ -9683,7 +9684,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 
   unsigned LaneLen = VecTy->getNumElements() / Factor;
   Type *EltTy = VecTy->getElementType();
-  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
@@ -9706,11 +9707,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     unsigned NumOpElts = cast<VectorType>(Op0->getType())->getNumElements();
 
     // Convert to the corresponding integer vector.
-    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+    auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
-    SubVecTy = VectorType::get(IntTy, LaneLen);
+    SubVecTy = FixedVectorType::get(IntTy, LaneLen);
   }
 
   // The base address of the store.
@@ -9720,7 +9721,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     // If we're going to generate more than one store, reset the lane length
     // and sub-vector type to something legal.
     LaneLen /= NumStores;
-    SubVecTy = VectorType::get(SubVecTy->getElementType(), LaneLen);
+    SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
 
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 42f6bfb1940e2..3339efda7d7c3 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -256,8 +256,8 @@ class InitializerBuilder {
       Type *EltTy = VecTy->getElementType();
       if (EltTy->isPointerTy()) {
         uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
-        Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
-                                      VecTy->getNumElements());
+        auto *NewTy = FixedVectorType::get(IntegerType::get(Ctx, EltSize),
+                                           VecTy->getNumElements());
         V = IRB.CreatePointerCast(V, NewTy);
       }
     }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f0961646c31ff..ebe126e2b8632 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -211,8 +211,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   // A helper that returns a vector type from the given type. The number of
   // elements in type Ty determine the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
-    return VectorType::get(ArgTy->getScalarType(),
-                           cast<VectorType>(DstTy)->getNumElements());
+    return FixedVectorType::get(ArgTy->getScalarType(),
+                                cast<VectorType>(DstTy)->getNumElements());
   };
 
   // Exit early if DstTy is not a vector type whose elements are at least
@@ -254,7 +254,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
 
   // Legalize the source type and ensure it can be used in a widening
   // operation.
-  Type *SrcTy = toVectorTy(Extend->getSrcTy());
+  auto *SrcTy = toVectorTy(Extend->getSrcTy());
   auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())

From e6cf402e8364a7a7e483a60aa1fca1213f4fb475 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 15:41:06 -0700
Subject: [PATCH 591/770] [SVE] Eliminate calls to default-false
 VectorType::get() from AggressiveInstCombine

Reviewers: efriedma, aymanmus, c-rhodes, david-arm

Reviewed By: david-arm

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80332
---
 .../Transforms/AggressiveInstCombine/TruncInstCombine.cpp   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index e41b2856c8d87..1f0989d230f88 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -281,8 +281,10 @@ Type *TruncInstCombine::getBestTruncatedType() {
 /// version of \p Ty, otherwise return \p Ty.
 static Type *getReducedType(Value *V, Type *Ty) {
   assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
-  if (auto *VTy = dyn_cast<VectorType>(V->getType()))
-    return VectorType::get(Ty, VTy->getNumElements());
+  if (auto *VTy = dyn_cast<VectorType>(V->getType())) {
+    // FIXME: should this handle scalable vectors?
+    return FixedVectorType::get(Ty, VTy->getNumElements());
+  }
   return Ty;
 }
 

From 7fc225f8c4596d4e51d64e814b21cc49c5e8566b Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Fri, 29 May 2020 17:29:35 -0400
Subject: [PATCH 592/770] [mlir] Fix Windows build

Summary:
MSVC does not seem to like certain forward declarations.

https://reviews.llvm.org/D80728 introduces an error where
seemingly unrelated .cpp files that include the .h
(but do not otherwise use the class that depends on the forward declaration).

Instead of forward declaration, include the full vector ops definition.

Subscribers: mehdi_amini, rriddle, jpienaar, shauheen, antiagainst, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, stephenneuendorffer, Joonsoo, grosul1, frgossen, Kayjukh, jurahul, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80841
---
 mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 2e6a859260795..fb8fc4cbe949d 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -10,17 +10,11 @@
 #define DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H_
 
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "llvm/ADT/SmallBitVector.h"
 
 namespace mlir {
-namespace vector {
-
-class TransferReadOp;
-class TransferWriteOp;
-
-} // namespace vector
-
 namespace linalg {
 
 struct LinalgTilingOptions;

From 4034d0ce207d0c545799ceae5aaf2e35d400407c Mon Sep 17 00:00:00 2001
From: Adrian Herrera <adrian.herrera02@gmail.com>
Date: Fri, 29 May 2020 15:59:58 -0700
Subject: [PATCH 593/770] [libFuzzer] Fixed description of fuzzer merge control
 file.

Summary:
The description of the fuzzer merge control file appears to be incorrect/out of date.
No "DONE" line appears in the control file. Rather, FT and COV are the markers that appear
following the STARTED line.

Reviewers: metzman, kcc

Reviewed By: kcc

Subscribers: #sanitizers

Tags: #sanitizers

Differential Revision: https://reviews.llvm.org/D80788
---
 compiler-rt/lib/fuzzer/FuzzerMerge.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerMerge.h b/compiler-rt/lib/fuzzer/FuzzerMerge.h
index c14dd589e62dd..e0c6bc539bdb8 100644
--- a/compiler-rt/lib/fuzzer/FuzzerMerge.h
+++ b/compiler-rt/lib/fuzzer/FuzzerMerge.h
@@ -13,7 +13,7 @@
 //   The process should tolerate the crashes, OOMs, leaks, etc.
 //
 // Algorithm:
-//   The outter process collects the set of files and writes their names
+//   The outer process collects the set of files and writes their names
 //   into a temporary "control" file, then repeatedly launches the inner
 //   process until all inputs are processed.
 //   The outer process does not actually execute the target code.
@@ -22,13 +22,14 @@
 //   and b) the last processed input. Then it starts processing the inputs one
 //   by one. Before processing every input it writes one line to control file:
 //   STARTED INPUT_ID INPUT_SIZE
-//   After processing an input it write another line:
-//   DONE INPUT_ID Feature1 Feature2 Feature3 ...
+//   After processing an input it writes the following lines:
+//   FT INPUT_ID Feature1 Feature2 Feature3 ...
+//   COV INPUT_ID Coverage1 Coverage2 Coverage3 ...
 //   If a crash happens while processing an input the last line in the control
 //   file will be "STARTED INPUT_ID" and so the next process will know
 //   where to resume.
 //
-//   Once all inputs are processed by the innner process(es) the outer process
+//   Once all inputs are processed by the inner process(es) the outer process
 //   reads the control files and does the merge based entirely on the contents
 //   of control file.
 //   It uses a single pass greedy algorithm choosing first the smallest inputs

From 5a99ec10f5df7fa351e81b9bc90bf38e670653ae Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 15:52:33 -0700
Subject: [PATCH 594/770] [SVE] Eliminate calls to default-false
 VectorType::get() from X86

Reviewers: efriedma, sdesmalen, c-rhodes, craig.topper

Reviewed By: craig.topper

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80331
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  2 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 12 ++++---
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  |  5 +--
 llvm/lib/Target/X86/X86PartialReduction.cpp   |  3 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 35 ++++++++++---------
 5 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3d2cdccd50a59..0b114b34186d1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28632,7 +28632,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)VectorType::get(ArgTy, 4);
+                      : (Type *)FixedVectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 2632cb8a745ac..c8939e348a70c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5999,14 +5999,18 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
       Ty = Type::getFP128Ty(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                16);
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
              Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                8);
     else if (Opc == X86::MMX_SET0)
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                2);
     else
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                4);
 
     bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
                       Opc == X86::AVX512_512_SETALLONES ||
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 72a37a9ddeb9e..00ac238f284bd 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -201,7 +201,7 @@ void X86InterleavedAccessGroup::decompose(
   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
   if (VecLength == 768 || VecLength == 1536) {
-    VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
+    VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
     VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
     NumLoads = NumSubVectors * (VecLength / 384);
@@ -768,7 +768,8 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   // Lower the interleaved stores:
   //   1. Decompose the interleaved wide shuffle into individual shuffle
   //   vectors.
-  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
+  decompose(Shuffles[0], Factor,
+            FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
             DecomposedVectors);
 
   //   2. Transpose the interleaved-vectors into vectors of contiguous
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 4b3ba20444094..16108bd1928f6 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -372,7 +372,8 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
   }
 
   // Intrinsics produce vXi64 and need to be casted to vXi32.
-  Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+  auto *I32Ty =
+      FixedVectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
 
   assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
   unsigned NumSplits = NumElts / IntrinsicNumElts;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6bfcadeaf8b67..5199bfc829eff 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3164,8 +3164,8 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   if (LT.first != 1 && MTy.isVector() &&
       MTy.getVectorNumElements() < ValVTy->getNumElements()) {
     // Type needs to be split. We need LT.first - 1 arithmetic ops.
-    VectorType *SingleOpTy =
-        VectorType::get(ValVTy->getElementType(), MTy.getVectorNumElements());
+    auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+                                            MTy.getVectorNumElements());
     ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
     ArithmeticCost *= LT.first - 1;
   }
@@ -3234,8 +3234,8 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     if (LT.first != 1 && MTy.isVector() &&
         MTy.getVectorNumElements() < ValVTy->getNumElements()) {
       // Type needs to be split. We need LT.first - 1 arithmetic ops.
-      Type *SingleOpTy =
-          VectorType::get(ValVTy->getElementType(), MTy.getVectorNumElements());
+      auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+                                              MTy.getVectorNumElements());
       ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
       ArithmeticCost *= LT.first - 1;
     }
@@ -3310,7 +3310,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
           getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
     } else {
       // Reducing from smaller size is a shift by immediate.
-      auto *ShiftTy = VectorType::get(
+      auto *ShiftTy = FixedVectorType::get(
           Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
       ReductionCost += getArithmeticInstrCost(
           Instruction::LShr, ShiftTy, CostKind,
@@ -3617,8 +3617,8 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
     }
 
     // Add the arithmetic op for this level.
-    auto *SubCondTy = VectorType::get(CondTy->getElementType(),
-                                      Ty->getNumElements());
+    auto *SubCondTy =
+        FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
     MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
   }
 
@@ -3866,14 +3866,15 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
                            ? getIndexSizeInBits(Ptr, DL)
                            : DL.getPointerSizeInBits();
 
-  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
-                                                    IndexSize), VF);
+  auto *IndexVTy = FixedVectorType::get(
+      IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
   std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
   std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
   int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
   if (SplitFactor > 1) {
     // Handle splitting of vector of pointers
-    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+    auto *SplitSrcTy =
+        FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
                                          AddressSpace);
   }
@@ -4265,14 +4266,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
 
   // Get the cost of one memory operation.
-  Type *SingleMemOpTy =
-      VectorType::get(cast<VectorType>(VecTy)->getElementType(),
-                      LegalVT.getVectorNumElements());
+  auto *SingleMemOpTy =
+      FixedVectorType::get(cast<VectorType>(VecTy)->getElementType(),
+                           LegalVT.getVectorNumElements());
   unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
                                        MaybeAlign(Alignment), AddressSpace,
                                        CostKind);
 
-  VectorType *VT = VectorType::get(ScalarTy, VF);
+  auto *VT = FixedVectorType::get(ScalarTy, VF);
   EVT ETy = TLI->getValueType(DL, VT);
   if (!ETy.isSimple())
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -4408,9 +4409,9 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
 
     unsigned NumOfLoadsInInterleaveGrp =
         Indices.size() ? Indices.size() : Factor;
-    Type *ResultTy =
-        VectorType::get(cast<VectorType>(VecTy)->getElementType(),
-                        cast<VectorType>(VecTy)->getNumElements() / Factor);
+    auto *ResultTy = FixedVectorType::get(
+        cast<VectorType>(VecTy)->getElementType(),
+        cast<VectorType>(VecTy)->getNumElements() / Factor);
     unsigned NumOfResults =
         getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
         NumOfLoadsInInterleaveGrp;

From 03559c684a9bfe4de142fa4a7d2ef1edf08a8ad3 Mon Sep 17 00:00:00 2001
From: Volodymyr Sapsai <vsapsai@apple.com>
Date: Thu, 28 May 2020 17:06:33 -0700
Subject: [PATCH 595/770] [diagtool] Install diagtool when
 LLVM_INSTALL_TOOLCHAIN_ONLY is ON.

Not sure about other platforms but `install-xcode-toolchain` was already
including diagtool in the toolchain. This change makes it possible to
install diagtool during Apple's 2-stage build.

Instead of dropping `if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)` conditional
I've switched to `add_clang_tool` which handles install targets. Also a
few other clang tools like clang-format, clang-scan-deps are using this
macro, so it is good to be consistent.

rdar://problem/15386909

Reviewed By: JDevlieghere

Differential Revision: https://reviews.llvm.org/D80770
---
 clang/tools/diagtool/CMakeLists.txt | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/clang/tools/diagtool/CMakeLists.txt b/clang/tools/diagtool/CMakeLists.txt
index a95444be40ee5..b49619c075c73 100644
--- a/clang/tools/diagtool/CMakeLists.txt
+++ b/clang/tools/diagtool/CMakeLists.txt
@@ -2,7 +2,7 @@ set(LLVM_LINK_COMPONENTS
   Support
   )
 
-add_clang_executable(diagtool
+add_clang_tool(diagtool
   diagtool_main.cpp
   DiagTool.cpp
   DiagnosticNames.cpp
@@ -17,15 +17,3 @@ clang_target_link_libraries(diagtool
   clangBasic
   clangFrontend
   )
-
-if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-  install(TARGETS diagtool
-    COMPONENT diagtool
-    RUNTIME DESTINATION bin)
-
-  if (NOT LLVM_ENABLE_IDE)
-    add_llvm_install_targets(install-diagtool
-      DEPENDS diagtool
-      COMPONENT diagtool)
-  endif()
-endif()

From 7265ff928a974a844b6301c139cbb0f957532da9 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne@apple.com>
Date: Fri, 29 May 2020 16:32:55 -0700
Subject: [PATCH 596/770] [libc++] Fix issues with the triviality of std::array

The Standard is currently unimplementable. We have to pick between:

1. Not implementing constexpr support properly in std::array<T, 0>
2. Making std::array<T, 0> non-trivial even when T is trivial
3. Returning nullptr from std::array<T, 0>::begin()

Libc++ initially picked (1). In 77b9abfc8e89, we started implementing constexpr properly, but lost the guarantee of triviality. Since it seems like both (1) and (2) are really important, it seems like (3) is the only viable option for libc++, after all. This is also what other implementations are doing.

This patch moves libc++ from (1) to (3).

It also:
- Improves the test coverage for the various ways of initializing std::array
- Adds tests for the triviality of std::array
- Adds tests for the aggregate-ness of std::array

Reviewed By: #libc, miscco, EricWF, zoecarver

Differential Revision: https://reviews.llvm.org/D80821
---
 libcxx/include/array                          |  23 +--
 .../sequences/array/triviality.pass.cpp       |  54 +++++
 .../sequences/array/aggregate.pass.cpp        |  39 ++--
 .../array/array.cons/default.pass.cpp         |  61 ------
 .../array/array.cons/implicit_copy.pass.cpp   | 104 ++++++----
 .../array/array.cons/initialization.pass.cpp  | 188 ++++++++++++++++++
 .../array.cons/initializer_list.pass.cpp      |  63 ------
 .../sequences/array/array.data/data.pass.cpp  |   7 +-
 .../array/array.data/data_const.pass.cpp      |   5 +-
 .../sequences/array/iterators.pass.cpp        |   6 -
 10 files changed, 331 insertions(+), 219 deletions(-)
 create mode 100644 libcxx/test/libcxx/containers/sequences/array/triviality.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/array/array.cons/initialization.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp

diff --git a/libcxx/include/array b/libcxx/include/array
index 215d4e89f0ea7..68743773f1a34 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -242,31 +242,16 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
     typedef std::reverse_iterator<iterator>       reverse_iterator;
     typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
-#ifndef _LIBCPP_CXX03_LANG
-    union __wrapper {
-        _LIBCPP_CONSTEXPR __wrapper() : __b() { }
-        ~__wrapper() = default;
-
-        bool __b;
-        _Tp __t;
-    } __w;
-
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
-    value_type* data() _NOEXCEPT {return &__w.__t;}
-    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
-    const value_type* data() const _NOEXCEPT {return &__w.__t;}
-#else // C++03
     typedef typename conditional<is_const<_Tp>::value, const char,
                                 char>::type _CharType;
 
     struct  _ArrayInStructT { _Tp __data_[1]; };
     _ALIGNAS_TYPE(_ArrayInStructT) _CharType __elems_[sizeof(_ArrayInStructT)];
 
-    _LIBCPP_INLINE_VISIBILITY
-    value_type* data() _NOEXCEPT {return reinterpret_cast<value_type*>(__elems_);}
-    _LIBCPP_INLINE_VISIBILITY
-    const value_type* data() const _NOEXCEPT {return reinterpret_cast<const value_type*>(__elems_);}
-#endif
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+    value_type* data() _NOEXCEPT {return nullptr;}
+    _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX14
+    const value_type* data() const _NOEXCEPT {return nullptr;}
 
     // No explicit construct/copy/destroy for aggregate type
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
diff --git a/libcxx/test/libcxx/containers/sequences/array/triviality.pass.cpp b/libcxx/test/libcxx/containers/sequences/array/triviality.pass.cpp
new file mode 100644
index 0000000000000..d124e84771657
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/array/triviality.pass.cpp
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Make sure std::array<T, N> is trivially copyable whenever T is trivially copyable.
+// This is not technically mandated by the Standard, but libc++ has been providing
+// this property.
+
+#include <array>
+#include <type_traits>
+
+
+struct Empty { };
+
+struct TrivialCopy {
+    int i;
+    double j;
+};
+
+struct NonTrivialCopy {
+    NonTrivialCopy(NonTrivialCopy const&) { }
+    NonTrivialCopy& operator=(NonTrivialCopy const&) { return *this; }
+};
+
+template <typename T>
+void check_trivially_copyable()
+{
+    static_assert(std::is_trivially_copyable<std::array<T, 0> >::value, "");
+    static_assert(std::is_trivially_copyable<std::array<T, 1> >::value, "");
+    static_assert(std::is_trivially_copyable<std::array<T, 2> >::value, "");
+    static_assert(std::is_trivially_copyable<std::array<T, 3> >::value, "");
+}
+
+int main(int, char**)
+{
+    check_trivially_copyable<int>();
+    check_trivially_copyable<long>();
+    check_trivially_copyable<double>();
+    check_trivially_copyable<long double>();
+    check_trivially_copyable<Empty>();
+    check_trivially_copyable<TrivialCopy>();
+
+    // Check that std::array<T, 0> is still trivially copyable when T is not
+    static_assert( std::is_trivially_copyable<std::array<NonTrivialCopy, 0> >::value, "");
+    static_assert(!std::is_trivially_copyable<std::array<NonTrivialCopy, 1> >::value, "");
+    static_assert(!std::is_trivially_copyable<std::array<NonTrivialCopy, 2> >::value, "");
+    static_assert(!std::is_trivially_copyable<std::array<NonTrivialCopy, 3> >::value, "");
+
+    return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp b/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp
index dd4064bb2fe84..9db25a9231bca 100644
--- a/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/aggregate.pass.cpp
@@ -7,44 +7,41 @@
 //===----------------------------------------------------------------------===//
 
 // Make sure std::array is an aggregate type.
+// We can only check this in C++17 and above, because we don't have the
+// trait before that.
+// UNSUPPORTED: c++03, c++11, c++14
 
 #include <array>
 #include <type_traits>
 
 template <typename T>
-void tests()
+void check_aggregate()
 {
-    // Test aggregate initialization
-    {
-        std::array<T, 0> a0 = {}; (void)a0;
-        std::array<T, 1> a1 = {T()}; (void)a1;
-        std::array<T, 2> a2 = {T(), T()}; (void)a2;
-        std::array<T, 3> a3 = {T(), T(), T()}; (void)a3;
-    }
-
-    // Test the is_aggregate trait.
-#if TEST_STD_VER >= 17 // The trait is only available in C++17 and above
     static_assert(std::is_aggregate<std::array<T, 0> >::value, "");
     static_assert(std::is_aggregate<std::array<T, 1> >::value, "");
     static_assert(std::is_aggregate<std::array<T, 2> >::value, "");
     static_assert(std::is_aggregate<std::array<T, 3> >::value, "");
     static_assert(std::is_aggregate<std::array<T, 4> >::value, "");
-#endif
 }
 
 struct Empty { };
-struct NonEmpty { int i; int j; };
+struct Trivial { int i; int j; };
+struct NonTrivial {
+    int i; int j;
+    NonTrivial(NonTrivial const&) { }
+};
 
 int main(int, char**)
 {
-    tests<char>();
-    tests<int>();
-    tests<long>();
-    tests<float>();
-    tests<double>();
-    tests<long double>();
-    tests<NonEmpty>();
-    tests<Empty>();
+    check_aggregate<char>();
+    check_aggregate<int>();
+    check_aggregate<long>();
+    check_aggregate<float>();
+    check_aggregate<double>();
+    check_aggregate<long double>();
+    check_aggregate<Empty>();
+    check_aggregate<Trivial>();
+    check_aggregate<NonTrivial>();
 
     return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp
deleted file mode 100644
index e73a9671f478a..0000000000000
--- a/libcxx/test/std/containers/sequences/array/array.cons/default.pass.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <array>
-
-// array();
-
-#include <array>
-#include <cassert>
-
-// std::array is explicitly allowed to be initialized with A a = { init-list };.
-// Disable the missing braces warning for this reason.
-#include "test_macros.h"
-#include "disable_missing_braces_warning.h"
-
-struct NoDefault {
-    TEST_CONSTEXPR NoDefault(int) { }
-};
-
-struct Default {
-    TEST_CONSTEXPR Default() { }
-};
-
-TEST_CONSTEXPR_CXX14 bool tests()
-{
-    {
-        std::array<Default, 3> array;
-        assert(array.size() == 3);
-    }
-
-    {
-        std::array<Default, 0> array;
-        assert(array.size() == 0);
-    }
-
-    {
-        typedef std::array<NoDefault, 0> C;
-        C c;
-        assert(c.size() == 0);
-        C c1 = {};
-        assert(c1.size() == 0);
-        C c2 = {{}};
-        assert(c2.size() == 0);
-    }
-
-    return true;
-}
-
-int main(int, char**)
-{
-    tests();
-#if TEST_STD_VER >= 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp
index cb9a182980315..814d1df2d52f2 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/implicit_copy.pass.cpp
@@ -32,62 +32,82 @@ struct NoDefault {
     TEST_CONSTEXPR NoDefault(int) { }
 };
 
+struct NonTrivialCopy {
+    TEST_CONSTEXPR NonTrivialCopy() { }
+    TEST_CONSTEXPR NonTrivialCopy(NonTrivialCopy const&) { }
+    TEST_CONSTEXPR_CXX14 NonTrivialCopy& operator=(NonTrivialCopy const&) { return *this; }
+};
+
 TEST_CONSTEXPR_CXX14 bool tests()
 {
     {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        C c = {1.1, 2.2, 3.3};
-        C c2 = c;
-        c2 = c;
-        static_assert(std::is_copy_constructible<C>::value, "");
-        static_assert(std::is_copy_assignable<C>::value, "");
+        typedef std::array<double, 3> Array;
+        Array array = {1.1, 2.2, 3.3};
+        Array copy = array;
+        copy = array;
+        static_assert(std::is_copy_constructible<Array>::value, "");
+        static_assert(std::is_copy_assignable<Array>::value, "");
     }
     {
-        typedef double T;
-        typedef std::array<const T, 3> C;
-        C c = {1.1, 2.2, 3.3};
-        C c2 = c;
-        ((void)c2);
-        static_assert(std::is_copy_constructible<C>::value, "");
-        TEST_NOT_COPY_ASSIGNABLE(C);
+        typedef std::array<double const, 3> Array;
+        Array array = {1.1, 2.2, 3.3};
+        Array copy = array; (void)copy;
+        static_assert(std::is_copy_constructible<Array>::value, "");
+        TEST_NOT_COPY_ASSIGNABLE(Array);
     }
     {
-        typedef double T;
-        typedef std::array<T, 0> C;
-        C c = {};
-        C c2 = c;
-        c2 = c;
-        static_assert(std::is_copy_constructible<C>::value, "");
-        static_assert(std::is_copy_assignable<C>::value, "");
+        typedef std::array<double, 0> Array;
+        Array array = {};
+        Array copy = array;
+        copy = array;
+        static_assert(std::is_copy_constructible<Array>::value, "");
+        static_assert(std::is_copy_assignable<Array>::value, "");
     }
     {
         // const arrays of size 0 should disable the implicit copy assignment operator.
-        typedef double T;
-        typedef std::array<const T, 0> C;
-        C c = {{}};
-        C c2 = c;
-        ((void)c2);
-        static_assert(std::is_copy_constructible<C>::value, "");
-        TEST_NOT_COPY_ASSIGNABLE(C);
+        typedef std::array<double const, 0> Array;
+        Array array = {};
+        Array copy = array; (void)copy;
+        static_assert(std::is_copy_constructible<Array>::value, "");
+        TEST_NOT_COPY_ASSIGNABLE(Array);
+    }
+    {
+        typedef std::array<NoDefault, 0> Array;
+        Array array = {};
+        Array copy = array;
+        copy = array;
+        static_assert(std::is_copy_constructible<Array>::value, "");
+        static_assert(std::is_copy_assignable<Array>::value, "");
+    }
+    {
+        typedef std::array<NoDefault const, 0> Array;
+        Array array = {};
+        Array copy = array; (void)copy;
+        static_assert(std::is_copy_constructible<Array>::value, "");
+        TEST_NOT_COPY_ASSIGNABLE(Array);
+    }
+
+    // Make sure we can implicitly copy a std::array of a non-trivially copyable type
+    {
+        typedef std::array<NonTrivialCopy, 0> Array;
+        Array array = {};
+        Array copy = array;
+        copy = array;
+        static_assert(std::is_copy_constructible<Array>::value, "");
     }
     {
-        typedef NoDefault T;
-        typedef std::array<T, 0> C;
-        C c = {};
-        C c2 = c;
-        c2 = c;
-        static_assert(std::is_copy_constructible<C>::value, "");
-        static_assert(std::is_copy_assignable<C>::value, "");
+        typedef std::array<NonTrivialCopy, 1> Array;
+        Array array = {};
+        Array copy = array;
+        copy = array;
+        static_assert(std::is_copy_constructible<Array>::value, "");
     }
     {
-        typedef NoDefault T;
-        typedef std::array<const T, 0> C;
-        C c = {{}};
-        C c2 = c;
-        ((void)c2);
-        static_assert(std::is_copy_constructible<C>::value, "");
-        TEST_NOT_COPY_ASSIGNABLE(C);
+        typedef std::array<NonTrivialCopy, 2> Array;
+        Array array = {};
+        Array copy = array;
+        copy = array;
+        static_assert(std::is_copy_constructible<Array>::value, "");
     }
 
     return true;
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/initialization.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/initialization.pass.cpp
new file mode 100644
index 0000000000000..9153106b384fc
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/array.cons/initialization.pass.cpp
@@ -0,0 +1,188 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Test all the ways of initializing a std::array.
+
+#include <array>
+#include <cassert>
+#include <type_traits>
+#include "test_macros.h"
+
+
+struct NoDefault {
+    TEST_CONSTEXPR NoDefault(int) { }
+};
+
+// Test default initialization
+// This one isn't constexpr because omitting to initialize fundamental types
+// isn't valid in a constexpr context.
+struct test_default_initialization {
+    template <typename T>
+    void operator()() const
+    {
+        std::array<T, 0> a0; (void)a0;
+        std::array<T, 1> a1; (void)a1;
+        std::array<T, 2> a2; (void)a2;
+        std::array<T, 3> a3; (void)a3;
+
+        std::array<NoDefault, 0> nodefault; (void)nodefault;
+    }
+};
+
+struct test_nondefault_initialization {
+    template <typename T>
+    TEST_CONSTEXPR_CXX14 void operator()() const
+    {
+        // Check direct-list-initialization syntax (introduced in C++11)
+    #if TEST_STD_VER >= 11
+        {
+            {
+                std::array<T, 0> a0_0{}; (void)a0_0;
+            }
+            {
+                std::array<T, 1> a1_0{}; (void)a1_0;
+                std::array<T, 1> a1_1{T()}; (void)a1_1;
+            }
+            {
+                std::array<T, 2> a2_0{}; (void)a2_0;
+                std::array<T, 2> a2_1{T()}; (void)a2_1;
+                std::array<T, 2> a2_2{T(), T()}; (void)a2_2;
+            }
+            {
+                std::array<T, 3> a3_0{}; (void)a3_0;
+                std::array<T, 3> a3_1{T()}; (void)a3_1;
+                std::array<T, 3> a3_2{T(), T()}; (void)a3_2;
+                std::array<T, 3> a3_3{T(), T(), T()}; (void)a3_3;
+            }
+
+            std::array<NoDefault, 0> nodefault{}; (void)nodefault;
+        }
+    #endif
+
+        // Check copy-list-initialization syntax
+        {
+            {
+                std::array<T, 0> a0_0 = {}; (void)a0_0;
+            }
+            {
+                std::array<T, 1> a1_0 = {}; (void)a1_0;
+                std::array<T, 1> a1_1 = {T()}; (void)a1_1;
+            }
+            {
+                std::array<T, 2> a2_0 = {}; (void)a2_0;
+                std::array<T, 2> a2_1 = {T()}; (void)a2_1;
+                std::array<T, 2> a2_2 = {T(), T()}; (void)a2_2;
+            }
+            {
+                std::array<T, 3> a3_0 = {}; (void)a3_0;
+                std::array<T, 3> a3_1 = {T()}; (void)a3_1;
+                std::array<T, 3> a3_2 = {T(), T()}; (void)a3_2;
+                std::array<T, 3> a3_3 = {T(), T(), T()}; (void)a3_3;
+            }
+
+            std::array<NoDefault, 0> nodefault = {}; (void)nodefault;
+        }
+
+        // Test aggregate initialization
+        {
+            {
+                std::array<T, 0> a0_0 = {{}}; (void)a0_0;
+            }
+            {
+                std::array<T, 1> a1_0 = {{}}; (void)a1_0;
+                std::array<T, 1> a1_1 = {{T()}}; (void)a1_1;
+            }
+            {
+                std::array<T, 2> a2_0 = {{}}; (void)a2_0;
+                std::array<T, 2> a2_1 = {{T()}}; (void)a2_1;
+                std::array<T, 2> a2_2 = {{T(), T()}}; (void)a2_2;
+            }
+            {
+                std::array<T, 3> a3_0 = {{}}; (void)a3_0;
+                std::array<T, 3> a3_1 = {{T()}}; (void)a3_1;
+                std::array<T, 3> a3_2 = {{T(), T()}}; (void)a3_2;
+                std::array<T, 3> a3_3 = {{T(), T(), T()}}; (void)a3_3;
+            }
+
+            // See http://wg21.link/LWG2157
+            std::array<NoDefault, 0> nodefault = {{}}; (void)nodefault;
+        }
+    }
+};
+
+// Test construction from an initializer-list
+TEST_CONSTEXPR_CXX14 bool test_initializer_list()
+{
+    {
+        std::array<double, 3> const a3_0 = {};
+        assert(a3_0[0] == double());
+        assert(a3_0[1] == double());
+        assert(a3_0[2] == double());
+    }
+    {
+        std::array<double, 3> const a3_1 = {1};
+        assert(a3_1[0] == double(1));
+        assert(a3_1[1] == double());
+        assert(a3_1[2] == double());
+    }
+    {
+        std::array<double, 3> const a3_2 = {1, 2.2};
+        assert(a3_2[0] == double(1));
+        assert(a3_2[1] == 2.2);
+        assert(a3_2[2] == double());
+    }
+    {
+        std::array<double, 3> const a3_3 = {1, 2, 3.5};
+        assert(a3_3[0] == double(1));
+        assert(a3_3[1] == double(2));
+        assert(a3_3[2] == 3.5);
+    }
+
+    return true;
+}
+
+struct Empty { };
+struct Trivial { int i; int j; };
+struct NonTrivial {
+    TEST_CONSTEXPR NonTrivial() { }
+    TEST_CONSTEXPR NonTrivial(NonTrivial const&) { }
+};
+struct NonEmptyNonTrivial {
+    int i; int j;
+    TEST_CONSTEXPR NonEmptyNonTrivial() : i(22), j(33) { }
+    TEST_CONSTEXPR NonEmptyNonTrivial(NonEmptyNonTrivial const&) : i(22), j(33) { }
+};
+
+template <typename F>
+TEST_CONSTEXPR_CXX14 bool with_all_types()
+{
+    F().template operator()<char>();
+    F().template operator()<int>();
+    F().template operator()<long>();
+    F().template operator()<float>();
+    F().template operator()<double>();
+    F().template operator()<long double>();
+    F().template operator()<Empty>();
+    F().template operator()<Trivial>();
+    F().template operator()<NonTrivial>();
+    F().template operator()<NonEmptyNonTrivial>();
+    return true;
+}
+
+int main(int, char**)
+{
+    with_all_types<test_nondefault_initialization>();
+    with_all_types<test_default_initialization>(); // not constexpr
+    test_initializer_list();
+#if TEST_STD_VER >= 14
+    static_assert(with_all_types<test_nondefault_initialization>(), "");
+    static_assert(test_initializer_list(), "");
+#endif
+
+    return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp
deleted file mode 100644
index 49437546a8f73..0000000000000
--- a/libcxx/test/std/containers/sequences/array/array.cons/initializer_list.pass.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <array>
-
-// Construct with initializer list
-
-#include <array>
-#include <cassert>
-
-// std::array is explicitly allowed to be initialized with A a = { init-list };.
-// Disable the missing braces warning for this reason.
-#include "test_macros.h"
-#include "disable_missing_braces_warning.h"
-
-TEST_CONSTEXPR_CXX14 bool tests()
-{
-    {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        C const c = {1, 2, 3.5};
-        assert(c.size() == 3);
-        assert(c[0] == 1);
-        assert(c[1] == 2);
-        assert(c[2] == 3.5);
-    }
-    {
-        typedef double T;
-        typedef std::array<T, 0> C;
-        C const c = {};
-        assert(c.size() == 0);
-    }
-
-    {
-        typedef double T;
-        typedef std::array<T, 3> C;
-        C const c = {1};
-        assert(c.size() == 3.0);
-        assert(c[0] == 1);
-    }
-    {
-        typedef int T;
-        typedef std::array<T, 1> C;
-        C const c = {};
-        assert(c.size() == 1);
-    }
-
-    return true;
-}
-
-int main(int, char**)
-{
-    tests();
-#if TEST_STD_VER >= 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp b/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp
index a41409f8df1c0..3493659f49a68 100644
--- a/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.data/data.pass.cpp
@@ -49,14 +49,14 @@ TEST_CONSTEXPR_CXX17 bool tests()
         typedef std::array<T, 0> C;
         C c = {};
         T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
+        (void)p;
     }
     {
         typedef double T;
         typedef std::array<const T, 0> C;
         C c = {{}};
         const T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
+        (void)p;
         static_assert((std::is_same<decltype(c.data()), const T*>::value), "");
     }
     {
@@ -64,7 +64,7 @@ TEST_CONSTEXPR_CXX17 bool tests()
         typedef std::array<T, 0> C;
         C c = {};
         T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
+        (void)p;
     }
     {
         std::array<int, 5> c = {0, 1, 2, 3, 4};
@@ -92,7 +92,6 @@ int main(int, char**)
         typedef std::array<T, 0> C;
         const C c = {};
         const T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
         std::uintptr_t pint = reinterpret_cast<std::uintptr_t>(p);
         assert(pint % TEST_ALIGNOF(T) == 0);
     }
diff --git a/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp b/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp
index 0f79237b48a6e..cce6e63d30da1 100644
--- a/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.data/data_const.pass.cpp
@@ -49,14 +49,14 @@ TEST_CONSTEXPR_CXX17 bool tests()
         typedef std::array<T, 0> C;
         const C c = {};
         const T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
+        (void)p;
     }
     {
         typedef NoDefault T;
         typedef std::array<T, 0> C;
         const C c = {};
         const T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
+        (void)p;
     }
     {
         std::array<int, 5> const c = {0, 1, 2, 3, 4};
@@ -84,7 +84,6 @@ int main(int, char**)
         typedef std::array<T, 0> C;
         const C c = {};
         const T* p = c.data();
-        LIBCPP_ASSERT(p != nullptr);
         std::uintptr_t pint = reinterpret_cast<std::uintptr_t>(p);
         assert(pint % TEST_ALIGNOF(T) == 0);
     }
diff --git a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
index 39d8a1a5dfa4c..60b01dfc1abe0 100644
--- a/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/iterators.pass.cpp
@@ -52,8 +52,6 @@ TEST_CONSTEXPR_CXX17 bool tests()
         typename C::iterator i = array.begin();
         typename C::const_iterator j = array.cbegin();
         assert(i == j);
-        LIBCPP_ASSERT(i != nullptr);
-        LIBCPP_ASSERT(j != nullptr);
     }
 
     {
@@ -63,8 +61,6 @@ TEST_CONSTEXPR_CXX17 bool tests()
         typename C::const_iterator j = array.cbegin();
         assert(i == array.end());
         assert(j == array.cend());
-        LIBCPP_ASSERT(i != nullptr);
-        LIBCPP_ASSERT(j != nullptr);
     }
     {
         typedef std::array<int, 1> C;
@@ -101,8 +97,6 @@ TEST_CONSTEXPR_CXX17 bool tests()
         typename C::iterator ib = array.begin();
         typename C::iterator ie = array.end();
         assert(ib == ie);
-        LIBCPP_ASSERT(ib != nullptr);
-        LIBCPP_ASSERT(ie != nullptr);
     }
 
 #if TEST_STD_VER >= 14

From c652c306a6aa3b356cebae78caf4b33b63afb866 Mon Sep 17 00:00:00 2001
From: Thomas Raoux <thomasraoux@google.com>
Date: Fri, 29 May 2020 16:34:56 -0700
Subject: [PATCH 597/770] [mlir][spirv] Clean up coop matrix assembly
 declaration.

Address code review feedback and use declarative assembly format.

Differential Revision: https://reviews.llvm.org/D80687
---
 .../SPIRV/SPIRVCooperativeMatrixOps.td        |  8 ++-
 mlir/lib/Dialect/SPIRV/SPIRVOps.cpp           | 58 ++-----------------
 .../Serialization/cooperative-matrix.mlir     |  2 +-
 .../Dialect/SPIRV/cooperative-matrix.mlir     |  2 +-
 4 files changed, 14 insertions(+), 56 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVCooperativeMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVCooperativeMatrixOps.td
index 4645765b66bab..9c3462a2e5bf1 100644
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVCooperativeMatrixOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVCooperativeMatrixOps.td
@@ -39,6 +39,8 @@ def SPV_CooperativeMatrixLengthNVOp : SPV_Op<"CooperativeMatrixLengthNV",
     ```
   }];
 
+  let assemblyFormat = "attr-dict `:` $type";
+
   let availability = [
     MinVersion<SPV_V_1_0>,
     MaxVersion<SPV_V_1_5>,
@@ -139,7 +141,7 @@ def SPV_CooperativeMatrixLoadNVOp : SPV_Op<"CooperativeMatrixLoadNV", []> {
 // -----
 
 def SPV_CooperativeMatrixMulAddNVOp : SPV_Op<"CooperativeMatrixMulAddNV",
-  [NoSideEffect]> {
+  [NoSideEffect, AllTypesMatch<["c", "result"]>]> {
   let summary = "See extension SPV_NV_cooperative_matrix";
 
   let description = [{
@@ -188,6 +190,10 @@ def SPV_CooperativeMatrixMulAddNVOp : SPV_Op<"CooperativeMatrixMulAddNV",
     ```
   }];
 
+  let assemblyFormat = [{
+    operands attr-dict`:` type($a) `,` type($b) `->` type($c)
+  }];
+
   let availability = [
     MinVersion<SPV_V_1_0>,
     MaxVersion<SPV_V_1_5>,
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
index 4f48ef9d7d7cf..ac8fee8619b6e 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -1134,12 +1134,11 @@ static LogicalResult verify(spirv::CompositeConstructOp compositeConstructOp) {
       return compositeConstructOp.emitError(
                  "has incorrect number of operands: expected ")
              << "1, but provided " << constituents.size();
-  } else {
-    if (constituents.size() != cType.getNumElements())
-      return compositeConstructOp.emitError(
-                 "has incorrect number of operands: expected ")
-             << cType.getNumElements() << ", but provided "
-             << constituents.size();
+  } else if (constituents.size() != cType.getNumElements()) {
+    return compositeConstructOp.emitError(
+               "has incorrect number of operands: expected ")
+           << cType.getNumElements() << ", but provided "
+           << constituents.size();
   }
 
   for (auto index : llvm::seq<uint32_t>(0, constituents.size())) {
@@ -2735,57 +2734,10 @@ static void print(spirv::CooperativeMatrixStoreNVOp coopMatrix,
   printer << " : " << coopMatrix.getOperand(1).getType();
 }
 
-//===----------------------------------------------------------------------===//
-// spv.CooperativeMatrixLengthNV
-//===----------------------------------------------------------------------===//
-
-static ParseResult parseCooperativeMatrixLengthNVOp(OpAsmParser &parser,
-                                                    OperationState &state) {
-  OpAsmParser::OperandType operandInfo;
-  Type dstType = parser.getBuilder().getIntegerType(32);
-  Type type;
-  if (parser.parseColonType(type)) {
-    return failure();
-  }
-  state.addAttribute(kTypeAttrName, TypeAttr::get(type));
-  state.addTypes(dstType);
-  return success();
-}
-
-static void print(spirv::CooperativeMatrixLengthNVOp coopMatrix,
-                  OpAsmPrinter &printer) {
-  printer << coopMatrix.getOperationName() << " : " << coopMatrix.type();
-}
-
 //===----------------------------------------------------------------------===//
 // spv.CooperativeMatrixMulAddNV
 //===----------------------------------------------------------------------===//
 
-static ParseResult parseCooperativeMatrixMulAddNVOp(OpAsmParser &parser,
-                                                    OperationState &state) {
-  SmallVector<OpAsmParser::OperandType, 3> ops;
-  SmallVector<Type, 3> types(3);
-  if (parser.parseOperandList(ops, 3) || parser.parseColon() ||
-      parser.parseType(types[0]) || parser.parseComma() ||
-      parser.parseType(types[1]) || parser.parseArrow() ||
-      parser.parseType(types[2]) ||
-      parser.resolveOperands(ops, types, parser.getNameLoc(), state.operands)) {
-    return failure();
-  }
-  state.addTypes(types[2]);
-  return success();
-}
-
-static void print(spirv::CooperativeMatrixMulAddNVOp coopMatrix,
-                  OpAsmPrinter &printer) {
-  printer << coopMatrix.getOperationName() << ' ' << coopMatrix.getOperand(0)
-          << ", " << coopMatrix.getOperand(1) << ", "
-          << coopMatrix.getOperand(2) << ", "
-          << " : " << coopMatrix.getOperand(0).getType() << ", "
-          << coopMatrix.getOperand(1).getType() << " -> "
-          << coopMatrix.getOperand(2).getType();
-}
-
 static LogicalResult
 verifyCoopMatrixMulAdd(spirv::CooperativeMatrixMulAddNVOp op) {
   if (op.c().getType() != op.result().getType())
diff --git a/mlir/test/Dialect/SPIRV/Serialization/cooperative-matrix.mlir b/mlir/test/Dialect/SPIRV/Serialization/cooperative-matrix.mlir
index 12f710ea1b465..0d58fea18a119 100644
--- a/mlir/test/Dialect/SPIRV/Serialization/cooperative-matrix.mlir
+++ b/mlir/test/Dialect/SPIRV/Serialization/cooperative-matrix.mlir
@@ -38,7 +38,7 @@ spv.module Logical GLSL450 requires #spv.vce<v1.0, [CooperativeMatrixNV], [SPV_N
 
   // CHECK-LABEL: @cooperative_matrix_muladd
   spv.func @cooperative_matrix_muladd(%a : !spv.coopmatrix<8x16xi32, Subgroup>, %b : !spv.coopmatrix<16x8xi32, Subgroup>, %c : !spv.coopmatrix<8x8xi32, Subgroup>) "None" {
-    // CHECK: {{%.*}} = spv.CooperativeMatrixMulAddNV {{%.*}}, {{%.*}}, {{%.*}},  : !spv.coopmatrix<8x16xi32, Subgroup>, !spv.coopmatrix<16x8xi32, Subgroup> -> !spv.coopmatrix<8x8xi32, Subgroup>
+    // CHECK: {{%.*}} = spv.CooperativeMatrixMulAddNV {{%.*}}, {{%.*}}, {{%.*}}  : !spv.coopmatrix<8x16xi32, Subgroup>, !spv.coopmatrix<16x8xi32, Subgroup> -> !spv.coopmatrix<8x8xi32, Subgroup>
     %r = spv.CooperativeMatrixMulAddNV %a, %b, %c : !spv.coopmatrix<8x16xi32, Subgroup>, !spv.coopmatrix<16x8xi32, Subgroup> -> !spv.coopmatrix<8x8xi32, Subgroup>
     spv.Return
   }
diff --git a/mlir/test/Dialect/SPIRV/cooperative-matrix.mlir b/mlir/test/Dialect/SPIRV/cooperative-matrix.mlir
index e30352625da67..51c709067f6f7 100644
--- a/mlir/test/Dialect/SPIRV/cooperative-matrix.mlir
+++ b/mlir/test/Dialect/SPIRV/cooperative-matrix.mlir
@@ -38,7 +38,7 @@ spv.func @cooperative_matrix_length() -> i32 "None" {
 
 // CHECK-LABEL: @cooperative_matrix_muladd
 spv.func @cooperative_matrix_muladd(%a : !spv.coopmatrix<8x16xi32, Subgroup>, %b : !spv.coopmatrix<16x8xi32, Subgroup>, %c : !spv.coopmatrix<8x8xi32, Subgroup>) "None" {
-  // CHECK: {{%.*}} = spv.CooperativeMatrixMulAddNV {{%.*}}, {{%.*}}, {{%.*}},  : !spv.coopmatrix<8x16xi32, Subgroup>, !spv.coopmatrix<16x8xi32, Subgroup> -> !spv.coopmatrix<8x8xi32, Subgroup>
+  // CHECK: {{%.*}} = spv.CooperativeMatrixMulAddNV {{%.*}}, {{%.*}}, {{%.*}}  : !spv.coopmatrix<8x16xi32, Subgroup>, !spv.coopmatrix<16x8xi32, Subgroup> -> !spv.coopmatrix<8x8xi32, Subgroup>
   %r = spv.CooperativeMatrixMulAddNV %a, %b, %c : !spv.coopmatrix<8x16xi32, Subgroup>, !spv.coopmatrix<16x8xi32, Subgroup> -> !spv.coopmatrix<8x8xi32, Subgroup>
   spv.Return
 }

From 14f49599ccafc7e318fac609387c4dd68430925f Mon Sep 17 00:00:00 2001
From: Tim Keith <tkeith@nvidia.com>
Date: Fri, 29 May 2020 16:39:13 -0700
Subject: [PATCH 598/770] [flang][NFC] Remove link-time dependency of Evaluate
 on Semantics

Summary:
Some Symbol-related functions used in Evaluate were moved to
Evaluate/tools.h. This includes changing some member functions that were
replaced by non-member functions `IsDummy`, `GetUsedModule`, and
`CountLenParameters`.

Some member functions were made inline in `Scope`, `Symbol`,
`ArraySpec`, and `DeclTypeSpec`. The definitions were preceded by a
comment explaining why they are inline.

`IsConstantShape` was expanded inline in `IsDescriptor` because it isn't
used anywhere else

After this change, at least when compiling with clang on macos,
`libFortranEvaluate.a` has no undefined symbols that are satisfied by
`libFortranSemantics.a`.

Reviewers: klausler, PeteSteinfeld, sscalpone, jdoerfert, DavidTruby

Reviewed By: PeteSteinfeld

Subscribers: llvm-commits

Tags: #flang, #llvm

Differential Revision: https://reviews.llvm.org/D80762
---
 flang/include/flang/Evaluate/tools.h        |  25 +++
 flang/include/flang/Semantics/scope.h       |  10 +-
 flang/include/flang/Semantics/symbol.h      |  86 +++++-----
 flang/include/flang/Semantics/tools.h       |  18 +--
 flang/include/flang/Semantics/type.h        |  79 +++++++--
 flang/lib/Evaluate/check-expression.cpp     |   2 +-
 flang/lib/Evaluate/tools.cpp                | 155 +++++++++++++++++-
 flang/lib/Evaluate/type.cpp                 |  12 +-
 flang/lib/Semantics/check-declarations.cpp  |   6 +-
 flang/lib/Semantics/compute-offsets.cpp     |   2 +-
 flang/lib/Semantics/mod-file.cpp            |   2 +-
 flang/lib/Semantics/resolve-names-utils.cpp |   2 +-
 flang/lib/Semantics/resolve-names.cpp       |  18 +--
 flang/lib/Semantics/scope.cpp               |  10 --
 flang/lib/Semantics/symbol.cpp              |  22 +--
 flang/lib/Semantics/tools.cpp               | 167 --------------------
 flang/lib/Semantics/type.cpp                |  70 --------
 17 files changed, 332 insertions(+), 354 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index a149a5fe5a050..c8d6da3dbf287 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -840,4 +840,29 @@ std::optional<std::string> FindImpureCall(
     const IntrinsicProcTable &, const ProcedureRef &);
 
 } // namespace Fortran::evaluate
+
+namespace Fortran::semantics {
+
+class Scope;
+
+// These functions are used in Evaluate so they are defined here rather than in
+// Semantics to avoid a link-time dependency on Semantics.
+
+bool IsVariableName(const Symbol &);
+bool IsPureProcedure(const Symbol &);
+bool IsPureProcedure(const Scope &);
+bool IsFunction(const Symbol &);
+bool IsProcedure(const Symbol &);
+bool IsProcedurePointer(const Symbol &);
+bool IsSaved(const Symbol &); // saved implicitly or explicitly
+bool IsDummy(const Symbol &);
+
+// Follow use, host, and construct assocations to a variable, if any.
+const Symbol *GetAssociationRoot(const Symbol &);
+const Symbol *FindCommonBlockContaining(const Symbol &);
+int CountLenParameters(const DerivedTypeSpec &);
+const Symbol &GetUsedModule(const UseDetails &);
+
+} // namespace Fortran::semantics
+
 #endif // FORTRAN_EVALUATE_TOOLS_H_
diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index 92d74adc88154..878536aa06da4 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -89,7 +89,7 @@ class Scope {
   Symbol *symbol() { return symbol_; }
   const Symbol *symbol() const { return symbol_; }
 
-  const Symbol *GetSymbol() const;
+  inline const Symbol *GetSymbol() const;
   const Scope *GetDerivedTypeParent() const;
   const Scope &GetDerivedTypeBase() const;
   std::optional<SourceName> GetName() const;
@@ -255,5 +255,13 @@ class Scope {
 
   friend llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Scope &);
 };
+
+// Inline so that it can be called from Evaluate without a link-time dependency.
+
+inline const Symbol *Scope::GetSymbol() const {
+  return symbol_ ? symbol_
+                 : derivedTypeSpec_ ? &derivedTypeSpec_->typeSymbol() : nullptr;
+}
+
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_SCOPE_H_
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 6ffa84ca184c5..0de6e462133d1 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -365,7 +365,6 @@ class UseDetails {
       : location_{location}, symbol_{symbol} {}
   const SourceName &location() const { return location_; }
   const Symbol &symbol() const { return symbol_; }
-  const Symbol &module() const;
 
 private:
   SourceName location_;
@@ -553,51 +552,13 @@ class Symbol {
   bool CanReplaceDetails(const Details &details) const;
 
   // Follow use-associations and host-associations to get the ultimate entity.
-  Symbol &GetUltimate() {
-    return const_cast<Symbol &>(
-        const_cast<const Symbol *>(this)->GetUltimate());
-  }
-  const Symbol &GetUltimate() const {
-    if (const auto *details{detailsIf<UseDetails>()}) {
-      return details->symbol().GetUltimate();
-    } else if (const auto *details{detailsIf<HostAssocDetails>()}) {
-      return details->symbol().GetUltimate();
-    } else {
-      return *this;
-    }
-  }
+  inline Symbol &GetUltimate();
+  inline const Symbol &GetUltimate() const;
 
-  DeclTypeSpec *GetType() {
-    return const_cast<DeclTypeSpec *>(
-        const_cast<const Symbol *>(this)->GetType());
-  }
-  const DeclTypeSpec *GetType() const {
-    return std::visit(
-        common::visitors{
-            [](const EntityDetails &x) { return x.type(); },
-            [](const ObjectEntityDetails &x) { return x.type(); },
-            [](const AssocEntityDetails &x) { return x.type(); },
-            [](const SubprogramDetails &x) {
-              return x.isFunction() ? x.result().GetType() : nullptr;
-            },
-            [](const ProcEntityDetails &x) {
-              if (const Symbol * symbol{x.interface().symbol()}) {
-                return symbol->GetType();
-              } else {
-                return x.interface().type();
-              }
-            },
-            [&](const ProcBindingDetails &x) { return x.symbol().GetType(); },
-            [](const TypeParamDetails &x) { return x.type(); },
-            [](const UseDetails &x) { return x.symbol().GetType(); },
-            [](const HostAssocDetails &x) { return x.symbol().GetType(); },
-            [](const auto &) -> const DeclTypeSpec * { return nullptr; },
-        },
-        details_);
-  }
+  inline DeclTypeSpec *GetType();
+  inline const DeclTypeSpec *GetType() const;
 
   void SetType(const DeclTypeSpec &);
-  bool IsDummy() const;
   bool IsFuncResult() const;
   bool IsObjectArray() const;
   bool IsSubprogram() const;
@@ -754,6 +715,45 @@ inline bool ProcEntityDetails::HasExplicitInterface() const {
   return false;
 }
 
+inline Symbol &Symbol::GetUltimate() {
+  return const_cast<Symbol &>(const_cast<const Symbol *>(this)->GetUltimate());
+}
+inline const Symbol &Symbol::GetUltimate() const {
+  if (const auto *details{detailsIf<UseDetails>()}) {
+    return details->symbol().GetUltimate();
+  } else if (const auto *details{detailsIf<HostAssocDetails>()}) {
+    return details->symbol().GetUltimate();
+  } else {
+    return *this;
+  }
+}
+
+inline DeclTypeSpec *Symbol::GetType() {
+  return const_cast<DeclTypeSpec *>(
+      const_cast<const Symbol *>(this)->GetType());
+}
+inline const DeclTypeSpec *Symbol::GetType() const {
+  return std::visit(
+      common::visitors{
+          [](const EntityDetails &x) { return x.type(); },
+          [](const ObjectEntityDetails &x) { return x.type(); },
+          [](const AssocEntityDetails &x) { return x.type(); },
+          [](const SubprogramDetails &x) {
+            return x.isFunction() ? x.result().GetType() : nullptr;
+          },
+          [](const ProcEntityDetails &x) {
+            const Symbol *symbol{x.interface().symbol()};
+            return symbol ? symbol->GetType() : x.interface().type();
+          },
+          [](const ProcBindingDetails &x) { return x.symbol().GetType(); },
+          [](const TypeParamDetails &x) { return x.type(); },
+          [](const UseDetails &x) { return x.symbol().GetType(); },
+          [](const HostAssocDetails &x) { return x.symbol().GetType(); },
+          [](const auto &) -> const DeclTypeSpec * { return nullptr; },
+      },
+      details_);
+}
+
 inline bool operator<(SymbolRef x, SymbolRef y) { return *x < *y; }
 inline bool operator<(MutableSymbolRef x, MutableSymbolRef y) {
   return *x < *y;
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index e6a3a2eb53f1e..1132fc0bfaf4e 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -31,7 +31,6 @@ class Scope;
 class Symbol;
 
 const Scope *FindModuleContaining(const Scope &);
-const Symbol *FindCommonBlockContaining(const Symbol &object);
 const Scope *FindProgramUnitContaining(const Scope &);
 const Scope *FindProgramUnitContaining(const Symbol &);
 const Scope *FindPureProcedureContaining(const Scope &);
@@ -50,9 +49,6 @@ const DeclTypeSpec *FindParentTypeSpec(const DeclTypeSpec &);
 const DeclTypeSpec *FindParentTypeSpec(const Scope &);
 const DeclTypeSpec *FindParentTypeSpec(const Symbol &);
 
-// Return the Symbol of the variable of a construct association, if it exists
-const Symbol *GetAssociationRoot(const Symbol &);
-
 enum class Tristate { No, Yes, Maybe };
 inline Tristate ToTristate(bool x) { return x ? Tristate::Yes : Tristate::No; }
 
@@ -78,21 +74,17 @@ bool DoesScopeContain(const Scope *maybeAncestor, const Scope &maybeDescendent);
 bool DoesScopeContain(const Scope *, const Symbol &);
 bool IsUseAssociated(const Symbol &, const Scope &);
 bool IsHostAssociated(const Symbol &, const Scope &);
-bool IsDummy(const Symbol &);
-bool IsStmtFunction(const Symbol &);
+inline bool IsStmtFunction(const Symbol &symbol) {
+  const auto *subprogram{symbol.detailsIf<SubprogramDetails>()};
+  return subprogram && subprogram->stmtFunction();
+}
 bool IsInStmtFunction(const Symbol &);
 bool IsStmtFunctionDummy(const Symbol &);
 bool IsStmtFunctionResult(const Symbol &);
 bool IsPointerDummy(const Symbol &);
-bool IsFunction(const Symbol &);
-bool IsPureProcedure(const Symbol &);
-bool IsPureProcedure(const Scope &);
 bool IsBindCProcedure(const Symbol &);
 bool IsBindCProcedure(const Scope &);
-bool IsProcedure(const Symbol &);
 bool IsProcName(const Symbol &symbol); // proc-name
-bool IsVariableName(const Symbol &symbol); // variable-name
-bool IsProcedurePointer(const Symbol &);
 bool IsFunctionResult(const Symbol &);
 bool IsFunctionResultWithSameNameAsFunction(const Symbol &);
 bool IsExtensibleType(const DerivedTypeSpec *);
@@ -103,8 +95,6 @@ bool IsTeamType(const DerivedTypeSpec *);
 bool IsIsoCType(const DerivedTypeSpec *);
 bool IsEventTypeOrLockType(const DerivedTypeSpec *);
 bool IsOrContainsEventOrLockComponent(const Symbol &);
-// Has an explicit or implied SAVE attribute
-bool IsSaved(const Symbol &);
 bool CanBeTypeBoundProc(const Symbol *);
 bool IsInitialized(const Symbol &);
 bool HasIntrinsicTypeName(const Symbol &);
diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h
index c99cdb715a7f9..36cd1bb73a915 100644
--- a/flang/include/flang/Semantics/type.h
+++ b/flang/include/flang/Semantics/type.h
@@ -217,13 +217,12 @@ class ShapeSpec {
 struct ArraySpec : public std::vector<ShapeSpec> {
   ArraySpec() {}
   int Rank() const { return size(); }
-  bool IsExplicitShape() const;
-  bool IsAssumedShape() const;
-  bool IsDeferredShape() const;
-  bool IsImpliedShape() const;
-  bool IsAssumedSize() const;
-  bool IsAssumedRank() const;
-  bool IsConstantShape() const; // explicit shape with constant bounds
+  inline bool IsExplicitShape() const;
+  inline bool IsAssumedShape() const;
+  inline bool IsDeferredShape() const;
+  inline bool IsImpliedShape() const;
+  inline bool IsAssumedSize() const;
+  inline bool IsAssumedRank() const;
 
 private:
   // Check non-empty and predicate is true for each element.
@@ -251,7 +250,6 @@ class DerivedTypeSpec {
   void ReplaceScope(const Scope &);
   RawParameters &rawParameters() { return rawParameters_; }
   const ParameterMapType &parameters() const { return parameters_; }
-  int NumLengthParameters() const;
 
   bool MightBeParameterized() const;
   bool IsForwardReferenced() const;
@@ -354,10 +352,10 @@ class DeclTypeSpec {
     return std::get<DerivedTypeSpec>(typeSpec_);
   }
 
-  IntrinsicTypeSpec *AsIntrinsic();
-  const IntrinsicTypeSpec *AsIntrinsic() const;
-  DerivedTypeSpec *AsDerived();
-  const DerivedTypeSpec *AsDerived() const;
+  inline IntrinsicTypeSpec *AsIntrinsic();
+  inline const IntrinsicTypeSpec *AsIntrinsic() const;
+  inline DerivedTypeSpec *AsDerived();
+  inline const DerivedTypeSpec *AsDerived() const;
 
   std::string AsFortran() const;
 
@@ -383,5 +381,62 @@ class ProcInterface {
   const Symbol *symbol_{nullptr};
   const DeclTypeSpec *type_{nullptr};
 };
+
+// Define some member functions here in the header so that they can be used by
+// lib/Evaluate without link-time dependency on Semantics.
+
+inline bool ArraySpec::IsExplicitShape() const {
+  return CheckAll([](const ShapeSpec &x) { return x.ubound().isExplicit(); });
+}
+inline bool ArraySpec::IsAssumedShape() const {
+  return CheckAll([](const ShapeSpec &x) { return x.ubound().isDeferred(); });
+}
+inline bool ArraySpec::IsDeferredShape() const {
+  return CheckAll([](const ShapeSpec &x) {
+    return x.lbound().isDeferred() && x.ubound().isDeferred();
+  });
+}
+inline bool ArraySpec::IsImpliedShape() const {
+  return !IsAssumedRank() &&
+      CheckAll([](const ShapeSpec &x) { return x.ubound().isAssumed(); });
+}
+inline bool ArraySpec::IsAssumedSize() const {
+  return !empty() && !IsAssumedRank() && back().ubound().isAssumed() &&
+      std::all_of(begin(), end() - 1,
+          [](const ShapeSpec &x) { return x.ubound().isExplicit(); });
+}
+inline bool ArraySpec::IsAssumedRank() const {
+  return Rank() == 1 && front().lbound().isAssumed();
+}
+
+inline IntrinsicTypeSpec *DeclTypeSpec::AsIntrinsic() {
+  switch (category_) {
+  case Numeric:
+    return &std::get<NumericTypeSpec>(typeSpec_);
+  case Logical:
+    return &std::get<LogicalTypeSpec>(typeSpec_);
+  case Character:
+    return &std::get<CharacterTypeSpec>(typeSpec_);
+  default:
+    return nullptr;
+  }
+}
+inline const IntrinsicTypeSpec *DeclTypeSpec::AsIntrinsic() const {
+  return const_cast<DeclTypeSpec *>(this)->AsIntrinsic();
+}
+
+inline DerivedTypeSpec *DeclTypeSpec::AsDerived() {
+  switch (category_) {
+  case TypeDerived:
+  case ClassDerived:
+    return &std::get<DerivedTypeSpec>(typeSpec_);
+  default:
+    return nullptr;
+  }
+}
+inline const DerivedTypeSpec *DeclTypeSpec::AsDerived() const {
+  return const_cast<DeclTypeSpec *>(this)->AsDerived();
+}
+
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TYPE_H_
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 6c8db0fe24194..5cd1fcb431d7d 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -208,7 +208,7 @@ class CheckSpecificationExprHelper
       return "derived type component or type parameter value not allowed to "
              "reference variable '"s +
           symbol.name().ToString() + "'";
-    } else if (symbol.IsDummy()) {
+    } else if (IsDummy(symbol)) {
       if (symbol.attrs().test(semantics::Attr::OPTIONAL)) {
         return "reference to OPTIONAL dummy argument '"s +
             symbol.name().ToString() + "'";
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 693d25bd31fa3..5b45f8447b171 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -823,7 +823,7 @@ parser::Message *AttachDeclaration(
   if (const auto *use{symbol.detailsIf<semantics::UseDetails>()}) {
     message.Attach(use->location(),
         "'%s' is USE-associated with '%s' in module '%s'"_en_US, symbol.name(),
-        unhosted->name(), use->module().name());
+        unhosted->name(), GetUsedModule(*use).name());
   } else {
     message.Attach(
         unhosted->name(), "Declaration of '%s'"_en_US, unhosted->name());
@@ -872,3 +872,156 @@ std::optional<std::string> FindImpureCall(
 }
 
 } // namespace Fortran::evaluate
+
+namespace Fortran::semantics {
+
+// When a construct association maps to a variable, and that variable
+// is not an array with a vector-valued subscript, return the base
+// Symbol of that variable, else nullptr.  Descends into other construct
+// associations when one associations maps to another.
+static const Symbol *GetAssociatedVariable(
+    const semantics::AssocEntityDetails &details) {
+  if (const auto &expr{details.expr()}) {
+    if (IsVariable(*expr) && !HasVectorSubscript(*expr)) {
+      if (const Symbol * varSymbol{GetFirstSymbol(*expr)}) {
+        return GetAssociationRoot(*varSymbol);
+      }
+    }
+  }
+  return nullptr;
+}
+
+const Symbol *GetAssociationRoot(const Symbol &symbol) {
+  const Symbol &ultimate{symbol.GetUltimate()};
+  const auto *details{ultimate.detailsIf<semantics::AssocEntityDetails>()};
+  return details ? GetAssociatedVariable(*details) : &ultimate;
+}
+
+bool IsVariableName(const Symbol &symbol) {
+  const Symbol *root{GetAssociationRoot(symbol)};
+  return root && root->has<ObjectEntityDetails>() && !IsNamedConstant(*root);
+}
+
+bool IsPureProcedure(const Symbol &symbol) {
+  if (const auto *procDetails{symbol.detailsIf<ProcEntityDetails>()}) {
+    if (const Symbol * procInterface{procDetails->interface().symbol()}) {
+      // procedure component with a pure interface
+      return IsPureProcedure(*procInterface);
+    }
+  } else if (const auto *details{symbol.detailsIf<ProcBindingDetails>()}) {
+    return IsPureProcedure(details->symbol());
+  } else if (!IsProcedure(symbol)) {
+    return false;
+  }
+  if (IsStmtFunction(symbol)) {
+    // Section 15.7(1) states that a statement function is PURE if it does not
+    // reference an IMPURE procedure or a VOLATILE variable
+    if (const auto &expr{symbol.get<SubprogramDetails>().stmtFunction()}) {
+      for (const SymbolRef &ref : evaluate::CollectSymbols(*expr)) {
+        if (IsFunction(*ref) && !IsPureProcedure(*ref)) {
+          return false;
+        }
+        const Symbol *root{GetAssociationRoot(*ref)};
+        if (root && root->attrs().test(Attr::VOLATILE)) {
+          return false;
+        }
+      }
+    }
+    return true; // statement function was not found to be impure
+  }
+  return symbol.attrs().test(Attr::PURE) ||
+      (symbol.attrs().test(Attr::ELEMENTAL) &&
+          !symbol.attrs().test(Attr::IMPURE));
+}
+
+bool IsPureProcedure(const Scope &scope) {
+  const Symbol *symbol{scope.GetSymbol()};
+  return symbol && IsPureProcedure(*symbol);
+}
+
+bool IsFunction(const Symbol &symbol) {
+  return std::visit(
+      common::visitors{
+          [](const SubprogramDetails &x) { return x.isFunction(); },
+          [&](const SubprogramNameDetails &) {
+            return symbol.test(Symbol::Flag::Function);
+          },
+          [](const ProcEntityDetails &x) {
+            const auto &ifc{x.interface()};
+            return ifc.type() || (ifc.symbol() && IsFunction(*ifc.symbol()));
+          },
+          [](const ProcBindingDetails &x) { return IsFunction(x.symbol()); },
+          [](const UseDetails &x) { return IsFunction(x.symbol()); },
+          [](const auto &) { return false; },
+      },
+      symbol.details());
+}
+
+bool IsProcedure(const Symbol &symbol) {
+  return std::visit(
+      common::visitors{
+          [](const SubprogramDetails &) { return true; },
+          [](const SubprogramNameDetails &) { return true; },
+          [](const ProcEntityDetails &) { return true; },
+          [](const GenericDetails &) { return true; },
+          [](const ProcBindingDetails &) { return true; },
+          [](const UseDetails &x) { return IsProcedure(x.symbol()); },
+          // TODO: FinalProcDetails?
+          [](const auto &) { return false; },
+      },
+      symbol.details());
+}
+
+const Symbol *FindCommonBlockContaining(const Symbol &object) {
+  const auto *details{object.detailsIf<ObjectEntityDetails>()};
+  return details ? details->commonBlock() : nullptr;
+}
+
+bool IsProcedurePointer(const Symbol &symbol) {
+  return symbol.has<ProcEntityDetails>() && IsPointer(symbol);
+}
+
+bool IsSaved(const Symbol &symbol) {
+  auto scopeKind{symbol.owner().kind()};
+  if (scopeKind == Scope::Kind::Module || scopeKind == Scope::Kind::BlockData) {
+    return true;
+  } else if (scopeKind == Scope::Kind::DerivedType) {
+    return false; // this is a component
+  } else if (IsNamedConstant(symbol)) {
+    return false;
+  } else if (symbol.attrs().test(Attr::SAVE)) {
+    return true;
+  } else if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()};
+             object && object->init()) {
+    return true;
+  } else if (IsProcedurePointer(symbol) &&
+      symbol.get<ProcEntityDetails>().init()) {
+    return true;
+  } else if (const Symbol * block{FindCommonBlockContaining(symbol)};
+             block && block->attrs().test(Attr::SAVE)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool IsDummy(const Symbol &symbol) {
+  return std::visit(
+      common::visitors{[](const EntityDetails &x) { return x.isDummy(); },
+          [](const ObjectEntityDetails &x) { return x.isDummy(); },
+          [](const ProcEntityDetails &x) { return x.isDummy(); },
+          [](const HostAssocDetails &x) { return IsDummy(x.symbol()); },
+          [](const auto &) { return false; }},
+      symbol.details());
+}
+
+int CountLenParameters(const DerivedTypeSpec &type) {
+  return std::count_if(type.parameters().begin(), type.parameters().end(),
+      [](const auto &pair) { return pair.second.isLen(); });
+}
+
+const Symbol &GetUsedModule(const UseDetails &details) {
+  return DEREF(details.symbol().owner().symbol());
+}
+
+} // namespace Fortran::semantics
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 3360d30d9f933..6988a1b2e3d6d 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -23,6 +23,7 @@
 // IsDescriptor() predicate
 // TODO there's probably a better place for this predicate than here
 namespace Fortran::semantics {
+
 static bool IsDescriptor(const ObjectEntityDetails &details) {
   if (const auto *type{details.type()}) {
     if (auto dynamicType{evaluate::DynamicType::From(*type)}) {
@@ -32,7 +33,14 @@ static bool IsDescriptor(const ObjectEntityDetails &details) {
     }
   }
   // TODO: Automatic (adjustable) arrays - are they descriptors?
-  return !details.shape().empty() && !details.shape().IsConstantShape();
+  for (const ShapeSpec &shapeSpec : details.shape()) {
+    const auto &lb{shapeSpec.lbound().GetExplicit()};
+    const auto &ub{shapeSpec.ubound().GetExplicit()};
+    if (!lb || !ub || !IsConstantExpr(*lb) || !IsConstantExpr(*ub)) {
+      return true;
+    }
+  }
+  return false;
 }
 
 static bool IsDescriptor(const ProcEntityDetails &details) {
@@ -427,7 +435,7 @@ DynamicType DynamicType::ResultTypeForMultiply(const DynamicType &that) const {
 
 bool DynamicType::RequiresDescriptor() const {
   return IsPolymorphic() || IsUnknownLengthCharacter() ||
-      (derived_ && derived_->NumLengthParameters() > 0);
+      (derived_ && CountLenParameters(*derived_) > 0);
 }
 
 bool DynamicType::HasDeferredTypeParameter() const {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index e29cdcae04479..55cf8cdb38967 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -341,7 +341,7 @@ void CheckHelper::CheckAssumedTypeEntity( // C709
     const Symbol &symbol, const ObjectEntityDetails &details) {
   if (const DeclTypeSpec * type{symbol.GetType()};
       type && type->category() == DeclTypeSpec::TypeStar) {
-    if (!symbol.IsDummy()) {
+    if (!IsDummy(symbol)) {
       messages_.Say(
           "Assumed-type entity '%s' must be a dummy argument"_err_en_US,
           symbol.name());
@@ -477,7 +477,7 @@ void CheckHelper::CheckObjectEntity(
   if (const DeclTypeSpec * type{details.type()}) { // C708
     if (type->IsPolymorphic() &&
         !(type->IsAssumedType() || IsAllocatableOrPointer(symbol) ||
-            symbol.IsDummy())) {
+            IsDummy(symbol))) {
       messages_.Say("CLASS entity '%s' must be a dummy argument or have "
                     "ALLOCATABLE or POINTER attribute"_err_en_US,
           symbol.name());
@@ -530,7 +530,7 @@ void CheckHelper::CheckArraySpec(
               " assumed rank"_err_en_US;
       }
     }
-  } else if (symbol.IsDummy()) {
+  } else if (IsDummy(symbol)) {
     if (isImplied && !isAssumedSize) { // C836
       msg = "Dummy array argument '%s' may not have implied shape"_err_en_US;
     }
diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp
index ee9956f53ecb3..6c2573d790a1a 100644
--- a/flang/lib/Semantics/compute-offsets.cpp
+++ b/flang/lib/Semantics/compute-offsets.cpp
@@ -212,7 +212,7 @@ auto ComputeOffsetsHelper::GetElementSize(
   if (IsDescriptor(symbol) || IsProcedure(symbol)) {
     int lenParams{0};
     if (const DerivedTypeSpec * derived{type->AsDerived()}) {
-      lenParams = derived->NumLengthParameters();
+      lenParams = CountLenParameters(*derived);
     }
     std::size_t size{
         runtime::Descriptor::SizeInBytes(symbol.Rank(), false, lenParams)};
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index ca1704e8012cb..1f8b9b2552a1e 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -389,7 +389,7 @@ void ModFileWriter::PutGeneric(const Symbol &symbol) {
 void ModFileWriter::PutUse(const Symbol &symbol) {
   auto &details{symbol.get<UseDetails>()};
   auto &use{details.symbol()};
-  uses_ << "use " << details.module().name();
+  uses_ << "use " << GetUsedModule(details).name();
   PutGenericName(uses_ << ",only:", symbol);
   // Can have intrinsic op with different local-name and use-name
   // (e.g. `operator(<)` and `operator(.lt.)`) but rename is not allowed
diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index 5d1014aacec5a..c63ed5c60b304 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -557,7 +557,7 @@ bool EquivalenceSets::CheckObject(const parser::Name &name) {
   if (symbol.owner().IsDerivedType()) { // C8107
     msg = "Derived type component '%s'"
           " is not allowed in an equivalence set"_err_en_US;
-  } else if (symbol.IsDummy()) { // C8106
+  } else if (IsDummy(symbol)) { // C8106
     msg = "Dummy argument '%s' is not allowed in an equivalence set"_err_en_US;
   } else if (symbol.IsFuncResult()) { // C8106
     msg = "Function result '%s' is not allow in an equivalence set"_err_en_US;
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 3b60969b122a7..5626163144d3c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -931,7 +931,7 @@ class DeclarationVisitor : public ArraySpecVisitor,
     } else if (auto *details{symbol.detailsIf<UseDetails>()}) {
       Say(name.source,
           "'%s' is use-associated from module '%s' and cannot be re-declared"_err_en_US,
-          name.source, details->module().name());
+          name.source, GetUsedModule(*details).name());
     } else if (auto *details{symbol.detailsIf<SubprogramNameDetails>()}) {
       if (details->kind() == SubprogramKind::Module) {
         Say2(name,
@@ -1932,7 +1932,7 @@ void ScopeHandler::SayAlreadyDeclared(const SourceName &name, Symbol &prev) {
     Say(name, "'%s' is already declared in this scoping unit"_err_en_US)
         .Attach(details->location(),
             "It is use-associated with '%s' in module '%s'"_err_en_US,
-            details->symbol().name(), details->module().name());
+            details->symbol().name(), GetUsedModule(*details).name());
   } else {
     SayAlreadyDeclared(name, prev.name());
   }
@@ -2363,14 +2363,14 @@ void ModuleVisitor::AddUse(
         Say(location,
             "Generic interface '%s' has ambiguous specific procedures"
             " from modules '%s' and '%s'"_err_en_US,
-            localSymbol.name(), useDetails->module().name(),
+            localSymbol.name(), GetUsedModule(*useDetails).name(),
             useSymbol.owner().GetName().value());
       } else if (generic1.derivedType() && generic2.derivedType() &&
           generic1.derivedType() != generic2.derivedType()) {
         Say(location,
             "Generic interface '%s' has ambiguous derived types"
             " from modules '%s' and '%s'"_err_en_US,
-            localSymbol.name(), useDetails->module().name(),
+            localSymbol.name(), GetUsedModule(*useDetails).name(),
             useSymbol.owner().GetName().value());
       } else {
         generic1.CopyFrom(generic2);
@@ -4420,7 +4420,7 @@ void DeclarationVisitor::CheckSaveStmts() {
 // If SAVE attribute can't be set on symbol, return error message.
 std::optional<MessageFixedText> DeclarationVisitor::CheckSaveAttr(
     const Symbol &symbol) {
-  if (symbol.IsDummy()) {
+  if (IsDummy(symbol)) {
     return "SAVE attribute may not be applied to dummy argument '%s'"_err_en_US;
   } else if (symbol.IsFuncResult()) {
     return "SAVE attribute may not be applied to function result '%s'"_err_en_US;
@@ -4483,7 +4483,7 @@ void DeclarationVisitor::CheckCommonBlocks() {
     } else if (attrs.test(Attr::BIND_C)) {
       Say(name,
           "Variable '%s' with BIND attribute may not appear in a COMMON block"_err_en_US);
-    } else if (symbol->IsDummy()) {
+    } else if (IsDummy(*symbol)) {
       Say(name,
           "Dummy argument '%s' may not appear in a COMMON block"_err_en_US);
     } else if (symbol->IsFuncResult()) {
@@ -4609,7 +4609,7 @@ bool DeclarationVisitor::PassesLocalityChecks(
     return false;
   }
   if (const DeclTypeSpec * type{symbol.GetType()}) {
-    if (type->IsPolymorphic() && symbol.IsDummy() &&
+    if (type->IsPolymorphic() && IsDummy(symbol) &&
         !IsPointer(symbol)) { // C1128
       SayWithDecl(name, symbol,
           "Nonpointer polymorphic argument '%s' not allowed in a "
@@ -5499,7 +5499,7 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
     if (CheckUseError(name)) {
       return nullptr; // reported an error
     }
-    if (symbol->IsDummy() ||
+    if (IsDummy(*symbol) ||
         (!symbol->GetType() && FindCommonBlockContaining(*symbol))) {
       ConvertToObjectEntity(*symbol);
       ApplyImplicitRules(*symbol);
@@ -5841,7 +5841,7 @@ void ResolveNamesVisitor::NoteExecutablePartCall(
         ConvertToProcEntity(*symbol);
         if (symbol->has<ProcEntityDetails>()) {
           symbol->set(flag);
-          if (symbol->IsDummy()) {
+          if (IsDummy(*symbol)) {
             symbol->attrs().set(Attr::EXTERNAL);
           }
           ApplyImplicitRules(*symbol);
diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp
index 92b09dd55ab4e..02637ba2add7f 100644
--- a/flang/lib/Semantics/scope.cpp
+++ b/flang/lib/Semantics/scope.cpp
@@ -362,16 +362,6 @@ const DeclTypeSpec *Scope::FindInstantiatedDerivedType(
   }
 }
 
-const Symbol *Scope::GetSymbol() const {
-  if (symbol_) {
-    return symbol_;
-  }
-  if (derivedTypeSpec_) {
-    return &derivedTypeSpec_->typeSymbol();
-  }
-  return nullptr;
-}
-
 const Scope *Scope::GetDerivedTypeParent() const {
   if (const Symbol * symbol{GetSymbol()}) {
     if (const DerivedTypeSpec * parent{symbol->GetParentTypeSpec(this)}) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index c22f8d08e55f6..9983426670039 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -141,13 +141,8 @@ ProcEntityDetails::ProcEntityDetails(EntityDetails &&d) : EntityDetails(d) {
   }
 }
 
-const Symbol &UseDetails::module() const {
-  // owner is a module so it must have a symbol:
-  return *symbol_->owner().symbol();
-}
-
 UseErrorDetails::UseErrorDetails(const UseDetails &useDetails) {
-  add_occurrence(useDetails.location(), *useDetails.module().scope());
+  add_occurrence(useDetails.location(), *GetUsedModule(useDetails).scope());
 }
 UseErrorDetails &UseErrorDetails::add_occurrence(
     const SourceName &location, const Scope &module) {
@@ -287,16 +282,6 @@ void Symbol::SetType(const DeclTypeSpec &type) {
       details_);
 }
 
-bool Symbol::IsDummy() const {
-  return std::visit(
-      common::visitors{[](const EntityDetails &x) { return x.isDummy(); },
-          [](const ObjectEntityDetails &x) { return x.isDummy(); },
-          [](const ProcEntityDetails &x) { return x.isDummy(); },
-          [](const HostAssocDetails &x) { return x.symbol().IsDummy(); },
-          [](const auto &) { return false; }},
-      details_);
-}
-
 bool Symbol::IsFuncResult() const {
   return std::visit(
       common::visitors{[](const EntityDetails &x) { return x.isFuncResult(); },
@@ -389,7 +374,7 @@ llvm::raw_ostream &operator<<(
 
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
   os << DetailsToString(details);
-  std::visit(
+  std::visit( //
       common::visitors{
           [&](const UnknownDetails &) {},
           [&](const MainProgramDetails &) {},
@@ -413,7 +398,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
             os << ' ' << EnumToString(x.kind());
           },
           [&](const UseDetails &x) {
-            os << " from " << x.symbol().name() << " in " << x.module().name();
+            os << " from " << x.symbol().name() << " in "
+               << GetUsedModule(x).name();
           },
           [&](const UseErrorDetails &x) {
             os << " uses:";
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 3b68beaa557fc..707b88de1f2de 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -42,14 +42,6 @@ const Scope *FindModuleContaining(const Scope &start) {
       start, [](const Scope &scope) { return scope.IsModule(); });
 }
 
-const Symbol *FindCommonBlockContaining(const Symbol &object) {
-  if (const auto *details{object.detailsIf<ObjectEntityDetails>()}) {
-    return details->commonBlock();
-  } else {
-    return nullptr;
-  }
-}
-
 const Scope *FindProgramUnitContaining(const Scope &start) {
   return FindScopeContaining(start, [](const Scope &scope) {
     switch (scope.kind()) {
@@ -193,21 +185,6 @@ bool IsHostAssociated(const Symbol &symbol, const Scope &scope) {
       DoesScopeContain(FindProgramUnitContaining(symbol), *subprogram);
 }
 
-bool IsDummy(const Symbol &symbol) {
-  if (const auto *details{symbol.detailsIf<ObjectEntityDetails>()}) {
-    return details->isDummy();
-  } else if (const auto *details{symbol.detailsIf<ProcEntityDetails>()}) {
-    return details->isDummy();
-  } else {
-    return false;
-  }
-}
-
-bool IsStmtFunction(const Symbol &symbol) {
-  const auto *subprogram{symbol.detailsIf<SubprogramDetails>()};
-  return subprogram && subprogram->stmtFunction();
-}
-
 bool IsInStmtFunction(const Symbol &symbol) {
   if (const Symbol * function{symbol.owner().symbol()}) {
     return IsStmtFunction(*function);
@@ -227,80 +204,11 @@ bool IsPointerDummy(const Symbol &symbol) {
   return IsPointer(symbol) && IsDummy(symbol);
 }
 
-// variable-name
-bool IsVariableName(const Symbol &symbol) {
-  if (const Symbol * root{GetAssociationRoot(symbol)}) {
-    return root->has<ObjectEntityDetails>() && !IsNamedConstant(*root);
-  } else {
-    return false;
-  }
-}
-
 // proc-name
 bool IsProcName(const Symbol &symbol) {
   return symbol.GetUltimate().has<ProcEntityDetails>();
 }
 
-bool IsFunction(const Symbol &symbol) {
-  return std::visit(
-      common::visitors{
-          [](const SubprogramDetails &x) { return x.isFunction(); },
-          [&](const SubprogramNameDetails &) {
-            return symbol.test(Symbol::Flag::Function);
-          },
-          [](const ProcEntityDetails &x) {
-            const auto &ifc{x.interface()};
-            return ifc.type() || (ifc.symbol() && IsFunction(*ifc.symbol()));
-          },
-          [](const ProcBindingDetails &x) { return IsFunction(x.symbol()); },
-          [](const UseDetails &x) { return IsFunction(x.symbol()); },
-          [](const auto &) { return false; },
-      },
-      symbol.details());
-}
-
-bool IsPureProcedure(const Symbol &symbol) {
-  if (const auto *procDetails{symbol.detailsIf<ProcEntityDetails>()}) {
-    if (const Symbol * procInterface{procDetails->interface().symbol()}) {
-      // procedure component with a pure interface
-      return IsPureProcedure(*procInterface);
-    }
-  } else if (const auto *details{symbol.detailsIf<ProcBindingDetails>()}) {
-    return IsPureProcedure(details->symbol());
-  } else if (!IsProcedure(symbol)) {
-    return false;
-  }
-  if (IsStmtFunction(symbol)) {
-    // Section 15.7(1) states that a statement function is PURE if it does not
-    // reference an IMPURE procedure or a VOLATILE variable
-    const MaybeExpr &expr{symbol.get<SubprogramDetails>().stmtFunction()};
-    if (expr) {
-      for (const Symbol &refSymbol : evaluate::CollectSymbols(*expr)) {
-        if (IsFunction(refSymbol) && !IsPureProcedure(refSymbol)) {
-          return false;
-        }
-        if (const Symbol * root{GetAssociationRoot(refSymbol)}) {
-          if (root->attrs().test(Attr::VOLATILE)) {
-            return false;
-          }
-        }
-      }
-    }
-    return true; // statement function was not found to be impure
-  }
-  return symbol.attrs().test(Attr::PURE) ||
-      (symbol.attrs().test(Attr::ELEMENTAL) &&
-          !symbol.attrs().test(Attr::IMPURE));
-}
-
-bool IsPureProcedure(const Scope &scope) {
-  if (const Symbol * symbol{scope.GetSymbol()}) {
-    return IsPureProcedure(*symbol);
-  } else {
-    return false;
-  }
-}
-
 bool IsBindCProcedure(const Symbol &symbol) {
   if (const auto *procDetails{symbol.detailsIf<ProcEntityDetails>()}) {
     if (const Symbol * procInterface{procDetails->interface().symbol()}) {
@@ -319,25 +227,6 @@ bool IsBindCProcedure(const Scope &scope) {
   }
 }
 
-bool IsProcedure(const Symbol &symbol) {
-  return std::visit(
-      common::visitors{
-          [](const SubprogramDetails &) { return true; },
-          [](const SubprogramNameDetails &) { return true; },
-          [](const ProcEntityDetails &) { return true; },
-          [](const GenericDetails &) { return true; },
-          [](const ProcBindingDetails &) { return true; },
-          [](const UseDetails &x) { return IsProcedure(x.symbol()); },
-          // TODO: FinalProcDetails?
-          [](const auto &) { return false; },
-      },
-      symbol.details());
-}
-
-bool IsProcedurePointer(const Symbol &symbol) {
-  return symbol.has<ProcEntityDetails>() && IsPointer(symbol);
-}
-
 static const Symbol *FindPointerComponent(
     const Scope &scope, std::set<const Scope *> &visited) {
   if (!scope.IsDerivedType()) {
@@ -555,33 +444,6 @@ const DeclTypeSpec *FindParentTypeSpec(const Symbol &symbol) {
   return nullptr;
 }
 
-// When a construct association maps to a variable, and that variable
-// is not an array with a vector-valued subscript, return the base
-// Symbol of that variable, else nullptr.  Descends into other construct
-// associations when one associations maps to another.
-static const Symbol *GetAssociatedVariable(const AssocEntityDetails &details) {
-  if (const MaybeExpr & expr{details.expr()}) {
-    if (evaluate::IsVariable(*expr) && !evaluate::HasVectorSubscript(*expr)) {
-      if (const Symbol * varSymbol{evaluate::GetFirstSymbol(*expr)}) {
-        return GetAssociationRoot(*varSymbol);
-      }
-    }
-  }
-  return nullptr;
-}
-
-// Return the Symbol of the variable of a construct association, if it exists
-// Return nullptr if the name is associated with an expression
-const Symbol *GetAssociationRoot(const Symbol &symbol) {
-  const Symbol &ultimate{symbol.GetUltimate()};
-  if (const auto *details{ultimate.detailsIf<AssocEntityDetails>()}) {
-    // We have a construct association
-    return GetAssociatedVariable(*details);
-  } else {
-    return &ultimate;
-  }
-}
-
 bool IsExtensibleType(const DerivedTypeSpec *derived) {
   return derived && !IsIsoCType(derived) &&
       !derived->typeSymbol().attrs().test(Attr::BIND_C) &&
@@ -627,35 +489,6 @@ bool IsOrContainsEventOrLockComponent(const Symbol &symbol) {
   return false;
 }
 
-bool IsSaved(const Symbol &symbol) {
-  auto scopeKind{symbol.owner().kind()};
-  if (scopeKind == Scope::Kind::Module || scopeKind == Scope::Kind::BlockData) {
-    return true;
-  } else if (scopeKind == Scope::Kind::DerivedType) {
-    return false; // this is a component
-  } else if (IsNamedConstant(symbol)) {
-    return false;
-  } else if (symbol.attrs().test(Attr::SAVE)) {
-    return true;
-  } else {
-    if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
-      if (object->init()) {
-        return true;
-      }
-    } else if (IsProcedurePointer(symbol)) {
-      if (symbol.get<ProcEntityDetails>().init()) {
-        return true;
-      }
-    }
-    if (const Symbol * block{FindCommonBlockContaining(symbol)}) {
-      if (block->attrs().test(Attr::SAVE)) {
-        return true;
-      }
-    }
-    return false;
-  }
-}
-
 // Check this symbol suitable as a type-bound procedure - C769
 bool CanBeTypeBoundProc(const Symbol *symbol) {
   if (!symbol || IsDummy(*symbol) || IsProcedurePointer(*symbol)) {
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index d8c817bd92cc3..75f728eef4e13 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -165,16 +165,6 @@ void DerivedTypeSpec::AddParamValue(SourceName name, ParamValue &&value) {
   CHECK(pair.second); // name was not already present
 }
 
-int DerivedTypeSpec::NumLengthParameters() const {
-  int result{0};
-  for (const auto &pair : parameters_) {
-    if (pair.second.isLen()) {
-      ++result;
-    }
-  }
-  return result;
-}
-
 bool DerivedTypeSpec::MightBeParameterized() const {
   return !cooked_ || !parameters_.empty();
 }
@@ -487,37 +477,6 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &o, const ShapeSpec &x) {
   return o;
 }
 
-bool ArraySpec::IsExplicitShape() const {
-  return CheckAll([](const ShapeSpec &x) { return x.ubound().isExplicit(); });
-}
-bool ArraySpec::IsAssumedShape() const {
-  return CheckAll([](const ShapeSpec &x) { return x.ubound().isDeferred(); });
-}
-bool ArraySpec::IsDeferredShape() const {
-  return CheckAll([](const ShapeSpec &x) {
-    return x.lbound().isDeferred() && x.ubound().isDeferred();
-  });
-}
-bool ArraySpec::IsImpliedShape() const {
-  return !IsAssumedRank() &&
-      CheckAll([](const ShapeSpec &x) { return x.ubound().isAssumed(); });
-}
-bool ArraySpec::IsAssumedSize() const {
-  return !empty() && !IsAssumedRank() && back().ubound().isAssumed() &&
-      std::all_of(begin(), end() - 1,
-          [](const ShapeSpec &x) { return x.ubound().isExplicit(); });
-}
-bool ArraySpec::IsAssumedRank() const {
-  return Rank() == 1 && front().lbound().isAssumed();
-}
-bool ArraySpec::IsConstantShape() const {
-  return CheckAll([](const ShapeSpec &x) {
-    const auto &lb{x.lbound().GetExplicit()};
-    const auto &ub{x.ubound().GetExplicit()};
-    return lb && ub && IsConstantExpr(*lb) && IsConstantExpr(*ub);
-  });
-}
-
 llvm::raw_ostream &operator<<(
     llvm::raw_ostream &os, const ArraySpec &arraySpec) {
   char sep{'('};
@@ -634,35 +593,6 @@ bool DeclTypeSpec::IsSequenceType() const {
   return false;
 }
 
-IntrinsicTypeSpec *DeclTypeSpec::AsIntrinsic() {
-  switch (category_) {
-  case Numeric:
-    return &std::get<NumericTypeSpec>(typeSpec_);
-  case Logical:
-    return &std::get<LogicalTypeSpec>(typeSpec_);
-  case Character:
-    return &std::get<CharacterTypeSpec>(typeSpec_);
-  default:
-    return nullptr;
-  }
-}
-const IntrinsicTypeSpec *DeclTypeSpec::AsIntrinsic() const {
-  return const_cast<DeclTypeSpec *>(this)->AsIntrinsic();
-}
-
-DerivedTypeSpec *DeclTypeSpec::AsDerived() {
-  switch (category_) {
-  case TypeDerived:
-  case ClassDerived:
-    return &std::get<DerivedTypeSpec>(typeSpec_);
-  default:
-    return nullptr;
-  }
-}
-const DerivedTypeSpec *DeclTypeSpec::AsDerived() const {
-  return const_cast<DeclTypeSpec *>(this)->AsDerived();
-}
-
 const NumericTypeSpec &DeclTypeSpec::numericTypeSpec() const {
   CHECK(category_ == Numeric);
   return std::get<NumericTypeSpec>(typeSpec_);

From 7d77b0545dd850f59209c47ea03cde048673b8dd Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 29 May 2020 19:49:11 -0400
Subject: [PATCH 599/770] [gn build] (manually) port 0e265e31578

---
 llvm/utils/gn/secondary/clang/lib/Testing/BUILD.gn   | 11 +++++++++++
 llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 llvm/utils/gn/secondary/clang/lib/Testing/BUILD.gn

diff --git a/llvm/utils/gn/secondary/clang/lib/Testing/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Testing/BUILD.gn
new file mode 100644
index 0000000000000..2a00a2248babe
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/lib/Testing/BUILD.gn
@@ -0,0 +1,11 @@
+static_library("Testing") {
+  output_name = "clangTesting"
+  configs += [ "//llvm/utils/gn/build:clang_code" ]
+  deps = [
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    # Make `gn format` not collapse this, for sync_source_lists_from_cmake.py.
+    "CommandLineArgs.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn
index 47f8e952256bf..f25ead00165c0 100644
--- a/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/AST/BUILD.gn
@@ -8,6 +8,7 @@ unittest("ASTTests") {
     "//clang/lib/Analysis",
     "//clang/lib/Basic",
     "//clang/lib/Frontend",
+    "//clang/lib/Testing",
     "//clang/lib/Tooling",
     "//llvm/lib/Support",
     "//llvm/lib/Testing/Support",
@@ -30,7 +31,6 @@ unittest("ASTTests") {
     "DeclTest.cpp",
     "EvaluateAsRValueTest.cpp",
     "ExternalASTSourceTest.cpp",
-    "Language.cpp",
     "NamedDeclPrinterTest.cpp",
     "RecursiveASTVisitorTest.cpp",
     "SizelessTypesTest.cpp",

From 7318e2400009ca07f059a047674b010a0c77081f Mon Sep 17 00:00:00 2001
From: Tony <Tony.Tye@amd.com>
Date: Fri, 29 May 2020 19:03:50 -0400
Subject: [PATCH 600/770] [AMDGPU] Add loaded code object path URI definition
 to AMDGPUUsage

Differential Revision: https://reviews.llvm.org/D80407
---
 llvm/docs/AMDGPUUsage.rst | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 7f8df7034d930..69c7f88f945e1 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1095,6 +1095,60 @@ the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.
 There is no current OS loader support for 32-bit programs and so
 ``R_AMDGPU_ABS32`` is not used.
 
+.. _amdgpu-loaded-code-object-path-uniform-resource-identifier:
+
+Loaded Code Object Path Uniform Resource Identifier (URI)
+---------------------------------------------------------
+
+The AMD GPU code object loader represents the path of the ELF shared object from
+which the code object was loaded as a textual Unifom Resource Identifier (URI).
+Note that the code object is the in memory loaded relocated form of the ELF
+shared object.  Multiple code objects may be loaded at different memory
+addresses in the same process from the same ELF shared object.
+
+The loaded code object path URI syntax is defined by the following BNF syntax:
+
+.. code::
+
+  code_object_uri ::== file_uri | memory_uri
+  file_uri        ::== "file://" file_path [ range_specifier ]
+  memory_uri      ::== "memory://" process_id range_specifier
+  range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number
+  file_path       ::== URI_ENCODED_OS_FILE_PATH
+  process_id      ::== DECIMAL_NUMBER
+  number          ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
+
+**number**
+  Is a C integral literal where hexadecimal values are prefixed by "0x" or "0X",
+  and octal values by "0".
+
+**file_path**
+  Is the file's path specified as a URI encoded UTF-8 string. In URI encoding,
+  every character that is not in the regular expression ``[a-zA-Z0-9/_.~-]`` is
+  encoded as two uppercase hexidecimal digits proceeded by "%".  Directories in
+  the path are separated by "/".
+
+**offset**
+  Is a 0-based byte offset to the start of the code object.  For a file URI, it
+  is from the start of the file specified by the ``file_path``, and if omitted
+  defaults to 0. For a memory URI, it is the memory address and is required.
+
+**size**
+  Is the number of bytes in the code object.  For a file URI, if omitted it
+  defaults to the size of the file.  It is required for a memory URI.
+
+**process_id**
+  Is the identity of the process owning the memory.  For Linux it is the C
+  unsigned integral decimal literal for the process ID (PID).
+
+For example:
+
+.. code::
+
+  file:///dir1/dir2/file1
+  file:///dir3/dir4/file2#offset=0x2000&size=3000
+  memory://1234#offset=0x20000&size=3000
+
 .. _amdgpu-dwarf-debug-information:
 
 DWARF Debug Information

From 26c070c8db165f8692751881fa3aa6793302717a Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 29 May 2020 23:53:22 +0000
Subject: [PATCH 601/770] [gn build] Port 34cfed24ebd

---
 llvm/utils/gn/secondary/llvm/unittests/MC/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/MC/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/MC/BUILD.gn
index 01be445487d28..6d117dce798d2 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/MC/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/MC/BUILD.gn
@@ -10,6 +10,7 @@ unittest("MCTests") {
   sources = [
     "Disassembler.cpp",
     "DwarfLineTables.cpp",
+    "MCDisassemblerTest.cpp",
     "MCInstPrinter.cpp",
     "StringTableBuilderTest.cpp",
     "TargetRegistry.cpp",

From 02f6f1ebb1f16e7e324df5201ce7003a4d9f2570 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 29 May 2020 23:53:23 +0000
Subject: [PATCH 602/770] [gn build] Port cf6cc662eee

---
 llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index 10c0e0cf35ba2..8ec471e130e32 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -68,6 +68,7 @@ static_library("Sema") {
     "SemaOpenMP.cpp",
     "SemaOverload.cpp",
     "SemaPseudoObject.cpp",
+    "SemaSYCL.cpp",
     "SemaStmt.cpp",
     "SemaStmtAsm.cpp",
     "SemaStmtAttr.cpp",

From 77b1ed4b4a492d5236f936f14caedd44b275e472 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 16:17:40 -0700
Subject: [PATCH 603/770] [SVE] Eliminate calls to default-false
 VectorType::get() from Linker

Reviewers: efriedma, tejohnson, sdesmalen, c-rhodes

Reviewed By: efriedma

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80326
---
 llvm/lib/Linker/IRMover.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index ce1133583f7a7..055689b16e8f4 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -305,10 +305,11 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
   case Type::ArrayTyID:
     return *Entry = ArrayType::get(ElementTypes[0],
                                    cast<ArrayType>(Ty)->getNumElements());
-  case Type::FixedVectorTyID:
   case Type::ScalableVectorTyID:
-    return *Entry = VectorType::get(ElementTypes[0],
-                                    cast<VectorType>(Ty)->getNumElements());
+    // FIXME: handle scalable vectors
+  case Type::FixedVectorTyID:
+    return *Entry = FixedVectorType::get(
+               ElementTypes[0], cast<FixedVectorType>(Ty)->getNumElements());
   case Type::PointerTyID:
     return *Entry = PointerType::get(ElementTypes[0],
                                      cast<PointerType>(Ty)->getAddressSpace());

From 3f0841f6d0a0eb86a1c36cc0c76931ae9d7bc77a Mon Sep 17 00:00:00 2001
From: Jared Wyles <jared.wyles@gmail.com>
Date: Sat, 30 May 2020 09:11:42 +1000
Subject: [PATCH 604/770] [jitlink] R_X86_64_PC32 support for the elf x86
 jitlinker

Summary:

Adding in our first relocation type, and all the required plumbing to support the rest in following patches

Differential Revision: https://reviews.llvm.org/D80613

Reviewer: lhames
---
 .../llvm/ExecutionEngine/JITLink/ELF_x86_64.h |  43 +++--
 .../ExecutionEngine/JITLink/ELF_x86_64.cpp    | 161 +++++++++++++-----
 .../JITLink/X86/ELF_x86-64_relocations.s      |  24 ++-
 3 files changed, 164 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
index ee43c356aebe3..7860088f35692 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
@@ -19,36 +19,33 @@ namespace llvm {
 namespace jitlink {
 
 namespace ELF_x86_64_Edges {
-
 enum ELFX86RelocationKind : Edge::Kind {
-  R_AMD64_NONE = Edge::FirstRelocation,
-  R_AMD64_64,
-  R_AMD64_PC32,
-  R_AMD64_GOT32,
-  R_AMD64_PLT32,
-  R_AMD64_COPY,
-  R_AMD64_GLOB_DAT,
-  R_AMD64_JUMP_SLOT,
-  R_AMD64_RELATIVE,
-  R_AMD64_GOTPCREL,
-  R_AMD64_32,
-  R_AMD64_32S,
-  R_AMD64_16,
-  R_AMD64_PC16,
-  R_AMD64_8,
-  R_AMD64_PC8,
-  R_AMD64_PC64,
-  R_AMD64_GOTOFF64,
-  R_AMD64_GOTPC32,
-  R_AMD64_SIZE32,
-  R_AMD64_SIZE64
+  Branch32 = Edge::FirstRelocation,
+  Branch32ToStub,
+  Pointer32,
+  Pointer64,
+  Pointer64Anon,
+  PCRel32,
+  PCRel32Minus1,
+  PCRel32Minus2,
+  PCRel32Minus4,
+  PCRel32Anon,
+  PCRel32Minus1Anon,
+  PCRel32Minus2Anon,
+  PCRel32Minus4Anon,
+  PCRel32GOTLoad,
+  PCRel32GOT,
+  PCRel32TLV,
+  Delta32,
+  Delta64,
+  NegDelta32,
+  NegDelta64,
 };
 
 } // end namespace ELF_x86_64_Edges
 
 /// jit-link the given object buffer, which must be a ELF x86-64 object file.
 void jitLink_ELF_x86_64(std::unique_ptr<JITLinkContext> Ctx);
-StringRef getELFX86RelocationKindName(Edge::Kind R);
 } // end namespace jitlink
 } // end namespace llvm
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index a7118eb9b563f..505f03590b6b0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -24,12 +24,19 @@ static const char *CommonSectionName = "__common";
 
 namespace llvm {
 namespace jitlink {
+
 // This should become a template as the ELFFile is so a lot of this could become
 // generic
 class ELFLinkGraphBuilder_x86_64 {
 
 private:
   Section *CommonSection = nullptr;
+  // TODO hack to get this working
+  // Find a better way
+  using SymbolTable = object::ELFFile<object::ELF64LE>::Elf_Shdr;
+  // For now we just assume
+  std::map<int32_t, Symbol *> JITSymbolTable;
+
   Section &getCommonSection() {
     if (!CommonSection) {
       auto Prot = static_cast<sys::Memory::ProtectionFlags>(
@@ -39,10 +46,21 @@ class ELFLinkGraphBuilder_x86_64 {
     return *CommonSection;
   }
 
+  static Expected<ELF_x86_64_Edges::ELFX86RelocationKind>
+  getRelocationKind(const uint32_t Type) {
+    switch (Type) {
+    case ELF::R_X86_64_PC32:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32;
+    }
+    return make_error<JITLinkError>("Unsupported x86-64 relocation:" +
+                                    formatv("{0:d}", Type));
+  }
+
   std::unique_ptr<LinkGraph> G;
   // This could be a template
   const object::ELFFile<object::ELF64LE> &Obj;
   object::ELFFile<object::ELF64LE>::Elf_Shdr_Range sections;
+  SymbolTable SymTab;
 
   bool isRelocatable() { return Obj.getHeader()->e_type == llvm::ELF::ET_REL; }
 
@@ -88,11 +106,11 @@ class ELFLinkGraphBuilder_x86_64 {
         // FIXME: Read size.
         (void)Size;
 
-        if (auto NameOrErr = SymRef.getName(*StringTable)) {
+        if (auto NameOrErr = SymRef.getName(*StringTable))
           Name = *NameOrErr;
-        } else {
+        else
           return NameOrErr.takeError();
-        }
+
         LLVM_DEBUG({
           dbgs() << "  ";
           if (!Name)
@@ -157,12 +175,93 @@ class ELFLinkGraphBuilder_x86_64 {
         // Do this here because we have it, but move it into graphify later
         G->createContentBlock(section, StringRef(Data, Size), Address,
                               Alignment, 0);
+        if (SecRef.sh_type == ELF::SHT_SYMTAB)
+          // TODO: Dynamic?
+          SymTab = SecRef;
       }
     }
 
     return Error::success();
   }
 
+  Error addRelocations() {
+    LLVM_DEBUG(dbgs() << "Adding relocations\n");
+    // TODO a partern is forming of iterate some sections but only give me
+    // ones I am interested, i should abstract that concept some where
+    for (auto &SecRef : sections) {
+      if (SecRef.sh_type != ELF::SHT_RELA && SecRef.sh_type != ELF::SHT_REL)
+        continue;
+      // TODO can the elf obj file do this for me?
+      if (SecRef.sh_type == ELF::SHT_REL)
+        return make_error<llvm::StringError>("Shouldn't have REL in x64",
+                                             llvm::inconvertibleErrorCode());
+
+      auto RelSectName = Obj.getSectionName(&SecRef);
+      if (!RelSectName)
+        return RelSectName.takeError();
+      // Deal with .eh_frame later
+      if (*RelSectName == StringRef(".rela.eh_frame"))
+        continue;
+
+      auto UpdateSection = Obj.getSection(SecRef.sh_info);
+      if (!UpdateSection)
+        return UpdateSection.takeError();
+
+      auto UpdateSectionName = Obj.getSectionName(*UpdateSection);
+      if (!UpdateSectionName)
+        return UpdateSectionName.takeError();
+
+      auto JITSection = G->findSectionByName(*UpdateSectionName);
+      if (!JITSection)
+        return make_error<llvm::StringError>(
+            "Refencing a a section that wasn't added to graph" +
+                *UpdateSectionName,
+            llvm::inconvertibleErrorCode());
+
+      auto Relocations = Obj.relas(&SecRef);
+      if (!Relocations)
+        return Relocations.takeError();
+
+      for (const auto &Rela : *Relocations) {
+        auto Type = Rela.getType(false);
+
+        LLVM_DEBUG({
+          dbgs() << "Relocation Type: " << Type << "\n"
+                 << "Name: " << Obj.getRelocationTypeName(Type) << "\n";
+        });
+
+        auto Symbol = Obj.getRelocationSymbol(&Rela, &SymTab);
+        if (!Symbol)
+          return Symbol.takeError();
+
+        auto BlockToFix = *(JITSection->blocks().begin());
+        auto TargetSymbol = JITSymbolTable[(*Symbol)->st_shndx];
+        uint64_t Addend = Rela.r_addend;
+        JITTargetAddress FixupAddress =
+            (*UpdateSection)->sh_addr + Rela.r_offset;
+
+        LLVM_DEBUG({
+          dbgs() << "Processing relocation at "
+                 << format("0x%016" PRIx64, FixupAddress) << "\n";
+        });
+        auto Kind = getRelocationKind(Type);
+        if (!Kind)
+          return Kind.takeError();
+
+        LLVM_DEBUG({
+          Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
+                  Addend);
+          // TODO a mapping of KIND => type then call getRelocationTypeName4
+          printEdge(dbgs(), *BlockToFix, GE, StringRef(""));
+          dbgs() << "\n";
+        });
+        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
+                            *TargetSymbol, Addend);
+      }
+    }
+    return Error::success();
+  }
+
   Error graphifyRegularSymbols() {
 
     // TODO: ELF supports beyond SHN_LORESERVE,
@@ -219,11 +318,10 @@ class ELFLinkGraphBuilder_x86_64 {
         if (!Name)
           return Name.takeError();
         // TODO: weak and hidden
-        if (SymRef.isExternal()) {
+        if (SymRef.isExternal())
           bindings = {Linkage::Strong, Scope::Default};
-        } else {
+        else
           bindings = {Linkage::Strong, Scope::Local};
-        }
 
         if (SymRef.isDefined() &&
             (Type == ELF::STT_FUNC || Type == ELF::STT_OBJECT)) {
@@ -247,9 +345,10 @@ class ELFLinkGraphBuilder_x86_64 {
           auto B = *bs.begin();
           LLVM_DEBUG({ dbgs() << "  " << *Name << ": "; });
 
-          G->addDefinedSymbol(*B, SymRef.getValue(), *Name, SymRef.st_size,
-                              bindings.first, bindings.second,
-                              SymRef.getType() == ELF::STT_FUNC, false);
+          auto &S = G->addDefinedSymbol(
+              *B, SymRef.getValue(), *Name, SymRef.st_size, bindings.first,
+              bindings.second, SymRef.getType() == ELF::STT_FUNC, false);
+          JITSymbolTable[SymRef.st_shndx] = &S;
         }
         //TODO: The following has to be implmented.
         // leaving commented out to save time for future patchs
@@ -298,6 +397,9 @@ class ELFLinkGraphBuilder_x86_64 {
     if (auto Err = graphifyRegularSymbols())
       return std::move(Err);
 
+    if (auto Err = addRelocations())
+      return std::move(Err);
+
     return std::move(G);
   }
 };
@@ -311,9 +413,7 @@ class ELFJITLinker_x86_64 : public JITLinker<ELFJITLinker_x86_64> {
       : JITLinker(std::move(Ctx), std::move(PassConfig)) {}
 
 private:
-  StringRef getEdgeKindName(Edge::Kind R) const override {
-    return getELFX86RelocationKindName(R);
-  }
+  StringRef getEdgeKindName(Edge::Kind R) const override { return StringRef(); }
 
   Expected<std::unique_ptr<LinkGraph>>
   buildGraph(MemoryBufferRef ObjBuffer) override {
@@ -329,7 +429,17 @@ class ELFJITLinker_x86_64 : public JITLinker<ELFJITLinker_x86_64> {
   }
 
   Error applyFixup(Block &B, const Edge &E, char *BlockWorkingMem) const {
-    //TODO: add relocation handling
+    using namespace ELF_x86_64_Edges;
+    char *FixupPtr = BlockWorkingMem + E.getOffset();
+    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+    switch (E.getKind()) {
+
+    case ELFX86RelocationKind::PCRel32:
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      // verify
+      *(support::little32_t *)FixupPtr = Value;
+      break;
+    }
     return Error::success();
   }
 };
@@ -349,30 +459,5 @@ void jitLink_ELF_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
 
   ELFJITLinker_x86_64::link(std::move(Ctx), std::move(Config));
 }
-
-StringRef getELFX86RelocationKindName(Edge::Kind R) {
-  // case R_AMD64_NONE:
-  //   return "None";
-  // case R_AMD64_PC32:
-  // case R_AMD64_GOT32:
-  // case R_AMD64_PLT32,
-  // R_AMD64_COPY,
-  // R_AMD64_GLOB_DAT,
-  // R_AMD64_JUMP_SLOT,
-  // R_AMD64_RELATIVE,
-  // R_AMD64_GOTPCREL,
-  // R_AMD64_32,
-  // R_AMD64_32S,
-  // R_AMD64_16,
-  // R_AMD64_PC16,
-  // R_AMD64_8,
-  // R_AMD64_PC8,
-  // R_AMD64_PC64,
-  // R_AMD64_GOTOFF64,
-  // R_AMD64_GOTPC32,
-  // R_AMD64_SIZE32,
-  // R_AMD64_SIZE64
-  return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
-}
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_relocations.s
index 12186e4f5433c..20b26b1826ba9 100644
--- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_relocations.s
@@ -1,20 +1,38 @@
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: llvm-mc -triple=x86_64-unknown-linux -filetype=obj -o %t/elf_reloc.o %s
-# RUN: llvm-jitlink -noexec %t/elf_reloc.o
+# RUN: llvm-jitlink -noexec -check %s %t/elf_reloc.o
 #
 # Test standard ELF relocations.
 
         .text
         .file   "testcase.c"
+
+# Empty main entry point.
         .globl  main
         .p2align        4, 0x90
         .type   main,@function
 main:
-        movl    $42, %eax
         retq
 .Lfunc_end0:
         .size   main, .Lfunc_end0-main
 
+# Test PCRel32 / R_X86_64_PC32 handling.
+# jitlink-check: decode_operand(test_pcrel32, 4) = named_data - next_pc(test_pcrel32)
+        .globl  test_pcrel32
+        .p2align       4, 0x90
+        .type  test_pcrel32,@function
+test_pcrel32:
+        movl    named_data(%rip), %eax
+.Ltest_pcrel32_end:
+        .size   test_pcrel32, .Ltest_pcrel32_end-test_pcrel32
+
+        .type   named_data,@object
+        .data
+        .p2align        2
+named_data:
+        .long   42
+        .size   named_data, 4
+
         .ident  "clang version 10.0.0-4ubuntu1 "
         .section        ".note.GNU-stack","",@progbits
-        .addrsig
\ No newline at end of file
+        .addrsig

From aad936548247107b1afd92ea9229b396564063b9 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Fri, 29 May 2020 17:44:51 -0700
Subject: [PATCH 605/770] [SVE] Eliminate calls to default-false
 VectorType::get() from AMDGPU

Reviewers: efriedma, david-arm, fpetrogalli, arsenm

Reviewed By: david-arm

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, tschuett, hiraditya, rkruppe, psnobl, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80328
---
 llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp      | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp             | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp              | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp  | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 46ffc77d8edd4..c9d25d4250d55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -438,7 +438,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
 
   Type *const Ty = I.getType();
   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
-  Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
+  auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
 
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 956508e12227b..0ef8586930276 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -598,7 +598,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
   if (Size <= 8)
     PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
   else
-    PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8);
+    PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8);
   unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
   auto PtrArg = CI->getArgOperand(PtrArgLoc);
   unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index b39039861f517..2b5143ba7506c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -902,7 +902,7 @@ static Type* getIntrinsicParamType(
     return nullptr;
   }
   if (P.VectorSize > 1)
-    T = VectorType::get(T, P.VectorSize);
+    T = FixedVectorType::get(T, P.VectorSize);
   if (P.PtrKind != AMDGPULibFunc::BYVALUE)
     T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
                                        - 1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 52e192e576dda..58bd6e5f3b2b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -167,7 +167,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     }
 
     if (IsV3 && Size >= 32) {
-      V4Ty = VectorType::get(VT->getElementType(), 4);
+      V4Ty = FixedVectorType::get(VT->getElementType(), 4);
       // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
       AdjustedArgTy = V4Ty;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index ab5b62ccf82e0..524a34be876ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -516,7 +516,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
             break;
           }
           if (EleCount > 1) {
-            IType = dyn_cast<Type>(VectorType::get(IType, EleCount));
+            IType = FixedVectorType::get(IType, EleCount);
           }
           Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch);
           WhatToStore.push_back(Arg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index eeedfe7a8c029..9e738dd6fdb30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -334,12 +334,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
-    return VectorType::get(Type::getInt32Ty(Context), 2);
+    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
   }
 
   // Global memory works best with 16-byte accesses. Private memory will also
   // hit this, although they'll be decomposed.
-  return VectorType::get(Type::getInt32Ty(Context), 4);
+  return FixedVectorType::get(Type::getInt32Ty(Context), 4);
 }
 
 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(

From f012c58abdb086ea093d48e7a705f98f9e9561c6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 28 May 2020 20:38:16 -0400
Subject: [PATCH 606/770] AMDGPU: Move MIMG MMO check to verifier

---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td    |   1 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  10 --
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   5 +
 .../CodeGen/AMDGPU/coalescer-subreg-join.mir  |   4 +-
 llvm/test/CodeGen/AMDGPU/memory_clause.mir    |  14 +--
 llvm/test/CodeGen/AMDGPU/merge-image-load.mir |   4 +-
 .../CodeGen/AMDGPU/merge-image-sample.mir     | 104 +++++++++---------
 llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir  |  10 +-
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   |   8 +-
 .../CodeGen/AMDGPU/waitcnt-preexisting.mir    |   2 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir |   8 +-
 11 files changed, 82 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index e87d672432319..a46ca9b34be39 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -132,7 +132,6 @@ class MIMG_Base <dag outs, string dns = "">
 
   let DecoderNamespace = dns;
   let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
-  let usesCustomInserter = 1;
 }
 
 class MIMG <dag outs, string dns = "">
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 43588c7de45a4..619ce1abeb813 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3716,16 +3716,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   MachineFunction *MF = BB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
-  if (TII->isMIMG(MI)) {
-    if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
-      report_fatal_error("missing mem operand from MIMG instruction");
-    }
-    // Add a memoperand for mimg instructions so that they aren't assumed to
-    // be ordered memory instuctions.
-
-    return BB;
-  }
-
   switch (MI.getOpcode()) {
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0300de69caea8..d7508c3b1ea2a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3448,6 +3448,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     return true;
   }
 
+  if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
+    ErrInfo = "missing memory operand from MIMG instruction.";
+    return false;
+  }
+
   // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     if (MI.getOperand(i).isFPImm()) {
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
index 33cfffa58d8a3..34c83f9bae497 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
@@ -61,7 +61,7 @@ body:             |
     %11.sub6 = COPY %1
     %11.sub7 = COPY %1
     %11.sub8 = COPY %1
-    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4)
     %20.sub1 = COPY %2
     %20.sub2 = COPY %2
     %20.sub3 = COPY %2
@@ -70,6 +70,6 @@ body:             |
     %20.sub6 = COPY %2
     %20.sub7 = COPY %2
     %20.sub8 = COPY %2
-    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4)
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
index 9f154ed8bddb0..efa042574f9a4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
@@ -325,12 +325,12 @@ body:             |
   bb.0:
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
-    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    IMAGE_STORE_V4_V2 %4, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    IMAGE_STORE_V4_V2 %5, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 %4, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 %5, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
 ...
 
 # GCN-LABEL: {{^}}name: mixed_clause{{$}}
@@ -355,7 +355,7 @@ body:             |
     %0 = IMPLICIT_DEF
     %1 = IMPLICIT_DEF
     %2 = IMPLICIT_DEF
-    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
     %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec
     %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir
index 80b8e195b1baa..fc91d650f972b 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir
@@ -148,7 +148,7 @@ body:             |
     %4:vreg_128 = COPY %2
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
@@ -372,7 +372,7 @@ body:             |
 ---
 
 # GFX9-LABEL: name: image_load_mip_merged_v1v3
-# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) 
+# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4)
 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0
 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3
 
diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
index 3d0c5932925e8..304b67786fa39 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir
@@ -15,7 +15,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 # GFX9-LABEL: name: image_sample_l_merged_v1v3_reversed
@@ -33,7 +33,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -52,7 +52,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) 
+    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
 ---
 
@@ -71,7 +71,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
-    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) 
+    %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4)
 ...
 ---
 
@@ -90,7 +90,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) 
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
 ---
 
@@ -109,7 +109,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
-    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) 
+    %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
 ...
 ---
 
@@ -128,9 +128,9 @@ body:             |
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
     %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
     %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
-    %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -148,8 +148,8 @@ body:             |
     %4:vreg_128 = COPY %2
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -167,7 +167,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -185,7 +185,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -242,7 +242,7 @@ body:             |
     %5:vgpr_32 = COPY %2.sub3
     %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -408,7 +408,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -428,7 +428,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -448,7 +448,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -468,7 +468,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -488,7 +488,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -508,7 +508,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -528,7 +528,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -548,7 +548,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -568,7 +568,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -588,7 +588,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -608,7 +608,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -628,7 +628,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -648,7 +648,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -668,7 +668,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -688,7 +688,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -708,7 +708,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -728,7 +728,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -748,7 +748,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -768,7 +768,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -788,7 +788,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -808,7 +808,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -828,7 +828,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -848,7 +848,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -868,7 +868,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -888,7 +888,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -908,7 +908,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -928,7 +928,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -948,7 +948,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -968,7 +968,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -988,7 +988,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1008,7 +1008,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1028,7 +1028,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1048,7 +1048,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1068,7 +1068,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1088,7 +1088,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1108,7 +1108,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1128,7 +1128,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1148,7 +1148,7 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
 
@@ -1168,6 +1168,6 @@ body:             |
     %4:vgpr_32 = COPY %2.sub3
     %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16)
     %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
-    %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) 
+    %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4)
 ...
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
index 5829cf60a58be..89ce7fd3100f3 100644
--- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir
@@ -8,7 +8,7 @@
 name:            hazard_image_sample_d_buf_off6
 body:            |
   bb.0:
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
 ...
 
@@ -19,7 +19,7 @@ body:            |
 name:            no_hazard_image_sample_d_buf_off1
 body:            |
   bb.0:
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, implicit $exec
 ...
 
@@ -31,7 +31,7 @@ body:            |
 name:            no_hazard_image_sample_d_buf_far
 body:            |
   bb.0:
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
     V_NOP_e32 implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
 ...
@@ -44,7 +44,7 @@ body:            |
 name:            no_hazard_image_sample_v4_v2_buf_off6
 body:            |
   bb.0:
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load 16)
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
 ...
 
@@ -56,6 +56,6 @@ body:            |
 name:            no_hazard_image_sample_v4_v3_buf_off6
 body:            |
   bb.0:
-    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 00742ebdfe4e9..3202fad8f70ec 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -101,10 +101,10 @@ body:             |
     $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3,  undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
-    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4)
+    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
+    IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     S_NOP 0
     $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0
     $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 128, 0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
index 92a613b1cc74e..40b381383aa79 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
@@ -32,6 +32,6 @@ body:             |
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
     $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
     $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec
-    IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16)
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
index 90009b6084289..c72a4a95c87a6 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
@@ -46,8 +46,8 @@ body: |
     ; GFX9: S_WAITCNT 0
     ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
 ...
 
 # Image load vs image sample. Waitcnt required because they are not guaranteed
@@ -65,6 +65,6 @@ body: |
     ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; GFX9: S_WAITCNT 3952
     ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
-    $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+    $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16)
+    $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16)
 ...

From 4f300d499631504acdd32219254e939697202285 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 29 May 2020 16:11:58 -0400
Subject: [PATCH 607/770] AMDGPU: Add new baseline tests for setreg handling

Most of these should be identical and use a common prefix, but
update_llc_test_checks is failing to generate shared checks for some
reason.
---
 .../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll | 892 +++++++++++++++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll    | 865 ++++++++++++++++-
 2 files changed, 1705 insertions(+), 52 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
index 85ed95eec0ae3..72de32e5a5ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -1,56 +1,890 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+
+; FIXME: This test has a DAG duplicate
+
+; Immediate values:
+; (mode register ID = 1) | (Offset << 6) | ((Width - 1) << 11)
+; Offset: fp_round = 0, fp_denorm = 4, dx10_clamp = 8, ieee_mode = 9
+
 
 ; Set FP32 fp_round to round to zero
 define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() {
-; GCN-LABEL: test_setreg_f32_round_mode_rtz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 3), 3
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 4097, i32 3)
+; GFX6789-LABEL: test_setreg_f32_round_mode_rtz:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f32_round_mode_rtz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2049, i32 3)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; Set FP64/FP16 fp_round to round to zero
 define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() {
-; GCN-LABEL: test_setreg_f64_round_mode_rtz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 3), 3
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 4225, i32 3)
+; GFX6789-LABEL: test_setreg_f64_round_mode_rtz:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f64_round_mode_rtz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2177, i32 3)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; Set all fp_round to round to zero
 define amdgpu_kernel void @test_setreg_all_round_mode_rtz() {
-; GCN-LABEL: test_setreg_all_round_mode_rtz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 5), 7
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 8193, i32 7)
+; GFX6789-LABEL: test_setreg_all_round_mode_rtz:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_all_round_mode_rtz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6273, i32 7)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; Set FP32 fp_round to dynamic mode
 define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) {
-; GCN-LABEL: test_setreg_roundingmode_var:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+; GFX6789-LABEL: test_setreg_roundingmode_var:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_roundingmode_var:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2049, i32 %var.mode)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_ieee_mode_off() {
+; GFX6789-LABEL: test_setreg_ieee_mode_off:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_ieee_mode_off:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 577, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_ieee_mode_on() {
+; GFX6789-LABEL: test_setreg_ieee_mode_on:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_ieee_mode_on:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 577, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_dx10_clamp_off() {
+; GFX6789-LABEL: test_setreg_dx10_clamp_off:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_dx10_clamp_off:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 513, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_dx10_clamp_on() {
+; GFX6789-LABEL: test_setreg_dx10_clamp_on:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_dx10_clamp_on:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 513, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Sets full width of fp round and fp denorm fields, to a variable
+define amdgpu_cs void @test_setreg_full_both_round_mode_and_denorm_mode(i32 inreg %mode) {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 inreg %mode)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Does not cover last bit of denorm field
+define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode() {
+; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 12289, i32 6)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Does not cover first bit of denorm field
+define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode_6() {
+; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4161, i32 6)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_cs void @test_setreg_f32_denorm_mode(i32 inreg %val) {
+; GFX6789-LABEL: test_setreg_f32_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f32_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2305, i32 %val)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_cs void @test_setreg_f64_denorm_mode(i32 inreg %val) {
+; GFX6789-LABEL: test_setreg_f64_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f64_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2433, i32 %val)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_cs void @test_setreg_full_denorm_mode(i32 inreg %val) {
+; GFX6789-LABEL: test_setreg_full_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 %val)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_0() {
+; GFX6789-LABEL: test_setreg_full_round_mode_0:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_1() {
+; GFX6789-LABEL: test_setreg_full_round_mode_1:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_2() {
+; GFX6789-LABEL: test_setreg_full_round_mode_2:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_4() {
+; GFX6789-LABEL: test_setreg_full_round_mode_4:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 4)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_8() {
+; GFX6789-LABEL: test_setreg_full_round_mode_8:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 8)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_15() {
+; GFX6789-LABEL: test_setreg_full_round_mode_15:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 15)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Should truncate set immediate value
+define amdgpu_kernel void @test_setreg_full_round_mode_42() {
+; GFX6789-LABEL: test_setreg_full_round_mode_42:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_42:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 42)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_0() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_0:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_1() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_1:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_2() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_2:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 2)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_4() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_4:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 4)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_8() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_8:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 8)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_15() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_15:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 15)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_42() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_42:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_42:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 42)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Sets all fp round and fp denorm bits.
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_0() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_1() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_2() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 2)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_4() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 4)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_8() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 8)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_16() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 16)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_32() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 32)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_64() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 64)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_128() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 128)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_15() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 15)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 255)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Truncate extra high bit
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_597() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 597)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_set_8_bits_straddles_round_and_denorm() {
+; GFX6789-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14465, i32 255)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_set_4_bits_straddles_round_and_denorm() {
+; GFX6789-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6273, i32 15)
+  call void asm sideeffect "", ""()
   ret void
 }
 
+; FIXME: Broken for DAG
 define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
-; GCN-LABEL: test_setreg_roundingmode_var_vgpr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX6789-LABEL: test_setreg_roundingmode_var_vgpr:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6789-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_setreg_roundingmode_var_vgpr:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+  call void asm sideeffect "", ""()
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
index 88e6bd4adb7bf..934e39e5987ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
@@ -1,52 +1,871 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+
 ; FIXME: This copy of the test is a subset of the -global-isel version, since the VGPR case doesn't work.
 
+; Immediate values:
+; (mode register ID = 1) | (Offset << 6) | ((Width - 1) << 11)
+; Offset: fp_round = 0, fp_denorm = 4, dx10_clamp = 8, ieee_mode = 9
+
+
 ; Set FP32 fp_round to round to zero
 define amdgpu_kernel void @test_setreg_f32_round_mode_rtz() {
-; GCN-LABEL: test_setreg_f32_round_mode_rtz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 3), 3
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 4097, i32 3)
+; GFX6789-LABEL: test_setreg_f32_round_mode_rtz:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f32_round_mode_rtz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2049, i32 3)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; Set FP64/FP16 fp_round to round to zero
 define amdgpu_kernel void @test_setreg_f64_round_mode_rtz() {
-; GCN-LABEL: test_setreg_f64_round_mode_rtz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 3), 3
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 4225, i32 3)
+; GFX6789-LABEL: test_setreg_f64_round_mode_rtz:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f64_round_mode_rtz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2177, i32 3)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; Set all fp_round to round to zero
 define amdgpu_kernel void @test_setreg_all_round_mode_rtz() {
-; GCN-LABEL: test_setreg_all_round_mode_rtz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 5), 7
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 8193, i32 7)
+; GFX6789-LABEL: test_setreg_all_round_mode_rtz:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_all_round_mode_rtz:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6273, i32 7)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; Set FP32 fp_round to dynamic mode
 define amdgpu_cs void @test_setreg_roundingmode_var(i32 inreg %var.mode) {
-; GCN-LABEL: test_setreg_roundingmode_var:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0
-; GCN-NEXT:    s_endpgm
-  call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+; GFX6789-LABEL: test_setreg_roundingmode_var:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_roundingmode_var:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2049, i32 %var.mode)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_ieee_mode_off() {
+; GFX6789-LABEL: test_setreg_ieee_mode_off:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_ieee_mode_off:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 577, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_ieee_mode_on() {
+; GFX6789-LABEL: test_setreg_ieee_mode_on:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_ieee_mode_on:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 577, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_dx10_clamp_off() {
+; GFX6789-LABEL: test_setreg_dx10_clamp_off:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_dx10_clamp_off:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 513, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_dx10_clamp_on() {
+; GFX6789-LABEL: test_setreg_dx10_clamp_on:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_dx10_clamp_on:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 513, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Sets full width of fp round and fp denorm fields, to a variable
+define amdgpu_cs void @test_setreg_full_both_round_mode_and_denorm_mode(i32 inreg %mode) {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 inreg %mode)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Does not cover last bit of denorm field
+define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode() {
+; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 12289, i32 6)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Does not cover first bit of denorm field
+define amdgpu_cs void @test_setreg_most_both_round_mode_and_denorm_mode_6() {
+; GFX6789-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 4161, i32 6)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_cs void @test_setreg_f32_denorm_mode(i32 inreg %val) {
+; GFX6789-LABEL: test_setreg_f32_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f32_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2305, i32 %val)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_cs void @test_setreg_f64_denorm_mode(i32 inreg %val) {
+; GFX6789-LABEL: test_setreg_f64_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_f64_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 2433, i32 %val)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_cs void @test_setreg_full_denorm_mode(i32 inreg %val) {
+; GFX6789-LABEL: test_setreg_full_denorm_mode:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 %val)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_0() {
+; GFX6789-LABEL: test_setreg_full_round_mode_0:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_1() {
+; GFX6789-LABEL: test_setreg_full_round_mode_1:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_2() {
+; GFX6789-LABEL: test_setreg_full_round_mode_2:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_4() {
+; GFX6789-LABEL: test_setreg_full_round_mode_4:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 4)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_8() {
+; GFX6789-LABEL: test_setreg_full_round_mode_8:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 8)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_round_mode_15() {
+; GFX6789-LABEL: test_setreg_full_round_mode_15:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 15)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Should truncate set immediate value
+define amdgpu_kernel void @test_setreg_full_round_mode_42() {
+; GFX6789-LABEL: test_setreg_full_round_mode_42:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_round_mode_42:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6145, i32 42)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_0() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_0:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_1() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_1:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_2() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_2:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 2)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_4() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_4:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 4)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_8() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_8:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 8)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_15() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_15:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 15)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_denorm_mode_42() {
+; GFX6789-LABEL: test_setreg_full_denorm_mode_42:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_denorm_mode_42:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6401, i32 42)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Sets all fp round and fp denorm bits.
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_0() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 0)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_1() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 1)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_2() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 2)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_4() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 4)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_8() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 8)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_16() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 16)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_32() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 32)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_64() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 64)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_128() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 128)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_15() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 15)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 255)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+; Truncate extra high bit
+define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_597() {
+; GFX6789-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14337, i32 597)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_set_8_bits_straddles_round_and_denorm() {
+; GFX6789-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 14465, i32 255)
+  call void asm sideeffect "", ""()
+  ret void
+}
+
+define amdgpu_kernel void @test_setreg_set_4_bits_straddles_round_and_denorm() {
+; GFX6789-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm:
+; GFX6789:       ; %bb.0:
+; GFX6789-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15
+; GFX6789-NEXT:    ;;#ASMSTART
+; GFX6789-NEXT:    ;;#ASMEND
+; GFX6789-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.setreg(i32 6273, i32 15)
+  call void asm sideeffect "", ""()
   ret void
 }
 
 ; FIXME: Broken for DAG
 ; define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
 ;   call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
+;   call void asm sideeffect "", ""()
 ;   ret void
 ; }
 

From 0892a96a05a8943457a4a3e2547923087aa06226 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 28 May 2020 20:55:45 -0400
Subject: [PATCH 608/770] AMDGPU: Optimize s_setreg_b32 to
 s_denorm_mode/s_round_mode

This is a custom inserter because it was less work than teaching
tablegen a way to indicate that it is sometimes OK to have a no side
effect instruction in the output of a side effecting pattern.

The asm is needed to look like a read of the mode register to prevent
it from being deleted. However, there seems to be a bug where the mode
register def instructions are moved across the asm sideeffect by the
post-RA scheduler.

Another oddity is the immediate is formatted differently between
s_denorm_mode and s_round_mode.
---
 llvm/lib/Target/AMDGPU/SIDefines.h            | 22 ++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 69 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  4 ++
 .../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll | 64 ++++++++++-------
 .../CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll    | 64 ++++++++++-------
 5 files changed, 171 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 23ef56afc39c9..c8d1542f2a1ad 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -366,6 +366,28 @@ enum Width : unsigned {
   WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
 };
 
+enum ModeRegisterMasks : uint32_t {
+  FP_ROUND_MASK = 0xf << 0,  // Bits 0..3
+  FP_DENORM_MASK = 0xf << 4, // Bits 4..7
+  DX10_CLAMP_MASK = 1 << 8,
+  IEEE_MODE_MASK = 1 << 9,
+  LOD_CLAMP_MASK = 1 << 10,
+  DEBUG_MASK = 1 << 11,
+
+  // EXCP_EN fields.
+  EXCP_EN_INVALID_MASK = 1 << 12,
+  EXCP_EN_INPUT_DENORMAL_MASK = 1 << 13,
+  EXCP_EN_FLOAT_DIV0_MASK = 1 << 14,
+  EXCP_EN_OVERFLOW_MASK = 1 << 15,
+  EXCP_EN_UNDERFLOW_MASK = 1 << 16,
+  EXCP_EN_INEXACT_MASK = 1 << 17,
+  EXCP_EN_INT_DIV0_MASK = 1 << 18,
+
+  GPR_IDX_EN_MASK = 1 << 27,
+  VSKIP_MASK = 1 << 28,
+  CSP_MASK = 0x7u << 29 // Bits 29..31
+};
+
 } // namespace Hwreg
 
 namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 619ce1abeb813..452ff785ec064 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4119,6 +4119,75 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     }
 
     return emitGWSMemViolTestLoop(MI, BB);
+  case AMDGPU::S_SETREG_B32: {
+    if (!getSubtarget()->hasDenormModeInst())
+      return BB;
+
+    // Try to optimize cases that only set the denormal mode or rounding mode.
+    //
+    // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
+    // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
+    // instead.
+    //
+    // FIXME: This could be predicates on the immediate, but tablegen doesn't
+    // allow you to have a no side effect instruction in the output of a
+    // sideeffecting pattern.
+
+    // TODO: Should also emit a no side effects pseudo if only FP bits are
+    // touched, even if not all of them or to a variable.
+    unsigned ID, Offset, Width;
+    AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+    if (ID != AMDGPU::Hwreg::ID_MODE)
+      return BB;
+
+    const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
+    const unsigned SetMask = WidthMask << Offset;
+    unsigned SetDenormOp = 0;
+    unsigned SetRoundOp = 0;
+
+    // The dedicated instructions can only set the whole denorm or round mode at
+    // once, not a subset of bits in either.
+    if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
+                                  AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
+      // If this fully sets both the round and denorm mode, emit the two
+      // dedicated instructions for these.
+      assert(Offset == 0);
+      SetRoundOp = AMDGPU::S_ROUND_MODE;
+      SetDenormOp = AMDGPU::S_DENORM_MODE;
+    } else if (Width == 4) {
+      if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
+        SetRoundOp = AMDGPU::S_ROUND_MODE;
+        assert(Offset == 0);
+      } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
+        SetDenormOp = AMDGPU::S_DENORM_MODE;
+        assert(Offset == 4);
+      }
+    }
+
+    if (SetRoundOp || SetDenormOp) {
+      MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+      MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
+      if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
+        unsigned ImmVal = Def->getOperand(1).getImm();
+        if (SetRoundOp) {
+          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
+            .addImm(ImmVal & 0xf);
+
+          // If we also have the denorm mode, get just the denorm mode bits.
+          ImmVal >>= 4;
+        }
+
+        if (SetDenormOp) {
+          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
+            .addImm(ImmVal & 0xf);
+        }
+
+        MI.eraseFromParent();
+      }
+    }
+
+    return BB;
+  }
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   }
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index dbafea5a1347e..774b9cf027853 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -808,6 +808,10 @@ def S_SETREG_B32 : SOPK_Pseudo <
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
   "$simm16, $sdst",
   [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
+
+  // Use custom inserter to optimize some cases to
+  // S_DENORM_MODE/S_ROUND_MODE.
+  let usesCustomInserter = 1;
   let Defs = [MODE];
   let Uses = [MODE];
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
index 72de32e5a5ff4..531495c53b5c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -309,7 +309,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_0() {
 ; GFX10-LABEL: test_setreg_full_round_mode_0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -329,7 +329,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_1() {
 ; GFX10-LABEL: test_setreg_full_round_mode_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX10-NEXT:    s_round_mode 0x1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -349,7 +349,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_2() {
 ; GFX10-LABEL: test_setreg_full_round_mode_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX10-NEXT:    s_round_mode 0x2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -369,7 +369,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_4() {
 ; GFX10-LABEL: test_setreg_full_round_mode_4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX10-NEXT:    s_round_mode 0x4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -389,7 +389,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_8() {
 ; GFX10-LABEL: test_setreg_full_round_mode_8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX10-NEXT:    s_round_mode 0x8
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -409,7 +409,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_15() {
 ; GFX10-LABEL: test_setreg_full_round_mode_15:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX10-NEXT:    s_round_mode 0xf
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -430,7 +430,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_42() {
 ; GFX10-LABEL: test_setreg_full_round_mode_42:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42
+; GFX10-NEXT:    s_round_mode 0xa
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -450,7 +450,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_0() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -470,7 +470,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_1() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1
+; GFX10-NEXT:    s_denorm_mode 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -491,7 +491,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_2() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2
+; GFX10-NEXT:    s_denorm_mode 2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -511,7 +511,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_4() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4
+; GFX10-NEXT:    s_denorm_mode 4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -531,7 +531,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_8() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8
+; GFX10-NEXT:    s_denorm_mode 8
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -551,7 +551,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_15() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_15:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15
+; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -571,7 +571,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_42() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_42:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42
+; GFX10-NEXT:    s_denorm_mode 10
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -591,10 +591,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_0()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 0)
   call void asm sideeffect "", ""()
@@ -611,10 +612,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_1()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x1
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 1)
   call void asm sideeffect "", ""()
@@ -631,10 +633,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_2()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 2)
   call void asm sideeffect "", ""()
@@ -651,10 +654,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_4()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x4
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 4)
   call void asm sideeffect "", ""()
@@ -671,10 +675,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_8()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x8
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 8)
   call void asm sideeffect "", ""()
@@ -691,10 +696,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_16()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 1
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 16)
   call void asm sideeffect "", ""()
@@ -711,10 +717,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_32()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 2
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 32)
   call void asm sideeffect "", ""()
@@ -731,10 +738,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_64()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 4
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 64)
   call void asm sideeffect "", ""()
@@ -751,10 +759,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_128(
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 8
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 128)
   call void asm sideeffect "", ""()
@@ -771,10 +780,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_15()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0xf
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 15)
   call void asm sideeffect "", ""()
@@ -791,10 +801,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255(
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0xf
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 255)
   call void asm sideeffect "", ""()
@@ -812,10 +823,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_597(
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x5
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 5
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 597)
   call void asm sideeffect "", ""()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
index 934e39e5987ff..515b41d066c63 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll
@@ -309,7 +309,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_0() {
 ; GFX10-LABEL: test_setreg_full_round_mode_0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -329,7 +329,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_1() {
 ; GFX10-LABEL: test_setreg_full_round_mode_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 1
+; GFX10-NEXT:    s_round_mode 0x1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -349,7 +349,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_2() {
 ; GFX10-LABEL: test_setreg_full_round_mode_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 2
+; GFX10-NEXT:    s_round_mode 0x2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -369,7 +369,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_4() {
 ; GFX10-LABEL: test_setreg_full_round_mode_4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 4
+; GFX10-NEXT:    s_round_mode 0x4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -389,7 +389,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_8() {
 ; GFX10-LABEL: test_setreg_full_round_mode_8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 8
+; GFX10-NEXT:    s_round_mode 0x8
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -409,7 +409,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_15() {
 ; GFX10-LABEL: test_setreg_full_round_mode_15:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 15
+; GFX10-NEXT:    s_round_mode 0xf
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -430,7 +430,7 @@ define amdgpu_kernel void @test_setreg_full_round_mode_42() {
 ; GFX10-LABEL: test_setreg_full_round_mode_42:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 42
+; GFX10-NEXT:    s_round_mode 0xa
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -450,7 +450,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_0() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 0
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -470,7 +470,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_1() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 1
+; GFX10-NEXT:    s_denorm_mode 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -491,7 +491,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_2() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 2
+; GFX10-NEXT:    s_denorm_mode 2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -511,7 +511,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_4() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 4
+; GFX10-NEXT:    s_denorm_mode 4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -531,7 +531,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_8() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 8
+; GFX10-NEXT:    s_denorm_mode 8
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -551,7 +551,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_15() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_15:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 15
+; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -571,7 +571,7 @@ define amdgpu_kernel void @test_setreg_full_denorm_mode_42() {
 ; GFX10-LABEL: test_setreg_full_denorm_mode_42:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 4), 42
+; GFX10-NEXT:    s_denorm_mode 10
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
@@ -591,10 +591,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_0()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 0)
   call void asm sideeffect "", ""()
@@ -611,10 +612,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_1()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x1
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 1)
   call void asm sideeffect "", ""()
@@ -631,10 +633,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_2()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x2
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 2
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 2)
   call void asm sideeffect "", ""()
@@ -651,10 +654,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_4()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x4
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 4
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 4)
   call void asm sideeffect "", ""()
@@ -671,10 +675,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_8()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x8
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 8
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 8)
   call void asm sideeffect "", ""()
@@ -691,10 +696,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_16()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 16
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 1
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 16)
   call void asm sideeffect "", ""()
@@ -711,10 +717,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_32()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 32
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 2
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 32)
   call void asm sideeffect "", ""()
@@ -731,10 +738,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_64()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 64
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 4
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 64)
   call void asm sideeffect "", ""()
@@ -751,10 +759,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_128(
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x80
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 8
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 128)
   call void asm sideeffect "", ""()
@@ -771,10 +780,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_15()
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0xf
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 15
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 0
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 15)
   call void asm sideeffect "", ""()
@@ -791,10 +801,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_255(
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0xf
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0xff
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 15
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 255)
   call void asm sideeffect "", ""()
@@ -812,10 +823,11 @@ define amdgpu_kernel void @test_setreg_full_both_round_mode_and_denorm_mode_597(
 ;
 ; GFX10-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_round_mode 0x5
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 8), 0x255
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_denorm_mode 5
 ; GFX10-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.setreg(i32 14337, i32 597)
   call void asm sideeffect "", ""()

From d04147789ff0b838f9dd6c592207d5f70bc0d025 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Sat, 30 May 2020 11:15:39 +0900
Subject: [PATCH 609/770] [AMDGPU] Remove assertion on S1024 SGPR to VGPR spill

Summary:
Replace an assertion that blocks S1024 SGPR to VGPR spill.
The assertion pre-dates S1024 and is not wave size dependent.

Reviewers: arsenm, sameerds, rampitec

Reviewed By: arsenm

Subscribers: qcolombet, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80783
---
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  11 +-
 llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll   | 244 +++++++++++++++++-
 2 files changed, 238 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 2a3ba523f8c2f..5515e15ec66bb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -287,16 +287,19 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned Size = FrameInfo.getObjectSize(FI);
-  assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
-  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+  unsigned NumLanes = Size / 4;
 
-  int NumLanes = Size / 4;
+  if (NumLanes > WaveSize)
+    return false;
+
+  assert(Size >= 4 && "invalid sgpr spill size");
+  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
 
   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
 
   // Make sure to handle the case where a wide SGPR spill may span between two
   // VGPRs.
-  for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+  for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
     Register LaneVGPR;
     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
 
diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index c262f353dcc09..f1085cfb395b7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -193,18 +193,236 @@ ret:
   ret void
 }
 
-; FIXME: x16 inlineasm seems broken
-; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
-;   %wide.sgpr = call <16 x i32>  asm sideeffect "; def $0", "=s" () #0
-;   %cmp = icmp eq i32 %in, 0
-;   br i1 %cmp, label %bb0, label %ret
-
-; bb0:
-;   call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0
-;   br label %ret
-
-; ret:
-;   ret void
-; }
+; ALL-LABEL: {{^}}spill_sgpr_x16:
+
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 8
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 9
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 10
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 11
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 12
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 13
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 14
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 15
+; VGPR: s_cbranch_scc1
+
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 8
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 9
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 10
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 11
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 12
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 13
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 14
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15
+
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: s_cbranch_scc1
+
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
+  %wide.sgpr = call <16 x i32>  asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0
+  br label %ret
+
+ret:
+ ret void
+}
+
+; ALL-LABEL: {{^}}spill_sgpr_x32:
+
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 8
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 9
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 10
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 11
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 12
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 13
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 14
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 15
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 16
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 17
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 18
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 19
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 20
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 21
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 22
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 23
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 24
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 25
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 26
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 27
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 28
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 29
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 30
+; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 31
+; VGPR: s_cbranch_scc1
+
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 8
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 9
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 10
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 11
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 12
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 13
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 14
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 16
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 17
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 18
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 19
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 20
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 21
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 22
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 23
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 24
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 25
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 26
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 27
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 28
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 29
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 30
+; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 31
+
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: buffer_store_dword
+; VMEM: s_cbranch_scc1
+
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+; VMEM: buffer_load_dword
+define amdgpu_kernel void @spill_sgpr_x32(i32 addrspace(1)* %out, i32 %in) #0 {
+  %wide.sgpr = call <32 x i32>  asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<32 x i32> %wide.sgpr) #0
+  br label %ret
+
+ret:
+ ret void
+}
 
 attributes #0 = { nounwind }

From 034a7b6604067b0ccb36c761a5782456b76c447e Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Fri, 29 May 2020 19:48:33 -0700
Subject: [PATCH 610/770] [ValueLattice] Fix uninitialized-value after D79036

Many check-clang-codegen tests failed.
---
 llvm/include/llvm/Analysis/ValueLattice.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueLattice.h b/llvm/include/llvm/Analysis/ValueLattice.h
index 00a230fb08c52..bf5bab9ced228 100644
--- a/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/llvm/include/llvm/Analysis/ValueLattice.h
@@ -142,11 +142,12 @@ class ValueLatticeElement {
   };
 
   // ConstVal and Range are initialized on-demand.
-  ValueLatticeElement() : Tag(unknown) {}
+  ValueLatticeElement() : Tag(unknown), NumRangeExtensions(0) {}
 
   ~ValueLatticeElement() { destroy(); }
 
-  ValueLatticeElement(const ValueLatticeElement &Other) : Tag(Other.Tag) {
+  ValueLatticeElement(const ValueLatticeElement &Other)
+      : Tag(Other.Tag), NumRangeExtensions(0) {
     switch (Other.Tag) {
     case constantrange:
     case constantrange_including_undef:
@@ -164,7 +165,8 @@ class ValueLatticeElement {
     }
   }
 
-  ValueLatticeElement(ValueLatticeElement &&Other) : Tag(Other.Tag) {
+  ValueLatticeElement(ValueLatticeElement &&Other)
+      : Tag(Other.Tag), NumRangeExtensions(0) {
     switch (Other.Tag) {
     case constantrange:
     case constantrange_including_undef:

From c554c5e159aee43c5cd8236e077817e9f29dea78 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Mon, 4 May 2020 13:48:56 -0700
Subject: [PATCH 611/770] Fix full unrolling with new pass manager.

Last we looked at this and couldn't come up with a reason to change
it, but with a pragma for full loop unrolling we bypass every other
loop unroll and then fail to fully unroll a loop when the pragma is set.

Move the OnlyWhenForced out of the check and into the initialization
of the full unroll pass in the new pass manager. This doesn't show up
with the old pass manager.

Add a new option to opt so that we can turn off loop unrolling
manually since this is a difference between clang and opt.

Tested with check-clang and check-llvm.
---
 clang/test/Misc/loop-opt-setup.c              | 36 ++++++++-
 llvm/lib/Passes/PassBuilder.cpp               | 11 +--
 llvm/test/Transforms/LoopUnroll/FullUnroll.ll | 77 +++++++++++++++++++
 llvm/tools/opt/NewPMDriver.cpp                |  9 +++
 4 files changed, 125 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/FullUnroll.ll

diff --git a/clang/test/Misc/loop-opt-setup.c b/clang/test/Misc/loop-opt-setup.c
index f283e803a7d1e..868c716c6ed74 100644
--- a/clang/test/Misc/loop-opt-setup.c
+++ b/clang/test/Misc/loop-opt-setup.c
@@ -1,5 +1,5 @@
-// RUN: %clang -O1 -fexperimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s
-// RUN: %clang -O1 -fno-experimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s
+// RUN: %clang -O1 -fexperimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-NEWPM
+// RUN: %clang -O1 -fno-experimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-OLDPM
 extern int a[16];
 int b = 0;
 int foo(void) {
@@ -8,5 +8,35 @@ int foo(void) {
     a[i] = b += 2;
   return b;
 }
-// CHECK-NOT: br i1
+// Check br i1 to make sure that the loop is fully unrolled
+// CHECK-LABEL-NEWPM: foo
+// CHECK-NOT-NEWPM: br i1
+// CHECK-LABEL-OLDPM: foo
+// CHECK-NOT-OLDPM: br i1
 
+void Helper() {
+  const int *nodes[5];
+  int num_active = 5;
+
+  while (num_active)
+#pragma clang loop unroll(full)
+    for (int i = 0; i < 5; ++i)
+      if (nodes[i])
+        --num_active;
+}
+
+// Check br i1 to make sure the loop is gone, there will still be a label branch for the infinite loop.
+// CHECK-LABEL-NEWPM: Helper
+// CHECK-NEWPM: br label
+// CHECK-NEWPM-NOT: br i1
+// CHECK-NEWPM: br label
+
+// The old pass manager doesn't remove the while loop so check for 5 load i32*.
+// CHECK-LABEL-OLDPM: Helper
+// CHECK-OLDPM: br label
+// CHECK-OLDPM: load i32*
+// CHECK-OLDPM: load i32*
+// CHECK-OLDPM: load i32*
+// CHECK-OLDPM: load i32*
+// CHECK-OLDPM: load i32*
+// CHECK-OLDPM: ret
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1b1701cbe2619..fda2af7e80b32 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -519,12 +519,13 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM2.addPass(LoopDeletionPass());
   // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
   // because it changes IR to makes profile annotation in back compile
-  // inaccurate.
-  if ((Phase != ThinLTOPhase::PreLink || !PGOOpt ||
-       PGOOpt->Action != PGOOptions::SampleUse) &&
-      PTO.LoopUnrolling)
+  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
+  // attributes so we need to make sure and allow the full unroll pass to pay
+  // attention to it.
+  if (Phase != ThinLTOPhase::PreLink || !PGOOpt ||
+      PGOOpt->Action != PGOOptions::SampleUse)
     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
-                                    /*OnlyWhenForced=*/false,
+                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
                                     PTO.ForgetAllSCEVInLoopUnroll));
 
   for (auto &C : LoopOptimizerEndEPCallbacks)
diff --git a/llvm/test/Transforms/LoopUnroll/FullUnroll.ll b/llvm/test/Transforms/LoopUnroll/FullUnroll.ll
new file mode 100644
index 0000000000000..01936e487682b
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/FullUnroll.ll
@@ -0,0 +1,77 @@
+; RUN: opt -passes='default<O1>' -disable-verify --mtriple x86_64-pc-linux-gnu -new-pm-disable-loop-unrolling=true \
+; RUN: -S -o - %s | FileCheck %s
+
+; This checks that the loop full unroller will fire in the new pass manager
+; when forced via #pragma in the source (or annotation in the code).
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; We don't end up deleting the loop, merely turning it infinite, but we remove
+; everything inside of it so check for the loop structure and absence of
+; conditional branches.
+; CHECK-LABEL: bb
+; CHECK: br label
+; CHECK-NOT: br i1
+; CHECK: br label
+; CHECK-NOT: br i1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define void @foo() #0 {
+bb:
+  %tmp = alloca [5 x i32*], align 16
+  %tmp1 = alloca i32, align 4
+  %tmp2 = alloca i32, align 4
+  store i32 5, i32* %tmp1, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb23, %bb
+  %tmp4 = load i32, i32* %tmp1, align 4
+  %tmp5 = icmp ne i32 %tmp4, 0
+  br i1 %tmp5, label %bb6, label %bb24
+
+bb6:                                              ; preds = %bb3
+  store i32 0, i32* %tmp2, align 4
+  br label %bb7
+
+bb7:                                              ; preds = %bb20, %bb6
+  %tmp8 = load i32, i32* %tmp2, align 4
+  %tmp9 = icmp slt i32 %tmp8, 5
+  br i1 %tmp9, label %bb10, label %bb23
+
+bb10:                                             ; preds = %bb7
+  %tmp11 = load i32, i32* %tmp2, align 4
+  %tmp12 = sext i32 %tmp11 to i64
+  %tmp13 = getelementptr inbounds [5 x i32*], [5 x i32*]* %tmp, i64 0, i64 %tmp12
+  %tmp14 = load i32*, i32** %tmp13, align 8
+  %tmp15 = icmp ne i32* %tmp14, null
+  br i1 %tmp15, label %bb16, label %bb19
+
+bb16:                                             ; preds = %bb10
+  %tmp17 = load i32, i32* %tmp1, align 4
+  %tmp18 = add nsw i32 %tmp17, -1
+  store i32 %tmp18, i32* %tmp1, align 4
+  br label %bb19
+
+bb19:                                             ; preds = %bb16, %bb10
+  br label %bb20
+
+bb20:                                             ; preds = %bb19
+  %tmp21 = load i32, i32* %tmp2, align 4
+  %tmp22 = add nsw i32 %tmp21, 1
+  store i32 %tmp22, i32* %tmp2, align 4
+  br label %bb7, !llvm.loop !1
+
+bb23:                                             ; preds = %bb7
+  br label %bb3
+
+bb24:                                             ; preds = %bb3
+  ret void
+}
+
+attributes #0 = { noinline nounwind optnone uwtable }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.unroll.full"}
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index c99ad2f7b4dcf..ce86c3d615846 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -100,6 +100,11 @@ static cl::opt<std::string> OptimizerLastEPPipeline(
              "the OptimizerLast extension point into default pipelines"),
     cl::Hidden);
 
+// Individual pipeline tuning options.
+static cl::opt<bool> DisableLoopUnrolling(
+    "new-pm-disable-loop-unrolling",
+    cl::desc("Disable loop unrolling in all relevant passes"), cl::init(false));
+
 extern cl::opt<PGOKind> PGOKindFlag;
 extern cl::opt<std::string> ProfileFile;
 extern cl::opt<CSPGOKind> CSPGOKindFlag;
@@ -260,6 +265,10 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   SI.registerCallbacks(PIC);
 
   PipelineTuningOptions PTO;
+  // LoopUnrolling defaults on to true and DisableLoopUnrolling is initialized
+  // to false above so we shouldn't necessarily need to check whether or not the
+  // option has been enabled.
+  PTO.LoopUnrolling = !DisableLoopUnrolling;
   PTO.Coroutines = Coroutines;
   PassBuilder PB(TM, PTO, P, &PIC);
   registerEPCallbacks(PB, VerifyEachPass, DebugPM);

From c2bb26d8613338b93a1aab54631d01e6a690bc29 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Mon, 4 May 2020 18:33:49 -0700
Subject: [PATCH 612/770] NFC: Simplify O1 pass pipeline construction.

Pull O1 pass pipeline out into a separate function and simplify
buildFunctionSimplificationPipeline accordingly.
---
 llvm/include/llvm/Passes/PassBuilder.h |   4 +
 llvm/lib/Passes/PassBuilder.cpp        | 189 ++++++++++++++++++++-----
 2 files changed, 158 insertions(+), 35 deletions(-)

diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index d5a70c2ae132d..295a5aacfe7ee 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -683,6 +683,10 @@ class PassBuilder {
   }
 
 private:
+  // O1 pass pipeline
+  FunctionPassManager buildO1FunctionSimplificationPipeline(
+      OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging = false);
+
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index fda2af7e80b32..10f92f7148025 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -426,11 +426,139 @@ void PassBuilder::registerLoopAnalyses(LoopAnalysisManager &LAM) {
     C(LAM);
 }
 
+// TODO: Investigate the cost/benefit of tail call elimination on debugging.
+FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
+    OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) {
+
+  FunctionPassManager FPM(DebugLogging);
+
+  // Form SSA out of local memory accesses after breaking apart aggregates into
+  // scalars.
+  FPM.addPass(SROA());
+
+  // Catch trivial redundancies
+  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
+
+  // Hoisting of scalars and load expressions.
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+
+  FPM.addPass(LibCallsShrinkWrapPass());
+
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  FPM.addPass(SimplifyCFGPass());
+
+  // Form canonically associated expression trees, and simplify the trees using
+  // basic mathematical properties. For example, this will form (nearly)
+  // minimal multiplication trees.
+  FPM.addPass(ReassociatePass());
+
+  // Add the primary loop simplification pipeline.
+  // FIXME: Currently this is split into two loop pass pipelines because we run
+  // some function passes in between them. These can and should be removed
+  // and/or replaced by scheduling the loop pass equivalents in the correct
+  // positions. But those equivalent passes aren't powerful enough yet.
+  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
+  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
+  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
+  // `LoopInstSimplify`.
+  LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging);
+
+  // Simplify the loop body. We do this initially to clean up after other loop
+  // passes run, either when iterating on a loop or on inner loops with
+  // implications on the outer loop.
+  LPM1.addPass(LoopInstSimplifyPass());
+  LPM1.addPass(LoopSimplifyCFGPass());
+
+  LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true));
+  // TODO: Investigate promotion cap for O1.
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(SimpleLoopUnswitchPass());
+  LPM2.addPass(IndVarSimplifyPass());
+  LPM2.addPass(LoopIdiomRecognizePass());
+
+  for (auto &C : LateLoopOptimizationsEPCallbacks)
+    C(LPM2, Level);
+
+  LPM2.addPass(LoopDeletionPass());
+  // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
+  // because it changes IR to makes profile annotation in back compile
+  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
+  // attributes so we need to make sure and allow the full unroll pass to pay
+  // attention to it.
+  if (Phase != ThinLTOPhase::PreLink || !PGOOpt ||
+      PGOOpt->Action != PGOOptions::SampleUse)
+    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                    PTO.ForgetAllSCEVInLoopUnroll));
+
+  for (auto &C : LoopOptimizerEndEPCallbacks)
+    C(LPM2, Level);
+
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(
+      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  // The loop passes in LPM2 (IndVarSimplifyPass, LoopIdiomRecognizePass,
+  // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
+  // *All* loop passes must preserve it, in order to be able to use it.
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
+
+  // Delete small array after loop unroll.
+  FPM.addPass(SROA());
+
+  // Specially optimize memory movement as it doesn't look like dataflow in SSA.
+  FPM.addPass(MemCpyOptPass());
+
+  // Sparse conditional constant propagation.
+  // FIXME: It isn't clear why we do this *after* loop passes rather than
+  // before...
+  FPM.addPass(SCCPPass());
+
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  FPM.addPass(BDCEPass());
+
+  // Run instcombine after redundancy and dead bit elimination to exploit
+  // opportunities opened up by them.
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  if (PTO.Coroutines)
+    FPM.addPass(CoroElidePass());
+
+  for (auto &C : ScalarOptimizerLateEPCallbacks)
+    C(FPM, Level);
+
+  // Finally, do an expensive DCE pass to catch all the dead code exposed by
+  // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
+  FPM.addPass(ADCEPass());
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  return FPM;
+}
+
 FunctionPassManager
 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
                                                  ThinLTOPhase Phase,
                                                  bool DebugLogging) {
   assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
+
+  // The O1 pipeline has a separate pipeline creation function to simplify
+  // construction readability.
+  if (Level.getSpeedupLevel() == 1)
+    return buildO1FunctionSimplificationPipeline(Level, Phase, DebugLogging);
+
   FunctionPassManager FPM(DebugLogging);
 
   // Form SSA out of local memory accesses after breaking apart aggregates into
@@ -443,25 +571,22 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
     FPM.addPass(AssumeSimplifyPass());
 
   // Hoisting of scalars and load expressions.
-  if (Level.getSpeedupLevel() > 1) {
-    if (EnableGVNHoist)
-      FPM.addPass(GVNHoistPass());
+  if (EnableGVNHoist)
+    FPM.addPass(GVNHoistPass());
 
-    // Global value numbering based sinking.
-    if (EnableGVNSink) {
-      FPM.addPass(GVNSinkPass());
-      FPM.addPass(SimplifyCFGPass());
-    }
+  // Global value numbering based sinking.
+  if (EnableGVNSink) {
+    FPM.addPass(GVNSinkPass());
+    FPM.addPass(SimplifyCFGPass());
   }
 
   // Speculative execution if the target has divergent branches; otherwise nop.
-  if (Level.getSpeedupLevel() > 1) {
-    FPM.addPass(SpeculativeExecutionPass());
+  FPM.addPass(SpeculativeExecutionPass());
+
+  // Optimize based on known information about branches, and cleanup afterward.
+  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(CorrelatedValuePropagationPass());
 
-    // Optimize based on known information about branches, and cleanup afterward.
-    FPM.addPass(JumpThreadingPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-  }
   FPM.addPass(SimplifyCFGPass());
   if (Level == OptimizationLevel::O3)
     FPM.addPass(AggressiveInstCombinePass());
@@ -475,12 +600,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
   // using the size value profile. Don't perform this when optimizing for size.
   if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
-      (Level.getSpeedupLevel() > 1 && !Level.isOptimizingForSize()))
+      !Level.isOptimizingForSize())
     FPM.addPass(PGOMemOPSizeOpt());
 
-  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
-  if (Level.getSpeedupLevel() > 1)
-    FPM.addPass(TailCallElimPass());
+  FPM.addPass(TailCallElimPass());
   FPM.addPass(SimplifyCFGPass());
 
   // Form canonically associated expression trees, and simplify the trees using
@@ -533,7 +656,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // We provide the opt remark emitter pass for LICM to use. We only need to do
   // this once as it is immutable.
-  FPM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  FPM.addPass(
+      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(
       std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
   FPM.addPass(SimplifyCFGPass());
@@ -548,14 +672,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(SROA());
 
   // Eliminate redundancies.
-  if (Level != OptimizationLevel::O1) {
-    // These passes add substantial compile time so skip them at O1.
-    FPM.addPass(MergedLoadStoreMotionPass());
-    if (RunNewGVN)
-      FPM.addPass(NewGVNPass());
-    else
-      FPM.addPass(GVN());
-  }
+  FPM.addPass(MergedLoadStoreMotionPass());
+  if (RunNewGVN)
+    FPM.addPass(NewGVNPass());
+  else
+    FPM.addPass(GVN());
 
   // Specially optimize memory movement as it doesn't look like dataflow in SSA.
   FPM.addPass(MemCpyOptPass());
@@ -577,14 +698,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
-  if (Level.getSpeedupLevel() > 1) {
-    FPM.addPass(JumpThreadingPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-    FPM.addPass(DSEPass());
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-        EnableMSSALoopDependency, DebugLogging));
-  }
+  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(CorrelatedValuePropagationPass());
+  FPM.addPass(DSEPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      EnableMSSALoopDependency, DebugLogging));
 
   if (PTO.Coroutines)
     FPM.addPass(CoroElidePass());

From 21fee0921d563f407e07b5e28592c2925da3704d Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Sat, 30 May 2020 03:36:22 +0000
Subject: [PATCH 613/770] Use .empty() instead of .size() == 0 (NFC)

Cleanup / Fix a clang-tidy warning
---
 mlir/lib/Transforms/BufferPlacement.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Transforms/BufferPlacement.cpp b/mlir/lib/Transforms/BufferPlacement.cpp
index 60f49d4e305c4..337e26e2a65b0 100644
--- a/mlir/lib/Transforms/BufferPlacement.cpp
+++ b/mlir/lib/Transforms/BufferPlacement.cpp
@@ -392,7 +392,7 @@ struct BufferPlacementPass
       // If the Dealloc position is at the terminator operation of the block,
       // then the value should escape from a deallocation.
       if (!nextOp) {
-        assert(deallocs.size() == 0 &&
+        assert(deallocs.empty() &&
                "There should be no dealloc for the returned buffer");
         continue;
       }

From 0800529fe605a03e9da1aca241a377eebcaa8cad Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 29 May 2020 22:10:05 -0700
Subject: [PATCH 614/770] [lldb/Bindings] Raise exception when using properties
 that rely on lldb.target

Several SBAddress properties use the lldb.target or lldb.process
convenience variables which are only set under the interactive script
interpreter. Unfortunately, users have been using these properties in
Python script and commands. This patch raises a Python exception to
force users to use GetLoadAddress instead.

Differential revision: https://reviews.llvm.org/D80848
---
 lldb/bindings/interface/SBAddress.i           | 25 ++++++++++++-------
 .../Python/Inputs/sbaddress.py                |  7 ++++++
 .../Python/sb_address_exception.test          |  8 ++++++
 3 files changed, 31 insertions(+), 9 deletions(-)
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Python/Inputs/sbaddress.py
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test

diff --git a/lldb/bindings/interface/SBAddress.i b/lldb/bindings/interface/SBAddress.i
index de277607d8f5e..6fd06c83d293e 100644
--- a/lldb/bindings/interface/SBAddress.i
+++ b/lldb/bindings/interface/SBAddress.i
@@ -144,27 +144,34 @@ public:
 
 #ifdef SWIGPYTHON
     %pythoncode %{
+        __runtime_error_str = 'This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.'
+
         def __get_load_addr_property__ (self):
-            '''Get the load address for a lldb.SBAddress using the current target.'''
+            '''Get the load address for a lldb.SBAddress using the current target. This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.'''
+            if not target:
+                raise RuntimeError(self.__runtime_error_str)
             return self.GetLoadAddress (target)
 
         def __set_load_addr_property__ (self, load_addr):
-            '''Set the load address for a lldb.SBAddress using the current target.'''
+            '''Set the load address for a lldb.SBAddress using the current target. This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.'''
+            if not target:
+                raise RuntimeError(self.__runtime_error_str)
             return self.SetLoadAddress (load_addr, target)
 
         def __int__(self):
-            '''Convert an address to a load address if there is a process and that process is alive, or to a file address otherwise.'''
-            if process and process.is_alive:
+            '''Convert an address to a load address if there is a process and that process is alive, or to a file address otherwise. This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.'''
+            if not process or not target:
+                raise RuntimeError(self.__runtime_error_str)
+            if process.is_alive:
                 return self.GetLoadAddress (target)
-            else:
-                return self.GetFileAddress ()
+            return self.GetFileAddress ()
 
         def __oct__(self):
-            '''Convert the address to an octal string'''
+            '''Convert the address to an octal string. This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.'''
             return '%o' % int(self)
 
         def __hex__(self):
-            '''Convert the address to an hex string'''
+            '''Convert the address to an hex string. This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.'''
             return '0x%x' % int(self)
 
         module = property(GetModule, None, doc='''A read only property that returns an lldb object that represents the module (lldb.SBModule) that this address resides within.''')
@@ -176,7 +183,7 @@ public:
         offset = property(GetOffset, None, doc='''A read only property that returns the section offset in bytes as an integer.''')
         section = property(GetSection, None, doc='''A read only property that returns an lldb object that represents the section (lldb.SBSection) that this address resides within.''')
         file_addr = property(GetFileAddress, None, doc='''A read only property that returns file address for the section as an integer. This is the address that represents the address as it is found in the object file that defines it.''')
-        load_addr = property(__get_load_addr_property__, __set_load_addr_property__, doc='''A read/write property that gets/sets the SBAddress using load address. The setter resolves SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command) and not in Python based commands, or breakpoint commands.''')
+        load_addr = property(__get_load_addr_property__, __set_load_addr_property__, doc='''A read/write property that gets/sets the SBAddress using load address. This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.''')
     %}
 #endif
 
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Inputs/sbaddress.py b/lldb/test/Shell/ScriptInterpreter/Python/Inputs/sbaddress.py
new file mode 100644
index 0000000000000..132d284549024
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Inputs/sbaddress.py
@@ -0,0 +1,7 @@
+import lldb
+
+def test(debugger, command, result, internal_dict):
+    return int(lldb.SBAddress())
+
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('command script add -f sbaddress.test test')
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test b/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test
new file mode 100644
index 0000000000000..ffcec11d3913d
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test
@@ -0,0 +1,8 @@
+# REQUIRES: python
+# UNSUPPORTED: lldb-repro
+#
+# Test that the SBAddress properties throw an exception when used outside of
+# the interactive script interpreter.
+#
+# RUN: %lldb --script-language python -o 'command script import %S/Inputs/sbaddress.py' -o 'test' 2>&1 | FileCheck %s
+# CHECK: RuntimeError: This resolves the SBAddress using the SBTarget from lldb.target so this property can ONLY be used in the interactive script interpreter (i.e. under the lldb script command). For things like Python based commands and breakpoint callbacks use GetLoadAddress instead.

From 20c9bb44ec1a4a795215ff6964d264219f9b05f2 Mon Sep 17 00:00:00 2001
From: Sourabh Singh Tomar <SourabhSingh.Tomar@amd.com>
Date: Sat, 30 May 2020 11:11:09 +0530
Subject: [PATCH 615/770] [DWARF5] Added support for emission of
 .debug_macro.dwo section

This patch adds support for emission of following DWARFv5 macro
forms in .debug_macro.dwo section:

- DW_MACRO_start_file
- DW_MACRO_end_file
- DW_MACRO_define_strx
- DW_MACRO_undef_strx

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D78866
---
 llvm/include/llvm/MC/MCObjectFileInfo.h    |  2 +
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 16 +++----
 llvm/lib/MC/MCObjectFileInfo.cpp           |  7 ++++
 llvm/test/DebugInfo/X86/debug-macro-dwo.ll | 49 ++++++++++++++++++++++
 4 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/debug-macro-dwo.ll

diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index e4eb2a06404e9..ca04d8e8d3b68 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -113,6 +113,7 @@ class MCObjectFileInfo {
   MCSection *DwarfLocDWOSection = nullptr;
   MCSection *DwarfStrOffDWOSection = nullptr;
   MCSection *DwarfMacinfoDWOSection = nullptr;
+  MCSection *DwarfMacroDWOSection = nullptr;
 
   /// The DWARF v5 string offset and address table sections.
   MCSection *DwarfStrOffSection = nullptr;
@@ -309,6 +310,7 @@ class MCObjectFileInfo {
   MCSection *getDwarfLoclistsDWOSection() const {
     return DwarfLoclistsDWOSection;
   }
+  MCSection *getDwarfMacroDWOSection() const { return DwarfMacroDWOSection; }
   MCSection *getDwarfMacinfoDWOSection() const {
     return DwarfMacinfoDWOSection;
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 84bc1a13c984e..99883016b5d96 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1291,9 +1291,11 @@ void DwarfDebug::finalizeModuleInfo() {
     // attribute.
     if (CUNode->getMacros()) {
       if (getDwarfVersion() >= 5) {
-        // FIXME: Add support for DWARFv5 DW_AT_macros attribute for split
-        // case.
-        if (!useSplitDwarf())
+        if (useSplitDwarf())
+          TheCU.addSectionDelta(
+              TheCU.getUnitDie(), dwarf::DW_AT_macros, U.getMacroLabelBegin(),
+              TLOF.getDwarfMacroDWOSection()->getBeginSymbol());
+        else
           U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macros,
                             U.getMacroLabelBegin(),
                             TLOF.getDwarfMacroSection()->getBeginSymbol());
@@ -3014,10 +3016,10 @@ void DwarfDebug::emitDebugMacinfo() {
 }
 
 void DwarfDebug::emitDebugMacinfoDWO() {
-  // FIXME: Add support for macro.dwo section.
-  if (getDwarfVersion() >= 5)
-    return;
-  emitDebugMacinfoImpl(Asm->getObjFileLowering().getDwarfMacinfoDWOSection());
+  auto &ObjLower = Asm->getObjFileLowering();
+  emitDebugMacinfoImpl(getDwarfVersion() >= 5
+                           ? ObjLower.getDwarfMacroDWOSection()
+                           : ObjLower.getDwarfMacinfoDWOSection());
 }
 
 // DWARF5 Experimental Separate Dwarf emitters.
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index f79b068349e3f..b77a9635f64c4 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -473,6 +473,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_rnglists.dwo", DebugSecType, ELF::SHF_EXCLUDE);
   DwarfMacinfoDWOSection =
       Ctx->getELFSection(".debug_macinfo.dwo", DebugSecType, ELF::SHF_EXCLUDE);
+  DwarfMacroDWOSection =
+      Ctx->getELFSection(".debug_macro.dwo", DebugSecType, ELF::SHF_EXCLUDE);
 
   DwarfLoclistsDWOSection =
       Ctx->getELFSection(".debug_loclists.dwo", DebugSecType, ELF::SHF_EXCLUDE);
@@ -649,6 +651,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "debug_macinfo.dwo");
+  DwarfMacroDWOSection = Ctx->getCOFFSection(
+      ".debug_macro.dwo",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "debug_macro.dwo");
   DwarfInfoDWOSection = Ctx->getCOFFSection(
       ".debug_info.dwo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
diff --git a/llvm/test/DebugInfo/X86/debug-macro-dwo.ll b/llvm/test/DebugInfo/X86/debug-macro-dwo.ll
new file mode 100644
index 0000000000000..b11f2f9513eda
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/debug-macro-dwo.ll
@@ -0,0 +1,49 @@
+; This test checks emission of .debug_macro.dwo section when
+; -gdwarf-5 -gsplit-dwarf -fdebug-macro is specified.
+
+; RUN: %llc_dwarf -dwarf-version=5 -O0 -filetype=obj \
+; RUN: -split-dwarf-file=foo.dwo < %s | llvm-dwarfdump -v - | FileCheck %s
+
+; CHECK-LABEL:  .debug_info contents:
+; CHECK: DW_AT_macros [DW_FORM_sec_offset] (0x00000000)
+
+; CHECK-LABEL:  .debug_macro.dwo contents:
+; CHECK-NEXT: 0x00000000:
+; CHECK-NEXT: macro header: version = 0x0005, flags = 0x02
+; CHECK-NEXT: DW_MACRO_start_file - lineno: 0 filenum: 0
+; CHECK-NEXT:   DW_MACRO_start_file - lineno: 1 filenum: 1
+; CHECK-NEXT:     DW_MACRO_define_strx - lineno: 1 macro: FOO 5
+; CHECK-NEXT:   DW_MACRO_end_file
+; CHECK-NEXT:   DW_MACRO_start_file - lineno: 2 filenum: 2
+; CHECK-NEXT:     DW_MACRO_undef_strx - lineno: 14 macro: YEA
+; CHECK-NEXT:   DW_MACRO_end_file
+; CHECK-NEXT:   DW_MACRO_undef_strx - lineno: 14 macro: YEA
+; CHECK-NEXT: DW_MACRO_end_file
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p200:32:32-p201:32:32-p202:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, enums: !2, macros: !3, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/", checksumkind: CSK_MD5, checksum: "ef6a7032e0c7ceeef614583f2c00dc80")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIMacroFile(file: !1, nodes: !5)
+!5 = !{!6, !10, !13}
+!6 = !DIMacroFile(line: 1, file: !7, nodes: !8)
+!7 = !DIFile(filename: "./foo.h", directory: "/home/", checksumkind: CSK_MD5, checksum: "0f0cd0e15b44f49d3944992c8dc28661")
+!8 = !{!9}
+!9 = !DIMacro(type: DW_MACINFO_define, line: 1, name: "FOO", value: "5")
+!10 = !DIMacroFile(line: 2, file: !11, nodes: !12)
+!11 = !DIFile(filename: "./bar.h", directory: "/home/", checksumkind: CSK_MD5, checksum: "bf4b34c263eaaa1d7085c18243b8d100")
+!12 = !{!13}
+!13 = !DIMacro(type: DW_MACINFO_undef, line: 14, name: "YEA")
+!14 = !{i32 7, !"Dwarf Version", i32 5}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}

From 2388a096e7865c043e83ece4e26654bd3d1a20d5 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 29 May 2020 23:01:09 -0700
Subject: [PATCH 616/770] [lldb/Test] use GetLoadAddress from scripted thread
 plan

Commit 0800529fe605 adds a runtime error which triggers when using
SBAddress properties that use the current process/target from a
non-interactive session. TestThreadPlanCommands.py was doing exactly
this and this patch fixes that by use GetLoadAddress instead.
---
 .../API/functionalities/thread_plan/wrap_step_over.py     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py
index a7b39764ceca0..bce34d6916eec 100644
--- a/lldb/test/API/functionalities/thread_plan/wrap_step_over.py
+++ b/lldb/test/API/functionalities/thread_plan/wrap_step_over.py
@@ -3,14 +3,16 @@
 class WrapStepOver():
     def __init__(self, thread_plan, args_data, dict):
         self.plan = thread_plan
-        frame_0 = thread_plan.GetThread().frames[0]
+        thread = thread_plan.GetThread()
+        target = thread.GetProcess().GetTarget()
+        frame_0 = thread.frames[0]
         line_entry = frame_0.line_entry
         start_addr = line_entry.addr
         end_addr = line_entry.end_addr
-        range_size = int(end_addr) - int(start_addr)
+        range_size = end_addr.GetLoadAddress(target) - start_addr.GetLoadAddress(target)
         error = lldb.SBError()
         self.sub_plan = thread_plan.QueueThreadPlanForStepOverRange(start_addr, range_size)
-        
+
     def should_step(self):
         return False
 

From cf97e0ec42b800ade5a18401a35ada96f355693f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 30 Apr 2019 11:50:09 +0300
Subject: [PATCH 617/770] [AArch64] Treat x18 as callee-saved in functions with
 windows calling convention on non-windows OSes

Treat it as callee-saved, and always back it up. When windows code calls
entry points in unix code, marked with the windows calling convention,
that unix code can call other functions that isn't compiled with
-ffixed-x18 which may clobber x18 freely. By backing it up and restoring
it on return, we preserve the register across the function call,
fulfilling this part of the windows calling convention on another OS.

This isn't enough for making sure that x18 is preseved when non-windows
code does a callback to windows code, but is a clear improvement over
the current status quo. Additionally, wine is nowadays building many
modules as PE DLLs, which avoids the callback issue altogether for those
DLLs.

Differential Revision: https://reviews.llvm.org/D61892
---
 .../AArch64/AArch64CallingConvention.td       |  4 +++
 .../Target/AArch64/AArch64FrameLowering.cpp   | 11 +++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  7 +++++
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  4 +++
 .../CodeGen/AArch64/aarch64_win64cc_vararg.ll | 30 ++++++++++---------
 .../CodeGen/AArch64/win64cc-backup-x18.ll     | 26 ++++++++++++++++
 6 files changed, 68 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/win64cc-backup-x18.ll

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index eed87946dab9e..a2219a240b9bc 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -384,6 +384,10 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
 
+// A variant for treating X18 as callee saved, when interfacing with
+// code that needs X18 to be preserved.
+def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>;
+
 // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
 // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
 // and not (LR,FP) pairs.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3abc03c0aaac3..b42bd5769a99c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2389,6 +2389,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2437,6 +2438,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
+  if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
+      !Subtarget.isTargetWindows()) {
+    // For Windows calling convention on a non-windows OS, where X18 is treated
+    // as reserved, back up X18 when entering non-windows code (marked with the
+    // Windows calling convention) and restore when returning regardless of
+    // whether the individual function uses it - it might call other functions
+    // that clobber it.
+    SavedRegs.set(AArch64::X18);
+  }
+
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
   unsigned SVECSStackSize = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dfa4b493c2216..6e708f5ea7169 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3958,6 +3958,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   CallingConv::ID CallerCC = CallerF.getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
+  // When using the Windows calling convention on a non-windows OS, we want
+  // to back up and restore X18 in such functions; we can't do a tail call
+  // from those functions.
+  if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
+      CalleeCC != CallingConv::Win64)
+    return false;
+
   // Byval parameters hand the function a pointer directly into the stack area
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index f33695e5ac520..886158ca44901 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -71,6 +71,10 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AAPCS_SwiftError_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
     return CSR_AArch64_RT_MostRegs_SaveList;
+  if (MF->getFunction().getCallingConv() == CallingConv::Win64)
+    // This is for OSes other than Windows; Windows is a separate case further
+    // above.
+    return CSR_AArch64_AAPCS_X18_SaveList;
   return CSR_AArch64_AAPCS_SaveList;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
index a45ae74ac49b9..4162329784862 100644
--- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
@@ -2,16 +2,18 @@
 
 define win64cc void @pass_va(i32 %count, ...) nounwind {
 entry:
-; CHECK: str     x30, [sp, #-80]!
-; CHECK: add     x8, sp, #24
-; CHECK: add     x0, sp, #24
-; CHECK: stp     x1, x2, [sp, #24]
-; CHECK: stp     x3, x4, [sp, #40]
-; CHECK: stp     x5, x6, [sp, #56]
-; CHECK: str     x7, [sp, #72]
+; CHECK: sub     sp, sp, #96
+; CHECK: add     x8, sp, #40
+; CHECK: add     x0, sp, #40
+; CHECK: stp     x30, x18, [sp, #16]
+; CHECK: stp     x1, x2, [sp, #40]
+; CHECK: stp     x3, x4, [sp, #56]
+; CHECK: stp     x5, x6, [sp, #72]
+; CHECK: str     x7, [sp, #88]
 ; CHECK: str     x8, [sp, #8]
 ; CHECK: bl      other_func
-; CHECK: ldr     x30, [sp], #80
+; CHECK: ldp     x30, x18, [sp, #16]
+; CHECK: add     sp, sp, #96
 ; CHECK: ret
   %ap = alloca i8*, align 8
   %ap1 = bitcast i8** %ap to i8*
@@ -27,11 +29,11 @@ declare void @llvm.va_start(i8*) nounwind
 declare void @llvm.va_copy(i8*, i8*) nounwind
 
 ; CHECK-LABEL: f9:
-; CHECK: sub     sp, sp, #16
+; CHECK: str     x18, [sp, #-16]!
 ; CHECK: add     x8, sp, #24
 ; CHECK: add     x0, sp, #24
 ; CHECK: str     x8, [sp, #8]
-; CHECK: add     sp, sp, #16
+; CHECK: ldr     x18, [sp], #16
 ; CHECK: ret
 define win64cc i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind {
 entry:
@@ -43,11 +45,11 @@ entry:
 }
 
 ; CHECK-LABEL: f8:
-; CHECK: sub     sp, sp, #16
+; CHECK: str     x18, [sp, #-16]!
 ; CHECK: add     x8, sp, #16
 ; CHECK: add     x0, sp, #16
 ; CHECK: str     x8, [sp, #8]
-; CHECK: add     sp, sp, #16
+; CHECK: ldr     x18, [sp], #16
 ; CHECK: ret
 define win64cc i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind {
 entry:
@@ -59,12 +61,12 @@ entry:
 }
 
 ; CHECK-LABEL: f7:
-; CHECK: sub     sp, sp, #32
+; CHECK: str     x18, [sp, #-32]!
 ; CHECK: add     x8, sp, #24
 ; CHECK: str     x7, [sp, #24]
 ; CHECK: add     x0, sp, #24
 ; CHECK: str     x8, [sp, #8]
-; CHECK: add     sp, sp, #32
+; CHECK: ldr     x18, [sp], #32
 ; CHECK: ret
 define win64cc i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/win64cc-backup-x18.ll b/llvm/test/CodeGen/AArch64/win64cc-backup-x18.ll
new file mode 100644
index 0000000000000..936ee3ca9e392
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/win64cc-backup-x18.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+;; Testing that x18 is backed up and restored, and that x29 (if used) still
+;; points to the x29,x30 pair on the stack.
+
+; RUN: llc < %s -mtriple=aarch64-linux-gnu --frame-pointer=non-leaf | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnu --frame-pointer=non-leaf -mattr=+reserve-x18 | FileCheck %s
+
+declare dso_local void @other()
+
+define dso_local win64cc void @func() #0 {
+; CHECK-LABEL: func:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x18, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    bl other
+; CHECK-NEXT:    ldr x18, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  tail call void @other()
+  ret void
+}
+
+attributes #0 = { nounwind }

From 51089db6d7554cefc6c57e6f10a7f876e2dd629e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 30 May 2020 01:20:14 +0300
Subject: [PATCH 618/770] [test] Regenerate checks in aarch64_win64cc_vararg.ll
 with update_llc_test_checks.py. NFC.

---
 .../CodeGen/AArch64/aarch64_win64cc_vararg.ll | 76 ++++++++++---------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
index 4162329784862..fe1a1f0e58136 100644
--- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
@@ -1,20 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 define win64cc void @pass_va(i32 %count, ...) nounwind {
+; CHECK-LABEL: pass_va:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #96 // =96
+; CHECK-NEXT:    add x8, sp, #40 // =40
+; CHECK-NEXT:    add x0, sp, #40 // =40
+; CHECK-NEXT:    stp x30, x18, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x1, x2, [sp, #40]
+; CHECK-NEXT:    stp x3, x4, [sp, #56]
+; CHECK-NEXT:    stp x5, x6, [sp, #72]
+; CHECK-NEXT:    str x7, [sp, #88]
+; CHECK-NEXT:    str x8, [sp, #8]
+; CHECK-NEXT:    bl other_func
+; CHECK-NEXT:    ldp x30, x18, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96 // =96
+; CHECK-NEXT:    ret
 entry:
-; CHECK: sub     sp, sp, #96
-; CHECK: add     x8, sp, #40
-; CHECK: add     x0, sp, #40
-; CHECK: stp     x30, x18, [sp, #16]
-; CHECK: stp     x1, x2, [sp, #40]
-; CHECK: stp     x3, x4, [sp, #56]
-; CHECK: stp     x5, x6, [sp, #72]
-; CHECK: str     x7, [sp, #88]
-; CHECK: str     x8, [sp, #8]
-; CHECK: bl      other_func
-; CHECK: ldp     x30, x18, [sp, #16]
-; CHECK: add     sp, sp, #96
-; CHECK: ret
   %ap = alloca i8*, align 8
   %ap1 = bitcast i8** %ap to i8*
   call void @llvm.va_start(i8* %ap1)
@@ -28,14 +31,15 @@ declare void @other_func(i8*) local_unnamed_addr
 declare void @llvm.va_start(i8*) nounwind
 declare void @llvm.va_copy(i8*, i8*) nounwind
 
-; CHECK-LABEL: f9:
-; CHECK: str     x18, [sp, #-16]!
-; CHECK: add     x8, sp, #24
-; CHECK: add     x0, sp, #24
-; CHECK: str     x8, [sp, #8]
-; CHECK: ldr     x18, [sp], #16
-; CHECK: ret
 define win64cc i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind {
+; CHECK-LABEL: f9:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x18, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, sp, #24 // =24
+; CHECK-NEXT:    add x0, sp, #24 // =24
+; CHECK-NEXT:    str x8, [sp, #8]
+; CHECK-NEXT:    ldr x18, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %ap = alloca i8*, align 8
   %ap1 = bitcast i8** %ap to i8*
@@ -44,14 +48,15 @@ entry:
   ret i8* %ap2
 }
 
-; CHECK-LABEL: f8:
-; CHECK: str     x18, [sp, #-16]!
-; CHECK: add     x8, sp, #16
-; CHECK: add     x0, sp, #16
-; CHECK: str     x8, [sp, #8]
-; CHECK: ldr     x18, [sp], #16
-; CHECK: ret
 define win64cc i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind {
+; CHECK-LABEL: f8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x18, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, sp, #16 // =16
+; CHECK-NEXT:    add x0, sp, #16 // =16
+; CHECK-NEXT:    str x8, [sp, #8]
+; CHECK-NEXT:    ldr x18, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %ap = alloca i8*, align 8
   %ap1 = bitcast i8** %ap to i8*
@@ -60,15 +65,16 @@ entry:
   ret i8* %ap2
 }
 
-; CHECK-LABEL: f7:
-; CHECK: str     x18, [sp, #-32]!
-; CHECK: add     x8, sp, #24
-; CHECK: str     x7, [sp, #24]
-; CHECK: add     x0, sp, #24
-; CHECK: str     x8, [sp, #8]
-; CHECK: ldr     x18, [sp], #32
-; CHECK: ret
 define win64cc i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind {
+; CHECK-LABEL: f7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x18, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, sp, #24 // =24
+; CHECK-NEXT:    str x7, [sp, #24]
+; CHECK-NEXT:    add x0, sp, #24 // =24
+; CHECK-NEXT:    str x8, [sp, #8]
+; CHECK-NEXT:    ldr x18, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %ap = alloca i8*, align 8
   %ap1 = bitcast i8** %ap to i8*

From c65c1d78931e262b5117278a8ee0a703d1be073c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Fri, 29 May 2020 22:46:57 -0700
Subject: [PATCH 619/770] [X86] Autogenerate complete checks. NFC

---
 llvm/test/CodeGen/X86/i1narrowfail.ll |  7 +++++--
 llvm/test/CodeGen/X86/narrow_op-1.ll  | 15 +++++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/X86/i1narrowfail.ll b/llvm/test/CodeGen/X86/i1narrowfail.ll
index 4f9a75672bfc6..282d1ac28f596 100644
--- a/llvm/test/CodeGen/X86/i1narrowfail.ll
+++ b/llvm/test/CodeGen/X86/i1narrowfail.ll
@@ -1,8 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: @foo
-; CHECK: orb     $16
 define void @foo(i64* %ptr) {
+; CHECK-LABEL: foo:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    orb $16, (%rdi)
+; CHECK-NEXT:    retq
   %r11 = load i64, i64* %ptr, align 8
   %r12 = or i64 16, %r11
   store i64 %r12, i64* %ptr, align 8
diff --git a/llvm/test/CodeGen/X86/narrow_op-1.ll b/llvm/test/CodeGen/X86/narrow_op-1.ll
index 96751abde28da..dc24b190ea913 100644
--- a/llvm/test/CodeGen/X86/narrow_op-1.ll
+++ b/llvm/test/CodeGen/X86/narrow_op-1.ll
@@ -1,28 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
 	%struct.bf = type { i64, i16, i16, i32 }
 @bfi = common global %struct.bf zeroinitializer, align 16
 
 define void @t1() nounwind optsize ssp {
+; CHECK-LABEL: t1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    orb $1, bfi+{{.*}}(%rip)
+; CHECK-NEXT:    retq
 entry:
 	%0 = load i32, i32* bitcast (i16* getelementptr (%struct.bf, %struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
 	%1 = or i32 %0, 65536
 	store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf, %struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
 	ret void
 
-; CHECK-LABEL: t1:
-; CHECK: orb $1
-; CHECK-NEXT: ret
 }
 
 define void @t2() nounwind optsize ssp {
+; CHECK-LABEL: t2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    orl $16842752, bfi+{{.*}}(%rip) # imm = 0x1010000
+; CHECK-NEXT:    retq
 entry:
 	%0 = load i32, i32* bitcast (i16* getelementptr (%struct.bf, %struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
 	%1 = or i32 %0, 16842752
 	store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf, %struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
 	ret void
 
-; CHECK-LABEL: t2:
-; CHECK: orl $16842752
-; CHECK-NEXT: ret
 }

From 92063228f85bfe22a6dfe20bf01c99ffe6ff3130 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 29 May 2020 23:50:26 -0700
Subject: [PATCH 620/770] [lldb/CMake] Fix typo that prevented regenerating the
 bindings

A typo in the GLOB patter prevented us from detecting changes in the
interface files and trigger SWIG to regenerate the bindings.
---
 lldb/bindings/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/bindings/CMakeLists.txt b/lldb/bindings/CMakeLists.txt
index 92ae402c478e9..ace34191a3c57 100644
--- a/lldb/bindings/CMakeLists.txt
+++ b/lldb/bindings/CMakeLists.txt
@@ -1,4 +1,4 @@
-file(GLOB SWIG_INTERFACES interfaces/*.i)
+file(GLOB SWIG_INTERFACES interface/*.i)
 file(GLOB_RECURSE SWIG_SOURCES *.swig)
 file(GLOB SWIG_HEADERS
   ${LLDB_SOURCE_DIR}/include/lldb/API/*.h

From 81b50a782375cb0c1a4541e6b5bac01c617fd20a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 10:21:36 +0100
Subject: [PATCH 621/770] [SelectionDAG] Remove repeated getOperand() call.
 NFC.

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a108c95bd286..3cba91a11d424 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3546,7 +3546,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         continue;
 
       SDValue SrcOp = Op.getOperand(i);
-      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
+      Tmp2 = ComputeNumSignBits(SrcOp, Depth + 1);
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
       if (SrcOp.getValueSizeInBits() != VTBits) {

From bd85b7d6688725e854a694f9f3e8baa6a3077a4a Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Sat, 30 May 2020 12:27:47 +0300
Subject: [PATCH 622/770] [OpenMP][SYCL] Do not crash on attempt to diagnose
 unsupported type use

Summary:
Do not ask size of type if it is dependent. ASTContext doesn't seem expecting
this.

Reviewers: jdoerfert, ABataev, bader

Reviewed By: ABataev

Subscribers: yaxunl, guansong, ebevhan, Anastasia, sstefan1, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80829
---
 clang/lib/Sema/Sema.cpp                               |  3 +++
 clang/test/OpenMP/nvptx_unsupported_type_messages.cpp | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 8c11a1a59e9c9..ffe2e4d4d56ad 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1725,6 +1725,9 @@ void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
   }
 
   auto CheckType = [&](QualType Ty) {
+    if (Ty->isDependentType())
+      return;
+
     if ((Ty->isFloat16Type() && !Context.getTargetInfo().hasFloat16Type()) ||
         ((Ty->isFloat128Type() ||
           (Ty->isRealFloatingType() && Context.getTypeSize(Ty) == 128)) &&
diff --git a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
index 22ce8175fd05a..e56105adeb834 100644
--- a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
+++ b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
@@ -120,3 +120,14 @@ void hostFoo() {
 long double qa, qb;
 decltype(qa + qb) qc;
 double qd[sizeof(-(-(qc * 2)))];
+
+struct A { };
+
+template <bool>
+struct A_type { typedef A type; };
+
+template <class Sp, class Tp>
+struct B {
+  enum { value = bool(Sp::value) || bool(Tp::value) };
+  typedef typename A_type<value>::type type;
+};

From 1c4238e7a00b28043e209d0b2c0dda76ee5af82e Mon Sep 17 00:00:00 2001
From: David Zarzycki <dave@znu.io>
Date: Sat, 30 May 2020 06:06:43 -0400
Subject: [PATCH 623/770] [libcxx testing] Stop using arbitrary timeouts in one
 test

On a busy and/or slow system, 100ms might not be long enough. Instead,
we now use atomic variables to communicate between threads.
---
 .../notify_one.pass.cpp                       | 122 +++++++++---------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/notify_one.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/notify_one.pass.cpp
index 07f9868177a8c..60e44485deb3d 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/notify_one.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/notify_one.pass.cpp
@@ -28,75 +28,81 @@ std::mutex mut;
 
 std::atomic_int test1(0);
 std::atomic_int test2(0);
+std::atomic_int ready(2);
+std::atomic_int which(0);
 
 void f1()
 {
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test1 == 0);
-    while (test1 == 0)
-        cv.wait(lk);
-    assert(test1 == 1);
-    test1 = 2;
+  --ready;
+  std::unique_lock<std::mutex> lk(mut);
+  assert(test1 == 0);
+  while (test1 == 0)
+    cv.wait(lk);
+  which = 1;
+  assert(test1 == 1);
+  test1 = 2;
 }
 
 void f2()
 {
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    while (test2 == 0)
-        cv.wait(lk);
-    assert(test2 == 1);
-    test2 = 2;
+  --ready;
+  std::unique_lock<std::mutex> lk(mut);
+  assert(test2 == 0);
+  while (test2 == 0)
+    cv.wait(lk);
+  which = 2;
+  assert(test2 == 1);
+  test2 = 2;
 }
 
 int main(int, char**)
 {
-    std::thread t1(f1);
-    std::thread t2(f2);
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        test1 = 1;
-        test2 = 1;
-    }
-    cv.notify_one();
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        std::unique_lock<std::mutex>lk(mut);
-    }
-    if (test1 == 2)
-    {
-        assert(test2 == 1);
-        t1.join();
-        test1 = 0;
-    }
-    else if (test2 == 2)
-    {
-        assert(test1 == 1);
-        t2.join();
-        test2 = 0;
-    }
-    else
-        assert(false);
-    cv.notify_one();
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        std::unique_lock<std::mutex>lk(mut);
-    }
-    if (test1 == 2)
-    {
-        assert(test2 == 0);
-        t1.join();
-        test1 = 0;
-    }
-    else if (test2 == 2)
-    {
-        assert(test1 == 0);
-        t2.join();
-        test2 = 0;
-    }
-    else
-        assert(false);
+  std::thread t1(f1);
+  std::thread t2(f2);
+  while (ready > 0)
+    std::this_thread::yield();
+  // In case the threads were preempted right after the atomic decrement but
+  // before cv.wait(), we yield one more time.
+  std::this_thread::yield();
+  {
+    std::unique_lock<std::mutex>lk(mut);
+    test1 = 1;
+    test2 = 1;
+    ready = 1;
+  }
+  cv.notify_one();
+  {
+    while (which == 0)
+      std::this_thread::yield();
+    std::unique_lock<std::mutex>lk(mut);
+  }
+  if (test1 == 2) {
+    assert(test2 == 1);
+    t1.join();
+    test1 = 0;
+  } else {
+    assert(test1 == 1);
+    assert(test2 == 2);
+    t2.join();
+    test2 = 0;
+  }
+  which = 0;
+  cv.notify_one();
+  {
+    while (which == 0)
+      std::this_thread::yield();
+    std::unique_lock<std::mutex>lk(mut);
+  }
+  if (test1 == 2) {
+    assert(test2 == 0);
+    t1.join();
+    test1 = 0;
+  } else {
+    assert(test1 == 0);
+    assert(test2 == 2);
+    t2.join();
+    test2 = 0;
+  }
 
   return 0;
 }

From f5a59bea4ff9369c6dfcf947c7a58a9d570491b9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 11:29:17 +0100
Subject: [PATCH 624/770] IPDBRawSymbol.h - reduce StringRef.h include to
 forward declaration. NFC.

---
 llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
index b24e712e3b781..5480d6b1d6fc7 100644
--- a/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
@@ -12,12 +12,12 @@
 #include "PDBTypes.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include <memory>
 
 namespace llvm {
 class raw_ostream;
+class StringRef;
 
 namespace pdb {
 

From f75e91fa131cd91efdb0935d163d8858390f459a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 12:00:17 +0100
Subject: [PATCH 625/770] IPDBRawSymbol.h - remove already declared forward
 declarations. NFC.

PDBTypes.h holds most PDB forward declarations already, move IPDBSession in there as well.
---
 llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h | 4 ----
 llvm/include/llvm/DebugInfo/PDB/PDBTypes.h      | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
index 5480d6b1d6fc7..f59e933ca575f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
@@ -21,10 +21,6 @@ class StringRef;
 
 namespace pdb {
 
-class IPDBSession;
-class PDBSymbolTypeVTable;
-class PDBSymbolTypeVTableShape;
-
 enum class PdbSymbolIdField : uint32_t {
   None = 0,
   SymIndexId = 1 << 0,
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h b/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
index c26d8d1ed10c9..66c842336e901 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -28,6 +28,7 @@ class IPDBDataStream;
 class IPDBInjectedSource;
 class IPDBLineNumber;
 class IPDBSectionContrib;
+class IPDBSession;
 class IPDBSourceFile;
 class IPDBTable;
 class PDBSymDumper;

From a8eb06ccffdd3ee1b004be489cf9a970035c523c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 12:17:13 +0100
Subject: [PATCH 626/770] Architecture.h - reduce includes to forward
 declarations. NFC.

Move includes to Architecture.cpp.
---
 llvm/include/llvm/TextAPI/MachO/Architecture.h | 9 ++++++---
 llvm/lib/TextAPI/MachO/Architecture.cpp        | 2 ++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/TextAPI/MachO/Architecture.h b/llvm/include/llvm/TextAPI/MachO/Architecture.h
index 3898cbada68f7..c7ffea78962da 100644
--- a/llvm/include/llvm/TextAPI/MachO/Architecture.h
+++ b/llvm/include/llvm/TextAPI/MachO/Architecture.h
@@ -13,11 +13,14 @@
 #ifndef LLVM_TEXTAPI_MACHO_ARCHITECTURE_H
 #define LLVM_TEXTAPI_MACHO_ARCHITECTURE_H
 
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
+class raw_ostream;
+class StringRef;
+class Triple;
+
 namespace MachO {
 
 /// Defines the architecture slices that are supported by Text-based Stub files.
diff --git a/llvm/lib/TextAPI/MachO/Architecture.cpp b/llvm/lib/TextAPI/MachO/Architecture.cpp
index 699fb5f4587ad..e1c2d42927ec8 100644
--- a/llvm/lib/TextAPI/MachO/Architecture.cpp
+++ b/llvm/lib/TextAPI/MachO/Architecture.cpp
@@ -12,7 +12,9 @@
 
 #include "llvm/TextAPI/MachO/Architecture.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace MachO {

From d1765cf1974bdf298d7cec8302013f314a70a032 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 12:36:16 +0100
Subject: [PATCH 627/770] ArchitectureSet.h - reduce raw_ostream.h include to
 forward declaration. NFC.

Move raw_ostream.h include to ArchitectureSet.cpp.
---
 llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h | 4 +++-
 llvm/lib/TextAPI/MachO/ArchitectureSet.cpp        | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h b/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h
index 6e4ede6275b4f..e3a8dad45d42e 100644
--- a/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h
+++ b/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h
@@ -13,14 +13,16 @@
 #ifndef LLVM_TEXTAPI_MACHO_ARCHITECTURE_SET_H
 #define LLVM_TEXTAPI_MACHO_ARCHITECTURE_SET_H
 
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/MachO/Architecture.h"
 #include <cstddef>
 #include <iterator>
 #include <limits>
+#include <string>
 #include <vector>
 
 namespace llvm {
+class raw_ostream;
+
 namespace MachO {
 
 class ArchitectureSet {
diff --git a/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp b/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp
index a05d3ebd67bc9..f665706fad811 100644
--- a/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp
+++ b/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TextAPI/MachO/ArchitectureSet.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace MachO {

From 5a918b7e134cbaa29d272ed38c5dabcecac8aac8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 12:48:46 +0100
Subject: [PATCH 628/770] ArchitectureSet.h - add missing <tuple> include.

MSVC seems to implicitly include this from <utility> but other toolchains don't
---
 llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h b/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h
index e3a8dad45d42e..c48a4a7023631 100644
--- a/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h
+++ b/llvm/include/llvm/TextAPI/MachO/ArchitectureSet.h
@@ -18,6 +18,7 @@
 #include <iterator>
 #include <limits>
 #include <string>
+#include <tuple>
 #include <vector>
 
 namespace llvm {

From 0f461c39df70a0b555745770a445850e39e1dff7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 13:07:57 +0100
Subject: [PATCH 629/770] TBEHandler.h - remove unnecessary VersionTuple
 forward declaration. NFC.

We already have to include VersionTuple.h
---
 llvm/include/llvm/TextAPI/ELF/TBEHandler.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/TextAPI/ELF/TBEHandler.h b/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
index 1748fd13f3dce..76484410987fa 100644
--- a/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
+++ b/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
@@ -24,7 +24,6 @@ namespace llvm {
 class raw_ostream;
 class Error;
 class StringRef;
-class VersionTuple;
 
 namespace elfabi {
 

From 50bdd60731130dbde81fa477ba8916c58039d73a Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Sat, 30 May 2020 13:00:35 +0100
Subject: [PATCH 630/770] [clang-format] [PR46130] When editing a file with
 unbalance {} the namespace comment fixer can incorrectly comment the wrong
 closing brace

Summary:
https://bugs.llvm.org/show_bug.cgi?id=46130   from Twitter https://twitter.com/ikautak/status/1265998988232159232

I have seen this myself many times.. if you have format on save and you work in an editor where you are constantly saving (:w muscle memory)

If you are in the middle of editing and somehow you've missed a { or } in your code, somewhere, often way below where you are at the bottom of your file the namespace comment fixer will have put the namespace on the previous closing brace.

This leads to you having to fix up the bottom of the file.

This revision prevents that happening by performing an initial pass of the tokens and simply counting the number of `{` and `}`  and ensuring they balance.

If they don't balance we don't do any namespace fixing as it will likely be unstable and incorrect.

Reviewed By: curdeius

Subscribers: cfe-commits

Tags: #clang, #clang-format

Differential Revision: https://reviews.llvm.org/D80830
---
 .../lib/Format/NamespaceEndCommentsFixer.cpp  | 17 +++++++++++
 .../Format/NamespaceEndCommentsFixerTest.cpp  | 28 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/clang/lib/Format/NamespaceEndCommentsFixer.cpp b/clang/lib/Format/NamespaceEndCommentsFixer.cpp
index 92707150fcdba..97de45bd19659 100644
--- a/clang/lib/Format/NamespaceEndCommentsFixer.cpp
+++ b/clang/lib/Format/NamespaceEndCommentsFixer.cpp
@@ -205,6 +205,23 @@ std::pair<tooling::Replacements, unsigned> NamespaceEndCommentsFixer::analyze(
   const SourceManager &SourceMgr = Env.getSourceManager();
   AffectedRangeMgr.computeAffectedLines(AnnotatedLines);
   tooling::Replacements Fixes;
+
+  // Spin through the lines and ensure we have balanced braces.
+  int Braces = 0;
+  for (size_t I = 0, E = AnnotatedLines.size(); I != E; ++I) {
+    FormatToken *Tok = AnnotatedLines[I]->First;
+    while (Tok) {
+      Braces += Tok->is(tok::l_brace) ? 1 : Tok->is(tok::r_brace) ? -1 : 0;
+      Tok = Tok->Next;
+    }
+  }
+  // Don't attempt to comment unbalanced braces or this can
+  // lead to comments being placed on the closing brace which isn't
+  // the matching brace of the namespace. (occurs during incomplete editing).
+  if (Braces != 0) {
+    return {Fixes, 0};
+  }
+
   std::string AllNamespaceNames = "";
   size_t StartLineIndex = SIZE_MAX;
   StringRef NamespaceTokenText;
diff --git a/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp b/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp
index fee8597b4330e..463afa67e8b02 100644
--- a/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp
+++ b/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp
@@ -1089,6 +1089,34 @@ TEST_F(NamespaceEndCommentsFixerTest, HandlesInlineAtEndOfLine_PR32438) {
                                     "void d() {\n"
                                     "}\n"));
 }
+
+TEST_F(NamespaceEndCommentsFixerTest, IgnoreUnbalanced) {
+  EXPECT_EQ("namespace A {\n"
+            "class Foo {\n"
+            "}\n"
+            "}// namespace A\n",
+            fixNamespaceEndComments("namespace A {\n"
+                                    "class Foo {\n"
+                                    "}\n"
+                                    "}\n"));
+  EXPECT_EQ("namespace A {\n"
+            "class Foo {\n"
+            "}\n",
+            fixNamespaceEndComments("namespace A {\n"
+                                    "class Foo {\n"
+                                    "}\n"));
+
+  EXPECT_EQ("namespace A {\n"
+            "class Foo {\n"
+            "}\n"
+            "}\n"
+            "}\n",
+            fixNamespaceEndComments("namespace A {\n"
+                                    "class Foo {\n"
+                                    "}\n"
+                                    "}\n"
+                                    "}\n"));
+}
 } // end namespace
 } // end namespace format
 } // end namespace clang

From 179f8ad08de07d52dbccd4da339103a4d03937c7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 13:17:19 +0100
Subject: [PATCH 631/770] PackedVersion.h - reduce includes to forward
 declarations. NFC.

---
 llvm/include/llvm/TextAPI/MachO/PackedVersion.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/TextAPI/MachO/PackedVersion.h b/llvm/include/llvm/TextAPI/MachO/PackedVersion.h
index 2d0138097dd92..0d9158ae5f0d3 100644
--- a/llvm/include/llvm/TextAPI/MachO/PackedVersion.h
+++ b/llvm/include/llvm/TextAPI/MachO/PackedVersion.h
@@ -13,10 +13,13 @@
 #ifndef LLVM_TEXTAPI_MACHO_PACKED_VERSION_H
 #define LLVM_TEXTAPI_MACHO_PACKED_VERSION_H
 
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
+class raw_ostream;
+class StringRef;
+
 namespace MachO {
 
 class PackedVersion {

From 9d0bfcec8377acc29e8a9983e8343dba8e7e900d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 14:02:14 +0100
Subject: [PATCH 632/770] [SelectionDAG] ComputeNumSignBits - use Valid Min/Max
 shift amount helpers directly. NFCI.

We are calling getValidShiftAmountConstant first followed by getValidMinimumShiftAmountConstant/getValidMaximumShiftAmountConstant if that failed. But both are used in the same way in ComputeNumSignBits and the Min/Max variants call getValidShiftAmountConstant internally anyhow.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3cba91a11d424..232b1ea6bb78e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3657,23 +3657,17 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRA:
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     // SRA X, C -> adds C sign bits.
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts))
-      Tmp = std::min<uint64_t>(Tmp + ShAmt->getZExtValue(), VTBits);
-    else if (const APInt *ShAmt =
-                 getValidMinimumShiftAmountConstant(Op, DemandedElts))
+    if (const APInt *ShAmt =
+            getValidMinimumShiftAmountConstant(Op, DemandedElts))
       Tmp = std::min<uint64_t>(Tmp + ShAmt->getZExtValue(), VTBits);
     return Tmp;
   case ISD::SHL:
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts)) {
+    if (const APInt *ShAmt =
+            getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
       // shl destroys sign bits, ensure it doesn't shift out all sign bits.
       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
       if (ShAmt->ult(Tmp))
         return Tmp - ShAmt->getZExtValue();
-    } else if (const APInt *ShAmt =
-                   getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
-      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-      if (ShAmt->ult(Tmp))
-        return Tmp - ShAmt->getZExtValue();
     }
     break;
   case ISD::AND:

From 63824ad94778c54c4c17210591448af746d092ad Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 14:04:55 +0100
Subject: [PATCH 633/770] [TargetLowering] SimplifyDemandedBits - remove shift
 amount clamps from getValidShiftAmountConstant calls. NFC.

getValidShiftAmountConstant only returns a value if the shift amount is in range, so we don't need to check it again.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 51 ++++++++-----------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b6fdddc46ede5..c9170fcdb48b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1366,19 +1366,16 @@ bool TargetLowering::SimplifyDemandedBits(
         if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
           if (const APInt *SA2 =
                   TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
-            if (SA2->ult(BitWidth)) {
-              unsigned C1 = SA2->getZExtValue();
-              unsigned Opc = ISD::SHL;
-              int Diff = ShAmt - C1;
-              if (Diff < 0) {
-                Diff = -Diff;
-                Opc = ISD::SRL;
-              }
-
-              SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
-              return TLO.CombineTo(
-                  Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
+            unsigned C1 = SA2->getZExtValue();
+            unsigned Opc = ISD::SHL;
+            int Diff = ShAmt - C1;
+            if (Diff < 0) {
+              Diff = -Diff;
+              Opc = ISD::SRL;
             }
+            SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
           }
         }
       }
@@ -1412,7 +1409,7 @@ bool TargetLowering::SimplifyDemandedBits(
             InnerOp.hasOneUse()) {
           if (const APInt *SA2 =
                   TLO.DAG.getValidShiftAmountConstant(InnerOp, DemandedElts)) {
-            unsigned InnerShAmt = SA2->getLimitedValue(InnerBits);
+            unsigned InnerShAmt = SA2->getZExtValue();
             if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
                 DemandedBits.getActiveBits() <=
                     (InnerBits - InnerShAmt + ShAmt) &&
@@ -1462,23 +1459,19 @@ bool TargetLowering::SimplifyDemandedBits(
       // are never demanded.
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SHL) {
-        if (const APInt *SA2 =
-                TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
-          if (!DemandedBits.intersects(
-                  APInt::getHighBitsSet(BitWidth, ShAmt))) {
-            if (SA2->ult(BitWidth)) {
-              unsigned C1 = SA2->getZExtValue();
-              unsigned Opc = ISD::SRL;
-              int Diff = ShAmt - C1;
-              if (Diff < 0) {
-                Diff = -Diff;
-                Opc = ISD::SHL;
-              }
-
-              SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
-              return TLO.CombineTo(
-                  Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
+        if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
+            unsigned C1 = SA2->getZExtValue();
+            unsigned Opc = ISD::SRL;
+            int Diff = ShAmt - C1;
+            if (Diff < 0) {
+              Diff = -Diff;
+              Opc = ISD::SHL;
             }
+            SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
           }
         }
       }

From e5bc07634d6277bd2e071511d1b826950016972c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 14:30:19 +0100
Subject: [PATCH 634/770] SafeStackLayout.cpp - remove includes directly
 defined in SafeStackLayout.h module header. NFC.

---
 llvm/lib/CodeGen/SafeStackLayout.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp
index 09964866e4d3b..1e658ed9e3a1b 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SafeStackLayout.h"
-#include "SafeStackColoring.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"

From 2b881f7911d585086a92fdb57e93109c432be1fb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 14:32:36 +0100
Subject: [PATCH 635/770] CriticalAntiDepBreaker.cpp - remove includes directly
 defined in CriticalAntiDepBreaker.h header. NFC.

---
 llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 9b1792650a9ee..7ae42b0102611 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -14,7 +14,6 @@
 
 #include "CriticalAntiDepBreaker.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -33,9 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <map>
 #include <utility>
-#include <vector>
 
 using namespace llvm;
 

From e6aba43cda848f4a8cfa5ce9f174b77def10e9df Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 30 May 2020 14:38:02 +0100
Subject: [PATCH 636/770] SafeStackColoring.h - reduce Instructions.h include
 to forward declaration. NFC.

SafeStackColoring.cpp - remove includes directly defined in SafeStackColoring.h header. NFC.
---
 llvm/lib/CodeGen/SafeStackColoring.cpp | 7 -------
 llvm/lib/CodeGen/SafeStackColoring.h   | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SafeStackColoring.cpp b/llvm/lib/CodeGen/SafeStackColoring.cpp
index 04a5c4b6d892f..27600f2d6eac3 100644
--- a/llvm/lib/CodeGen/SafeStackColoring.cpp
+++ b/llvm/lib/CodeGen/SafeStackColoring.cpp
@@ -7,14 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "SafeStackColoring.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -23,10 +19,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::safestack;
diff --git a/llvm/lib/CodeGen/SafeStackColoring.h b/llvm/lib/CodeGen/SafeStackColoring.h
index b696b1b6baed1..d917d02099bc6 100644
--- a/llvm/lib/CodeGen/SafeStackColoring.h
+++ b/llvm/lib/CodeGen/SafeStackColoring.h
@@ -13,13 +13,13 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <utility>
 
 namespace llvm {
 
+class AllocaInst;
 class BasicBlock;
 class Function;
 class Instruction;

From d99a1848c4f8ca164c0c0768e10eafc850b2a68a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 30 May 2020 16:20:42 +0100
Subject: [PATCH 637/770] [BasicAA] Use known lower bounds for index values for
 size based check.

Currently, BasicAA does not exploit information about value ranges of
indexes. For example, consider the 2 pointers %a = %base and
%b = %base + %stride below, assuming they are used to access 4 elements.

If we know that %stride >= 4, we know the accesses do not alias. If
%stride is a constant, BasicAA currently gets that. But if the >= 4
constraint is encoded using an assume, it misses the NoAlias.

This patch extends DecomposedGEP to include an additional MinOtherOffset
field, which tracks the constant offset similar to the existing
OtherOffset, which the difference that it also includes non-negative
lower bounds on the range of the index value. When checking if the
distance between 2 accesses exceeds the access size, we can use this
improved bound.

For now this is limited to using non-negative lower bounds for indices,
as this conveniently skips cases where we do not have a useful lower
bound (because it is not constrained). We potential miss out in cases
where the lower bound is constrained but negative, but that can be
exploited in the future.

Reviewers: sanjoy, hfinkel, reames, asbirlea

Reviewed By: asbirlea

Differential Revision: https://reviews.llvm.org/D76194
---
 .../llvm/Analysis/BasicAliasAnalysis.h        |  5 ++
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      | 33 ++++++--
 .../Analysis/BasicAA/assume-index-positive.ll | 82 +++++++++++++++++++
 llvm/test/Analysis/BasicAA/bug.23626.ll       |  4 +-
 4 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index 403510dbb3a3a..fdab033f70038 100644
--- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -140,6 +140,11 @@ class BasicAAResult : public AAResultBase<BasicAAResult> {
     // Total constant offset w.r.t the base from indexing through
     // pointers/arrays/vectors
     APInt OtherOffset;
+    // Constant offset w.r.t the base from indexing through
+    // pointers/arrays/vectors, including the lower bounds of index variables,
+    // if there are any. Currently only known non-negative lower bounds are
+    // added.
+    APInt MinOtherOffset;
     // Scaled variable (non-constant) indices.
     SmallVector<VariableGEPIndex, 4> VarIndices;
     // Is GEP index scale compile-time constant.
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 5fdfb7d490731..a55443b7ec6fd 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -564,10 +565,11 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero())
           continue;
-        Decomposed.OtherOffset +=
-            (DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
-             CIdx->getValue().sextOrSelf(MaxPointerSize))
-                .sextOrTrunc(MaxPointerSize);
+        APInt Offset = (DL.getTypeAllocSize(GTI.getIndexedType()) *
+                        CIdx->getValue().sextOrSelf(MaxPointerSize))
+                           .sextOrTrunc(MaxPointerSize);
+        Decomposed.OtherOffset += Offset;
+        Decomposed.MinOtherOffset += Offset;
         continue;
       }
 
@@ -611,7 +613,18 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         if (PointerSize > Width)
           SExtBits += PointerSize - Width;
       } else {
-        Decomposed.OtherOffset += IndexOffset.sextOrTrunc(MaxPointerSize) * Scale;
+        APInt Offset = IndexOffset.sextOrTrunc(MaxPointerSize) * Scale;
+        Decomposed.OtherOffset += Offset;
+        APInt IndexBound =
+            computeConstantRange(Index, true, AC, dyn_cast<Instruction>(GEPOp))
+                .getLower()
+                .sextOrTrunc(MaxPointerSize);
+        // If we find a non-negative lower bound for the index value, we can
+        // improve the known offset to include it. By just using non-negative
+        // lower bounds, we conveniently skip any index values for which we do
+        // not find a useful lower bound.
+        if (IndexBound.isNonNegative())
+          Decomposed.MinOtherOffset += Offset + IndexBound * Scale;
         Scale *= IndexScale.sextOrTrunc(MaxPointerSize);
       }
 
@@ -1328,6 +1341,8 @@ AliasResult BasicAAResult::aliasGEP(
   DecompGEP2.StructOffset = DecompGEP2.OtherOffset = APInt(MaxPointerSize, 0);
   DecompGEP1.HasCompileTimeConstantScale =
       DecompGEP2.HasCompileTimeConstantScale = true;
+  DecompGEP1.MinOtherOffset = APInt(MaxPointerSize, 0);
+  DecompGEP2.MinOtherOffset = APInt(MaxPointerSize, 0);
 
   bool GEP1MaxLookupReached =
     DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
@@ -1342,6 +1357,8 @@ AliasResult BasicAAResult::aliasGEP(
 
   APInt GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
   APInt GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;
+  APInt GEP1BaseOffsetMin = DecompGEP1.StructOffset + DecompGEP1.MinOtherOffset;
+  APInt GEP2BaseOffsetMin = DecompGEP2.StructOffset + DecompGEP2.MinOtherOffset;
 
   assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
          "DecomposeGEPExpression returned a result different from "
@@ -1416,6 +1433,7 @@ AliasResult BasicAAResult::aliasGEP(
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
+    GEP1BaseOffsetMin -= GEP2BaseOffsetMin;
     GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);
 
   } else {
@@ -1534,10 +1552,11 @@ AliasResult BasicAAResult::aliasGEP(
     // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
     // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
     // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
-    if (AllPositive && GEP1BaseOffset.sgt(0) &&
+    if (AllPositive && GEP1BaseOffsetMin.sgt(0) &&
         V2Size != LocationSize::unknown() &&
-        GEP1BaseOffset.uge(V2Size.getValue()))
+        GEP1BaseOffsetMin.uge(V2Size.getValue())) {
       return NoAlias;
+    }
 
     if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
                                 GEP1BaseOffset, &AC, DT))
diff --git a/llvm/test/Analysis/BasicAA/assume-index-positive.ll b/llvm/test/Analysis/BasicAA/assume-index-positive.ll
index d89738a23b034..630620f0d8bde 100644
--- a/llvm/test/Analysis/BasicAA/assume-index-positive.ll
+++ b/llvm/test/Analysis/BasicAA/assume-index-positive.ll
@@ -113,4 +113,86 @@ define void @test4(double* %ptr, i32 %skip) {
   ret void
 }
 
+
+define void @test5(double* %ptr, i32 %stride) {
+; CHECK-LABEL: Function: test5: 4 pointers, 1 call sites
+; CHECK-NEXT:  MustAlias:   <6 x double>* %col.ptr.1, double* %ptr
+; CHECK-NEXT:  NoAlias:    double* %col.ptr.2, double* %ptr
+; CHECK-NEXT:  MayAlias:   <6 x double>* %col.ptr.1, double* %col.ptr.2
+; CHECK-NEXT:  NoAlias:    <6 x double>* %col.ptr.2.cast, double* %ptr
+; CHECK-NEXT:  MayAlias:   <6 x double>* %col.ptr.1, <6 x double>* %col.ptr.2.cast
+; CHECK-NEXT:  MustAlias:  <6 x double>* %col.ptr.2.cast, double* %col.ptr.2
+; CHECK-NEXT:  NoModRef:  Ptr: double* %ptr <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: <6 x double>* %col.ptr.1 <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: double* %col.ptr.2   <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: <6 x double>* %col.ptr.2.cast    <->  call void @llvm.assume(i1 %gt)
+;
+  %gt = icmp sge i32 %stride, 5
+  call void @llvm.assume(i1 %gt)
+  %col.ptr.1 = bitcast double* %ptr to <6 x double>*
+  %lv.1 = load <6 x double>, <6 x double>* %col.ptr.1, align 8
+  %col.ptr.2= getelementptr double, double* %ptr, i32 %stride
+  %col.ptr.2.cast = bitcast double* %col.ptr.2 to <6 x double>*
+  %lv.2 = load <6 x double>, <6 x double>* %col.ptr.2.cast, align 8
+  %res.1 = fadd <6 x double> %lv.1, %lv.1
+  %res.2 = fadd <6 x double> %lv.2, %lv.2
+  store <6 x double> %res.1, <6 x double>* %col.ptr.1, align 8
+  store <6 x double> %res.2, <6 x double>* %col.ptr.2.cast, align 8
+  ret void
+}
+
+define void @test6(double* %ptr, i32 %stride) {
+; CHECK-LABEL: Function: test6: 4 pointers, 1 call sites
+; CHECK-NEXT:  MustAlias:  <6 x double>* %col.ptr.1, double* %ptr
+; CHECK-NEXT:  NoAlias:    double* %col.ptr.2, double* %ptr
+; CHECK-NEXT:  NoAlias:    <6 x double>* %col.ptr.1, double* %col.ptr.2
+; CHECK-NEXT:  NoAlias:    <6 x double>* %col.ptr.2.cast, double* %ptr
+; CHECK-NEXT:  NoAlias:    <6 x double>* %col.ptr.1, <6 x double>* %col.ptr.2.cast
+; CHECK-NEXT:  MustAlias:  <6 x double>* %col.ptr.2.cast, double* %col.ptr.2
+; CHECK-NEXT:  NoModRef:  Ptr: double* %ptr <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: <6 x double>* %col.ptr.1 <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: double* %col.ptr.2   <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: <6 x double>* %col.ptr.2.cast    <->  call void @llvm.assume(i1 %gt)
+;
+  %gt = icmp sge i32 %stride, 6
+  call void @llvm.assume(i1 %gt)
+  %col.ptr.1 = bitcast double* %ptr to <6 x double>*
+  %lv.1 = load <6 x double>, <6 x double>* %col.ptr.1, align 8
+  %col.ptr.2= getelementptr double, double* %ptr, i32 %stride
+  %col.ptr.2.cast = bitcast double* %col.ptr.2 to <6 x double>*
+  %lv.2 = load <6 x double>, <6 x double>* %col.ptr.2.cast, align 8
+  %res.1 = fadd <6 x double> %lv.1, %lv.1
+  %res.2 = fadd <6 x double> %lv.2, %lv.2
+  store <6 x double> %res.1, <6 x double>* %col.ptr.1, align 8
+  store <6 x double> %res.2, <6 x double>* %col.ptr.2.cast, align 8
+  ret void
+}
+
+define void @test7(double* %ptr, i32 %stride) {
+; CHECK-LABEL: Function: test7: 4 pointers, 1 call sites
+; CHECK-NEXT:  MustAlias:   <6 x double>* %col.ptr.1, double* %ptr
+; CHECK-NEXT:  MayAlias:    double* %col.ptr.2, double* %ptr
+; CHECK-NEXT:  MayAlias:    <6 x double>* %col.ptr.1, double* %col.ptr.2
+; CHECK-NEXT:  MayAlias:    <6 x double>* %col.ptr.2.cast, double* %ptr
+; CHECK-NEXT:  MayAlias:    <6 x double>* %col.ptr.1, <6 x double>* %col.ptr.2.cast
+; CHECK-NEXT:  MustAlias:   <6 x double>* %col.ptr.2.cast, double* %col.ptr.2
+; CHECK-NEXT:  NoModRef:  Ptr: double* %ptr <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: <6 x double>* %col.ptr.1 <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: double* %col.ptr.2   <->  call void @llvm.assume(i1 %gt)
+; CHECK-NEXT:  NoModRef:  Ptr: <6 x double>* %col.ptr.2.cast    <->  call void @llvm.assume(i1 %gt)
+;
+  %gt = icmp sge i32 %stride, 0
+  call void @llvm.assume(i1 %gt)
+  %col.ptr.1 = bitcast double* %ptr to <6 x double>*
+  %lv.1 = load <6 x double>, <6 x double>* %col.ptr.1, align 8
+  %col.ptr.2= getelementptr double, double* %ptr, i32 %stride
+  %col.ptr.2.cast = bitcast double* %col.ptr.2 to <6 x double>*
+  %lv.2 = load <6 x double>, <6 x double>* %col.ptr.2.cast, align 8
+  %res.1 = fadd <6 x double> %lv.1, %lv.1
+  %res.2 = fadd <6 x double> %lv.2, %lv.2
+  store <6 x double> %res.1, <6 x double>* %col.ptr.1, align 8
+  store <6 x double> %res.2, <6 x double>* %col.ptr.2.cast, align 8
+  ret void
+}
+
 declare void @llvm.assume(i1 %cond)
diff --git a/llvm/test/Analysis/BasicAA/bug.23626.ll b/llvm/test/Analysis/BasicAA/bug.23626.ll
index 7d5b5ad06698d..da423442759a6 100644
--- a/llvm/test/Analysis/BasicAA/bug.23626.ll
+++ b/llvm/test/Analysis/BasicAA/bug.23626.ll
@@ -4,9 +4,9 @@ target triple = "x86_64-apple-darwin13.4.0"
 
 ; CHECK-LABEL: compute1
 ; CHECK: MayAlias:	i32* %arrayidx8, i32* %out
-; CHECK: MayAlias:	i32* %arrayidx11, i32* %out
+; CHECK: NoAlias:	i32* %arrayidx11, i32* %out
 ; CHECK: MayAlias:	i32* %arrayidx11, i32* %arrayidx8
-; CHECK: MayAlias:	i32* %arrayidx14, i32* %out
+; CHECK: NoAlias:	i32* %arrayidx14, i32* %out
 ; CHECK: MayAlias:	i32* %arrayidx14, i32* %arrayidx8
 ; CHECK: MayAlias:	i32* %arrayidx11, i32* %arrayidx14
 define void @compute1(i32 %num.0.lcssa, i32* %out) {

From fd0ab3b3eb88de3fe4792c34b50084595e22d68d Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Fri, 29 May 2020 17:08:28 +0200
Subject: [PATCH 638/770] [lldb] Pass -fPIC flag even when DYLIB_ONLY is set

Summary:
It seems that when we rewrite a few rules to only build a dylib (i.e., when DYLIB_ONLY is set),
the rule for setting the CFLAGS for the dylib's object file compilation will no longer work. From what I can
see this is because in DYLIB_ONLY mode we pretend to compile the main executable so
the DYLIB_OBJECTS scope is actually never used.

This patch makes `-fPIC` unstopped if DYLIB_ONLY is set so that -fPIC actually ends up in the
CFLAGS for the dylib object file compilation.

The test for this is D80798 which only compiles on Linux with this patch.

Reviewers: friss, labath

Reviewed By: friss

Subscribers: JDevlieghere

Differential Revision: https://reviews.llvm.org/D80789
---
 lldb/packages/Python/lldbsuite/test/make/Makefile.rules | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index ea0fa748bc361..5e3f478849901 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -657,9 +657,14 @@ endif
 $(DYLIB_OBJECTS) : CFLAGS += -DCOMPILING_LLDB_TEST_DLL
 
 ifneq "$(OS)" "Windows_NT"
+ifeq "$(DYLIB_ONLY)" ""
+CFLAGS += -fPIC
+CXXFLAGS += -fPIC
+else
 $(DYLIB_OBJECTS) : CFLAGS += -fPIC
 $(DYLIB_OBJECTS) : CXXFLAGS += -fPIC
 endif
+endif
 
 $(DYLIB_FILENAME) : $(DYLIB_OBJECTS)
 ifeq "$(OS)" "Darwin"

From 065bf124fde8fd9d37eba0a387833cdfe9e68d08 Mon Sep 17 00:00:00 2001
From: zoecarver <z.zoelec2@gmail.com>
Date: Sat, 30 May 2020 09:56:04 -0700
Subject: [PATCH 639/770] [DSE] Remove noop stores in MSSA.

Adds a simple fast-path check for the pattern:
v = load ptr
store v to ptr

I took the tests from the bugzilla post, I can add more if needed (but I think these should be sufficent).

Refs: https://bugs.llvm.org/show_bug.cgi?id=45795

Differential Revision: https://reviews.llvm.org/D79391
---
 .../Scalar/DeadStoreElimination.cpp           |  28 +++
 .../DeadStoreElimination/MSSA/atomic.ll       |  34 ++++
 .../DeadStoreElimination/MSSA/noop-stores.ll  | 171 ++++++++++++++++++
 .../DeadStoreElimination/MSSA/simple-todo.ll  | 119 ------------
 4 files changed, 233 insertions(+), 119 deletions(-)
 create mode 100644 llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll

diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c9be31c194420..36d8b4b06cf45 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -82,6 +82,7 @@ STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther, "Number of other instrs removed");
 STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
 STATISTIC(NumModifiedStores, "Number of stores modified");
+STATISTIC(NumNoopStores, "Number of noop stores deleted");
 
 DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
               "Controls which MemoryDefs are eliminated.");
@@ -1821,6 +1822,21 @@ struct DSEState {
   }
 };
 
+/// \returns true if \p KillingDef stores the result of \p Load to the source of
+/// \p Load.
+static bool storeIsNoop(MemorySSA &MSSA, LoadInst *Load,
+                        MemoryDef *KillingDef) {
+  Instruction *Store = KillingDef->getMemoryInst();
+  // If the load's operand isn't the destination of the store, bail.
+  if (Load->getPointerOperand() != Store->getOperand(1))
+    return false;
+
+  // Get the defining access for the load.
+  auto *LoadAccess = MSSA.getMemoryAccess(Load)->getDefiningAccess();
+  // The store is dead if the defining accesses are the same.
+  return LoadAccess == KillingDef->getDefiningAccess();
+}
+
 bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
                                   MemorySSA &MSSA, DominatorTree &DT,
                                   PostDominatorTree &PDT,
@@ -1835,6 +1851,18 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
     if (State.SkipStores.count(KillingDef))
       continue;
     Instruction *SI = KillingDef->getMemoryInst();
+
+    // Check if we're storing a value that we just loaded.
+    if (auto *Load = dyn_cast<LoadInst>(SI->getOperand(0))) {
+      if (storeIsNoop(MSSA, Load, KillingDef)) {
+        State.deleteDeadInstruction(SI);
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *SI
+                          << '\n');
+        NumNoopStores++;
+        continue;
+      }
+    }
+
     auto MaybeSILoc = State.getLocForWriteEx(SI);
     if (!MaybeSILoc) {
       LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
index 26df903bb275d..8a4515388f2d7 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
@@ -98,3 +98,37 @@ define i32 @test15() {
   store i32 1, i32* @x
   ret i32 %x
 }
+
+; **** Noop load->store tests **************************************************
+
+; We can optimize unordered atomic loads or stores.
+define void @test_load_atomic(i32* %Q) {
+; CHECK-LABEL: @test_load_atomic(
+; CHECK-NEXT:    ret void
+;
+  %a = load atomic i32, i32* %Q unordered, align 4
+  store atomic i32 %a, i32* %Q unordered, align 4
+  ret void
+}
+
+; We can optimize unordered atomic loads or stores.
+define void @test_store_atomic(i32* %Q) {
+; CHECK-LABEL: @test_store_atomic(
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  store atomic i32 %a, i32* %Q unordered, align 4
+  ret void
+}
+
+; We can NOT optimize release atomic loads or stores.
+define void @test_store_atomic_release(i32* %Q) {
+; CHECK-LABEL: @test_store_atomic_release(
+; CHECK-NEXT:    load
+; CHECK-NEXT:    store atomic
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  store atomic i32 %a, i32* %Q release, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll
new file mode 100644
index 0000000000000..11eaf911392fa
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/noop-stores.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+declare void @llvm.init.trampoline(i8*, i8*, i8*)
+
+; **** Noop load->store tests **************************************************
+
+; We CAN optimize volatile loads.
+define void @test_load_volatile(i32* %Q) {
+; CHECK-LABEL: @test_load_volatile(
+; CHECK-NEXT:    [[A:%.*]] = load volatile i32, i32* [[Q:%.*]]
+; CHECK-NEXT:    store i32 [[A]], i32* [[Q]]
+; CHECK-NEXT:    ret void
+;
+  %a = load volatile i32, i32* %Q
+  store i32 %a, i32* %Q
+  ret void
+}
+
+; We can NOT optimize volatile stores.
+define void @test_store_volatile(i32* %Q) {
+; CHECK-LABEL: @test_store_volatile(
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[Q:%.*]]
+; CHECK-NEXT:    store volatile i32 [[A]]
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  store volatile i32 %a, i32* %Q
+  ret void
+}
+
+; PR2599 - load -> store to same address.
+define void @test12({ i32, i32 }* %x) nounwind  {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 0, [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+  %tmp4 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 0
+  %tmp5 = load i32, i32* %tmp4, align 4
+  %tmp7 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %tmp17 = sub i32 0, %tmp8
+  store i32 %tmp5, i32* %tmp4, align 4
+  store i32 %tmp17, i32* %tmp7, align 4
+  ret void
+}
+
+; Remove redundant store if loaded value is in another block.
+define i32 @test26(i1 %c, i32* %p) {
+; CHECK-LABEL: @test26(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  store i32 %v, i32* %p, align 4
+  br label %bb3
+bb3:
+  ret i32 0
+}
+
+; Remove redundant store if loaded value is in another block.
+define i32 @test27(i1 %c, i32* %p) {
+; CHECK-LABEL: @test27(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 %v, i32* %p, align 4
+  ret i32 0
+}
+
+declare void @unknown_func()
+
+; Remove redundant store, which is in the lame loop as the load.
+define i32 @test33(i1 %c, i32* %p, i32 %i) {
+; CHECK-LABEL: @test33(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %bb1
+bb1:
+  %v = load i32, i32* %p, align 4
+  br label %bb2
+bb2:
+  store i32 %v, i32* %p, align 4
+  ; Might read and overwrite value at %p, but doesn't matter.
+  call void @unknown_func()
+  br i1 undef, label %bb1, label %bb3
+bb3:
+  ret i32 0
+}
+
+declare void @unkown_write(i32*)
+
+; We can't remove the "noop" store around an unkown write.
+define void @test43(i32* %Q) {
+; CHECK-LABEL: @test43(
+; CHECK-NEXT:    load
+; CHECK-NEXT:    call
+; CHECK-NEXT:    store
+; CHECK-NEXT:    ret void
+;
+  %a = load i32, i32* %Q
+  call void @unkown_write(i32* %Q)
+  store i32 %a, i32* %Q
+  ret void
+}
+
+; We CAN remove it when the unkown write comes AFTER.
+define void @test44(i32* %Q) {
+; CHECK-LABEL: @test44(
+; CHECK-NEXT:    call
+; CHECK-NEXT:    ret void
+  %a = load i32, i32* %Q
+  store i32 %a, i32* %Q
+  call void @unkown_write(i32* %Q)
+  ret void
+}
+
+define void @test45(i32* %Q) {
+; CHECK-LABEL: @test45(
+; CHECK-NEXT:    [[A:%.*]] = load
+; CHECK-NEXT:    store i32 [[A]]
+; CHECK-NEXT:    ret void
+  %a = load i32, i32* %Q
+  store i32 10, i32* %Q
+  store i32 %a, i32* %Q
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
index 7e212187f0304..70c055e3b370a 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
@@ -10,16 +10,6 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) n
 declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 declare void @llvm.init.trampoline(i8*, i8*, i8*)
 
-define void @test5(i32* %Q) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[A:%.*]] = load volatile i32, i32* [[Q:%.*]]
-; CHECK-NEXT:    ret void
-;
-  %a = load volatile i32, i32* %Q
-  store i32 %a, i32* %Q
-  ret void
-}
-
 ; Do not delete stores that are only partially killed.
 define i32 @test8() {
 ; CHECK-LABEL: @test8(
@@ -80,25 +70,6 @@ define void @test11() {
   ret void
 }
 
-; PR2599 - load -> store to same address.
-define void @test12({ i32, i32 }* %x) nounwind  {
-; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[X:%.*]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 0, [[TMP8]]
-; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %tmp4 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 0
-  %tmp5 = load i32, i32* %tmp4, align 4
-  %tmp7 = getelementptr { i32, i32 }, { i32, i32 }* %x, i32 0, i32 1
-  %tmp8 = load i32, i32* %tmp7, align 4
-  %tmp17 = sub i32 0, %tmp8
-  store i32 %tmp5, i32* %tmp4, align 4
-  store i32 %tmp17, i32* %tmp7, align 4
-  ret void
-}
-
 
 declare noalias i8* @malloc(i32)
 declare noalias i8* @calloc(i32, i32)
@@ -143,54 +114,6 @@ define void @test22(i1 %i, i32 %k, i32 %m) nounwind {
   ret void
 }
 
-; Remove redundant store if loaded value is in another block.
-define i32 @test26(i1 %c, i32* %p) {
-; CHECK-LABEL: @test26(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    br label [[BB3]]
-; CHECK:       bb3:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %v = load i32, i32* %p, align 4
-  br i1 %c, label %bb1, label %bb2
-bb1:
-  br label %bb3
-bb2:
-  store i32 %v, i32* %p, align 4
-  br label %bb3
-bb3:
-  ret i32 0
-}
-
-; Remove redundant store if loaded value is in another block.
-define i32 @test27(i1 %c, i32* %p) {
-; CHECK-LABEL: @test27(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    br label [[BB3]]
-; CHECK:       bb3:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %v = load i32, i32* %p, align 4
-  br i1 %c, label %bb1, label %bb2
-bb1:
-  br label %bb3
-bb2:
-  br label %bb3
-bb3:
-  store i32 %v, i32* %p, align 4
-  ret i32 0
-}
-
 declare void @unknown_func()
 
 ; Remove redundant store if loaded value is in another block inside a loop.
@@ -213,48 +136,6 @@ bb2:
   ret i32 0
 }
 
-; Remove redundant store, which is in the lame loop as the load.
-define i32 @test33(i1 %c, i32* %p, i32 %i) {
-; CHECK-LABEL: @test33(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[BB2:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    call void @unknown_func()
-; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
-; CHECK:       bb3:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  br label %bb1
-bb1:
-  %v = load i32, i32* %p, align 4
-  br label %bb2
-bb2:
-  store i32 %v, i32* %p, align 4
-  ; Might read and overwrite value at %p, but doesn't matter.
-  call void @unknown_func()
-  br i1 undef, label %bb1, label %bb3
-bb3:
-  ret i32 0
-}
-
-define void @test43(i32* %P, i32* noalias %Q) {
-; CHECK-LABEL: @test43(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 50331649, i32* [[P:%.*]]
-; CHECK-NEXT:    store i32 2, i32* [[Q:%.*]]
-; CHECK-NEXT:    ret void
-;
-entry:
-  store i32 1, i32* %P
-  %P2 = bitcast i32* %P to i8*
-  store i32 2, i32* %Q
-  store i8 3, i8* %P2
-  ret void
-}
-
 define void @test43a(i32* %P, i32* noalias %Q) {
 ; CHECK-LABEL: @test43a(
 ; CHECK-NEXT:  entry:

From 7873376bb36b4f9646fbc26d6da88e2edbf796e4 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Sat, 30 May 2020 18:20:23 +0000
Subject: [PATCH 640/770] [LoopUnroll] Fix build failure for allyesconfig.

Differential Revision: https://reviews.llvm.org/D80477.
---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 06e9f7ff2f533..861fc7d46ffeb 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -818,10 +818,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // For loops exiting from non latch exiting block, we limit the
         // supported loops to have a single exiting block.
         NewIDom = TermBlocks.back();
-        for (BasicBlock *Iter : TermBlocks) {
-          Instruction *Term = Iter->getTerminator();
+        for (unsigned i = 0, e = TermBlocks.size(); i != e; ++i) {
+          Instruction *Term = TermBlocks[i]->getTerminator();
           if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
-            NewIDom = Iter;
+            NewIDom = DT->findNearestCommonDominator(TermBlocks[i], Latches[i]);
             break;
           }
         }

From 44119626dedfebe245fe6ce26487949201299d38 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Wed, 27 May 2020 15:01:17 +0100
Subject: [PATCH 641/770] [clang-tidy] RenamerClangTidyChecks ignore builtin
 and command line macros

Summary: Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=42635 | readability-identifier-naming option MacroDefinitionCase should ignore macros passed as parameters. ]]

Reviewers: aaron.ballman, alexfh, gribozavr2, hokein

Reviewed By: aaron.ballman

Subscribers: xazax.hun, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80631
---
 .../clang-tidy/utils/RenamerClangTidyCheck.cpp            | 8 ++++++++
 .../clang-tidy/checkers/readability-identifier-naming.cpp | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index dd05b3a45c0d2..3301ba6343c7b 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -73,6 +73,14 @@ class RenamerClangTidyCheckPPCallbacks : public PPCallbacks {
   /// MacroDefined calls checkMacro for macros in the main file
   void MacroDefined(const Token &MacroNameTok,
                     const MacroDirective *MD) override {
+    if (MD->getMacroInfo()->isBuiltinMacro())
+      return;
+    if (PP->getSourceManager().isWrittenInBuiltinFile(
+            MacroNameTok.getLocation()))
+      return;
+    if (PP->getSourceManager().isWrittenInCommandLineFile(
+            MacroNameTok.getLocation()))
+      return;
     Check->checkMacro(PP->getSourceManager(), MacroNameTok, MD->getMacroInfo());
   }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp
index 7983bb30ca649..1bb435e02eb55 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp
@@ -80,7 +80,7 @@
 // RUN:     {key: readability-identifier-naming.LocalPointerPrefix, value: 'l_'}, \
 // RUN:     {key: readability-identifier-naming.LocalConstantPointerCase, value: CamelCase}, \
 // RUN:     {key: readability-identifier-naming.LocalConstantPointerPrefix, value: 'lc_'}, \
-// RUN:   ]}' -- -fno-delayed-template-parsing \
+// RUN:   ]}' -- -fno-delayed-template-parsing -Dbad_macro \
 // RUN:   -I%S/Inputs/readability-identifier-naming \
 // RUN:   -isystem %S/Inputs/readability-identifier-naming/system
 

From dfa82f8af446380d033a1d568273e82318ecce0e Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Sat, 30 May 2020 12:58:41 -0700
Subject: [PATCH 642/770] [Tests] Convert last statepoint lowering tests to
 bundle format

---
 llvm/test/CodeGen/X86/statepoint-invoke.ll  | 46 ++++++++++-----------
 llvm/test/CodeGen/X86/statepoint-live-in.ll | 34 +++++++--------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/llvm/test/CodeGen/X86/statepoint-invoke.ll b/llvm/test/CodeGen/X86/statepoint-invoke.ll
index e0edac6e4e6e6..9dc305d8ca726 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke.ll
+++ b/llvm/test/CodeGen/X86/statepoint-invoke.ll
@@ -34,12 +34,12 @@ define i64 addrspace(1)* @test_basic(i64 addrspace(1)* %obj,
                                      i64 addrspace(1)* %obj1)
 gc "statepoint-example" personality i32 ()* @"personality_function" {
 entry:
-  %0 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
+  %0 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) ["deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
           to label %invoke_safepoint_normal_dest unwind label %exceptional_return
 
 invoke_safepoint_normal_dest:
-  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %0, i32 13, i32 13)
-  %obj1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %0, i32 14, i32 14)
+  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %0, i32 8, i32 8)
+  %obj1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %0, i32 9, i32 9)
   br label %normal_return
 
 normal_return:
@@ -48,8 +48,8 @@ normal_return:
 exceptional_return:
   %landing_pad = landingpad token
           cleanup
-  %obj.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
-  %obj1.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 14, i32 14)
+  %obj.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 8, i32 8)
+  %obj1.relocated1 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 9, i32 9)
   ret i64 addrspace(1)* %obj1.relocated1
 }
 ; CHECK-LABEL: GCC_except_table{{[0-9]+}}:
@@ -82,7 +82,7 @@ define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj,
                                       i64 addrspace(1)* %obj1)
   gc "statepoint-example" personality i32 ()* @personality_function {
 entry:
-  %0 = invoke token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 0, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @some_other_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
+  %0 = invoke token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 0, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @some_other_call, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 0, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
           to label %normal_return unwind label %exceptional_return
 
 normal_return:
@@ -92,7 +92,7 @@ normal_return:
 exceptional_return:
   %landing_pad = landingpad token
           cleanup
-  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
+  %obj.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 8, i32 8)
   ret i64 addrspace(1)* %obj.relocated
 }
 ; CHECK-LABEL: GCC_except_table{{[0-9]+}}:
@@ -158,21 +158,21 @@ entry:
   br i1 %cond, label %left, label %right
 
 left:
-  %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2)
+  %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 0, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2)
            to label %left.relocs unwind label %exceptional_return.left
 
 left.relocs:
-  %val1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
-  %val2.relocated_left = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
+  %val1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 8, i32 8)
+  %val2.relocated_left = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 9, i32 9)
   br label %normal_return
 
 right:
-  %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
+  %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
            to label %right.relocs unwind label %exceptional_return.right
 
 right.relocs:
-  %val2.relocated_right = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 13, i32 13)
-  %val3.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 14, i32 14)
+  %val2.relocated_right = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 8, i32 8)
+  %val3.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 9, i32 9)
   br label %normal_return
 
 normal_return:
@@ -184,13 +184,13 @@ normal_return:
 exceptional_return.left:
   %landing_pad = landingpad token
           cleanup
-  %val.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
+  %val.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 8, i32 8)
   ret i64 addrspace(1)* %val.relocated2
 
 exceptional_return.right:
   %landing_pad1 = landingpad token
           cleanup
-  %val.relocated3 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad1, i32 13, i32 13)
+  %val.relocated3 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad1, i32 8, i32 8)
   ret i64 addrspace(1)* %val.relocated3
 }
 
@@ -214,19 +214,19 @@ define i64 addrspace(1)* @test_null_undef(i64 addrspace(1)* %val1)
 ; CHECK-NEXT:    jmp .LBB3_1
        gc "statepoint-example" personality i32 ()* @"personality_function" {
 entry:
-  %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* null, i64 addrspace(1)* undef)
+  %sp1 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 0, i64 addrspace(1)* null, i64 addrspace(1)* undef)
            to label %normal_return unwind label %exceptional_return
 
 normal_return:
-  %null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
-  %undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
+  %null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 8, i32 8)
+  %undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 9, i32 9)
   ret i64 addrspace(1)* %null.relocated
 
 exceptional_return:
   %landing_pad = landingpad token
           cleanup
-  %null.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 13, i32 13)
-  %undef.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 14, i32 14)
+  %null.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 8, i32 8)
+  %undef.relocated2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 9, i32 9)
   ret i64 addrspace(1)* %null.relocated2
 }
 
@@ -256,18 +256,18 @@ entry:
   %a = alloca i32
   %aa = addrspacecast i32* %a to i32 addrspace(1)*
   %c = inttoptr i64 15 to i64 addrspace(1)*
-  %sp = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %aa, i64 addrspace(1)* %c)
+  %sp = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 0, i32 addrspace(1)* %aa, i64 addrspace(1)* %c)
            to label %normal_return unwind label %exceptional_return
 
 normal_return:
-  %aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 13, i32 13)
+  %aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 8, i32 8)
   %aa.converted = bitcast i32 addrspace(1)* %aa.rel to i64 addrspace(1)*
   ret i64 addrspace(1)* %aa.converted
 
 exceptional_return:
   %landing_pad = landingpad token
           cleanup
-  %aa.rel2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 14, i32 14)
+  %aa.rel2 = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %landing_pad, i32 9, i32 9)
   ret i64 addrspace(1)* %aa.rel2
 }
 
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index c736394f662b7..c0c2b09ceffd3 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -45,8 +45,8 @@ define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
 entry:
 ; Because the first call clobbers esi, we have to move the values into
 ; new registers.  Note that they stay in the registers for both calls.
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 2, i32 %a, i32 %b)
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 2, i32 %b, i32 %a)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a, i32 %b)]
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %b, i32 %a)]
   ret void
 }
 
@@ -61,7 +61,7 @@ define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    retq
 entry:
 ; We directly reference the argument slot
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)]
   ret void
 }
 
@@ -79,7 +79,7 @@ define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
   ret void
 }
 
@@ -99,8 +99,8 @@ define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-ex
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 entry:
-  %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p, i32 addrspace(1)* %p)
-  %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 9, i32 9)
+  %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0, i32 addrspace(1)* %p, i32 addrspace(1)* %p) ["deopt"(i32 %a)]
+  %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 8, i32 8)
   ret i32 addrspace(1)* %p2
 }
 
@@ -123,8 +123,8 @@ define void @test6(i32 %a) gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
 entry:
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 0) ["deopt"(i32 %a)]
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a)]
   ret void
 }
 
@@ -228,7 +228,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   ret void
 }
 
@@ -323,7 +323,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i8 %a8, i8 %b8, i8 %c8, i8 %d8, i16 %e16, i16 %f16, i16 %g16, i16 %h16, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i8 %a8, i8 %b8, i8 %c8, i8 %d8, i16 %e16, i16 %f16, i16 %g16, i16 %h16, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   ret void
 }
 
@@ -340,7 +340,7 @@ define void @test9(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    retq
 
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
   ret void
 }
 
@@ -392,8 +392,8 @@ define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32
 ; CHECK-NEXT:    retq
 
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
-  %statepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
+  %statepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 0) ["deopt"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)]
   ret void
 }
 
@@ -554,7 +554,7 @@ entry:
   %x64 = zext i32 %x to i64
   %y64 = zext i32 %y to i64
   %z64 = zext i32 %z to i64
-  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 0) ["deopt"(i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)]
   %addab = add i64 %a64, %b64
   %addc = add i64 %addab, %c64
   %addd = add i64 %addc, %d64
@@ -595,7 +595,7 @@ define void @addr_func() gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 3, void ()* @bar, void ()* @bar, void ()* @bar)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 0) ["deopt"(void ()* @bar, void ()* @bar, void ()* @bar)]
   ret void
 }
 
@@ -612,7 +612,7 @@ define void @addr_global() gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 3, i32* @G, i32* @G, i32* @G)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 0) ["deopt"(i32* @G, i32* @G, i32* @G)]
   ret void
 }
 
@@ -629,7 +629,7 @@ define void @addr_alloca(i32 %v) gc "statepoint-example" {
 entry:
   %a = alloca i32
   store i32 %v, i32* %a
-  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 3, i32* %a, i32* %a, i32* %a)
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i64 0, i64 0) ["deopt"(i32* %a, i32* %a, i32* %a)]
   ret void
 }
 

From 0fee91a187d98ad68d70fb42dad4451fce172a23 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Sat, 30 May 2020 20:24:10 +0000
Subject: [PATCH 643/770] [LoopUnroll] Add a test case for rG7873376bb36b.

rG7873376bb36b fixes a build failure for allyesconfig.

The problem happened when the single exiting block doesn't dominate the
loop latch, then the immediate dominator of the exit block should not be
the exiting block after unrolling. As the exiting block of
different unrolled iteration can branch to the exit block, and the ith
exiting block doesn't dominate (i+1)th exiting block, the immediate
dominator of the exit block should not the nearest common dominator of
the exiting block and the loop latch of the same iteration.

Differential Revision: https://reviews.llvm.org/D80477
---
 .../Transforms/LoopUnroll/nonlatchcondbr.ll   | 98 ++++++++++++++++++-
 1 file changed, 94 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
index 547b05d1e186d..351f3faf7ce81 100644
--- a/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
+++ b/llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-unroll -S | FileCheck %s
-; RUN: opt < %s -passes='require<opt-remark-emit>,unroll' -S | FileCheck %s
+; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-count=4 -S | FileCheck %s
+; RUN: opt < %s -passes='require<opt-remark-emit>,unroll' -unroll-runtime -unroll-count=4 -S | FileCheck %s
 
-define void @foo(i32* noalias %A) {
-; CHECK-LABEL: @foo(
+; Check that loop unroll pass correctly handle loops with
+; single exiting block not the loop header or latch.
+
+define void @test1(i32* noalias %A) {
+; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
@@ -66,4 +69,91 @@ for.end:
   ret void
 }
 
+; Check that loop unroll pass correctly handle loops with
+; (1) exiting block not dominating the loop latch; and
+; (2) exiting terminator instructions cannot be simplified to unconditional.
+
+define void @test2(i32* noalias %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOR_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[TMP0]])
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[FOR_PREHEADER]] ], [ [[DOTPRE_3:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE_3:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[FOR_PREHEADER]] ], [ [[INC_3:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]] ]
+; CHECK-NEXT:    call void @bar(i32 [[TMP1]])
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @foo(i64 [[I]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.body.for.body_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    [[INC_1:%.*]] = add nuw nsw i64 [[INC]], 1
+; CHECK-NEXT:    br i1 true, label [[FOR_BODY_1:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_1:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body.1:
+; CHECK-NEXT:    [[CMP_1:%.*]] = call i1 @foo(i64 [[INC]])
+; CHECK-NEXT:    br i1 [[CMP_1]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_1]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.body.for.body_crit_edge.1:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_1]]
+; CHECK-NEXT:    [[DOTPRE_1:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_1]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_1]])
+; CHECK-NEXT:    [[INC_2:%.*]] = add nuw nsw i64 [[INC_1]], 1
+; CHECK-NEXT:    br i1 true, label [[FOR_BODY_2:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_2:%.*]]
+; CHECK:       for.body.2:
+; CHECK-NEXT:    [[CMP_2:%.*]] = call i1 @foo(i64 [[INC_1]])
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_2]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.body.for.body_crit_edge.2:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]]
+; CHECK-NEXT:    [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4
+; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_2]])
+; CHECK-NEXT:    [[INC_3]] = add nsw i64 [[INC_2]], 1
+; CHECK-NEXT:    br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]]
+; CHECK:       for.body.3:
+; CHECK-NEXT:    [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]])
+; CHECK-NEXT:    br i1 [[CMP_3]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.body.for.body_crit_edge.3:
+; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]]
+; CHECK-NEXT:    [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4
+; CHECK-NEXT:    br label [[FOR_HEADER]], !llvm.loop !0
+;
+entry:
+  br i1 true, label %for.preheader, label %for.end
+
+for.preheader:
+  %0 = load i32, i32* %A, align 4
+  call void @bar(i32 %0)
+  br label %for.header
+
+for.header:
+  %1 = phi i32 [ %0, %for.preheader ], [ %.pre, %for.body.for.body_crit_edge ]
+  %i = phi i64 [ 0, %for.preheader ], [ %inc, %for.body.for.body_crit_edge ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i
+  call void @bar(i32 %1)
+  %inc = add nsw i64 %i, 1
+  br i1 true, label %for.body, label %for.body.for.body_crit_edge
+
+for.body:
+  %cmp = call i1 @foo(i64 %i)
+  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
+
+for.body.for.body_crit_edge:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %A, i64 %inc
+  %.pre = load i32, i32* %arrayidx.phi.trans.insert, align 4
+  br label %for.header
+
+for.end:
+  ret void
+}
+
 declare void @bar(i32)
+declare i1 @foo(i64)

From 3eb430d59847b9e0091199cb63a50ab0494711dd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 00:31:21 -0700
Subject: [PATCH 644/770] [X86] Factor constant pool comment printing out of
 the switch in X86AsmPrinter::emitInstruction. NFC

Pull the verbose asm check out of the cases and move it up to
the call of the new function.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 492 ++++++++++++-------------
 1 file changed, 244 insertions(+), 248 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 723806186543f..43cfeaab42e9b 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1966,233 +1966,9 @@ static unsigned getRegisterWidth(const MCOperandInfo &Info) {
   llvm_unreachable("Unknown register class!");
 }
 
-void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
-  X86MCInstLower MCInstLowering(*MF, *this);
-  const X86RegisterInfo *RI =
-      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
-
-  // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
-  // are compressed from EVEX encoding to VEX encoding.
-  if (TM.Options.MCOptions.ShowMCEncoding) {
-    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
-      OutStreamer->AddComment("EVEX TO VEX Compression ", false);
-  }
-
+static void addConstantComments(const MachineInstr *MI,
+                                MCStreamer &OutStreamer) {
   switch (MI->getOpcode()) {
-  case TargetOpcode::DBG_VALUE:
-    llvm_unreachable("Should be handled target independently");
-
-  // Emit nothing here but a comment if we can.
-  case X86::Int_MemBarrier:
-    OutStreamer->emitRawComment("MEMBARRIER");
-    return;
-
-  case X86::EH_RETURN:
-  case X86::EH_RETURN64: {
-    // Lower these as normal, but add some comments.
-    Register Reg = MI->getOperand(0).getReg();
-    OutStreamer->AddComment(StringRef("eh_return, addr: %") +
-                            X86ATTInstPrinter::getRegisterName(Reg));
-    break;
-  }
-  case X86::CLEANUPRET: {
-    // Lower these as normal, but add some comments.
-    OutStreamer->AddComment("CLEANUPRET");
-    break;
-  }
-
-  case X86::CATCHRET: {
-    // Lower these as normal, but add some comments.
-    OutStreamer->AddComment("CATCHRET");
-    break;
-  }
-
-  case X86::ENDBR32:
-  case X86::ENDBR64: {
-    // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
-    // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
-    // non-empty. If MI is the initial ENDBR, place the
-    // __patchable_function_entries label after ENDBR.
-    if (CurrentPatchableFunctionEntrySym &&
-        CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
-        MI == &MF->front().front()) {
-      MCInst Inst;
-      MCInstLowering.Lower(MI, Inst);
-      EmitAndCountInstruction(Inst);
-      CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
-      OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
-      return;
-    }
-    break;
-  }
-
-  case X86::TAILJMPr:
-  case X86::TAILJMPm:
-  case X86::TAILJMPd:
-  case X86::TAILJMPd_CC:
-  case X86::TAILJMPr64:
-  case X86::TAILJMPm64:
-  case X86::TAILJMPd64:
-  case X86::TAILJMPd64_CC:
-  case X86::TAILJMPr64_REX:
-  case X86::TAILJMPm64_REX:
-    // Lower these as normal, but add some comments.
-    OutStreamer->AddComment("TAILCALL");
-    break;
-
-  case X86::TLS_addr32:
-  case X86::TLS_addr64:
-  case X86::TLS_base_addr32:
-  case X86::TLS_base_addr64:
-    return LowerTlsAddr(MCInstLowering, *MI);
-
-  case X86::MOVPC32r: {
-    // This is a pseudo op for a two instruction sequence with a label, which
-    // looks like:
-    //     call "L1$pb"
-    // "L1$pb":
-    //     popl %esi
-
-    // Emit the call.
-    MCSymbol *PICBase = MF->getPICBaseSymbol();
-    // FIXME: We would like an efficient form for this, so we don't have to do a
-    // lot of extra uniquing.
-    EmitAndCountInstruction(
-        MCInstBuilder(X86::CALLpcrel32)
-            .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
-
-    const X86FrameLowering *FrameLowering =
-        MF->getSubtarget<X86Subtarget>().getFrameLowering();
-    bool hasFP = FrameLowering->hasFP(*MF);
-
-    // TODO: This is needed only if we require precise CFA.
-    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
-                               !OutStreamer->getDwarfFrameInfos().back().End;
-
-    int stackGrowth = -RI->getSlotSize();
-
-    if (HasActiveDwarfFrame && !hasFP) {
-      OutStreamer->emitCFIAdjustCfaOffset(-stackGrowth);
-    }
-
-    // Emit the label.
-    OutStreamer->emitLabel(PICBase);
-
-    // popl $reg
-    EmitAndCountInstruction(
-        MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
-
-    if (HasActiveDwarfFrame && !hasFP) {
-      OutStreamer->emitCFIAdjustCfaOffset(stackGrowth);
-    }
-    return;
-  }
-
-  case X86::ADD32ri: {
-    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
-    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
-      break;
-
-    // Okay, we have something like:
-    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
-
-    // For this, we want to print something like:
-    //   MYGLOBAL + (. - PICBASE)
-    // However, we can't generate a ".", so just emit a new label here and refer
-    // to it.
-    MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->emitLabel(DotSym);
-
-    // Now that we have emitted the label, lower the complex operand expression.
-    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
-
-    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
-    const MCExpr *PICBase =
-        MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
-    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
-
-    DotExpr = MCBinaryExpr::createAdd(
-        MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
-
-    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
-                                .addReg(MI->getOperand(0).getReg())
-                                .addReg(MI->getOperand(1).getReg())
-                                .addExpr(DotExpr));
-    return;
-  }
-  case TargetOpcode::STATEPOINT:
-    return LowerSTATEPOINT(*MI, MCInstLowering);
-
-  case TargetOpcode::FAULTING_OP:
-    return LowerFAULTING_OP(*MI, MCInstLowering);
-
-  case TargetOpcode::FENTRY_CALL:
-    return LowerFENTRY_CALL(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_OP:
-    return LowerPATCHABLE_OP(*MI, MCInstLowering);
-
-  case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(*MI);
-
-  case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
-    return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_RET:
-    return LowerPATCHABLE_RET(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_TAIL_CALL:
-    return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_EVENT_CALL:
-    return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
-    return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
-
-  case X86::MORESTACK_RET:
-    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
-    return;
-
-  case X86::MORESTACK_RET_RESTORE_R10:
-    // Return, then restore R10.
-    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
-    EmitAndCountInstruction(
-        MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
-    return;
-
-  case X86::SEH_PushReg:
-  case X86::SEH_SaveReg:
-  case X86::SEH_SaveXMM:
-  case X86::SEH_StackAlloc:
-  case X86::SEH_StackAlign:
-  case X86::SEH_SetFrame:
-  case X86::SEH_PushFrame:
-  case X86::SEH_EndPrologue:
-    EmitSEHInstruction(MI);
-    return;
-
-  case X86::SEH_Epilogue: {
-    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
-    MachineBasicBlock::const_iterator MBBI(MI);
-    // Check if preceded by a call and emit nop if so.
-    for (MBBI = PrevCrossBBInst(MBBI);
-         MBBI != MachineBasicBlock::const_iterator();
-         MBBI = PrevCrossBBInst(MBBI)) {
-      // Conservatively assume that pseudo instructions don't emit code and keep
-      // looking for a call. We may emit an unnecessary nop in some cases.
-      if (!MBBI->isPseudo()) {
-        if (MBBI->isCall())
-          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
-        break;
-      }
-    }
-    return;
-  }
-
   // Lower PSHUFB and VPERMILP normally but add a comment if we can find
   // a constant shuffle mask. We won't be able to do this at the MC layer
   // because the mask isn't an immediate.
@@ -2208,8 +1984,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::VPSHUFBZrm:
   case X86::VPSHUFBZrmk:
   case X86::VPSHUFBZrmkz: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
     unsigned SrcIdx, MaskIdx;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
@@ -2239,7 +2013,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
     }
     break;
   }
@@ -2266,8 +2040,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::VPERMILPDZrm:
   case X86::VPERMILPDZrmk:
   case X86::VPERMILPDZrmkz: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
     unsigned SrcIdx, MaskIdx;
     unsigned ElSize;
     switch (MI->getOpcode()) {
@@ -2311,7 +2083,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, ElSize, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
     }
     break;
   }
@@ -2320,8 +2092,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::VPERMIL2PSrm:
   case X86::VPERMIL2PDYrm:
   case X86::VPERMIL2PSYrm: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
     assert(MI->getNumOperands() >= 8 &&
            "We should always have at least 8 operands!");
 
@@ -2342,14 +2112,12 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
     }
     break;
   }
 
   case X86::VPPERMrrm: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
     assert(MI->getNumOperands() >= 7 &&
            "We should always have at least 7 operands!");
 
@@ -2359,14 +2127,12 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
     }
     break;
   }
 
   case X86::MMX_MOVQ64rm: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
     if (MI->getNumOperands() <= 4)
       break;
     if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
@@ -2376,7 +2142,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
       if (auto *CF = dyn_cast<ConstantFP>(C)) {
         CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
-        OutStreamer->AddComment(CS.str());
+        OutStreamer.AddComment(CS.str());
       }
     }
     break;
@@ -2427,8 +2193,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::VBROADCASTI64X2Z128rm:
   case X86::VBROADCASTI64X2rm:
   case X86::VBROADCASTI64X4rm:
-    if (!OutStreamer->isVerboseAsm())
-      break;
     if (MI->getNumOperands() <= 4)
       break;
     if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
@@ -2473,7 +2237,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
           }
         }
         CS << "]";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer.AddComment(CS.str());
       } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
         CS << "<";
         for (int l = 0; l != NumLanes; ++l) {
@@ -2485,10 +2249,11 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
           }
         }
         CS << ">";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer.AddComment(CS.str());
       }
     }
     break;
+
   case X86::MOVDDUPrm:
   case X86::VMOVDDUPrm:
   case X86::VMOVDDUPZ128rm:
@@ -2520,8 +2285,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::VPBROADCASTWZ128rm:
   case X86::VPBROADCASTWZ256rm:
   case X86::VPBROADCASTWZrm:
-    if (!OutStreamer->isVerboseAsm())
-      break;
     if (MI->getNumOperands() <= 4)
       break;
     if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
@@ -2572,8 +2335,241 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
         printConstant(C, CS);
       }
       CS << "]";
-      OutStreamer->AddComment(CS.str());
+      OutStreamer.AddComment(CS.str());
+    }
+  }
+}
+
+void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(*MF, *this);
+  const X86RegisterInfo *RI =
+      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+  // are compressed from EVEX encoding to VEX encoding.
+  if (TM.Options.MCOptions.ShowMCEncoding) {
+    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
+      OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+  }
+
+  // Add comments for values loaded from constant pool.
+  if (OutStreamer->isVerboseAsm())
+    addConstantComments(MI, *OutStreamer);
+
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    OutStreamer->emitRawComment("MEMBARRIER");
+    return;
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    // Lower these as normal, but add some comments.
+    Register Reg = MI->getOperand(0).getReg();
+    OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+                            X86ATTInstPrinter::getRegisterName(Reg));
+    break;
+  }
+  case X86::CLEANUPRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CLEANUPRET");
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CATCHRET");
+    break;
+  }
+
+  case X86::ENDBR32:
+  case X86::ENDBR64: {
+    // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
+    // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
+    // non-empty. If MI is the initial ENDBR, place the
+    // __patchable_function_entries label after ENDBR.
+    if (CurrentPatchableFunctionEntrySym &&
+        CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
+        MI == &MF->front().front()) {
+      MCInst Inst;
+      MCInstLowering.Lower(MI, Inst);
+      EmitAndCountInstruction(Inst);
+      CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
+      OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+      return;
+    }
+    break;
+  }
+
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPd64:
+  case X86::TAILJMPd64_CC:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    break;
+
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return LowerTlsAddr(MCInstLowering, *MI);
+
+  case X86::MOVPC32r: {
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+
+    // Emit the call.
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::CALLpcrel32)
+            .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+    const X86FrameLowering *FrameLowering =
+        MF->getSubtarget<X86Subtarget>().getFrameLowering();
+    bool hasFP = FrameLowering->hasFP(*MF);
+
+    // TODO: This is needed only if we require precise CFA.
+    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+                               !OutStreamer->getDwarfFrameInfos().back().End;
+
+    int stackGrowth = -RI->getSlotSize();
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->emitCFIAdjustCfaOffset(-stackGrowth);
+    }
+
+    // Emit the label.
+    OutStreamer->emitLabel(PICBase);
+
+    // popl $reg
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->emitCFIAdjustCfaOffset(stackGrowth);
+    }
+    return;
+  }
+
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->emitLabel(DotSym);
+
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCExpr *PICBase =
+        MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+    DotExpr = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
+
+    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(DotExpr));
+    return;
+  }
+  case TargetOpcode::STATEPOINT:
+    return LowerSTATEPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::FENTRY_CALL:
+    return LowerFENTRY_CALL(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_OP:
+    return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(*MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+    return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_RET:
+    return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+    return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_EVENT_CALL:
+    return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+    return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
+
+  case X86::MORESTACK_RET:
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
+    return;
+
+  case X86::SEH_PushReg:
+  case X86::SEH_SaveReg:
+  case X86::SEH_SaveXMM:
+  case X86::SEH_StackAlloc:
+  case X86::SEH_StackAlign:
+  case X86::SEH_SetFrame:
+  case X86::SEH_PushFrame:
+  case X86::SEH_EndPrologue:
+    EmitSEHInstruction(MI);
+    return;
+
+  case X86::SEH_Epilogue: {
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    MachineBasicBlock::const_iterator MBBI(MI);
+    // Check if preceded by a call and emit nop if so.
+    for (MBBI = PrevCrossBBInst(MBBI);
+         MBBI != MachineBasicBlock::const_iterator();
+         MBBI = PrevCrossBBInst(MBBI)) {
+      // Conservatively assume that pseudo instructions don't emit code and keep
+      // looking for a call. We may emit an unnecessary nop in some cases.
+      if (!MBBI->isPseudo()) {
+        if (MBBI->isCall())
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+        break;
+      }
     }
+    return;
+  }
   }
 
   MCInst TmpInst;

From 16976cb92535b0620f46720fe3be283da904026c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 12:53:56 -0700
Subject: [PATCH 645/770] [X86] Minor cleanups to addShuffleComments in
 X86MCInstPrinter.cpp. NFCI

-Replace some ifs that should be impossible with asserts.
-Use X86::AddrDisp and X86::AddrNumOperands to make code more readable
-Use X86II::isKMasked/isKMergeMasked to do some operand skipping to remove or simplify switches
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 87 +++++++++++++-------------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 43cfeaab42e9b..1e5596b2a90a1 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1984,28 +1984,19 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPSHUFBZrm:
   case X86::VPSHUFBZrmk:
   case X86::VPSHUFBZrmkz: {
-    unsigned SrcIdx, MaskIdx;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("Invalid opcode");
-    case X86::PSHUFBrm:
-    case X86::VPSHUFBrm:
-    case X86::VPSHUFBYrm:
-    case X86::VPSHUFBZ128rm:
-    case X86::VPSHUFBZ256rm:
-    case X86::VPSHUFBZrm:
-      SrcIdx = 1; MaskIdx = 5; break;
-    case X86::VPSHUFBZ128rmkz:
-    case X86::VPSHUFBZ256rmkz:
-    case X86::VPSHUFBZrmkz:
-      SrcIdx = 2; MaskIdx = 6; break;
-    case X86::VPSHUFBZ128rmk:
-    case X86::VPSHUFBZ256rmk:
-    case X86::VPSHUFBZrmk:
-      SrcIdx = 3; MaskIdx = 7; break;
+    unsigned SrcIdx = 1;
+    if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+      // Skip mask operand.
+      ++SrcIdx;
+      if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+        // Skip passthru operand.
+        ++SrcIdx;
+      }
     }
+    unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
 
-    assert(MI->getNumOperands() >= 6 &&
-           "We should always have at least 6 operands!");
+    assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
@@ -2040,7 +2031,6 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPERMILPDZrm:
   case X86::VPERMILPDZrmk:
   case X86::VPERMILPDZrmkz: {
-    unsigned SrcIdx, MaskIdx;
     unsigned ElSize;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
@@ -2049,33 +2039,42 @@ static void addConstantComments(const MachineInstr *MI,
     case X86::VPERMILPSZ128rm:
     case X86::VPERMILPSZ256rm:
     case X86::VPERMILPSZrm:
-      SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
     case X86::VPERMILPSZ128rmkz:
     case X86::VPERMILPSZ256rmkz:
     case X86::VPERMILPSZrmkz:
-      SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
     case X86::VPERMILPSZ128rmk:
     case X86::VPERMILPSZ256rmk:
     case X86::VPERMILPSZrmk:
-      SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
+      ElSize = 32;
+      break;
     case X86::VPERMILPDrm:
     case X86::VPERMILPDYrm:
     case X86::VPERMILPDZ128rm:
     case X86::VPERMILPDZ256rm:
     case X86::VPERMILPDZrm:
-      SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
     case X86::VPERMILPDZ128rmkz:
     case X86::VPERMILPDZ256rmkz:
     case X86::VPERMILPDZrmkz:
-      SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
     case X86::VPERMILPDZ128rmk:
     case X86::VPERMILPDZ256rmk:
     case X86::VPERMILPDZrmk:
-      SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
+      ElSize = 64;
+      break;
+    }
+
+    unsigned SrcIdx = 1;
+    if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+      // Skip mask operand.
+      ++SrcIdx;
+      if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+        // Skip passthru operand.
+        ++SrcIdx;
+      }
     }
+    unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
 
-    assert(MI->getNumOperands() >= 6 &&
-           "We should always have at least 6 operands!");
+    assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
@@ -2092,8 +2091,8 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPERMIL2PSrm:
   case X86::VPERMIL2PDYrm:
   case X86::VPERMIL2PSYrm: {
-    assert(MI->getNumOperands() >= 8 &&
-           "We should always have at least 8 operands!");
+    assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands + 1) &&
+           "Unexpected number of operands!");
 
     const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
     if (!CtrlOp.isImm())
@@ -2106,7 +2105,7 @@ static void addConstantComments(const MachineInstr *MI,
     case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
     }
 
-    const MachineOperand &MaskOp = MI->getOperand(6);
+    const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
@@ -2118,10 +2117,10 @@ static void addConstantComments(const MachineInstr *MI,
   }
 
   case X86::VPPERMrrm: {
-    assert(MI->getNumOperands() >= 7 &&
-           "We should always have at least 7 operands!");
+    assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
 
-    const MachineOperand &MaskOp = MI->getOperand(6);
+    const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
@@ -2133,9 +2132,9 @@ static void addConstantComments(const MachineInstr *MI,
   }
 
   case X86::MMX_MOVQ64rm: {
-    if (MI->getNumOperands() <= 4)
-      break;
-    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+    assert(MI->getNumOperands() == (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
       std::string Comment;
       raw_string_ostream CS(Comment);
       const MachineOperand &DstOp = MI->getOperand(0);
@@ -2193,9 +2192,9 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VBROADCASTI64X2Z128rm:
   case X86::VBROADCASTI64X2rm:
   case X86::VBROADCASTI64X4rm:
-    if (MI->getNumOperands() <= 4)
-      break;
-    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
       int NumLanes = 1;
       // Override NumLanes for the broadcast instructions.
       switch (MI->getOpcode()) {
@@ -2285,9 +2284,9 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VPBROADCASTWZ128rm:
   case X86::VPBROADCASTWZ256rm:
   case X86::VPBROADCASTWZrm:
-    if (MI->getNumOperands() <= 4)
-      break;
-    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
       int NumElts;
       switch (MI->getOpcode()) {
       default: llvm_unreachable("Invalid opcode");

From fc532c1a0dc66bf23cac60f5363f180c6cbfefbc Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Sat, 30 May 2020 15:17:41 -0700
Subject: [PATCH 646/770] Remove some non-determinism from the
 `Darwin/duplicate_os_log_reports.cpp` test.

The test read from an uninitialized buffer which could cause the output
to be unpredictable.

The test is currently disabled so this won't actually change anything
until the test is re-enabled.
---
 .../test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp b/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp
index a8314d804331d..dd5a257e39855 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/duplicate_os_log_reports.cpp
@@ -16,6 +16,7 @@
 // RUN: FileCheck %s -input-file=%t.process_syslog_output.txt
 #include <cassert>
 #include <cstdio>
+#include <cstring>
 #include <sanitizer/asan_interface.h>
 
 const int kBufferSize = 512;
@@ -37,6 +38,7 @@ void readOne() {
 
 int main() {
   buffer = static_cast<char *>(malloc(kBufferSize));
+  memset(static_cast<void *>(buffer), static_cast<int>('.'), kBufferSize);
   assert(buffer);
   // Deliberately poison `buffer` so that we have a deterministic way
   // triggering two ASan reports in a row in the no halt_on_error mode (e.g. Two

From 1b6d29e06b07e518025b6f06445ad3275d6f5684 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 30 May 2020 15:41:09 -0700
Subject: [PATCH 647/770] [Driver] Fix BooleanFFlag identifiers to use 'f'
 'fno_' prefixes instead of suffixes

---
 clang/include/clang/Driver/Options.td | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index e88e6cf8a1301..729cbfb6ad4aa 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3316,11 +3316,9 @@ def Z_reserved_lib_cckext : Flag<["-"], "Z-reserved-lib-cckext">,
     Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
 
 // Ignored options
-// FIXME: multiclasess produce suffixes, not prefixes. This is fine for now
-// since it is only used in ignored options.
 multiclass BooleanFFlag<string name> {
-  def _f : Flag<["-"], "f"#name>;
-  def _fno : Flag<["-"], "fno-"#name>;
+  def f#NAME : Flag<["-"], "f"#name>;
+  def fno_#NAME : Flag<["-"], "fno-"#name>;
 }
 
 defm : BooleanFFlag<"keep-inline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;

From 07e8a780d81bb58a0c7bd4da6cc0b9beaec3c788 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 15:51:56 -0700
Subject: [PATCH 648/770] [X86] Add pseudo instructions to use MULX with a
 single destination when the low result isn't used.

The instruction is defined to only produce high result if both
destinations are the same. We can exploit this to avoid
unnecessarily clobbering a register.

In order to hide this from register allocation we use a pseudo
instruction and expand the result during MCInst creation.

Differential Revision: https://reviews.llvm.org/D80500
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp   | 26 +++++++++++++-----
 llvm/lib/Target/X86/X86InstrArithmetic.td | 10 +++++++
 llvm/lib/Target/X86/X86MCInstLower.cpp    | 20 ++++++++++++++
 llvm/test/CodeGen/X86/atomic-unordered.ll | 32 +++++++++++------------
 llvm/test/CodeGen/X86/i128-mul.ll         |  4 +--
 llvm/test/CodeGen/X86/pr35636.ll          | 12 ++++-----
 6 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index a5fa98ec8d926..efdea78e1db98 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4759,20 +4759,25 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     unsigned LoReg, HiReg;
     bool IsSigned = Opcode == ISD::SMUL_LOHI;
     bool UseMULX = !IsSigned && Subtarget->hasBMI2();
+    bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
-      Opc  = UseMULX ? X86::MULX32rr :
+      Opc  = UseMULXHi ? X86::MULX32Hrr :
+             UseMULX ? X86::MULX32rr :
              IsSigned ? X86::IMUL32r : X86::MUL32r;
-      MOpc = UseMULX ? X86::MULX32rm :
+      MOpc = UseMULXHi ? X86::MULX32Hrm :
+             UseMULX ? X86::MULX32rm :
              IsSigned ? X86::IMUL32m : X86::MUL32m;
       LoReg = UseMULX ? X86::EDX : X86::EAX;
       HiReg = X86::EDX;
       break;
     case MVT::i64:
-      Opc  = UseMULX ? X86::MULX64rr :
+      Opc  = UseMULXHi ? X86::MULX64Hrr :
+             UseMULX ? X86::MULX64rr :
              IsSigned ? X86::IMUL64r : X86::MUL64r;
-      MOpc = UseMULX ? X86::MULX64rm :
+      MOpc = UseMULXHi ? X86::MULX64Hrm :
+             UseMULX ? X86::MULX64rm :
              IsSigned ? X86::IMUL64m : X86::MUL64m;
       LoReg = UseMULX ? X86::RDX : X86::RAX;
       HiReg = X86::RDX;
@@ -4796,7 +4801,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      if (UseMULX) {
+      if (UseMULXHi) {
+        SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        Chain = SDValue(CNode, 1);
+      } else if (UseMULX) {
         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
         ResHi = SDValue(CNode, 0);
@@ -4815,7 +4825,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       SDValue Ops[] = { N1, InFlag };
-      if (UseMULX) {
+      if (UseMULXHi) {
+        SDVTList VTs = CurDAG->getVTList(NVT);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+      } else if (UseMULX) {
         SDVTList VTs = CurDAG->getVTList(NVT, NVT);
         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
         ResHi = SDValue(CNode, 0);
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 57ad893402aa0..8bb3b755135b7 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1313,7 +1313,17 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+
              []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+
+  // Pseudo instructions to be used when the low result isn't used. The
+  // instruction is defined to keep the high if both destinations are the same.
+  def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
+                    []>, Sched<[sched]>;
+
+  let mayLoad = 1 in
+  def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
+                    []>, Sched<[sched.Folded]>;
 }
 }
 
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 1e5596b2a90a1..1d0df17e1ceb8 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -509,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
            "LEA has segment specified!");
     break;
 
+  case X86::MULX32Hrr:
+  case X86::MULX32Hrm:
+  case X86::MULX64Hrr:
+  case X86::MULX64Hrm: {
+    // Turn into regular MULX by duplicating the destination.
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
+    case X86::MULX32Hrm: NewOpc = X86::MULX32rr; break;
+    case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
+    case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
+    }
+    OutMI.setOpcode(NewOpc);
+    // Duplicate the destination.
+    unsigned DestReg = OutMI.getOperand(0).getReg();
+    OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
+    break;
+  }
+
   // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
   // if one of the registers is extended, but other isn't.
   case X86::VMOVZPQILo2PQIrr:
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index b321820cf506a..16fde4074ea0e 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -839,14 +839,14 @@ define i64 @load_fold_udiv1(i64* %p) {
 ; CHECK-O3-CUR:       # %bb.0:
 ; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
 ; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT:    mulxq %rax, %rcx, %rax
+; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
 ; CHECK-O3-CUR-NEXT:    shrq $3, %rax
 ; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: load_fold_udiv1:
 ; CHECK-O3-EX:       # %bb.0:
 ; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rcx, %rax
+; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
 ; CHECK-O3-EX-NEXT:    shrq $3, %rax
 ; CHECK-O3-EX-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
@@ -1034,9 +1034,9 @@ define i64 @load_fold_urem1(i64* %p) {
 ; CHECK-O3-NEXT:    movq (%rdi), %rax
 ; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
 ; CHECK-O3-NEXT:    movq %rax, %rdx
-; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rdx
-; CHECK-O3-NEXT:    shrq $3, %rdx
-; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rcx
+; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rcx
+; CHECK-O3-NEXT:    shrq $3, %rcx
+; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rcx
 ; CHECK-O3-NEXT:    leaq (%rcx,%rcx,2), %rcx
 ; CHECK-O3-NEXT:    subq %rcx, %rax
 ; CHECK-O3-NEXT:    retq
@@ -1693,7 +1693,7 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movq (%rdi), %rdx
 ; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O0-NEXT:    mulxq %rax, %rcx, %rax
+; CHECK-O0-NEXT:    mulxq %rax, %rax, %rax
 ; CHECK-O0-NEXT:    shrq $3, %rax
 ; CHECK-O0-NEXT:    movq %rax, (%rdi)
 ; CHECK-O0-NEXT:    retq
@@ -1702,17 +1702,17 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
 ; CHECK-O3-CUR:       # %bb.0:
 ; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
 ; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rcx
-; CHECK-O3-CUR-NEXT:    shrq $3, %rcx
-; CHECK-O3-CUR-NEXT:    movq %rcx, (%rdi)
+; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
+; CHECK-O3-CUR-NEXT:    shrq $3, %rax
+; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
 ; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
 ; CHECK-O3-EX:       # %bb.0:
 ; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rcx
-; CHECK-O3-EX-NEXT:    shrq $3, %rcx
-; CHECK-O3-EX-NEXT:    movq %rcx, (%rdi)
+; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
+; CHECK-O3-EX-NEXT:    shrq $3, %rax
+; CHECK-O3-EX-NEXT:    movq %rax, (%rdi)
 ; CHECK-O3-EX-NEXT:    retq
   %prev = load atomic i64, i64* %p unordered, align 8
   %val = udiv i64 %prev, 15
@@ -1840,7 +1840,7 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
 ; CHECK-O0-NEXT:    movq %rax, %rdx
-; CHECK-O0-NEXT:    mulxq %rcx, %rdx, %rcx
+; CHECK-O0-NEXT:    mulxq %rcx, %rcx, %rcx
 ; CHECK-O0-NEXT:    shrq $3, %rcx
 ; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
 ; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
@@ -1852,9 +1852,9 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
 ; CHECK-O3:       # %bb.0:
 ; CHECK-O3-NEXT:    movq (%rdi), %rdx
 ; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-NEXT:    mulxq %rax, %rax, %rcx
-; CHECK-O3-NEXT:    shrq $3, %rcx
-; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rax
+; CHECK-O3-NEXT:    mulxq %rax, %rax, %rax
+; CHECK-O3-NEXT:    shrq $3, %rax
+; CHECK-O3-NEXT:    leaq (%rax,%rax,4), %rax
 ; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
 ; CHECK-O3-NEXT:    subq %rax, %rdx
 ; CHECK-O3-NEXT:    movq %rdx, (%rdi)
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 45834f2eeecd3..118fd94342f51 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -54,7 +54,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    mulxl %esi, %edx, %ebx
+; X86-BMI-NEXT:    mulxl %esi, %ebx, %ebx
 ; X86-BMI-NEXT:    movl %ecx, %edx
 ; X86-BMI-NEXT:    mulxl %esi, %esi, %ebp
 ; X86-BMI-NEXT:    addl %ebx, %esi
@@ -85,7 +85,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X64-BMI-LABEL: foo:
 ; X64-BMI:       # %bb.0:
 ; X64-BMI-NEXT:    movq %rdi, %rdx
-; X64-BMI-NEXT:    mulxq %rsi, %rcx, %rax
+; X64-BMI-NEXT:    mulxq %rsi, %rax, %rax
 ; X64-BMI-NEXT:    retq
   %tmp0 = zext i64 %x to i128
   %tmp1 = zext i64 %y to i128
diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll
index 07fb37f4b62a8..ed4ef292c6605 100644
--- a/llvm/test/CodeGen/X86/pr35636.ll
+++ b/llvm/test/CodeGen/X86/pr35636.ll
@@ -7,9 +7,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ; HSW:       # %bb.0: # %bb
 ; HSW-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
 ; HSW-NEXT:    movq %rdi, %rdx
-; HSW-NEXT:    mulxq %rax, %rax, %rcx
-; HSW-NEXT:    shrq $42, %rcx
-; HSW-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; HSW-NEXT:    mulxq %rax, %rax, %rax
+; HSW-NEXT:    shrq $42, %rax
+; HSW-NEXT:    imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
 ; HSW-NEXT:    shrq $20, %rax
 ; HSW-NEXT:    leal (%rax,%rax,4), %eax
 ; HSW-NEXT:    addl $5, %eax
@@ -24,9 +24,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ; ZN:       # %bb.0: # %bb
 ; ZN-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
 ; ZN-NEXT:    movq %rdi, %rdx
-; ZN-NEXT:    mulxq %rax, %rax, %rcx
-; ZN-NEXT:    shrq $42, %rcx
-; ZN-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; ZN-NEXT:    mulxq %rax, %rax, %rax
+; ZN-NEXT:    shrq $42, %rax
+; ZN-NEXT:    imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
 ; ZN-NEXT:    shrq $20, %rax
 ; ZN-NEXT:    leal 5(%rax,%rax,4), %eax
 ; ZN-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF

From a23d1e9aff4d8cb752e227b3e16f887cf49c15d4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 30 May 2020 17:25:18 -0700
Subject: [PATCH 649/770] [llvm-objdump] Simplify reportError() and prepend
 outs().flush()

As noticed by dblaikie.

I don't know what code paths using reportError can cause stdout output
to be interleaved with stderr, so no test is added now.

Also drop an unneeded use of errs().fflush() in reportWarning().
I requested this in D64165.
---
 llvm/tools/llvm-objdump/llvm-objdump.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 6eebf98744aae..d62839a8686cd 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -420,10 +420,10 @@ void reportWarning(Twine Message, StringRef File) {
   outs().flush();
   WithColor::warning(errs(), ToolName)
       << "'" << File << "': " << Message << "\n";
-  errs().flush();
 }
 
 LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Twine Message) {
+  outs().flush();
   WithColor::error(errs(), ToolName) << "'" << File << "': " << Message << "\n";
   exit(1);
 }
@@ -432,6 +432,7 @@ LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
                                          StringRef ArchiveName,
                                          StringRef ArchitectureName) {
   assert(E);
+  outs().flush();
   WithColor::error(errs(), ToolName);
   if (ArchiveName != "")
     errs() << ArchiveName << "(" << FileName << ")";
@@ -439,11 +440,8 @@ LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
     errs() << "'" << FileName << "'";
   if (!ArchitectureName.empty())
     errs() << " (for architecture " << ArchitectureName << ")";
-  std::string Buf;
-  raw_string_ostream OS(Buf);
-  logAllUnhandledErrors(std::move(E), OS);
-  OS.flush();
-  errs() << ": " << Buf;
+  errs() << ": ";
+  logAllUnhandledErrors(std::move(E), errs());
   exit(1);
 }
 

From 439d27d79f58282b618881142244bbdcb1f28345 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 30 May 2020 18:00:14 -0700
Subject: [PATCH 650/770] [llvm-objdump] Move llvm:: to llvm::objdump:: and
 qualifying definitions with objdump::

Or adding `static`.

Qualifying definitions with `objdump::` comforms to the coding standards
https://llvm.org/docs/CodingStandards.html#use-namespace-qualifiers-to-implement-previously-declared-functions
---
 llvm/tools/llvm-objdump/COFFDump.cpp     |  1 +
 llvm/tools/llvm-objdump/llvm-objdump.cpp | 48 +++++++++++++-----------
 llvm/tools/llvm-objdump/llvm-objdump.h   |  4 +-
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/llvm/tools/llvm-objdump/COFFDump.cpp b/llvm/tools/llvm-objdump/COFFDump.cpp
index c829b496f6c3f..873a8ab64e7d8 100644
--- a/llvm/tools/llvm-objdump/COFFDump.cpp
+++ b/llvm/tools/llvm-objdump/COFFDump.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::objdump;
 using namespace llvm::object;
 using namespace llvm::Win64EH;
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index d62839a8686cd..2416ffcb13791 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -388,7 +388,8 @@ static FilterResult checkSectionFilter(object::SectionRef S) {
 
 namespace llvm {
 
-SectionFilter ToolSectionFilter(object::ObjectFile const &O, uint64_t *Idx) {
+SectionFilter objdump::ToolSectionFilter(object::ObjectFile const &O,
+                                         uint64_t *Idx) {
   // Start at UINT64_MAX so that the first index returned after an increment is
   // zero (after the unsigned wrap).
   if (Idx)
@@ -403,8 +404,8 @@ SectionFilter ToolSectionFilter(object::ObjectFile const &O, uint64_t *Idx) {
       O);
 }
 
-std::string getFileNameForError(const object::Archive::Child &C,
-                                unsigned Index) {
+std::string objdump::getFileNameForError(const object::Archive::Child &C,
+                                         unsigned Index) {
   Expected<StringRef> NameOrErr = C.getName();
   if (NameOrErr)
     return std::string(NameOrErr.get());
@@ -414,7 +415,7 @@ std::string getFileNameForError(const object::Archive::Child &C,
   return "<file index: " + std::to_string(Index) + ">";
 }
 
-void reportWarning(Twine Message, StringRef File) {
+void objdump::reportWarning(Twine Message, StringRef File) {
   // Output order between errs() and outs() matters especially for archive
   // files where the output is per member object.
   outs().flush();
@@ -422,15 +423,16 @@ void reportWarning(Twine Message, StringRef File) {
       << "'" << File << "': " << Message << "\n";
 }
 
-LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Twine Message) {
+LLVM_ATTRIBUTE_NORETURN void objdump::reportError(StringRef File,
+                                                  Twine Message) {
   outs().flush();
   WithColor::error(errs(), ToolName) << "'" << File << "': " << Message << "\n";
   exit(1);
 }
 
-LLVM_ATTRIBUTE_NORETURN void reportError(Error E, StringRef FileName,
-                                         StringRef ArchiveName,
-                                         StringRef ArchitectureName) {
+LLVM_ATTRIBUTE_NORETURN void objdump::reportError(Error E, StringRef FileName,
+                                                  StringRef ArchiveName,
+                                                  StringRef ArchitectureName) {
   assert(E);
   outs().flush();
   WithColor::error(errs(), ToolName);
@@ -495,7 +497,7 @@ static const Target *getTarget(const ObjectFile *Obj) {
   return TheTarget;
 }
 
-bool isRelocAddressLess(RelocationRef A, RelocationRef B) {
+bool objdump::isRelocAddressLess(RelocationRef A, RelocationRef B) {
   return A.getOffset() < B.getOffset();
 }
 
@@ -1149,7 +1151,8 @@ static void dumpELFData(uint64_t SectionAddr, uint64_t Index, uint64_t End,
   }
 }
 
-SymbolInfoTy createSymbolInfo(const ObjectFile *Obj, const SymbolRef &Symbol) {
+SymbolInfoTy objdump::createSymbolInfo(const ObjectFile *Obj,
+                                       const SymbolRef &Symbol) {
   const StringRef FileName = Obj->getFileName();
   const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName);
   const StringRef Name = unwrapOrError(Symbol.getName(), FileName);
@@ -1169,8 +1172,9 @@ SymbolInfoTy createSymbolInfo(const ObjectFile *Obj, const SymbolRef &Symbol) {
                                      : (uint8_t)ELF::STT_NOTYPE);
 }
 
-SymbolInfoTy createDummySymbolInfo(const ObjectFile *Obj, const uint64_t Addr,
-                                   StringRef &Name, uint8_t Type) {
+static SymbolInfoTy createDummySymbolInfo(const ObjectFile *Obj,
+                                          const uint64_t Addr, StringRef &Name,
+                                          uint8_t Type) {
   if (Obj->isXCOFF() && SymbolDescription)
     return SymbolInfoTy(Addr, Name, None, None, false);
   else
@@ -1713,7 +1717,7 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
                     SP, InlineRelocs);
 }
 
-void printRelocations(const ObjectFile *Obj) {
+void objdump::printRelocations(const ObjectFile *Obj) {
   StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 :
                                                  "%08" PRIx64;
   // Regular objdump doesn't print relocations in non-relocatable object
@@ -1767,7 +1771,7 @@ void printRelocations(const ObjectFile *Obj) {
   }
 }
 
-void printDynamicRelocations(const ObjectFile *Obj) {
+void objdump::printDynamicRelocations(const ObjectFile *Obj) {
   // For the moment, this option is for ELF only
   if (!Obj->isELF())
     return;
@@ -1819,7 +1823,7 @@ static size_t getMaxSectionNameWidth(const ObjectFile *Obj) {
   return MaxWidth;
 }
 
-void printSectionHeaders(const ObjectFile *Obj) {
+void objdump::printSectionHeaders(const ObjectFile *Obj) {
   size_t NameWidth = getMaxSectionNameWidth(Obj);
   size_t AddressWidth = 2 * Obj->getBytesInAddress();
   bool HasLMAColumn = shouldDisplayLMA(Obj);
@@ -1864,7 +1868,7 @@ void printSectionHeaders(const ObjectFile *Obj) {
   outs() << "\n";
 }
 
-void printSectionContents(const ObjectFile *Obj) {
+void objdump::printSectionContents(const ObjectFile *Obj) {
   for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
     StringRef Name = unwrapOrError(Section.getName(), Obj->getFileName());
     uint64_t BaseAddr = Section.getAddress();
@@ -1908,8 +1912,8 @@ void printSectionContents(const ObjectFile *Obj) {
   }
 }
 
-void printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
-                      StringRef ArchitectureName, bool DumpDynamic) {
+void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
+                               StringRef ArchitectureName, bool DumpDynamic) {
   if (O->isCOFF() && !DumpDynamic) {
     outs() << "SYMBOL TABLE:\n";
     printCOFFSymbolTable(cast<const COFFObjectFile>(O));
@@ -1939,9 +1943,9 @@ void printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
     printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic);
 }
 
-void printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
-                 StringRef FileName, StringRef ArchiveName,
-                 StringRef ArchitectureName, bool DumpDynamic) {
+void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
+                          StringRef FileName, StringRef ArchiveName,
+                          StringRef ArchitectureName, bool DumpDynamic) {
   const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(O);
   uint64_t Address = unwrapOrError(Symbol.getAddress(), FileName, ArchiveName,
                                    ArchitectureName);
@@ -2089,7 +2093,7 @@ static void printUnwindInfo(const ObjectFile *O) {
 
 /// Dump the raw contents of the __clangast section so the output can be piped
 /// into llvm-bcanalyzer.
-void printRawClangAST(const ObjectFile *Obj) {
+static void printRawClangAST(const ObjectFile *Obj) {
   if (outs().is_displayed()) {
     WithColor::error(errs(), ToolName)
         << "The -raw-clang-ast option will dump the raw binary contents of "
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 8e6c1059207ba..390fc62d09f81 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -51,8 +51,6 @@ extern cl::opt<bool> UnwindInfo;
 
 extern StringSet<> FoundSectionSet;
 
-} // namespace objdump
-
 typedef std::function<bool(llvm::object::SectionRef const &)> FilterPredicate;
 
 /// A filtered iterator for SectionRefs that skips sections based on some given
@@ -118,7 +116,6 @@ SectionFilter ToolSectionFilter(llvm::object::ObjectFile const &O,
                                 uint64_t *Idx = nullptr);
 
 bool isRelocAddressLess(object::RelocationRef A, object::RelocationRef B);
-void printRawClangAST(const object::ObjectFile *O);
 void printRelocations(const object::ObjectFile *O);
 void printDynamicRelocations(const object::ObjectFile *O);
 void printSectionHeaders(const object::ObjectFile *O);
@@ -147,6 +144,7 @@ std::string getFileNameForError(const object::Archive::Child &C,
 SymbolInfoTy createSymbolInfo(const object::ObjectFile *Obj,
                               const object::SymbolRef &Symbol);
 
+} // namespace objdump
 } // end namespace llvm
 
 #endif

From d04eb253c710aec30707e404cfc9dc672082d3a2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 30 May 2020 18:03:43 -0700
Subject: [PATCH 651/770] [llvm-objdump] Delete unneeeded namespace llvm {}

---
 llvm/tools/llvm-objdump/llvm-objdump.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 2416ffcb13791..70ab0a1643eeb 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -386,8 +386,6 @@ static FilterResult checkSectionFilter(object::SectionRef S) {
           /*IncrementIndex=*/true};
 }
 
-namespace llvm {
-
 SectionFilter objdump::ToolSectionFilter(object::ObjectFile const &O,
                                          uint64_t *Idx) {
   // Start at UINT64_MAX so that the first index returned after an increment is
@@ -2422,7 +2420,6 @@ static void dumpInput(StringRef file) {
   else
     reportError(errorCodeToError(object_error::invalid_file_type), file);
 }
-} // namespace llvm
 
 int main(int argc, char **argv) {
   using namespace llvm;

From ce1fadca608ffaf214732b843e084a75e55fcb50 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sat, 30 May 2020 19:33:50 -0700
Subject: [PATCH 652/770] [ELF][docs] Update supported targets

PowerPC, PowerPC64 and x86-32 have production quality.
Mention Hexagon, RISC-V and SPARC V9.
---
 lld/docs/index.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lld/docs/index.rst b/lld/docs/index.rst
index 5a3f1a211b7bc..b820d57e3d354 100644
--- a/lld/docs/index.rst
+++ b/lld/docs/index.rst
@@ -33,10 +33,10 @@ Features
   machine, you can expect that LLD runs more than twice as fast as the GNU
   gold linker. Your mileage may vary, though.
 
-- It supports various CPUs/ABIs including x86-64, x86, x32, AArch64,
-  ARM, MIPS 32/64 big/little-endian, PowerPC, PowerPC 64 and AMDGPU.
-  Among these, x86-64, AArch64, and ARM (>= v6) are production quality.
-  MIPS seems decent too. x86 should be OK but is not well tested yet.
+- It supports various CPUs/ABIs including AArch64, AMDGPU, ARM, Hexagon, MIPS
+  32/64 big/little-endian, PowerPC, PowerPC64, RISC-V, SPARC V9, x86-32 and
+  x86-64. Among these, AArch64, ARM (>= v6), PowerPC, PowerPC64, x86-32 and
+  x86-64 have production quality. MIPS seems decent too.
 
 - It is always a cross-linker, meaning that it always supports all the
   above targets however it was built. In fact, we don't provide a

From 8857822452c758805e8bb33ecc877d8d0cce1660 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 16:12:44 -0700
Subject: [PATCH 653/770] [X86] Move MMX_SET0 pattern into the instruction
 definition. NFC

---
 llvm/lib/Target/X86/X86InstrMMX.td | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 83eddaa05f4ae..415e0389145a0 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -24,8 +24,9 @@
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero] in {
-def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasMMX] in {
+def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "",
+                 [(set VR64:$dst, (x86mmx (MMX_X86movw2d (i32 0))))]>;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -172,11 +173,6 @@ def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                           (x86mmx (MMX_X86movw2d (loadi32 addr:$src))))]>,
                         Sched<[WriteVecLoad]>;
 
-let Predicates = [HasMMX] in {
-  def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
-            (MMX_SET0)>;
-}
-
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", []>,

From efc5857b0b121ffd0b74fcd7aa8c48419a3fe4fc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 16:27:26 -0700
Subject: [PATCH 654/770] [X86] Autogenerate complete checks. NFC

---
 llvm/test/CodeGen/X86/pr23246.ll | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll
index 8b6745de109fa..45587b8c69cd4 100644
--- a/llvm/test/CodeGen/X86/pr23246.ll
+++ b/llvm/test/CodeGen/X86/pr23246.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple x86_64-unknown-unknown -mattr=mmx | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@@ -5,11 +6,12 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; PR23246
 ; We're really only interested in doing something sane with the shuffle.
 
-; CHECK-LABEL: test:
-; CHECK:      movq2dq %mm0, %xmm0
-; CHECK-NEXT: pshufd {{.*}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: retq
 define <2 x i64> @test(x86_mmx %a) #0 {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT:    retq
 entry:
   %b = bitcast x86_mmx %a to <1 x i64>
   %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>

From 1ecf39d607acdb04c2bb5155e5f7265db2484511 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 17:03:53 -0700
Subject: [PATCH 655/770] [X86] Fix a place where we created MOVQ2DQ with a
 DstVT other than v2i64.

The type profile and isel pattern have this type declared as
being MVT::v2i64. But isel skips the explicit type check due to
the type profile.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0b114b34186d1..6ebd46893f957 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30002,10 +30002,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
 
     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+      // FIXME: Use v4f32 for SSE1?
+      assert(Subtarget.hasSSE2() && "Requires SSE2");
       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
              "Unexpected type action!");
       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
-      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
+                                N->getOperand(0));
+      Res = DAG.getBitcast(WideVT, Res);
       Results.push_back(Res);
       return;
     }

From af1accdd860d4e1768a1f56a8651ae4d13445e14 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 17:04:56 -0700
Subject: [PATCH 656/770] [X86] Teach computeKnownBitsForTargetNode that the
 upper half of X86ISD::MOVQ2DQ is all zero.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++++++
 llvm/test/CodeGen/X86/mmx-cvt.ll        | 2 --
 llvm/test/CodeGen/X86/vec_insert-7.ll   | 7 +------
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6ebd46893f957..fa1b194afc1a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33402,6 +33402,12 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case X86ISD::MOVQ2DQ: {
+    // Move from MMX to XMM. Upper half of XMM should be 0.
+    if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
+      Known.setAllZero();
+    break;
+  }
   }
 
   // Handle target shuffles.
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index 339df30892a61..803b3d9367205 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -298,7 +298,6 @@ define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
 ; X86-NEXT:    movq (%eax), %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
 ; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -307,7 +306,6 @@ define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
 ; X64-NEXT:    movq (%rdi), %mm0
 ; X64-NEXT:    paddd %mm0, %mm0
 ; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
   %2 = bitcast <1 x i64>* %0 to x86_mmx*
diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll
index 52d6e7ca7a2ce..e4b9806ab7e67 100644
--- a/llvm/test/CodeGen/X86/vec_insert-7.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-7.ll
@@ -8,12 +8,7 @@
 define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
 ; X32-LABEL: mmx_movzl:
 ; X32:       ## %bb.0:
-; X32-NEXT:    movq2dq %mm0, %xmm0
-; X32-NEXT:    movl $32, %eax
-; X32-NEXT:    pinsrd $0, %eax, %xmm0
-; X32-NEXT:    pxor %xmm1, %xmm1
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; X32-NEXT:    movdq2q %xmm1, %mm0
+; X32-NEXT:    movq LCPI0_0, %mm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mmx_movzl:

From a4dd45b7d09d8c12b87eaa0e6d1a92ce2b0defe0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 18:40:17 -0700
Subject: [PATCH 657/770] [DAGCombiner] Move debug message and statistic update
 into CommitTargetLoweringOpt.

This code was repeated in two callers of CommitTargetLoweringOpt.
But CommitTargetLoweringOpt is also called from TargetLowering.
We should print a message for those calls to. So sink the
repeated code into CommitTargetLoweringOpt to catch those calls.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 43bcf2e118882..5fb7aa0e98d46 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1060,6 +1060,12 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
 
 void DAGCombiner::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
+             dbgs() << '\n');
+
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
   WorklistRemover DeadNodes(*this);
@@ -1089,12 +1095,6 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
   // Revisit the node.
   AddToWorklist(Op.getNode());
 
-  // Replace the old value with the new one.
-  ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
-             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
-             dbgs() << '\n');
-
   CommitTargetLoweringOpt(TLO);
   return true;
 }
@@ -1114,12 +1114,6 @@ bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
   // Revisit the node.
   AddToWorklist(Op.getNode());
 
-  // Replace the old value with the new one.
-  ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
-             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
-             dbgs() << '\n');
-
   CommitTargetLoweringOpt(TLO);
   return true;
 }

From 7c3b8077cc3feed2de3de6f3efb0627d619d1434 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 18:51:20 -0700
Subject: [PATCH 658/770] [X86] Add DAG combine to turn (v2i64
 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ. Remove unneeded isel
 patterns.

We already had a DAG combine for (mmx (bitconvert (i64 (extractelement v2i64))))
to MOVDQ2Q.

Remove patterns for MMX_MOVQ2DQrr/MMX_MOVDQ2Qrr that use
scalar_to_vector/extractelement involving i64 scalar type with
v2i64 and x86mmx.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  5 ++++
 llvm/lib/Target/X86/X86InstrMMX.td      | 31 ++++++-------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fa1b194afc1a9..5eba9f3875a9e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47463,6 +47463,11 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
         VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
                         DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
 
+  // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
+  if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
+      Src.getOperand(0).getValueType() == MVT::x86mmx)
+    return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 415e0389145a0..92c3561ac21a5 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -232,20 +232,21 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(store (x86mmx VR64:$src), addr:$dst)]>;
 
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
 def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                              (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
-                               (x86mmx (bitconvert
-                               (i64 (extractelt (v2i64 VR128:$src),
-                                     (iPTR 0))))))]>;
+                               (x86mmx (MMX_X86movdq2q VR128:$src)))]>;
 
 def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
                               [(set VR128:$dst,
-                                (v2i64
-                                  (scalar_to_vector
-                                    (i64 (bitconvert (x86mmx VR64:$src))))))]>;
+                                (v2i64 (MMX_X86movq2dq VR64:$src)))]>;
 
 let isCodeGenOnly = 1, hasSideEffects = 1 in {
 def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
@@ -550,24 +551,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                                 (int_x86_mmx_pmovmskb VR64:$src))]>,
                           Sched<[WriteMMXMOVMSK]>;
 
-// MMX to XMM for vector types
-def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-
-// Low word of XMM to MMX.
-def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
-
-def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
-          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
-
-def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
-                             (i64 (bitconvert (x86mmx VR64:$src)))))),
-          (MMX_MOVQ2DQrr VR64:$src)>;
-
 // Misc.
 let SchedRW = [SchedWriteShuffle.MMX] in {
 let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in

From dbda87186ec1b28a98d7a91a651b5a47c6f06d40 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 30 May 2020 20:24:51 -0700
Subject: [PATCH 659/770] [X86] Remove unneeded bitconverts from isel patterns.
 NFC

The types already match so TableGen is removing the bitconvert.
---
 llvm/lib/Target/X86/X86InstrMMX.td | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 92c3561ac21a5..49940204c25a4 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -44,8 +44,7 @@ let Constraints = "$src1 = $dst" in {
     def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                  (ins VR64:$src1, OType:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR64:$dst, (IntId VR64:$src1,
-                                   (bitconvert (load_mmx addr:$src2))))]>,
+                 [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 
@@ -61,8 +60,7 @@ let Constraints = "$src1 = $dst" in {
     def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                                   (ins VR64:$src1, i64mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (IntId VR64:$src1,
-                                    (bitconvert (load_mmx addr:$src2))))]>,
+                  [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32u8imm:$src2),
@@ -82,8 +80,7 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
 
   def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR64:$dst,
-                   (IntId64 (bitconvert (load_mmx addr:$src))))]>,
+                 [(set VR64:$dst, (IntId64 (load_mmx addr:$src)))]>,
                  Sched<[sched.Folded]>;
 }
 
@@ -102,8 +99,7 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
-         (IntId64 VR64:$src1,
-          (bitconvert (load_mmx addr:$src2))))]>,
+         (IntId64 VR64:$src1, (load_mmx addr:$src2)))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
@@ -119,8 +115,8 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
   def rmi  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-      [(set VR64:$dst, (IntId VR64:$src1,
-                       (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>,
+      [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2),
+                                          (i8 timm:$src3)))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 

From 234eba90f4f346a4b0d260cdd61a9aae647b2b48 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@gmail.com>
Date: Sat, 30 May 2020 20:45:27 -0700
Subject: [PATCH 660/770] AMDGPU: Add setTruncStoreAction for vector i64 types
 made legal recently

Reviewers:
  rampitec, arsenm

Differential Revision:
  https://reviews.llvm.org/D80853
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  6 +++
 llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll | 50 +++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 452ff785ec064..5d97b9f43e7c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -220,6 +220,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
 
+  setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
+  setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
+  setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
new file mode 100644
index 0000000000000..627ba9e0f7170
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}trunc_store_v4i64_v4i8:
+; GCN: global_store_dword v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off
+define amdgpu_kernel void @trunc_store_v4i64_v4i8(< 4 x i8> addrspace(1)* %out, <4 x i64> %in) {
+entry:
+  %trunc = trunc <4 x i64> %in to < 4 x i8>
+  store <4 x i8> %trunc, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_store_v8i64_v8i8:
+; GCN: global_store_dwordx2 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+define amdgpu_kernel void @trunc_store_v8i64_v8i8(< 8 x i8> addrspace(1)* %out, <8 x i64> %in) {
+entry:
+  %trunc = trunc <8 x i64> %in to < 8 x i8>
+  store <8 x i8> %trunc, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_store_v8i64_v8i16:
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+define amdgpu_kernel void @trunc_store_v8i64_v8i16(< 8 x i16> addrspace(1)* %out, <8 x i64> %in) {
+entry:
+  %trunc = trunc <8 x i64> %in to < 8 x i16>
+  store <8 x i16> %trunc, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_store_v8i64_v8i32:
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+define amdgpu_kernel void @trunc_store_v8i64_v8i32(< 8 x i32> addrspace(1)* %out, <8 x i64> %in) {
+entry:
+  %trunc = trunc <8 x i64> %in to <8 x i32>
+  store <8 x i32> %trunc, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32:
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
+; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) {
+entry:
+  %trunc = trunc <16 x i64> %in to <16 x i32>
+  store <16 x i32> %trunc, <16 x i32> addrspace(1)* %out
+  ret void
+}

From d4751f35560321dfb38cd77b924e715b9ebf9203 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 29 May 2020 16:52:43 +0100
Subject: [PATCH 661/770] [AMDGPU] Precommit tests for D80813

---
 llvm/test/CodeGen/AMDGPU/llvm.sin.ll | 70 ++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index c5736cd1d2324..685f4eb35dc26 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -51,6 +51,20 @@ define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x)
   ret void
 }
 
+; FUNC-LABEL: {{^}}fmf_sin_3x_f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; SICIVI: v_fract_f32
+; GFX9-NOT: v_fract_f32
+; GCN: v_sin_f32
+; GCN-NOT: v_sin_f32
+define amdgpu_kernel void @fmf_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul reassoc float 3.0, %x
+  %sin = call reassoc float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}safe_sin_2x_f32:
 ; GCN: v_add_f32
 ; GCN: v_mul_f32
@@ -80,6 +94,62 @@ define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x)
   ret void
 }
 
+; FUNC-LABEL: {{^}}fmf_sin_2x_f32:
+; GCN: v_add_f32
+; GCN: v_mul_f32
+; SICIVI: v_fract_f32
+; GFX9-NOT: v_fract_f32
+; GCN: v_sin_f32
+; GCN-NOT: v_sin_f32
+define amdgpu_kernel void @fmf_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul reassoc float 2.0, %x
+  %sin = call reassoc float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_sin_cancel_f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; SICIVI: v_fract_f32
+; GFX9-NOT: v_fract_f32
+; GCN: v_sin_f32
+; GCN-NOT: v_sin_f32
+define amdgpu_kernel void @safe_sin_cancel_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 0x401921FB60000000, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_sin_cancel_f32:
+; GCN-NOT: v_add_f32
+; GCN-NOT: v_mul_f32
+; SICIVI: v_fract_f32
+; GFX9-NOT: v_fract_f32
+; GCN: v_sin_f32
+; GCN-NOT: v_sin_f32
+define amdgpu_kernel void @unsafe_sin_cancel_f32(float addrspace(1)* %out, float %x) #2 {
+  %y = fmul float 0x401921FB60000000, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fmf_sin_cancel_f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; SICIVI: v_fract_f32
+; GFX9-NOT: v_fract_f32
+; GCN: v_sin_f32
+; GCN-NOT: v_sin_f32
+define amdgpu_kernel void @fmf_sin_cancel_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul reassoc float 0x401921FB60000000, %x
+  %sin = call reassoc float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}sin_v4f32:
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}

From 2768edfff19a170faca35a8c63163c8bb1b67382 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 29 May 2020 17:02:13 +0100
Subject: [PATCH 662/770] [AMDGPU] Propagate fast-math flags when lowering FSIN
 and FCOS

Differential Revision: https://reviews.llvm.org/D80813
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 ++++++++------
 llvm/test/CodeGen/AMDGPU/llvm.sin.ll      | 10 ++++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5d97b9f43e7c8..bbd3737d2ef0e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8288,22 +8288,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDValue Arg = Op.getOperand(0);
   SDValue TrigVal;
 
-  // TODO: Should this propagate fast-math-flags?
+  // Propagate fast-math flags so that the multiply we introduce can be folded
+  // if Arg is already the result of a multiply by constant.
+  auto Flags = Op->getFlags();
 
   SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
 
   if (Subtarget->hasTrigReducedRange()) {
-    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
-    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
+    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
+    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
   } else {
-    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
   }
 
   switch (Op.getOpcode()) {
   case ISD::FCOS:
-    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
   case ISD::FSIN:
-    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
   default:
     llvm_unreachable("Wrong trig opcode");
   }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index 685f4eb35dc26..7f033a6c43a73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -52,7 +52,8 @@ define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x)
 }
 
 ; FUNC-LABEL: {{^}}fmf_sin_3x_f32:
-; GCN: v_mul_f32
+; GCN-NOT: v_add_f32
+; GCN: 0x3ef47644
 ; GCN: v_mul_f32
 ; SICIVI: v_fract_f32
 ; GFX9-NOT: v_fract_f32
@@ -95,7 +96,8 @@ define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x)
 }
 
 ; FUNC-LABEL: {{^}}fmf_sin_2x_f32:
-; GCN: v_add_f32
+; GCN-NOT: v_add_f32
+; GCN: 0x3ea2f983
 ; GCN: v_mul_f32
 ; SICIVI: v_fract_f32
 ; GFX9-NOT: v_fract_f32
@@ -137,8 +139,8 @@ define amdgpu_kernel void @unsafe_sin_cancel_f32(float addrspace(1)* %out, float
 }
 
 ; FUNC-LABEL: {{^}}fmf_sin_cancel_f32:
-; GCN: v_mul_f32
-; GCN: v_mul_f32
+; GCN-NOT: v_add_f32
+; GCN-NOT: v_mul_f32
 ; SICIVI: v_fract_f32
 ; GFX9-NOT: v_fract_f32
 ; GCN: v_sin_f32

From af3abbf7bd2213003a133c361c212ac6efb1bd2b Mon Sep 17 00:00:00 2001
From: Kang Zhang <shkzhang@cn.ibm.com>
Date: Sun, 31 May 2020 08:05:27 +0000
Subject: [PATCH 663/770] [NFC][PowerPC] Add a new case to test
 phi-node-elimination pass

---
 llvm/test/CodeGen/PowerPC/phi-eliminate.mir | 295 ++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/phi-eliminate.mir

diff --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
new file mode 100644
index 0000000000000..8b49b038e6c8a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
@@ -0,0 +1,295 @@
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - \
+# RUN:   -run-pass=livevars,phi-node-elimination | FileCheck %s
+
+--- |
+  define void @phi_eliminate(i32 %0, i32 %1, i8* %2) {
+    %scevgep3 = getelementptr i8, i8* %2, i64 undef
+    call void @llvm.set.loop.iterations.i64(i64 undef)
+    br label %4
+
+  4:                                                ; preds = %4, %3
+    %5 = phi i32 [ %8, %4 ], [ %0, %3 ]
+    %6 = phi i8* [ %scevgep3, %3 ], [ %7, %4 ]
+    %7 = getelementptr i8, i8* %6, i64 -1
+    %8 = sdiv i32 %5, %1
+    %9 = mul nsw i32 %8, %1
+    %10 = sub nsw i32 %5, %9
+    %11 = icmp ult i32 %10, 10
+    %12 = trunc i32 %10 to i8
+    %13 = select i1 %11, i8 48, i8 55
+    %14 = add i8 %13, %12
+    store i8 %14, i8* %7, align 1
+    %15 = call i1 @llvm.loop.decrement.i64(i64 1)
+    br i1 %15, label %4, label %16
+
+  16:                                               ; preds = %4
+    ret void
+  }
+
+  declare void @llvm.set.loop.iterations.i64(i64)
+
+  declare i1 @llvm.loop.decrement.i64(i64)
+
+  declare void @llvm.stackprotector(i8*, i8**)
+...
+---
+name:            phi_eliminate
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+  - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 1, class: gprc, preferred-register: '' }
+  - { id: 2, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 3, class: g8rc, preferred-register: '' }
+  - { id: 4, class: gprc, preferred-register: '' }
+  - { id: 5, class: g8rc, preferred-register: '' }
+  - { id: 6, class: g8rc, preferred-register: '' }
+  - { id: 7, class: g8rc, preferred-register: '' }
+  - { id: 8, class: gprc, preferred-register: '' }
+  - { id: 9, class: gprc, preferred-register: '' }
+  - { id: 10, class: g8rc, preferred-register: '' }
+  - { id: 11, class: gprc, preferred-register: '' }
+  - { id: 12, class: gprc, preferred-register: '' }
+  - { id: 13, class: crrc, preferred-register: '' }
+  - { id: 14, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 15, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 16, class: gprc, preferred-register: '' }
+  - { id: 17, class: gprc, preferred-register: '' }
+  - { id: 18, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 19, class: gprc, preferred-register: '' }
+  - { id: 20, class: gprc, preferred-register: '' }
+  - { id: 21, class: gprc, preferred-register: '' }
+  - { id: 22, class: crrc, preferred-register: '' }
+  - { id: 23, class: gprc, preferred-register: '' }
+  - { id: 24, class: gprc, preferred-register: '' }
+  - { id: 25, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 26, class: gprc, preferred-register: '' }
+  - { id: 27, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 28, class: gprc, preferred-register: '' }
+  - { id: 29, class: gprc, preferred-register: '' }
+  - { id: 30, class: gprc, preferred-register: '' }
+  - { id: 31, class: crrc, preferred-register: '' }
+  - { id: 32, class: gprc, preferred-register: '' }
+  - { id: 33, class: gprc, preferred-register: '' }
+  - { id: 34, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 35, class: gprc, preferred-register: '' }
+  - { id: 36, class: gprc, preferred-register: '' }
+  - { id: 37, class: gprc, preferred-register: '' }
+  - { id: 38, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 39, class: g8rc, preferred-register: '' }
+  - { id: 40, class: gprc, preferred-register: '' }
+  - { id: 41, class: gprc, preferred-register: '' }
+  - { id: 42, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 43, class: gprc, preferred-register: '' }
+  - { id: 44, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 45, class: gprc, preferred-register: '' }
+  - { id: 46, class: gprc, preferred-register: '' }
+  - { id: 47, class: crrc, preferred-register: '' }
+  - { id: 48, class: gprc, preferred-register: '' }
+  - { id: 49, class: gprc, preferred-register: '' }
+  - { id: 50, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 51, class: g8rc, preferred-register: '' }
+  - { id: 52, class: gprc, preferred-register: '' }
+  - { id: 53, class: gprc, preferred-register: '' }
+  - { id: 54, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 55, class: gprc, preferred-register: '' }
+  - { id: 56, class: gprc, preferred-register: '' }
+liveins:
+  - { reg: '$x3', virtual-reg: '%5' }
+  - { reg: '$x4', virtual-reg: '%6' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: phi_eliminate
+  ; CHECK: bb.0 (%ir-block.3):
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $x3, $x4
+  ; CHECK:   %6:g8rc = COPY killed $x4
+  ; CHECK:   %5:g8rc = COPY killed $x3
+  ; CHECK:   %9:gprc = COPY killed %6.sub_32
+  ; CHECK:   %8:gprc = COPY killed %5.sub_32
+  ; CHECK:   MTCTR8loop undef %10:g8rc, implicit-def dead $ctr8
+  ; CHECK:   %14:gprc_and_gprc_nor0 = LI 55
+  ; CHECK:   %15:gprc_and_gprc_nor0 = LI 48
+
+  ; CHECK: bb.1 (%ir-block.4):
+  ; CHECK:   successors: %bb.2(0x40000000), %bb.7(0x40000000)
+  ; CHECK:   %19:gprc = DIVW %8, %9
+  ; CHECK:   BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
+
+  ; CHECK: bb.7:
+  ; CHECK:   successors: %bb.5(0x80000000)
+  ; CHECK:   %61:gprc = COPY killed %8
+  ; CHECK:   %62:g8rc_and_g8rc_nox0 = IMPLICIT_DEF
+  ; CHECK:   %63:gprc = COPY killed %19
+  ; CHECK:   B %bb.5
+
+  ; CHECK: bb.2 (%ir-block.4):
+  ; CHECK:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; CHECK:   %20:gprc = nsw MULLW %19, %9
+  ; CHECK:   %21:gprc = SUBF killed %20, killed %8
+  ; CHECK:   %22:crrc = CMPLWI %21, 10
+  ; CHECK:   %23:gprc = ISEL %15, %14, killed %22.sub_lt
+  ; CHECK:   %24:gprc = ADD4 killed %23, killed %21
+  ; CHECK:   %25:g8rc_and_g8rc_nox0 = STBU killed %24, -1, undef %0:g8rc_and_g8rc_nox0 :: (store 1 into %ir.7)
+  ; CHECK:   %26:gprc = DIVW %19, %9
+  ; CHECK:   %57:gprc = COPY killed %26
+  ; CHECK:   %58:gprc = COPY %19
+  ; CHECK:   %59:g8rc_and_g8rc_nox0 = COPY killed %25
+  ; CHECK:   %60:gprc = COPY killed %19
+  ; CHECK:   BDZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
+  ; CHECK:   B %bb.3
+
+  ; CHECK: bb.3 (%ir-block.4):
+  ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+  ; CHECK:   %38:g8rc_and_g8rc_nox0 = COPY killed %59
+  ; CHECK:   %37:gprc = COPY killed %57
+  ; CHECK:   %36:gprc = COPY killed %58
+  ; CHECK:   %35:gprc = COPY %57
+  ; CHECK:   %27:g8rc_and_g8rc_nox0 = COPY killed %38
+  ; CHECK:   %56:gprc = COPY %35
+  ; CHECK:   %28:gprc = DIVW %56, %9
+  ; CHECK:   %29:gprc = nsw MULLW killed %37, %9
+  ; CHECK:   %30:gprc = SUBF killed %29, killed %36
+  ; CHECK:   %31:crrc = CMPLWI %30, 10
+  ; CHECK:   %32:gprc = ISEL %15, %14, killed %31.sub_lt
+  ; CHECK:   %33:gprc = ADD4 killed %32, killed %30
+  ; CHECK:   %34:g8rc_and_g8rc_nox0 = STBU killed %33, -1, killed %27 :: (store unknown-size into %ir.7, align 1)
+  ; CHECK:   %57:gprc = COPY killed %28
+  ; CHECK:   %58:gprc = COPY killed %35
+  ; CHECK:   %59:g8rc_and_g8rc_nox0 = COPY killed %34
+  ; CHECK:   %60:gprc = COPY killed %56
+  ; CHECK:   BDNZ8 %bb.3, implicit-def $ctr8, implicit $ctr8
+  ; CHECK:   B %bb.4
+
+  ; CHECK: bb.4:
+  ; CHECK:   successors: %bb.5(0x80000000)
+  ; CHECK:   %44:g8rc_and_g8rc_nox0 = COPY killed %59
+  ; CHECK:   %43:gprc = COPY killed %57
+  ; CHECK:   %41:gprc = COPY killed %60
+  ; CHECK:   %39:g8rc = COPY killed %44
+  ; CHECK:   %61:gprc = COPY killed %41
+  ; CHECK:   %62:g8rc_and_g8rc_nox0 = COPY killed %39
+  ; CHECK:   %63:gprc = COPY killed %43
+
+  ; CHECK: bb.5:
+  ; CHECK:   successors: %bb.6(0x80000000)
+  ; CHECK:   %55:gprc = COPY killed %63
+  ; CHECK:   %54:g8rc_and_g8rc_nox0 = COPY killed %62
+  ; CHECK:   %53:gprc = COPY killed %61
+  ; CHECK:   %45:gprc = nsw MULLW killed %55, killed %9
+  ; CHECK:   %46:gprc = SUBF killed %45, killed %53
+  ; CHECK:   %47:crrc = CMPLWI %46, 10
+  ; CHECK:   %48:gprc = ISEL killed %15, killed %14, killed %47.sub_lt
+  ; CHECK:   %49:gprc = ADD4 killed %48, killed %46
+  ; CHECK:   dead %50:g8rc_and_g8rc_nox0 = STBU killed %49, -1, killed %54 :: (store unknown-size into %ir.7, align 1)
+  ; CHECK:   B %bb.6
+
+  ; CHECK: bb.6 (%ir-block.16):
+  ; CHECK:   BLR8 implicit $lr8, implicit $rm
+
+  bb.0 (%ir-block.3):
+    successors: %bb.1(0x80000000)
+    liveins: $x3, $x4
+
+    %6:g8rc = COPY killed $x4
+    %5:g8rc = COPY killed $x3
+    %9:gprc = COPY killed %6.sub_32
+    %8:gprc = COPY killed %5.sub_32
+    MTCTR8loop undef %10:g8rc, implicit-def dead $ctr8
+    %14:gprc_and_gprc_nor0 = LI 55
+    %15:gprc_and_gprc_nor0 = LI 48
+
+  bb.1 (%ir-block.4):
+    successors: %bb.2(0x40000000), %bb.5(0x40000000)
+
+    %19:gprc = DIVW %8, %9
+    BDZ8 %bb.5, implicit-def $ctr8, implicit $ctr8
+    B %bb.2
+
+  bb.2 (%ir-block.4):
+    successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+    %20:gprc = nsw MULLW %19, %9
+    %21:gprc = SUBF killed %20, killed %8
+    %22:crrc = CMPLWI %21, 10
+    %23:gprc = ISEL %15, %14, killed %22.sub_lt
+    %24:gprc = ADD4 killed %23, killed %21
+    %25:g8rc_and_g8rc_nox0 = STBU killed %24, -1, undef %0:g8rc_and_g8rc_nox0 :: (store 1 into %ir.7)
+    %26:gprc = DIVW %19, %9
+    BDZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
+    B %bb.3
+
+  bb.3 (%ir-block.4):
+    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+
+    %35:gprc = PHI %26, %bb.2, %28, %bb.3
+    %36:gprc = PHI %19, %bb.2, %35, %bb.3
+    %37:gprc = PHI %26, %bb.2, %28, %bb.3
+    %38:g8rc_and_g8rc_nox0 = PHI %25, %bb.2, %34, %bb.3
+    %27:g8rc_and_g8rc_nox0 = COPY killed %38
+    %56:gprc = COPY %35
+    %28:gprc = DIVW %56, %9
+    %29:gprc = nsw MULLW killed %37, %9
+    %30:gprc = SUBF killed %29, killed %36
+    %31:crrc = CMPLWI %30, 10
+    %32:gprc = ISEL %15, %14, killed %31.sub_lt
+    %33:gprc = ADD4 killed %32, killed %30
+    %34:g8rc_and_g8rc_nox0 = STBU killed %33, -1, killed %27 :: (store unknown-size into %ir.7, align 1)
+    BDNZ8 %bb.3, implicit-def $ctr8, implicit $ctr8
+    B %bb.4
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    %41:gprc = PHI %19, %bb.2, %56, %bb.3
+    %43:gprc = PHI %26, %bb.2, %28, %bb.3
+    %44:g8rc_and_g8rc_nox0 = PHI %25, %bb.2, %34, %bb.3
+    %39:g8rc = COPY killed %44
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+
+    %53:gprc = PHI %8, %bb.1, %41, %bb.4
+    %54:g8rc_and_g8rc_nox0 = PHI undef %0:g8rc_and_g8rc_nox0, %bb.1, %39, %bb.4
+    %55:gprc = PHI %19, %bb.1, %43, %bb.4
+    %45:gprc = nsw MULLW killed %55, killed %9
+    %46:gprc = SUBF killed %45, killed %53
+    %47:crrc = CMPLWI %46, 10
+    %48:gprc = ISEL killed %15, killed %14, killed %47.sub_lt
+    %49:gprc = ADD4 killed %48, killed %46
+    dead %50:g8rc_and_g8rc_nox0 = STBU killed %49, -1, killed %54 :: (store unknown-size into %ir.7, align 1)
+    B %bb.6
+
+  bb.6 (%ir-block.16):
+    BLR8 implicit $lr8, implicit $rm
+
+...

From bfdf9ef009ab335981747f09a2c6b9a41c0462b4 Mon Sep 17 00:00:00 2001
From: Kang Zhang <shkzhang@cn.ibm.com>
Date: Sun, 31 May 2020 09:24:21 +0000
Subject: [PATCH 664/770] Revert "[NFC][PowerPC] Add a new case to test
 phi-node-elimination pass" This case wll be failed on some machines which
 enable expensive-checks.

This reverts commit af3abbf7bd2213003a133c361c212ac6efb1bd2b.
---
 llvm/test/CodeGen/PowerPC/phi-eliminate.mir | 295 --------------------
 1 file changed, 295 deletions(-)
 delete mode 100644 llvm/test/CodeGen/PowerPC/phi-eliminate.mir

diff --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
deleted file mode 100644
index 8b49b038e6c8a..0000000000000
--- a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
+++ /dev/null
@@ -1,295 +0,0 @@
-# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - \
-# RUN:   -run-pass=livevars,phi-node-elimination | FileCheck %s
-
---- |
-  define void @phi_eliminate(i32 %0, i32 %1, i8* %2) {
-    %scevgep3 = getelementptr i8, i8* %2, i64 undef
-    call void @llvm.set.loop.iterations.i64(i64 undef)
-    br label %4
-
-  4:                                                ; preds = %4, %3
-    %5 = phi i32 [ %8, %4 ], [ %0, %3 ]
-    %6 = phi i8* [ %scevgep3, %3 ], [ %7, %4 ]
-    %7 = getelementptr i8, i8* %6, i64 -1
-    %8 = sdiv i32 %5, %1
-    %9 = mul nsw i32 %8, %1
-    %10 = sub nsw i32 %5, %9
-    %11 = icmp ult i32 %10, 10
-    %12 = trunc i32 %10 to i8
-    %13 = select i1 %11, i8 48, i8 55
-    %14 = add i8 %13, %12
-    store i8 %14, i8* %7, align 1
-    %15 = call i1 @llvm.loop.decrement.i64(i64 1)
-    br i1 %15, label %4, label %16
-
-  16:                                               ; preds = %4
-    ret void
-  }
-
-  declare void @llvm.set.loop.iterations.i64(i64)
-
-  declare i1 @llvm.loop.decrement.i64(i64)
-
-  declare void @llvm.stackprotector(i8*, i8**)
-...
----
-name:            phi_eliminate
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-registers:
-  - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 1, class: gprc, preferred-register: '' }
-  - { id: 2, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 3, class: g8rc, preferred-register: '' }
-  - { id: 4, class: gprc, preferred-register: '' }
-  - { id: 5, class: g8rc, preferred-register: '' }
-  - { id: 6, class: g8rc, preferred-register: '' }
-  - { id: 7, class: g8rc, preferred-register: '' }
-  - { id: 8, class: gprc, preferred-register: '' }
-  - { id: 9, class: gprc, preferred-register: '' }
-  - { id: 10, class: g8rc, preferred-register: '' }
-  - { id: 11, class: gprc, preferred-register: '' }
-  - { id: 12, class: gprc, preferred-register: '' }
-  - { id: 13, class: crrc, preferred-register: '' }
-  - { id: 14, class: gprc_and_gprc_nor0, preferred-register: '' }
-  - { id: 15, class: gprc_and_gprc_nor0, preferred-register: '' }
-  - { id: 16, class: gprc, preferred-register: '' }
-  - { id: 17, class: gprc, preferred-register: '' }
-  - { id: 18, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 19, class: gprc, preferred-register: '' }
-  - { id: 20, class: gprc, preferred-register: '' }
-  - { id: 21, class: gprc, preferred-register: '' }
-  - { id: 22, class: crrc, preferred-register: '' }
-  - { id: 23, class: gprc, preferred-register: '' }
-  - { id: 24, class: gprc, preferred-register: '' }
-  - { id: 25, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 26, class: gprc, preferred-register: '' }
-  - { id: 27, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 28, class: gprc, preferred-register: '' }
-  - { id: 29, class: gprc, preferred-register: '' }
-  - { id: 30, class: gprc, preferred-register: '' }
-  - { id: 31, class: crrc, preferred-register: '' }
-  - { id: 32, class: gprc, preferred-register: '' }
-  - { id: 33, class: gprc, preferred-register: '' }
-  - { id: 34, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 35, class: gprc, preferred-register: '' }
-  - { id: 36, class: gprc, preferred-register: '' }
-  - { id: 37, class: gprc, preferred-register: '' }
-  - { id: 38, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 39, class: g8rc, preferred-register: '' }
-  - { id: 40, class: gprc, preferred-register: '' }
-  - { id: 41, class: gprc, preferred-register: '' }
-  - { id: 42, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 43, class: gprc, preferred-register: '' }
-  - { id: 44, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 45, class: gprc, preferred-register: '' }
-  - { id: 46, class: gprc, preferred-register: '' }
-  - { id: 47, class: crrc, preferred-register: '' }
-  - { id: 48, class: gprc, preferred-register: '' }
-  - { id: 49, class: gprc, preferred-register: '' }
-  - { id: 50, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 51, class: g8rc, preferred-register: '' }
-  - { id: 52, class: gprc, preferred-register: '' }
-  - { id: 53, class: gprc, preferred-register: '' }
-  - { id: 54, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 55, class: gprc, preferred-register: '' }
-  - { id: 56, class: gprc, preferred-register: '' }
-liveins:
-  - { reg: '$x3', virtual-reg: '%5' }
-  - { reg: '$x4', virtual-reg: '%6' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-callSites:       []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  ; CHECK-LABEL: name: phi_eliminate
-  ; CHECK: bb.0 (%ir-block.3):
-  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $x3, $x4
-  ; CHECK:   %6:g8rc = COPY killed $x4
-  ; CHECK:   %5:g8rc = COPY killed $x3
-  ; CHECK:   %9:gprc = COPY killed %6.sub_32
-  ; CHECK:   %8:gprc = COPY killed %5.sub_32
-  ; CHECK:   MTCTR8loop undef %10:g8rc, implicit-def dead $ctr8
-  ; CHECK:   %14:gprc_and_gprc_nor0 = LI 55
-  ; CHECK:   %15:gprc_and_gprc_nor0 = LI 48
-
-  ; CHECK: bb.1 (%ir-block.4):
-  ; CHECK:   successors: %bb.2(0x40000000), %bb.7(0x40000000)
-  ; CHECK:   %19:gprc = DIVW %8, %9
-  ; CHECK:   BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
-
-  ; CHECK: bb.7:
-  ; CHECK:   successors: %bb.5(0x80000000)
-  ; CHECK:   %61:gprc = COPY killed %8
-  ; CHECK:   %62:g8rc_and_g8rc_nox0 = IMPLICIT_DEF
-  ; CHECK:   %63:gprc = COPY killed %19
-  ; CHECK:   B %bb.5
-
-  ; CHECK: bb.2 (%ir-block.4):
-  ; CHECK:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
-  ; CHECK:   %20:gprc = nsw MULLW %19, %9
-  ; CHECK:   %21:gprc = SUBF killed %20, killed %8
-  ; CHECK:   %22:crrc = CMPLWI %21, 10
-  ; CHECK:   %23:gprc = ISEL %15, %14, killed %22.sub_lt
-  ; CHECK:   %24:gprc = ADD4 killed %23, killed %21
-  ; CHECK:   %25:g8rc_and_g8rc_nox0 = STBU killed %24, -1, undef %0:g8rc_and_g8rc_nox0 :: (store 1 into %ir.7)
-  ; CHECK:   %26:gprc = DIVW %19, %9
-  ; CHECK:   %57:gprc = COPY killed %26
-  ; CHECK:   %58:gprc = COPY %19
-  ; CHECK:   %59:g8rc_and_g8rc_nox0 = COPY killed %25
-  ; CHECK:   %60:gprc = COPY killed %19
-  ; CHECK:   BDZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
-  ; CHECK:   B %bb.3
-
-  ; CHECK: bb.3 (%ir-block.4):
-  ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
-  ; CHECK:   %38:g8rc_and_g8rc_nox0 = COPY killed %59
-  ; CHECK:   %37:gprc = COPY killed %57
-  ; CHECK:   %36:gprc = COPY killed %58
-  ; CHECK:   %35:gprc = COPY %57
-  ; CHECK:   %27:g8rc_and_g8rc_nox0 = COPY killed %38
-  ; CHECK:   %56:gprc = COPY %35
-  ; CHECK:   %28:gprc = DIVW %56, %9
-  ; CHECK:   %29:gprc = nsw MULLW killed %37, %9
-  ; CHECK:   %30:gprc = SUBF killed %29, killed %36
-  ; CHECK:   %31:crrc = CMPLWI %30, 10
-  ; CHECK:   %32:gprc = ISEL %15, %14, killed %31.sub_lt
-  ; CHECK:   %33:gprc = ADD4 killed %32, killed %30
-  ; CHECK:   %34:g8rc_and_g8rc_nox0 = STBU killed %33, -1, killed %27 :: (store unknown-size into %ir.7, align 1)
-  ; CHECK:   %57:gprc = COPY killed %28
-  ; CHECK:   %58:gprc = COPY killed %35
-  ; CHECK:   %59:g8rc_and_g8rc_nox0 = COPY killed %34
-  ; CHECK:   %60:gprc = COPY killed %56
-  ; CHECK:   BDNZ8 %bb.3, implicit-def $ctr8, implicit $ctr8
-  ; CHECK:   B %bb.4
-
-  ; CHECK: bb.4:
-  ; CHECK:   successors: %bb.5(0x80000000)
-  ; CHECK:   %44:g8rc_and_g8rc_nox0 = COPY killed %59
-  ; CHECK:   %43:gprc = COPY killed %57
-  ; CHECK:   %41:gprc = COPY killed %60
-  ; CHECK:   %39:g8rc = COPY killed %44
-  ; CHECK:   %61:gprc = COPY killed %41
-  ; CHECK:   %62:g8rc_and_g8rc_nox0 = COPY killed %39
-  ; CHECK:   %63:gprc = COPY killed %43
-
-  ; CHECK: bb.5:
-  ; CHECK:   successors: %bb.6(0x80000000)
-  ; CHECK:   %55:gprc = COPY killed %63
-  ; CHECK:   %54:g8rc_and_g8rc_nox0 = COPY killed %62
-  ; CHECK:   %53:gprc = COPY killed %61
-  ; CHECK:   %45:gprc = nsw MULLW killed %55, killed %9
-  ; CHECK:   %46:gprc = SUBF killed %45, killed %53
-  ; CHECK:   %47:crrc = CMPLWI %46, 10
-  ; CHECK:   %48:gprc = ISEL killed %15, killed %14, killed %47.sub_lt
-  ; CHECK:   %49:gprc = ADD4 killed %48, killed %46
-  ; CHECK:   dead %50:g8rc_and_g8rc_nox0 = STBU killed %49, -1, killed %54 :: (store unknown-size into %ir.7, align 1)
-  ; CHECK:   B %bb.6
-
-  ; CHECK: bb.6 (%ir-block.16):
-  ; CHECK:   BLR8 implicit $lr8, implicit $rm
-
-  bb.0 (%ir-block.3):
-    successors: %bb.1(0x80000000)
-    liveins: $x3, $x4
-
-    %6:g8rc = COPY killed $x4
-    %5:g8rc = COPY killed $x3
-    %9:gprc = COPY killed %6.sub_32
-    %8:gprc = COPY killed %5.sub_32
-    MTCTR8loop undef %10:g8rc, implicit-def dead $ctr8
-    %14:gprc_and_gprc_nor0 = LI 55
-    %15:gprc_and_gprc_nor0 = LI 48
-
-  bb.1 (%ir-block.4):
-    successors: %bb.2(0x40000000), %bb.5(0x40000000)
-
-    %19:gprc = DIVW %8, %9
-    BDZ8 %bb.5, implicit-def $ctr8, implicit $ctr8
-    B %bb.2
-
-  bb.2 (%ir-block.4):
-    successors: %bb.3(0x40000000), %bb.4(0x40000000)
-
-    %20:gprc = nsw MULLW %19, %9
-    %21:gprc = SUBF killed %20, killed %8
-    %22:crrc = CMPLWI %21, 10
-    %23:gprc = ISEL %15, %14, killed %22.sub_lt
-    %24:gprc = ADD4 killed %23, killed %21
-    %25:g8rc_and_g8rc_nox0 = STBU killed %24, -1, undef %0:g8rc_and_g8rc_nox0 :: (store 1 into %ir.7)
-    %26:gprc = DIVW %19, %9
-    BDZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
-    B %bb.3
-
-  bb.3 (%ir-block.4):
-    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
-
-    %35:gprc = PHI %26, %bb.2, %28, %bb.3
-    %36:gprc = PHI %19, %bb.2, %35, %bb.3
-    %37:gprc = PHI %26, %bb.2, %28, %bb.3
-    %38:g8rc_and_g8rc_nox0 = PHI %25, %bb.2, %34, %bb.3
-    %27:g8rc_and_g8rc_nox0 = COPY killed %38
-    %56:gprc = COPY %35
-    %28:gprc = DIVW %56, %9
-    %29:gprc = nsw MULLW killed %37, %9
-    %30:gprc = SUBF killed %29, killed %36
-    %31:crrc = CMPLWI %30, 10
-    %32:gprc = ISEL %15, %14, killed %31.sub_lt
-    %33:gprc = ADD4 killed %32, killed %30
-    %34:g8rc_and_g8rc_nox0 = STBU killed %33, -1, killed %27 :: (store unknown-size into %ir.7, align 1)
-    BDNZ8 %bb.3, implicit-def $ctr8, implicit $ctr8
-    B %bb.4
-
-  bb.4:
-    successors: %bb.5(0x80000000)
-
-    %41:gprc = PHI %19, %bb.2, %56, %bb.3
-    %43:gprc = PHI %26, %bb.2, %28, %bb.3
-    %44:g8rc_and_g8rc_nox0 = PHI %25, %bb.2, %34, %bb.3
-    %39:g8rc = COPY killed %44
-
-  bb.5:
-    successors: %bb.6(0x80000000)
-
-    %53:gprc = PHI %8, %bb.1, %41, %bb.4
-    %54:g8rc_and_g8rc_nox0 = PHI undef %0:g8rc_and_g8rc_nox0, %bb.1, %39, %bb.4
-    %55:gprc = PHI %19, %bb.1, %43, %bb.4
-    %45:gprc = nsw MULLW killed %55, killed %9
-    %46:gprc = SUBF killed %45, killed %53
-    %47:crrc = CMPLWI %46, 10
-    %48:gprc = ISEL killed %15, killed %14, killed %47.sub_lt
-    %49:gprc = ADD4 killed %48, killed %46
-    dead %50:g8rc_and_g8rc_nox0 = STBU killed %49, -1, killed %54 :: (store unknown-size into %ir.7, align 1)
-    B %bb.6
-
-  bb.6 (%ir-block.16):
-    BLR8 implicit $lr8, implicit $rm
-
-...

From ec25a71eb7fc72440149784951d62453301cc960 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 31 May 2020 11:04:35 +0100
Subject: [PATCH 665/770] [ScheduleDAG] Avoid unnecessary recomputation of
 topological order.

In some cases ScheduleDAGRRList has to add new nodes to resolve problems
with interfering physical registers. When new nodes are added, it
completely re-computes the topological order, which can take a long
time, but is unnecessary. We only add nodes one by one, and initially
they do not have any predecessors. So we can just insert them at the end
of the vector. Later we add predecessors, but the helper function
properly updates the topological order much more efficiently. With this
change, the compile time for the program below drops from 300s to 30s on
my machine.

    define i11129 @test1() {
      %L1 = load i11129, i11129* undef
      %B30 = ashr i11129 %L1, %L1
      store i11129 %B30, i11129* undef
      ret i11129 %L1
    }

This should be generally beneficial, as we can skip a large amount of
work. Theoretically there are some scenarios where we might not safe
much, e.g. when we add a dependency between the first and last node.
Then we would have to shift all nodes. But we still do not have to spend
the time re-computing the initial order.

Reviewers: MatzeB, atrick, efriedma, niravd, paquette

Reviewed By: paquette

Differential Revision: https://reviews.llvm.org/D59722
---
 llvm/include/llvm/CodeGen/ScheduleDAG.h             | 4 ++++
 llvm/lib/CodeGen/ScheduleDAG.cpp                    | 8 ++++++++
 llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp | 4 ++--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h
index e004f3bf2cc1c..4c8d047727ceb 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAG.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h
@@ -724,6 +724,10 @@ class TargetRegisterInfo;
   public:
     ScheduleDAGTopologicalSort(std::vector<SUnit> &SUnits, SUnit *ExitSU);
 
+    /// Add a SUnit without predecessors to the end of the topological order. It
+    /// also must be the first new node added to the DAG.
+    void AddSUnitWithoutPredecessors(const SUnit *SU);
+
     /// Creates the initial topological ordering from the DAG to be scheduled.
     void InitDAGTopologicalSorting();
 
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index dc3a11670a166..60f8eec1b9bc7 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -713,6 +713,14 @@ bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
   return false;
 }
 
+void ScheduleDAGTopologicalSort::AddSUnitWithoutPredecessors(const SUnit *SU) {
+  assert(SU->NodeNum == Index2Node.size() && "Node cannot be added at the end");
+  assert(SU->NumPreds == 0 && "Can only add SU's with no predecessors");
+  Node2Index.push_back(Index2Node.size());
+  Index2Node.push_back(SU->NodeNum);
+  Visited.resize(Node2Index.size());
+}
+
 bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
                                              const SUnit *TargetSU) {
   FixOrder();
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index ff806bdb822c2..72e68a5045c69 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -279,7 +279,7 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes {
     SUnit *NewNode = newSUnit(N);
     // Update the topological ordering.
     if (NewNode->NodeNum >= NumSUnits)
-      Topo.MarkDirty();
+      Topo.AddSUnitWithoutPredecessors(NewNode);
     return NewNode;
   }
 
@@ -289,7 +289,7 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes {
     SUnit *NewNode = Clone(N);
     // Update the topological ordering.
     if (NewNode->NodeNum >= NumSUnits)
-      Topo.MarkDirty();
+      Topo.AddSUnitWithoutPredecessors(NewNode);
     return NewNode;
   }
 

From 45ebe38ffc40bb7221fc587bfb4481cf7f53ebbc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 11:43:27 +0100
Subject: [PATCH 666/770] [X86][AVX] Pad small shuffle inputs in
 combineX86ShufflesRecursively

As detailed on PR45974 and D79987, getFauxShuffleMask is creating nodes on the fly to create shuffles with inputs the same size as the result, causing problems for hasOneUse() checks in later simplification stages.

Currently only combineX86ShufflesRecursively benefits from these widened inputs so I've begun moving the functionality there, and out of getFauxShuffleMask. This allows us to remove the widening from VBROADCAST and *EXTEND* faux shuffle cases.

This just leaves the INSERT_SUBVECTOR case in getFauxShuffleMask still creating nodes, which will require more extensive refactoring.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 44 ++++++++-----------------
 1 file changed, 13 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5eba9f3875a9e..bcaf6298de332 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7662,20 +7662,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
   }
   case X86ISD::VBROADCAST: {
     SDValue Src = N.getOperand(0);
-    MVT SrcVT = Src.getSimpleValueType();
-    if (!SrcVT.isVector())
+    if (!Src.getSimpleValueType().isVector())
       return false;
-
-    if (NumSizeInBits != SrcVT.getSizeInBits()) {
-      assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
-             "Illegal broadcast type");
-      SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
-                               NumSizeInBits / SrcVT.getScalarSizeInBits());
-      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
-                        DAG.getUNDEF(SrcVT), Src,
-                        DAG.getIntPtrConstant(0, SDLoc(N)));
-    }
-
     Ops.push_back(Src);
     Mask.append(NumElts, 0);
     return true;
@@ -7692,22 +7680,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
         (SrcVT.getScalarSizeInBits() % 8) != 0)
       return false;
 
-    unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
     bool IsAnyExtend =
         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
-    DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
-                         Mask);
-
-    if (NumSizeInBits != SrcVT.getSizeInBits()) {
-      assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
-             "Illegal zero-extension type");
-      SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
-                               NumSizeInBits / NumSrcBitsPerElt);
-      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
-                        DAG.getUNDEF(SrcVT), Src,
-                        DAG.getIntPtrConstant(0, SDLoc(N)));
-    }
-
+    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+                         IsAnyExtend, Mask);
     Ops.push_back(Src);
     return true;
   }
@@ -35054,7 +35030,8 @@ static SDValue combineX86ShufflesRecursively(
 
   assert(Root.getSimpleValueType().isVector() &&
          "Shuffles operate on vector types!");
-  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+  assert(VT.getSizeInBits() == RootSizeInBits &&
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
@@ -35068,12 +35045,17 @@ static SDValue combineX86ShufflesRecursively(
                               OpZero, DAG, Depth, false))
     return SDValue();
 
-  // Shuffle inputs must be the same size as the result.
-  if (llvm::any_of(OpInputs, [VT](SDValue Op) {
-        return VT.getSizeInBits() != Op.getValueSizeInBits();
+  // Shuffle inputs must be the same size as the result, bail on any larger
+  // inputs and widen any smaller inputs.
+  if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
+        return Op.getValueSizeInBits() > RootSizeInBits;
       }))
     return SDValue();
 
+  for (SDValue &Op : OpInputs)
+    if (Op.getValueSizeInBits() < RootSizeInBits)
+      Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), RootSizeInBits);
+
   SmallVector<int, 64> Mask;
   SmallVector<SDValue, 16> Ops;
 

From d33ba1aa0b505e3f4c55b382f171e8cbef6a1843 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 13:19:18 +0100
Subject: [PATCH 667/770] [X86][AVX] getFauxShuffleMask - don't widen shuffle
 inputs from INSERT_SUBVECTOR(X,SHUFFLE(Y,Z))

Don't create nodes on the fly when decoding INSERT_SUBVECTOR as faux shuffles.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bcaf6298de332..95c9312cd772c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7438,9 +7438,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
       return false;
 
-    // Shuffle inputs must be the same size as the subvector.
+    // Subvector shuffle inputs must not be larger than the subvector.
     if (llvm::any_of(SubInputs, [SubVT](SDValue Op) {
-          return SubVT.getSizeInBits() != Op.getValueSizeInBits();
+          return SubVT.getSizeInBits() > Op.getValueSizeInBits();
         }))
       return false;
 
@@ -7460,14 +7460,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       }
     }
     Ops.push_back(Src);
-    for (SDValue &SubInput : SubInputs) {
-      EVT SubSVT = SubInput.getValueType().getScalarType();
-      EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
-                                   NumSizeInBits / SubSVT.getSizeInBits());
-      Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
-                                DAG.getUNDEF(AltVT), SubInput,
-                                DAG.getIntPtrConstant(0, SDLoc(N))));
-    }
+    Ops.append(SubInputs.begin(), SubInputs.end());
     for (int i = 0; i != (int)NumElts; ++i)
       Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {

From 129c501aa9199c2c5a69c7a6de8ec9873e3d41a4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 31 May 2020 08:41:09 -0400
Subject: [PATCH 668/770] [PhaseOrdering] add scalarization test for PR42174;
 NFC

Motivating test for vector-combine enhancement in D80885.
Make sure that vectorization and canonicalization are
working together as expected.
---
 .../PhaseOrdering/X86/scalarization.ll        | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
new file mode 100644
index 0000000000000..3b341f6a5b7a5
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3                   -S < %s  | FileCheck %s
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+
+target triple = "x86_64--"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR42174 - https://bugs.llvm.org/show_bug.cgi?id=42174
+; This test should match the IR produced by clang after running -mem2reg.
+; All math before the final 'add' should be scalarized.
+
+define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, i32 %p, i32 %j, i32 %u) {
+; CHECK-LABEL: @square(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[K:%.*]], 2
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[DIV]], i32 0
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[P:%.*]], 6234
+; CHECK-NEXT:    [[SPLATINSERT2:%.*]] = insertelement <4 x i32> undef, i32 [[MUL]], i32 0
+; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[H:%.*]], 75
+; CHECK-NEXT:    [[SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[MUL5]], i32 0
+; CHECK-NEXT:    [[DIV9:%.*]] = sdiv i32 [[J:%.*]], 3452
+; CHECK-NEXT:    [[SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[DIV9]], i32 0
+; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
+; CHECK-NEXT:    [[SPLATINSERT14:%.*]] = insertelement <4 x i32> undef, i32 [[MUL13]], i32 0
+; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
+; CHECK-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[DIV17]], i32 0
+; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
+; CHECK-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[MUL21]], i32 0
+; CHECK-NEXT:    [[SPLATINSERT25:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SPLATINSERT25]], <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SPLATINSERT18]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[SPLATINSERT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[SPLATINSERT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[SPLATINSERT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[SPLATINSERT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[SPLATINSERT10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SPLATINSERT22]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 317425, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP10]], [[NUM:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD29]]
+;
+  %add = add <4 x i32> %num, <i32 1, i32 1, i32 1, i32 1>
+  %div = sdiv i32 %k, 2
+  %splatinsert = insertelement <4 x i32> undef, i32 %div, i32 0
+  %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add1 = add <4 x i32> %add, %splat
+  %mul = mul nsw i32 %p, 6234
+  %splatinsert2 = insertelement <4 x i32> undef, i32 %mul, i32 0
+  %splat3 = shufflevector <4 x i32> %splatinsert2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add4 = add <4 x i32> %add1, %splat3
+  %mul5 = mul nsw i32 75, %h
+  %splatinsert6 = insertelement <4 x i32> undef, i32 %mul5, i32 0
+  %splat7 = shufflevector <4 x i32> %splatinsert6, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add8 = add <4 x i32> %add4, %splat7
+  %div9 = sdiv i32 %j, 3452
+  %splatinsert10 = insertelement <4 x i32> undef, i32 %div9, i32 0
+  %splat11 = shufflevector <4 x i32> %splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add12 = add <4 x i32> %add8, %splat11
+  %mul13 = mul nsw i32 53, %w
+  %splatinsert14 = insertelement <4 x i32> undef, i32 %mul13, i32 0
+  %splat15 = shufflevector <4 x i32> %splatinsert14, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add16 = add <4 x i32> %add12, %splat15
+  %div17 = sdiv i32 %x, 820
+  %splatinsert18 = insertelement <4 x i32> undef, i32 %div17, i32 0
+  %splat19 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add20 = add <4 x i32> %add16, %splat19
+  %mul21 = mul nsw i32 4, %u
+  %splatinsert22 = insertelement <4 x i32> undef, i32 %mul21, i32 0
+  %splat23 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add24 = add <4 x i32> %add20, %splat23
+  %splatinsert25 = insertelement <4 x i32> undef, i32 %y, i32 0
+  %splat26 = shufflevector <4 x i32> %splatinsert25, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add27 = add <4 x i32> %add24, %splat26
+  %add28 = add <4 x i32> %add27, <i32 25, i32 25, i32 25, i32 25>
+  %add29 = add <4 x i32> %add28, <i32 317400, i32 317400, i32 317400, i32 317400>
+  ret <4 x i32> %add29
+}
+

From 15b281d7805dde85af532b954e27e3fc8bf2611d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 13:46:46 +0100
Subject: [PATCH 669/770] [X86][AVX] Add test case described in D79987

---
 llvm/test/CodeGen/X86/oddshuffles.ll | 69 ++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 03f6b52665169..e182008eadc9e 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1977,6 +1977,75 @@ define void @splat3_256(<32 x i8> %a0, <96 x i8> *%a1) {
   ret void
 }
 
+; D79987
+define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
+; SSE2-LABEL: splat_v3i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,0,1]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm3, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: splat_v3i32:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
+; SSE42-NEXT:    pxor %xmm1, %xmm1
+; SSE42-NEXT:    xorps %xmm3, %xmm3
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: splat_v3i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: splat_v3i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-SLOW-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: splat_v3i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-FAST-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; XOP-LABEL: splat_v3i32:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOP-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; XOP-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; XOP-NEXT:    retq
+  %1 = load <3 x i32>, <3 x i32>* %ptr, align 1
+  %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %3 = shufflevector <16 x i32> <i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> %2, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32 > %3
+}
+
 define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
 ; SSE2-LABEL: wrongorder:
 ; SSE2:       # %bb.0:

From f046326847076b50017b3d32db62c3511c478888 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 13:50:40 +0100
Subject: [PATCH 670/770] [X86] getFauxShuffleMask/getTargetShuffleInputs -
 make SelectionDAG const (PR45974).

Try to prevent future node creation issues (as detailed in PR45974) by making the SelectionDAG reference const, so it can still be used for analysis, but not node creation.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 95c9312cd772c..89559ad9acbda 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7271,7 +7271,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
 // TODO: Use DemandedElts variant.
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
-                                   SelectionDAG &DAG, unsigned Depth,
+                                   const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts);
 
 // Attempt to decode ops that could be represented as a shuffle mask.
@@ -7280,7 +7280,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
-                               SelectionDAG &DAG, unsigned Depth,
+                               const SelectionDAG &DAG, unsigned Depth,
                                bool ResolveKnownElts) {
   Mask.clear();
   Ops.clear();
@@ -7734,7 +7734,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
                                    SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
                                    APInt &KnownUndef, APInt &KnownZero,
-                                   SelectionDAG &DAG, unsigned Depth,
+                                   const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts) {
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())
@@ -7755,7 +7755,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
 
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
-                                   SelectionDAG &DAG, unsigned Depth = 0,
+                                   const SelectionDAG &DAG, unsigned Depth = 0,
                                    bool ResolveKnownElts = true) {
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())

From e31f2a894a7bec0a64553d615ef40fa36134844e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 31 May 2020 09:05:48 -0400
Subject: [PATCH 671/770] [VectorCombine] add tests for scalarizing
 binop-with-constant; NFC

Goes with proposal in D80885.

This is adapted from the InstCombine tests that were added for
D50992

But these should be adjusted further to provide more interesting
scenarios for x86-specific codegen. Eg, vector types/sizes will
have different costs depending on ISA attributes.

We also need to add tests that include a load of the scalar
variable and add tests that include extra uses of the insert
to further exercise the cost model.
---
 .../X86/insert-binop-with-constant.ll         | 643 ++++++++++++++++++
 1 file changed, 643 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll

diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
new file mode 100644
index 0000000000000..7b8dc44ebc243
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
@@ -0,0 +1,643 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
+
+define <2 x i64> @add_constant(i64 %x) {
+; CHECK-LABEL: @add_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = add <2 x i64> %ins, <i64 42, i64 undef>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @add_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = add <2 x i64> %ins, <i64 42, i64 -42>
+  ret <2 x i64> %bo
+}
+
+; IR flags are not required, but they should propagate.
+
+define <4 x i32> @sub_constant_op0(i32 %x) {
+; CHECK-LABEL: @sub_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sub nuw nsw <4 x i32> <i32 undef, i32 -42, i32 undef, i32 undef>, [[INS]]
+; CHECK-NEXT:    ret <4 x i32> [[BO]]
+;
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 1
+  %bo = sub nsw nuw <4 x i32> <i32 undef, i32 -42, i32 undef, i32 undef>, %ins
+  ret <4 x i32> %bo
+}
+
+define <4 x i32> @sub_constant_op0_not_undef_lane(i32 %x) {
+; CHECK-LABEL: @sub_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sub nuw <4 x i32> <i32 1, i32 42, i32 42, i32 -42>, [[INS]]
+; CHECK-NEXT:    ret <4 x i32> [[BO]]
+;
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 1
+  %bo = sub nuw <4 x i32> <i32 1, i32 42, i32 42, i32 -42>, %ins
+  ret <4 x i32> %bo
+}
+
+define <8 x i16> @sub_constant_op1(i16 %x) {
+; CHECK-LABEL: @sub_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <8 x i16> undef, i16 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = sub nuw <8 x i16> [[INS]], <i16 42, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
+; CHECK-NEXT:    ret <8 x i16> [[BO]]
+;
+  %ins = insertelement <8 x i16> undef, i16 %x, i32 0
+  %bo = sub nuw <8 x i16> %ins, <i16 42, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
+  ret <8 x i16> %bo
+}
+
+define <8 x i16> @sub_constant_op1_not_undef_lane(i16 %x) {
+; CHECK-LABEL: @sub_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <8 x i16> undef, i16 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = sub nuw <8 x i16> [[INS]], <i16 42, i16 -42, i16 0, i16 1, i16 -2, i16 3, i16 -4, i16 5>
+; CHECK-NEXT:    ret <8 x i16> [[BO]]
+;
+  %ins = insertelement <8 x i16> undef, i16 %x, i32 0
+  %bo = sub nuw <8 x i16> %ins, <i16 42, i16 -42, i16 0, i16 1, i16 -2, i16 3, i16 -4, i16 5>
+  ret <8 x i16> %bo
+}
+
+define <16 x i8> @mul_constant(i8 %x) {
+; CHECK-LABEL: @mul_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 2
+; CHECK-NEXT:    [[BO:%.*]] = mul <16 x i8> [[INS]], <i8 undef, i8 undef, i8 -42, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+; CHECK-NEXT:    ret <16 x i8> [[BO]]
+;
+  %ins = insertelement <16 x i8> undef, i8 %x, i32 2
+  %bo = mul <16 x i8> %ins, <i8 undef, i8 undef, i8 -42, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+  ret <16 x i8> %bo
+}
+
+define <3 x i64> @mul_constant_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @mul_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <3 x i64> undef, i64 [[X:%.*]], i32 2
+; CHECK-NEXT:    [[BO:%.*]] = mul <3 x i64> [[INS]], <i64 42, i64 undef, i64 -42>
+; CHECK-NEXT:    ret <3 x i64> [[BO]]
+;
+  %ins = insertelement <3 x i64> undef, i64 %x, i32 2
+  %bo = mul <3 x i64> %ins, <i64 42, i64 undef, i64 -42>
+  ret <3 x i64> %bo
+}
+
+define <2 x i64> @shl_constant_op0(i64 %x) {
+; CHECK-LABEL: @shl_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = shl <2 x i64> <i64 undef, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @shl_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = shl <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @shl_constant_op1(i64 %x) {
+; CHECK-LABEL: @shl_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = shl nuw <2 x i64> %ins, <i64 5, i64 undef>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @shl_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = shl nuw <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @ashr_constant_op0(i64 %x) {
+; CHECK-LABEL: @ashr_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = ashr exact <2 x i64> <i64 undef, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = ashr exact <2 x i64> <i64 undef, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @ashr_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @ashr_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = ashr exact <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = ashr exact <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @ashr_constant_op1(i64 %x) {
+; CHECK-LABEL: @ashr_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = ashr <2 x i64> [[INS]], <i64 5, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = ashr <2 x i64> %ins, <i64 5, i64 undef>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @ashr_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @ashr_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = ashr <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = ashr <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @lshr_constant_op0(i64 %x) {
+; CHECK-LABEL: @lshr_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = lshr <2 x i64> <i64 5, i64 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = lshr <2 x i64> <i64 5, i64 undef>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @lshr_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @lshr_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = lshr <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = lshr <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @lshr_constant_op1(i64 %x) {
+; CHECK-LABEL: @lshr_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = lshr exact <2 x i64> [[INS]], <i64 undef, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = lshr exact <2 x i64> %ins, <i64 undef, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @lshr_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @lshr_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = lshr exact <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = lshr exact <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @urem_constant_op0(i64 %x) {
+; CHECK-LABEL: @urem_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i64> <i64 5, i64 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = urem <2 x i64> <i64 5, i64 undef>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @urem_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @urem_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = urem <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @urem_constant_op1(i64 %x) {
+; CHECK-LABEL: @urem_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i64> [[INS]], <i64 undef, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = urem <2 x i64> %ins, <i64 undef, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @urem_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @urem_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = urem <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = urem <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @srem_constant_op0(i64 %x) {
+; CHECK-LABEL: @srem_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i64> <i64 5, i64 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = srem <2 x i64> <i64 5, i64 undef>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @srem_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @srem_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = srem <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @srem_constant_op1(i64 %x) {
+; CHECK-LABEL: @srem_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i64> [[INS]], <i64 undef, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = srem <2 x i64> %ins, <i64 undef, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @srem_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @srem_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = srem <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = srem <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @udiv_constant_op0(i64 %x) {
+; CHECK-LABEL: @udiv_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = udiv exact <2 x i64> <i64 5, i64 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = udiv exact <2 x i64> <i64 5, i64 undef>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @udiv_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @udiv_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = udiv exact <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = udiv exact <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @udiv_constant_op1(i64 %x) {
+; CHECK-LABEL: @udiv_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i64> [[INS]], <i64 undef, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = udiv <2 x i64> %ins, <i64 undef, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @udiv_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @udiv_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = udiv <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @sdiv_constant_op0(i64 %x) {
+; CHECK-LABEL: @sdiv_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = sdiv <2 x i64> <i64 5, i64 undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = sdiv <2 x i64> <i64 5, i64 undef>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @sdiv_constant_op0_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @sdiv_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = sdiv <2 x i64> <i64 5, i64 2>, [[INS]]
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = sdiv <2 x i64> <i64 5, i64 2>, %ins
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @sdiv_constant_op1(i64 %x) {
+; CHECK-LABEL: @sdiv_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sdiv exact <2 x i64> [[INS]], <i64 undef, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = sdiv exact <2 x i64> %ins, <i64 undef, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @sdiv_constant_op1_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @sdiv_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = sdiv exact <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = sdiv exact <2 x i64> %ins, <i64 5, i64 2>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @and_constant(i64 %x) {
+; CHECK-LABEL: @and_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = and <2 x i64> [[INS]], <i64 42, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = and <2 x i64> %ins, <i64 42, i64 undef>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @and_constant_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @and_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = and <2 x i64> [[INS]], <i64 42, i64 -42>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = and <2 x i64> %ins, <i64 42, i64 -42>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @or_constant(i64 %x) {
+; CHECK-LABEL: @or_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = or <2 x i64> [[INS]], <i64 undef, i64 -42>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = or <2 x i64> %ins, <i64 undef, i64 -42>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @or_constant_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @or_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = or <2 x i64> [[INS]], <i64 42, i64 -42>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 1
+  %bo = or <2 x i64> %ins, <i64 42, i64 -42>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @xor_constant(i64 %x) {
+; CHECK-LABEL: @xor_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = xor <2 x i64> [[INS]], <i64 42, i64 undef>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = xor <2 x i64> %ins, <i64 42, i64 undef>
+  ret <2 x i64> %bo
+}
+
+define <2 x i64> @xor_constant_not_undef_lane(i64 %x) {
+; CHECK-LABEL: @xor_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = xor <2 x i64> [[INS]], <i64 42, i64 -42>
+; CHECK-NEXT:    ret <2 x i64> [[BO]]
+;
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bo = xor <2 x i64> %ins, <i64 42, i64 -42>
+  ret <2 x i64> %bo
+}
+
+define <2 x double> @fadd_constant(double %x) {
+; CHECK-LABEL: @fadd_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fadd <2 x double> [[INS]], <double 4.200000e+01, double undef>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fadd <2 x double> %ins, <double 42.0, double undef>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fadd_constant_not_undef_lane(double %x) {
+; CHECK-LABEL: @fadd_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fadd <2 x double> [[INS]], <double 4.200000e+01, double -4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = fadd <2 x double> %ins, <double 42.0, double -42.0>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fsub_constant_op0(double %x) {
+; CHECK-LABEL: @fsub_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fsub fast <2 x double> <double 4.200000e+01, double undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fsub fast <2 x double> <double 42.0, double undef>, %ins
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fsub_constant_op0_not_undef_lane(double %x) {
+; CHECK-LABEL: @fsub_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fsub nsz <2 x double> <double 4.200000e+01, double -4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = fsub nsz <2 x double> <double 42.0, double -42.0>, %ins
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fsub_constant_op1(double %x) {
+; CHECK-LABEL: @fsub_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fsub <2 x double> [[INS]], <double undef, double 4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = fsub <2 x double> %ins, <double undef, double 42.0>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fsub_constant_op1_not_undef_lane(double %x) {
+; CHECK-LABEL: @fsub_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fsub <2 x double> [[INS]], <double 4.200000e+01, double -4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fsub <2 x double> %ins, <double 42.0, double -42.0>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fmul_constant(double %x) {
+; CHECK-LABEL: @fmul_constant(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fmul reassoc <2 x double> [[INS]], <double 4.200000e+01, double undef>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fmul reassoc <2 x double> %ins, <double 42.0, double undef>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fmul_constant_not_undef_lane(double %x) {
+; CHECK-LABEL: @fmul_constant_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fmul <2 x double> [[INS]], <double 4.200000e+01, double -4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = fmul <2 x double> %ins, <double 42.0, double -42.0>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fdiv_constant_op0(double %x) {
+; CHECK-LABEL: @fdiv_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = fdiv nnan <2 x double> <double undef, double 4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = fdiv nnan <2 x double> <double undef, double 42.0>, %ins
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fdiv_constant_op0_not_undef_lane(double %x) {
+; CHECK-LABEL: @fdiv_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fdiv ninf <2 x double> <double 4.200000e+01, double -4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fdiv ninf <2 x double> <double 42.0, double -42.0>, %ins
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fdiv_constant_op1(double %x) {
+; CHECK-LABEL: @fdiv_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x double> [[INS]], <double 4.200000e+01, double undef>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fdiv <2 x double> %ins, <double 42.0, double undef>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @fdiv_constant_op1_not_undef_lane(double %x) {
+; CHECK-LABEL: @fdiv_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x double> [[INS]], <double 4.200000e+01, double -4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = fdiv <2 x double> %ins, <double 42.0, double -42.0>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @frem_constant_op0(double %x) {
+; CHECK-LABEL: @frem_constant_op0(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = frem fast <2 x double> <double 4.200000e+01, double undef>, [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = frem fast <2 x double> <double 42.0, double undef>, %ins
+  ret <2 x double> %bo
+}
+
+define <2 x double> @frem_constant_op0_not_undef_lane(double %x) {
+; CHECK-LABEL: @frem_constant_op0_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = frem <2 x double> <double 4.200000e+01, double -4.200000e+01>, [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = frem <2 x double> <double 42.0, double -42.0>, %ins
+  ret <2 x double> %bo
+}
+
+define <2 x double> @frem_constant_op1(double %x) {
+; CHECK-LABEL: @frem_constant_op1(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[BO:%.*]] = frem ninf <2 x double> [[INS]], <double undef, double 4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 1
+  %bo = frem ninf <2 x double> %ins, <double undef, double 42.0>
+  ret <2 x double> %bo
+}
+
+define <2 x double> @frem_constant_op1_not_undef_lane(double %x) {
+; CHECK-LABEL: @frem_constant_op1_not_undef_lane(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BO:%.*]] = frem nnan <2 x double> [[INS]], <double 4.200000e+01, double -4.200000e+01>
+; CHECK-NEXT:    ret <2 x double> [[BO]]
+;
+  %ins = insertelement <2 x double> undef, double %x, i32 0
+  %bo = frem nnan <2 x double> %ins, <double 42.0, double -42.0>
+  ret <2 x double> %bo
+}

From 4a2673d79fdbae57a800ec578ee3d58a6890a4f9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 14:20:00 +0100
Subject: [PATCH 672/770] [X86][AVX] Add SimplifyMultipleUseDemandedBits
 VBROADCAST handling to SimplifyDemandedVectorElts.

As suggested on D79987.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  6 ++++++
 llvm/test/CodeGen/X86/oddshuffles.ll    | 14 ++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 89559ad9acbda..7edce21290330 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36830,6 +36830,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
+    // Aggressively peek through src to get at the demanded elt.
+    // TODO - we should do this for all target/faux shuffles ops.
+    APInt SrcBits = APInt::getAllOnesValue(SrcVT.getScalarSizeInBits());
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(Src, SrcBits, SrcElts,
+                                                         TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
     break;
   }
   case X86ISD::VPERMV: {
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index e182008eadc9e..910c40d673835 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2014,18 +2014,16 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
 ;
 ; AVX2-SLOW-LABEL: splat_v3i32:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-SLOW-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
-; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %ymm1
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-SLOW-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: splat_v3i32:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-FAST-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm1
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero

From 91b45fb527af8f1f83b51e7238642d3af011bad4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 31 May 2020 10:10:28 -0400
Subject: [PATCH 673/770] [PhaseOrdering] add test for hoisting/CSE (PR46115);
 NFC

---
 .../test/Transforms/PhaseOrdering/X86/vdiv.ll | 214 ++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
new file mode 100644
index 0000000000000..79cedac27a66e
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S                                        | FileCheck %s
+; RUN: opt < %s -passes='default<O3>' -aa-pipeline=default -S | FileCheck %s
+
+; Test that IR is optimal after vectorization/unrolling/CSE/canonicalization.
+; In particular, there should be no fdivs inside loops because that is expensive.
+
+; TODO: There is a CSE opportunity to reduce the hoisted fdivs after vectorization/unrolling.
+; PR46115 - https://bugs.llvm.org/PR46115
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.15.0"
+
+define void @vdiv(double* %x, double* %y, double %a, i32 %N) #0 {
+; CHECK-LABEL: @vdiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP6]], [[X]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N_VEC]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], 12
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
+; CHECK:       vector.ph.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], 9223372036854775804
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP8]] to <4 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP10]], <4 x double>* [[TMP12]], align 8, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x double>, <4 x double>* [[TMP14]], align 8, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_NEXT_1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x double>, <4 x double>* [[TMP19]], align 8, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_NEXT_1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double* [[TMP21]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP20]], <4 x double>* [[TMP22]], align 8, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_NEXT_2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x double>, <4 x double>* [[TMP24]], align 8, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP25:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_NEXT_2]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP25]], <4 x double>* [[TMP27]], align 8, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT_3]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NITER_NSUB_3]] = add i64 [[NITER]], -4
+; CHECK-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0
+; CHECK-NEXT:    br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK:       middle.block.unr-lcssa:
+; CHECK-NEXT:    [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL_PREHEADER:%.*]]
+; CHECK:       vector.body.epil.preheader:
+; CHECK-NEXT:    [[TMP28:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label [[VECTOR_BODY_EPIL:%.*]]
+; CHECK:       vector.body.epil:
+; CHECK-NEXT:    [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_UNR]], [[VECTOR_BODY_EPIL_PREHEADER]] ], [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ]
+; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ [[XTRAITER]], [[VECTOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_EPIL]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, <4 x double>* [[TMP30]], align 8, !tbaa !3, !alias.scope !7
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_EPIL]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_EPIL]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP31]], <4 x double>* [[TMP33]], align 8, !tbaa !3, !alias.scope !10, !noalias !7
+; CHECK-NEXT:    [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 4
+; CHECK-NEXT:    [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop !14
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[INDVARS_IV_PH]], -1
+; CHECK-NEXT:    [[TMP35:%.*]] = add nsw i64 [[TMP34]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[XTRAITER8:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3
+; CHECK-NEXT:    [[LCMP_MOD9:%.*]] = icmp eq i64 [[XTRAITER8]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD9]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]]
+; CHECK:       for.body.prol.preheader:
+; CHECK-NEXT:    [[TMP36:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[FOR_BODY_PROL:%.*]]
+; CHECK:       for.body.prol:
+; CHECK-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ]
+; CHECK-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_BODY_PROL]] ], [ [[XTRAITER8]], [[FOR_BODY_PROL_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_PROL]]
+; CHECK-NEXT:    [[T0_PROL:%.*]] = load double, double* [[ARRAYIDX_PROL]], align 8, !tbaa !3
+; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast double [[T0_PROL]], [[TMP36]]
+; CHECK-NEXT:    [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_PROL]]
+; CHECK-NEXT:    store double [[TMP37]], double* [[ARRAYIDX2_PROL]], align 8, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
+; CHECK-NEXT:    [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1
+; CHECK-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0
+; CHECK-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop !16
+; CHECK:       for.body.prol.loopexit:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ult i64 [[TMP35]], 3
+; CHECK-NEXT:    br i1 [[TMP38]], label [[FOR_END]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; CHECK:       for.body.preheader.new:
+; CHECK-NEXT:    [[TMP39:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP41:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP42:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[ARRAYIDX]], align 8, !tbaa !3
+; CHECK-NEXT:    [[TMP43:%.*]] = fmul fast double [[T0]], [[TMP39]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP43]], double* [[ARRAYIDX2]], align 8, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[T0_1:%.*]] = load double, double* [[ARRAYIDX_1]], align 8, !tbaa !3
+; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast double [[T0_1]], [[TMP40]]
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    store double [[TMP44]], double* [[ARRAYIDX2_1]], align 8, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT_1]]
+; CHECK-NEXT:    [[T0_2:%.*]] = load double, double* [[ARRAYIDX_2]], align 8, !tbaa !3
+; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast double [[T0_2]], [[TMP41]]
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT_1]]
+; CHECK-NEXT:    store double [[TMP45]], double* [[ARRAYIDX2_2]], align 8, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT_2]]
+; CHECK-NEXT:    [[T0_3:%.*]] = load double, double* [[ARRAYIDX_3]], align 8, !tbaa !3
+; CHECK-NEXT:    [[TMP46:%.*]] = fmul fast double [[T0_3]], [[TMP42]]
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT_2]]
+; CHECK-NEXT:    store double [[TMP46]], double* [[ARRAYIDX2_3]], align 8, !tbaa !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %div = fdiv fast double 1.0, %a
+  br label %for.cond
+
+for.cond:
+  %n.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %n.0, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  br label %for.end
+
+for.body:
+  %idxprom = sext i32 %n.0 to i64
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %idxprom
+  %t0 = load double, double* %arrayidx, align 8, !tbaa !3
+  %mul = fmul fast double %t0, %div
+  %idxprom1 = sext i32 %n.0 to i64
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %idxprom1
+  store double %mul, double* %arrayidx2, align 8, !tbaa !3
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %n.0, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git 45ebe38ffc40bb7221fc587bfb4481cf7f53ebbc)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"double", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+

From bfdc2552664d6f0bb332a9c6a115877020f3c1df Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 31 May 2020 10:46:11 -0400
Subject: [PATCH 674/770] [utils] change update_test_checks.py use of 'TMP'
 value names

As discussed in PR45951:
https://bugs.llvm.org/show_bug.cgi?id=45951

There's a potential name collision between update_test_checks.py and -instnamer
and/or manually-generated IR test files because all of them try to use the
variable name that should never be used: "tmp".

This patch proposes to reduce the odds of collision and adds a warning if we
detect the problem. This will cause regression test churn when regenerating
CHECK lines on existing files.

Differential Revision: https://reviews.llvm.org/D80584
---
 llvm/utils/UpdateTestChecks/common.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index a3fca4905f3dd..a2e9787253c59 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -218,10 +218,12 @@ def build_function_body_dictionary(function_re, scrubber, scrubber_args, raw_too
 # spaces, commas, paren, or end of the string
 IR_VALUE_RE = re.compile(r'(\s+)%([\w.-]+?)([,\s\(\)]|\Z)')
 
+NAMELESS_PREFIX = "NAMELESS"
+
 # Create a FileCheck variable name based on an IR name.
 def get_value_name(var):
   if var.isdigit():
-    var = 'TMP' + var
+    var = NAMELESS_PREFIX + var
   var = var.replace('.', '_')
   var = var.replace('-', '_')
   return var.upper()
@@ -243,6 +245,8 @@ def genericize_check_lines(lines, is_analyze, vars_seen):
   # into defs, and variables we have seen into uses.
   def transform_line_vars(match):
     var = match.group(2)
+    if NAMELESS_PREFIX.lower() in var.lower():
+      warn("Change IR value name '%s' to prevent possible conflict with scripted FileCheck name." % (var,))
     if var in vars_seen:
       rv = get_value_use(var)
     else:

From 95f65a7c6cebba7dbcd955bc02235f5d3581ff44 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 30 May 2020 16:03:16 -0400
Subject: [PATCH 675/770] AArch64/GlobalISel: Fix incorrect ptrmask usage for
 alignment

I inverted the mask when I ported to the new form of G_PTRMASK in
8bc03d2168241f7b12265e9cd7e4eb7655709f34.

I don't think this really broke anything, since G_VASTART isn't
handled for types with an alignment higher than the stack alignment.
---
 llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp        | 2 +-
 llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 510572e6d4121..6e53ec2bb46eb 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -243,7 +243,7 @@ MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
   LLT PtrTy = Res.getLLTTy(*getMRI());
   LLT MaskTy = LLT::scalar(PtrTy.getSizeInBits());
   Register MaskReg = getMRI()->createGenericVirtualRegister(MaskTy);
-  buildConstant(MaskReg, maskTrailingOnes<uint64_t>(NumBits));
+  buildConstant(MaskReg, maskTrailingZeros<uint64_t>(NumBits));
   return buildPtrMask(Res, Op0, MaskReg);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
index a0cc566771189..a8aac0210b181 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
@@ -23,8 +23,8 @@ body: |
     ; CHECK: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load 8)
     ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
     ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[LOAD2]], [[C1]](s64)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C1]](s64)
-    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[PTR_ADD2]], [[COPY1]](s64)
+    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[PTR_ADD2]], [[C2]](s64)
     ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTRMASK]], [[C]](s64)
     ; CHECK: G_STORE [[PTR_ADD3]](p0), [[COPY]](p0) :: (store 8)
     %0:_(p0) = COPY $x0

From f23ddbe3c3ae5f40b99ba272afc3d16b800ba8b9 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Sat, 30 May 2020 22:03:50 +0100
Subject: [PATCH 676/770] clang-tidy and clang-query wont crash with invalid
 command line options

Summary: Motivated by [[ https://bugs.llvm.org/show_bug.cgi?id=46141 | clang-tidy crashed for unknown command line argument. ]]

Reviewers: aaron.ballman, alexfh

Reviewed By: aaron.ballman

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80879
---
 clang-tools-extra/clang-query/tool/ClangQuery.cpp | 14 +++++++++++---
 .../clang-tidy/tool/ClangTidyMain.cpp             | 15 +++++++++++----
 .../test/clang-query/invalid-command-line.cpp     |  4 ++++
 .../infrastructure/invalid-command-line.cpp       |  4 ++++
 4 files changed, 30 insertions(+), 7 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-query/invalid-command-line.cpp
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp

diff --git a/clang-tools-extra/clang-query/tool/ClangQuery.cpp b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
index 5cfa0acf9120a..0c471def2e140 100644
--- a/clang-tools-extra/clang-query/tool/ClangQuery.cpp
+++ b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
 #include <fstream>
 #include <string>
 
@@ -86,7 +87,14 @@ bool runCommandsInFile(const char *ExeName, std::string const &FileName,
 int main(int argc, const char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
-  CommonOptionsParser OptionsParser(argc, argv, ClangQueryCategory);
+  llvm::Expected<CommonOptionsParser> OptionsParser =
+      CommonOptionsParser::create(argc, argv, ClangQueryCategory,
+                                  llvm::cl::OneOrMore);
+
+  if (!OptionsParser) {
+    llvm::WithColor::error() << llvm::toString(OptionsParser.takeError());
+    return 1;
+  }
 
   if (!Commands.empty() && !CommandFiles.empty()) {
     llvm::errs() << argv[0] << ": cannot specify both -c and -f\n";
@@ -99,8 +107,8 @@ int main(int argc, const char **argv) {
     return 1;
   }
 
-  ClangTool Tool(OptionsParser.getCompilations(),
-                 OptionsParser.getSourcePathList());
+  ClangTool Tool(OptionsParser->getCompilations(),
+                 OptionsParser->getSourcePathList());
   std::vector<std::unique_ptr<ASTUnit>> ASTs;
   int Status = Tool.buildASTs(ASTs);
   int ASTStatus = 0;
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 665d100268344..aca16b0d6d819 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
 
 using namespace clang::ast_matchers;
 using namespace clang::driver;
@@ -333,8 +334,14 @@ getVfsFromFile(const std::string &OverlayFile,
 
 int clangTidyMain(int argc, const char **argv) {
   llvm::InitLLVM X(argc, argv);
-  CommonOptionsParser OptionsParser(argc, argv, ClangTidyCategory,
-                                    cl::ZeroOrMore);
+  llvm::Expected<CommonOptionsParser> OptionsParser =
+      CommonOptionsParser::create(argc, argv, ClangTidyCategory,
+                                  cl::ZeroOrMore);
+  if (!OptionsParser) {
+    llvm::WithColor::error() << llvm::toString(OptionsParser.takeError());
+    return 1;
+  }
+
   llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> BaseFS(
       new vfs::OverlayFileSystem(vfs::getRealFileSystem()));
 
@@ -365,7 +372,7 @@ int clangTidyMain(int argc, const char **argv) {
   SmallString<256> ProfilePrefix = MakeAbsolute(StoreCheckProfile);
 
   StringRef FileName("dummy");
-  auto PathList = OptionsParser.getSourcePathList();
+  auto PathList = OptionsParser->getSourcePathList();
   if (!PathList.empty()) {
     FileName = PathList.front();
   }
@@ -433,7 +440,7 @@ int clangTidyMain(int argc, const char **argv) {
   ClangTidyContext Context(std::move(OwningOptionsProvider),
                            AllowEnablingAnalyzerAlphaCheckers);
   std::vector<ClangTidyError> Errors =
-      runClangTidy(Context, OptionsParser.getCompilations(), PathList, BaseFS,
+      runClangTidy(Context, OptionsParser->getCompilations(), PathList, BaseFS,
                    EnableCheckProfile, ProfilePrefix);
   bool FoundErrors = llvm::find_if(Errors, [](const ClangTidyError &E) {
                        return E.DiagLevel == ClangTidyError::Error;
diff --git a/clang-tools-extra/test/clang-query/invalid-command-line.cpp b/clang-tools-extra/test/clang-query/invalid-command-line.cpp
new file mode 100644
index 0000000000000..10ab43198b4e2
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/invalid-command-line.cpp
@@ -0,0 +1,4 @@
+// RUN: not clang-query --invalid-arg 2>&1 | FileCheck %s
+
+// CHECK: error: [CommonOptionsParser]: clang-query: Unknown command line argument '--invalid-arg'.  Try: 'clang-query --help'
+// CHECK-NEXT: clang-query: Did you mean '--extra-arg'?
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp
new file mode 100644
index 0000000000000..90b3f5200059c
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp
@@ -0,0 +1,4 @@
+// RUN: not clang-tidy --invalid-arg 2>&1 | FileCheck %s
+
+// CHECK: error: [CommonOptionsParser]: clang-tidy: Unknown command line argument '--invalid-arg'.  Try: 'clang-tidy --help'
+// CHECK-NEXT: clang-tidy: Did you mean '--extra-arg'?

From dfbfdc96f9e15be40c938cde9b159afd028bf4a2 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 31 May 2020 11:06:32 -0400
Subject: [PATCH 677/770] [utils] update expected strings in tests; NFC

The script was changes with:
https://github.com/llvm/llvm-project/commit/bfdc2552664d6f0bb332a9c6a115877020f3c1df
---
 .../Inputs/mangled_names.c.expected           | 20 +++++++++----------
 .../Inputs/mangled_names.c.funcsig.expected   | 20 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
index d6ba7ae09b620..6ea154286c152 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
@@ -8,10 +8,10 @@
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
+// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
 // CHECK-NEXT:    ret i64 [[ADD]]
 //
 long test(long a, int b) {
@@ -27,12 +27,12 @@ long test(long a, int b) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[C:%.*]], i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
+// CHECK-NEXT:    [[NAMELESS2:%.*]] = load i32, i32* [[C_ADDR]], align 4
+// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[NAMELESS2]] to i64
 // CHECK-NEXT:    [[ADD2:%.*]] = add nsw i64 [[ADD]], [[CONV1]]
 // CHECK-NEXT:    ret i64 [[ADD2]]
 //
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
index 005b2f2427473..dbe1296182aa6 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
@@ -9,10 +9,10 @@
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
+// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
 // CHECK-NEXT:    ret i64 [[ADD]]
 //
 long test(long a, int b) {
@@ -29,12 +29,12 @@ long test(long a, int b) {
 // CHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
+// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
+// CHECK-NEXT:    [[NAMELESS2:%.*]] = load i32, i32* [[C_ADDR]], align 4
+// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[NAMELESS2]] to i64
 // CHECK-NEXT:    [[ADD2:%.*]] = add nsw i64 [[ADD]], [[CONV1]]
 // CHECK-NEXT:    ret i64 [[ADD2]]
 //

From f4b0ebb89b3086a2bdd8c7dd1f5d142fa09ca728 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Sun, 31 May 2020 16:40:09 +0100
Subject: [PATCH 678/770] Revert "clang-tidy and clang-query wont crash with
 invalid command line options"

This reverts commit f23ddbe3c3ae5f40b99ba272afc3d16b800ba8b9.
---
 clang-tools-extra/clang-query/tool/ClangQuery.cpp | 14 +++-----------
 .../clang-tidy/tool/ClangTidyMain.cpp             | 15 ++++-----------
 .../test/clang-query/invalid-command-line.cpp     |  4 ----
 .../infrastructure/invalid-command-line.cpp       |  4 ----
 4 files changed, 7 insertions(+), 30 deletions(-)
 delete mode 100644 clang-tools-extra/test/clang-query/invalid-command-line.cpp
 delete mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp

diff --git a/clang-tools-extra/clang-query/tool/ClangQuery.cpp b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
index 0c471def2e140..5cfa0acf9120a 100644
--- a/clang-tools-extra/clang-query/tool/ClangQuery.cpp
+++ b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/WithColor.h"
 #include <fstream>
 #include <string>
 
@@ -87,14 +86,7 @@ bool runCommandsInFile(const char *ExeName, std::string const &FileName,
 int main(int argc, const char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
-  llvm::Expected<CommonOptionsParser> OptionsParser =
-      CommonOptionsParser::create(argc, argv, ClangQueryCategory,
-                                  llvm::cl::OneOrMore);
-
-  if (!OptionsParser) {
-    llvm::WithColor::error() << llvm::toString(OptionsParser.takeError());
-    return 1;
-  }
+  CommonOptionsParser OptionsParser(argc, argv, ClangQueryCategory);
 
   if (!Commands.empty() && !CommandFiles.empty()) {
     llvm::errs() << argv[0] << ": cannot specify both -c and -f\n";
@@ -107,8 +99,8 @@ int main(int argc, const char **argv) {
     return 1;
   }
 
-  ClangTool Tool(OptionsParser->getCompilations(),
-                 OptionsParser->getSourcePathList());
+  ClangTool Tool(OptionsParser.getCompilations(),
+                 OptionsParser.getSourcePathList());
   std::vector<std::unique_ptr<ASTUnit>> ASTs;
   int Status = Tool.buildASTs(ASTs);
   int ASTStatus = 0;
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index aca16b0d6d819..665d100268344 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/WithColor.h"
 
 using namespace clang::ast_matchers;
 using namespace clang::driver;
@@ -334,14 +333,8 @@ getVfsFromFile(const std::string &OverlayFile,
 
 int clangTidyMain(int argc, const char **argv) {
   llvm::InitLLVM X(argc, argv);
-  llvm::Expected<CommonOptionsParser> OptionsParser =
-      CommonOptionsParser::create(argc, argv, ClangTidyCategory,
-                                  cl::ZeroOrMore);
-  if (!OptionsParser) {
-    llvm::WithColor::error() << llvm::toString(OptionsParser.takeError());
-    return 1;
-  }
-
+  CommonOptionsParser OptionsParser(argc, argv, ClangTidyCategory,
+                                    cl::ZeroOrMore);
   llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> BaseFS(
       new vfs::OverlayFileSystem(vfs::getRealFileSystem()));
 
@@ -372,7 +365,7 @@ int clangTidyMain(int argc, const char **argv) {
   SmallString<256> ProfilePrefix = MakeAbsolute(StoreCheckProfile);
 
   StringRef FileName("dummy");
-  auto PathList = OptionsParser->getSourcePathList();
+  auto PathList = OptionsParser.getSourcePathList();
   if (!PathList.empty()) {
     FileName = PathList.front();
   }
@@ -440,7 +433,7 @@ int clangTidyMain(int argc, const char **argv) {
   ClangTidyContext Context(std::move(OwningOptionsProvider),
                            AllowEnablingAnalyzerAlphaCheckers);
   std::vector<ClangTidyError> Errors =
-      runClangTidy(Context, OptionsParser->getCompilations(), PathList, BaseFS,
+      runClangTidy(Context, OptionsParser.getCompilations(), PathList, BaseFS,
                    EnableCheckProfile, ProfilePrefix);
   bool FoundErrors = llvm::find_if(Errors, [](const ClangTidyError &E) {
                        return E.DiagLevel == ClangTidyError::Error;
diff --git a/clang-tools-extra/test/clang-query/invalid-command-line.cpp b/clang-tools-extra/test/clang-query/invalid-command-line.cpp
deleted file mode 100644
index 10ab43198b4e2..0000000000000
--- a/clang-tools-extra/test/clang-query/invalid-command-line.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: not clang-query --invalid-arg 2>&1 | FileCheck %s
-
-// CHECK: error: [CommonOptionsParser]: clang-query: Unknown command line argument '--invalid-arg'.  Try: 'clang-query --help'
-// CHECK-NEXT: clang-query: Did you mean '--extra-arg'?
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp
deleted file mode 100644
index 90b3f5200059c..0000000000000
--- a/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: not clang-tidy --invalid-arg 2>&1 | FileCheck %s
-
-// CHECK: error: [CommonOptionsParser]: clang-tidy: Unknown command line argument '--invalid-arg'.  Try: 'clang-tidy --help'
-// CHECK-NEXT: clang-tidy: Did you mean '--extra-arg'?

From 5952125691571de9bd817551fb1baabe270e73f9 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Sun, 31 May 2020 17:41:29 +0100
Subject: [PATCH 679/770] clang-tidy and clang-query wont crash with invalid
 command line options

Motivated by [[ https://bugs.llvm.org/show_bug.cgi?id=46141 | clang-tidy crashed for unknown command line argument. ]]

Reviewed By: aaron.ballman, thakis

Differential Revision: https://reviews.llvm.org/D80879
---
 clang-tools-extra/clang-query/tool/ClangQuery.cpp | 14 +++++++++++---
 .../clang-tidy/tool/ClangTidyMain.cpp             | 15 +++++++++++----
 .../test/clang-query/invalid-command-line.cpp     |  4 ++++
 .../infrastructure/invalid-command-line.cpp       |  4 ++++
 4 files changed, 30 insertions(+), 7 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-query/invalid-command-line.cpp
 create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp

diff --git a/clang-tools-extra/clang-query/tool/ClangQuery.cpp b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
index 5cfa0acf9120a..0c471def2e140 100644
--- a/clang-tools-extra/clang-query/tool/ClangQuery.cpp
+++ b/clang-tools-extra/clang-query/tool/ClangQuery.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/WithColor.h"
 #include <fstream>
 #include <string>
 
@@ -86,7 +87,14 @@ bool runCommandsInFile(const char *ExeName, std::string const &FileName,
 int main(int argc, const char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
 
-  CommonOptionsParser OptionsParser(argc, argv, ClangQueryCategory);
+  llvm::Expected<CommonOptionsParser> OptionsParser =
+      CommonOptionsParser::create(argc, argv, ClangQueryCategory,
+                                  llvm::cl::OneOrMore);
+
+  if (!OptionsParser) {
+    llvm::WithColor::error() << llvm::toString(OptionsParser.takeError());
+    return 1;
+  }
 
   if (!Commands.empty() && !CommandFiles.empty()) {
     llvm::errs() << argv[0] << ": cannot specify both -c and -f\n";
@@ -99,8 +107,8 @@ int main(int argc, const char **argv) {
     return 1;
   }
 
-  ClangTool Tool(OptionsParser.getCompilations(),
-                 OptionsParser.getSourcePathList());
+  ClangTool Tool(OptionsParser->getCompilations(),
+                 OptionsParser->getSourcePathList());
   std::vector<std::unique_ptr<ASTUnit>> ASTs;
   int Status = Tool.buildASTs(ASTs);
   int ASTStatus = 0;
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 665d100268344..aca16b0d6d819 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
 
 using namespace clang::ast_matchers;
 using namespace clang::driver;
@@ -333,8 +334,14 @@ getVfsFromFile(const std::string &OverlayFile,
 
 int clangTidyMain(int argc, const char **argv) {
   llvm::InitLLVM X(argc, argv);
-  CommonOptionsParser OptionsParser(argc, argv, ClangTidyCategory,
-                                    cl::ZeroOrMore);
+  llvm::Expected<CommonOptionsParser> OptionsParser =
+      CommonOptionsParser::create(argc, argv, ClangTidyCategory,
+                                  cl::ZeroOrMore);
+  if (!OptionsParser) {
+    llvm::WithColor::error() << llvm::toString(OptionsParser.takeError());
+    return 1;
+  }
+
   llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> BaseFS(
       new vfs::OverlayFileSystem(vfs::getRealFileSystem()));
 
@@ -365,7 +372,7 @@ int clangTidyMain(int argc, const char **argv) {
   SmallString<256> ProfilePrefix = MakeAbsolute(StoreCheckProfile);
 
   StringRef FileName("dummy");
-  auto PathList = OptionsParser.getSourcePathList();
+  auto PathList = OptionsParser->getSourcePathList();
   if (!PathList.empty()) {
     FileName = PathList.front();
   }
@@ -433,7 +440,7 @@ int clangTidyMain(int argc, const char **argv) {
   ClangTidyContext Context(std::move(OwningOptionsProvider),
                            AllowEnablingAnalyzerAlphaCheckers);
   std::vector<ClangTidyError> Errors =
-      runClangTidy(Context, OptionsParser.getCompilations(), PathList, BaseFS,
+      runClangTidy(Context, OptionsParser->getCompilations(), PathList, BaseFS,
                    EnableCheckProfile, ProfilePrefix);
   bool FoundErrors = llvm::find_if(Errors, [](const ClangTidyError &E) {
                        return E.DiagLevel == ClangTidyError::Error;
diff --git a/clang-tools-extra/test/clang-query/invalid-command-line.cpp b/clang-tools-extra/test/clang-query/invalid-command-line.cpp
new file mode 100644
index 0000000000000..901aad8c1f237
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/invalid-command-line.cpp
@@ -0,0 +1,4 @@
+// RUN: not clang-query --invalid-arg 2>&1 | FileCheck %s
+
+// CHECK: error: [CommonOptionsParser]: clang-query{{(\.exe)?}}: Unknown command line argument '--invalid-arg'.  Try: 'clang-query{{(\.exe)?}} --help'
+// CHECK-NEXT: clang-query{{(\.exe)?}}: Did you mean '--extra-arg'?
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp
new file mode 100644
index 0000000000000..be84a08818957
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/invalid-command-line.cpp
@@ -0,0 +1,4 @@
+// RUN: not clang-tidy --invalid-arg 2>&1 | FileCheck %s
+
+// CHECK: error: [CommonOptionsParser]: clang-tidy{{(\.exe)?}}: Unknown command line argument '--invalid-arg'.  Try: 'clang-tidy{{(\.exe)?}} --help'
+// CHECK-NEXT: clang-tidy{{(\.exe)?}}: Did you mean '--extra-arg'?

From 0cf5ef176b5222b6ee8825a2e4ec843dd7152b46 Mon Sep 17 00:00:00 2001
From: Chris Lattner <clattner@nondot.org>
Date: Sun, 31 May 2020 11:43:54 -0700
Subject: [PATCH 680/770] Change some extraneous /// comments to // comments
 inside methods. NFC.

---
 mlir/lib/Transforms/DialectConversion.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Transforms/DialectConversion.cpp b/mlir/lib/Transforms/DialectConversion.cpp
index 4dfb9b1de84ae..871252e65e5d9 100644
--- a/mlir/lib/Transforms/DialectConversion.cpp
+++ b/mlir/lib/Transforms/DialectConversion.cpp
@@ -877,8 +877,8 @@ void ConversionPatternRewriterImpl::replaceOp(Operation *op,
   // Record the requested operation replacement.
   replacements.emplace_back(op, newValues);
 
-  /// Mark this operation as recursively ignored so that we don't need to
-  /// convert any nested operations.
+  // Mark this operation as recursively ignored so that we don't need to
+  // convert any nested operations.
   markNestedOpsIgnored(op);
 }
 
@@ -1647,13 +1647,13 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
   // Legalize the given operation.
   if (failed(opLegalizer.legalize(op, rewriter))) {
     // Handle the case of a failed conversion for each of the different modes.
-    /// Full conversions expect all operations to be converted.
+    // Full conversions expect all operations to be converted.
     if (mode == OpConversionMode::Full)
       return op->emitError()
              << "failed to legalize operation '" << op->getName() << "'";
-    /// Partial conversions allow conversions to fail iff the operation was not
-    /// explicitly marked as illegal. If the user provided a nonlegalizableOps
-    /// set, non-legalizable ops are included.
+    // Partial conversions allow conversions to fail iff the operation was not
+    // explicitly marked as illegal. If the user provided a nonlegalizableOps
+    // set, non-legalizable ops are included.
     if (mode == OpConversionMode::Partial) {
       if (opLegalizer.isIllegal(op))
         return op->emitError()
@@ -1663,9 +1663,9 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
         trackedOps->insert(op);
     }
   } else {
-    /// Analysis conversions don't fail if any operations fail to legalize,
-    /// they are only interested in the operations that were successfully
-    /// legalized.
+    // Analysis conversions don't fail if any operations fail to legalize,
+    // they are only interested in the operations that were successfully
+    // legalized.
     if (mode == OpConversionMode::Analysis)
       trackedOps->insert(op);
 
@@ -1684,7 +1684,7 @@ OperationConverter::convertOperations(ArrayRef<Operation *> ops,
     return success();
   ConversionTarget &target = opLegalizer.getTarget();
 
-  /// Compute the set of operations and blocks to convert.
+  // Compute the set of operations and blocks to convert.
   std::vector<Operation *> toConvert;
   for (auto *op : ops) {
     toConvert.emplace_back(op);

From 8f2f613a6ecc75d592e9bd379b20b95790c00827 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 19:58:15 +0100
Subject: [PATCH 681/770] [X86][AVX] combineX86ShufflesRecursively -
 peekThroughOneUseBitcasts subvector before widening.

This matches what we do for the full sized vector ops at the start of combineX86ShufflesRecursively, and helps getFauxShuffleMask extract more INSERT_SUBVECTOR patterns.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp    |   3 +-
 llvm/test/CodeGen/X86/vector-reduce-mul.ll | 109 +++++++--------------
 2 files changed, 36 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7edce21290330..1cbfd41dcbc32 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35047,7 +35047,8 @@ static SDValue combineX86ShufflesRecursively(
 
   for (SDValue &Op : OpInputs)
     if (Op.getValueSizeInBits() < RootSizeInBits)
-      Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), RootSizeInBits);
+      Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
+                          SDLoc(Op), RootSizeInBits);
 
   SmallVector<int, 64> Mask;
   SmallVector<SDValue, 16> Ops;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 89ed59d455e49..09d1472c39e98 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -1970,18 +1970,13 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
@@ -2058,18 +2053,13 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512DQ-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
@@ -2090,18 +2080,13 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512DQVL-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX512DQVL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
@@ -2254,18 +2239,13 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
 ; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsrld $16, %xmm2, %xmm2
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
@@ -2295,18 +2275,13 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512BW-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
@@ -2336,13 +2311,10 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BWVL-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
@@ -2637,18 +2609,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
@@ -2681,18 +2648,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512BW-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
@@ -2725,13 +2687,10 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BWVL-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
 ; AVX512BWVL-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1

From 22e50833e9564f6be75fcbbabe9d75ca745e778d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 31 May 2020 20:19:24 +0100
Subject: [PATCH 682/770] [X86][AVX] Reduce unary target shuffles width if the
 upper elements aren't demanded.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  5 +++
 .../X86/avx512-intrinsics-fast-isel.ll        | 32 +++++++++----------
 llvm/test/CodeGen/X86/vector-reduce-mul.ll    | 27 ++++++----------
 .../X86/vector-shuffle-combining-avx.ll       |  3 +-
 4 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1cbfd41dcbc32..86825ce8a446c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36908,6 +36908,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
                                                TLO.DAG, DL, ExtSizeInBits));
     }
+      // Target unary shuffles by immediate:
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+    case X86ISD::VPERMILPI:
       // Byte shifts by immediate.
     case X86ISD::VSHLDQ:
     case X86ISD::VSRLDQ:
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index fee195ae121fd..295b5271ed0b4 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -7742,7 +7742,7 @@ define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
 ; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -7755,7 +7755,7 @@ define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
 ; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -7781,7 +7781,7 @@ define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
 ; X86-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -7794,7 +7794,7 @@ define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
 ; X64-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -7867,7 +7867,7 @@ define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
 ; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -7880,7 +7880,7 @@ define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
 ; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -7906,7 +7906,7 @@ define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
 ; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -7919,7 +7919,7 @@ define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
 ; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -7996,7 +7996,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -8012,7 +8012,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -8043,7 +8043,7 @@ define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -8058,7 +8058,7 @@ define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -8146,7 +8146,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -8162,7 +8162,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
@@ -8194,7 +8194,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
-; X86-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
@@ -8210,7 +8210,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
-; X64-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    vmovq %xmm0, %rax
 ; X64-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 09d1472c39e98..e6f9bb597a225 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -1969,8 +1969,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2052,8 +2051,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512DQ-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2079,8 +2077,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512DQVL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512DQVL-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512DQVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2238,8 +2235,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
 ; AVX2-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
@@ -2274,8 +2270,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512BW-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2311,8 +2306,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BWVL-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2608,8 +2602,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2647,8 +2640,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX512BW-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
-; AVX512BW-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2687,8 +2679,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BWVL-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
 ; AVX512BWVL-NEXT:    vpmullw %xmm0, %xmm2, %xmm0
-; AVX512BWVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero
 ; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 50a250ba1adf0..6ffbe095c39ba 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -145,8 +145,7 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
 define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
 ; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; CHECK-NEXT:    vmovapd %xmm0, %xmm0
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)

From 8abe830093f65a0fc6ba398ee1786d4d96607fdf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sun, 31 May 2020 12:39:14 -0700
Subject: [PATCH 683/770] [X86] Rewrite how X86PartialReduction finds
 candidates to consider optimizing.

Previously we walked the users of any vector binop looking for
more binops with the same opcode or phis that eventually ended up
in a reduction. While this is simple it also means visiting the
same nodes many times since we'll do a forward walk for each
BinaryOperator in the chain. It was also far more general than what
we have tests for or expect to see.

This patch replaces the algorithm with a new method that starts at
extract elements looking for a horizontal reduction. Once we find
a reduction we walk through backwards through phis and adds to
collect leaves that we can consider for rewriting.

We only consider single use adds and phis. Except for a special
case if the Add is used by a phi that forms a loop back to the
Add. Including other single use Adds to support unrolled loops.

Ultimately, I want to narrow the Adds, Phis, and final reduction
based on the partial reduction we're doing. I still haven't
figured out exactly what that looks like yet. But restricting
the types of graphs we expect to handle seemed like a good first
step. As does having all the leaves and the reduction at once.

Differential Revision: https://reviews.llvm.org/D79971
---
 llvm/lib/Target/X86/X86PartialReduction.cpp | 367 ++++++++++----------
 llvm/test/CodeGen/X86/madd.ll               |   4 +-
 llvm/test/CodeGen/X86/sad.ll                |   2 +-
 3 files changed, 190 insertions(+), 183 deletions(-)

diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 16108bd1928f6..65caeab1d1cf2 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -49,11 +49,8 @@ class X86PartialReduction : public FunctionPass {
   }
 
 private:
-  bool tryMAddPattern(BinaryOperator *BO);
-  bool tryMAddReplacement(Value *Op, BinaryOperator *Add);
-
-  bool trySADPattern(BinaryOperator *BO);
-  bool trySADReplacement(Value *Op, BinaryOperator *Add);
+  bool tryMAddReplacement(Instruction *Op);
+  bool trySADReplacement(Instruction *Op);
 };
 }
 
@@ -66,139 +63,24 @@ char X86PartialReduction::ID = 0;
 INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
                 "X86 Partial Reduction", false, false)
 
-static bool isVectorReductionOp(const BinaryOperator &BO) {
-  if (!BO.getType()->isVectorTy())
+bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
+  if (!ST->hasSSE2())
     return false;
 
-  unsigned Opcode = BO.getOpcode();
-
-  switch (Opcode) {
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    break;
-  case Instruction::FAdd:
-  case Instruction::FMul:
-    if (auto *FPOp = dyn_cast<FPMathOperator>(&BO))
-      if (FPOp->getFastMathFlags().isFast())
-        break;
-    LLVM_FALLTHROUGH;
-  default:
+  // Need at least 8 elements.
+  if (cast<VectorType>(Op->getType())->getNumElements() < 8)
     return false;
-  }
 
-  unsigned ElemNum = cast<VectorType>(BO.getType())->getNumElements();
-  // Ensure the reduction size is a power of 2.
-  if (!isPowerOf2_32(ElemNum))
+  // Element type should be i32.
+  if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
     return false;
 
-  unsigned ElemNumToReduce = ElemNum;
-
-  // Do DFS search on the def-use chain from the given instruction. We only
-  // allow four kinds of operations during the search until we reach the
-  // instruction that extracts the first element from the vector:
-  //
-  //   1. The reduction operation of the same opcode as the given instruction.
-  //
-  //   2. PHI node.
-  //
-  //   3. ShuffleVector instruction together with a reduction operation that
-  //      does a partial reduction.
-  //
-  //   4. ExtractElement that extracts the first element from the vector, and we
-  //      stop searching the def-use chain here.
-  //
-  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
-  // from 1-3 to the stack to continue the DFS. The given instruction is not
-  // a reduction operation if we meet any other instructions other than those
-  // listed above.
-
-  SmallVector<const User *, 16> UsersToVisit{&BO};
-  SmallPtrSet<const User *, 16> Visited;
-  bool ReduxExtracted = false;
-
-  while (!UsersToVisit.empty()) {
-    auto User = UsersToVisit.back();
-    UsersToVisit.pop_back();
-    if (!Visited.insert(User).second)
-      continue;
-
-    for (const auto *U : User->users()) {
-      auto *Inst = dyn_cast<Instruction>(U);
-      if (!Inst)
-        return false;
-
-      if (Inst->getOpcode() == Opcode || isa<PHINode>(U)) {
-        if (auto *FPOp = dyn_cast<FPMathOperator>(Inst))
-          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
-            return false;
-        UsersToVisit.push_back(U);
-      } else if (auto *ShufInst = dyn_cast<ShuffleVectorInst>(U)) {
-        // Detect the following pattern: A ShuffleVector instruction together
-        // with a reduction that do partial reduction on the first and second
-        // ElemNumToReduce / 2 elements, and store the result in
-        // ElemNumToReduce / 2 elements in another vector.
-
-        unsigned ResultElements = ShufInst->getType()->getNumElements();
-        if (ResultElements < ElemNum)
-          return false;
-
-        if (ElemNumToReduce == 1)
-          return false;
-        if (!isa<UndefValue>(U->getOperand(1)))
-          return false;
-        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
-          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
-            return false;
-        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
-          if (ShufInst->getMaskValue(i) != -1)
-            return false;
-
-        // There is only one user of this ShuffleVector instruction, which
-        // must be a reduction operation.
-        if (!U->hasOneUse())
-          return false;
-
-        auto *U2 = dyn_cast<BinaryOperator>(*U->user_begin());
-        if (!U2 || U2->getOpcode() != Opcode)
-          return false;
-
-        // Check operands of the reduction operation.
-        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
-            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
-          UsersToVisit.push_back(U2);
-          ElemNumToReduce /= 2;
-        } else
-          return false;
-      } else if (isa<ExtractElementInst>(U)) {
-        // At this moment we should have reduced all elements in the vector.
-        if (ElemNumToReduce != 1)
-          return false;
-
-        auto *Val = dyn_cast<ConstantInt>(U->getOperand(1));
-        if (!Val || !Val->isZero())
-          return false;
-
-        ReduxExtracted = true;
-      } else
-        return false;
-    }
-  }
-  return ReduxExtracted;
-}
-
-bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
-  BasicBlock *BB = Add->getParent();
-
-  auto *BO = dyn_cast<BinaryOperator>(Op);
-  if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() ||
-      BO->getParent() != BB)
+  auto *Mul = dyn_cast<BinaryOperator>(Op);
+  if (!Mul || Mul->getOpcode() != Instruction::Mul)
     return false;
 
-  Value *LHS = BO->getOperand(0);
-  Value *RHS = BO->getOperand(1);
+  Value *LHS = Mul->getOperand(0);
+  Value *RHS = Mul->getOperand(1);
 
   // LHS and RHS should be only used once or if they are the same then only
   // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
@@ -219,7 +101,7 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
   auto CanShrinkOp = [&](Value *Op) {
     auto IsFreeTruncation = [&](Value *Op) {
       if (auto *Cast = dyn_cast<CastInst>(Op)) {
-        if (Cast->getParent() == BB &&
+        if (Cast->getParent() == Mul->getParent() &&
             (Cast->getOpcode() == Instruction::SExt ||
              Cast->getOpcode() == Instruction::ZExt) &&
             Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16)
@@ -232,16 +114,16 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
     // If the operation can be freely truncated and has enough sign bits we
     // can shrink.
     if (IsFreeTruncation(Op) &&
-        ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+        ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
       return true;
 
     // SelectionDAG has limited support for truncating through an add or sub if
     // the inputs are freely truncatable.
     if (auto *BO = dyn_cast<BinaryOperator>(Op)) {
-      if (BO->getParent() == BB &&
+      if (BO->getParent() == Mul->getParent() &&
           IsFreeTruncation(BO->getOperand(0)) &&
           IsFreeTruncation(BO->getOperand(1)) &&
-          ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+          ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
         return true;
     }
 
@@ -252,7 +134,7 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
   if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS))
     return false;
 
-  IRBuilder<> Builder(Add);
+  IRBuilder<> Builder(Mul);
 
   auto *MulTy = cast<VectorType>(Op->getType());
   unsigned NumElts = MulTy->getNumElements();
@@ -266,8 +148,11 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
     EvenMask[i] = i * 2;
     OddMask[i] = i * 2 + 1;
   }
-  Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask);
-  Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask);
+  // Creating a new mul so the replaceAllUsesWith below doesn't replace the
+  // uses in the shuffles we're creating.
+  Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1));
+  Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask);
+  Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask);
   Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
 
   // Concatenate zeroes to extend back to the original type.
@@ -276,34 +161,21 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
   Value *Zero = Constant::getNullValue(MAdd->getType());
   Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
 
-  // Replaces the use of mul in the original Add with the pmaddwd and zeroes.
-  Add->replaceUsesOfWith(BO, Concat);
-  Add->setHasNoSignedWrap(false);
-  Add->setHasNoUnsignedWrap(false);
+  Mul->replaceAllUsesWith(Concat);
+  Mul->eraseFromParent();
 
   return true;
 }
 
-// Try to replace operans of this add with pmaddwd patterns.
-bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) {
+bool X86PartialReduction::trySADReplacement(Instruction *Op) {
   if (!ST->hasSSE2())
     return false;
 
-  // Need at least 8 elements.
-  if (cast<VectorType>(BO->getType())->getNumElements() < 8)
-    return false;
-
-  // Element type should be i32.
-  if (!cast<VectorType>(BO->getType())->getElementType()->isIntegerTy(32))
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
     return false;
 
-  bool Changed = false;
-  Changed |= tryMAddReplacement(BO->getOperand(0), BO);
-  Changed |= tryMAddReplacement(BO->getOperand(1), BO);
-  return Changed;
-}
-
-bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
   // Operand should be a select.
   auto *SI = dyn_cast<SelectInst>(Op);
   if (!SI)
@@ -337,7 +209,7 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
   if (!Op0 || !Op1)
     return false;
 
-  IRBuilder<> Builder(Add);
+  IRBuilder<> Builder(SI);
 
   auto *OpTy = cast<VectorType>(Op->getType());
   unsigned NumElts = OpTy->getNumElements();
@@ -355,7 +227,7 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
     IntrinsicNumElts = 16;
   }
 
-  Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID);
+  Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
 
   if (NumElts < 16) {
     // Pad input with zeroes.
@@ -419,27 +291,155 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
     Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
   }
 
-  // Replaces the uses of Op in Add with the new sequence.
-  Add->replaceUsesOfWith(Op, Ops[0]);
-  Add->setHasNoSignedWrap(false);
-  Add->setHasNoUnsignedWrap(false);
+  SI->replaceAllUsesWith(Ops[0]);
+  SI->eraseFromParent();
 
   return true;
 }
 
-bool X86PartialReduction::trySADPattern(BinaryOperator *BO) {
-  if (!ST->hasSSE2())
-    return false;
+// Walk backwards from the ExtractElementInst and determine if it is the end of
+// a horizontal reduction. Return the input to the reduction if we find one.
+static Value *matchAddReduction(const ExtractElementInst &EE) {
+  // Make sure we're extracting index 0.
+  auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand());
+  if (!Index || !Index->isNullValue())
+    return nullptr;
 
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
-  if (!cast<VectorType>(BO->getType())->getElementType()->isIntegerTy(32))
+  const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand());
+  if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse())
+    return nullptr;
+
+  unsigned NumElems = cast<VectorType>(BO->getType())->getNumElements();
+  // Ensure the reduction size is a power of 2.
+  if (!isPowerOf2_32(NumElems))
+    return nullptr;
+
+  const Value *Op = BO;
+  unsigned Stages = Log2_32(NumElems);
+  for (unsigned i = 0; i != Stages; ++i) {
+    const auto *BO = dyn_cast<BinaryOperator>(Op);
+    if (!BO || BO->getOpcode() != Instruction::Add)
+      return nullptr;
+
+    // If this isn't the first add, then it should only have 2 users, the
+    // shuffle and another add which we checked in the previous iteration.
+    if (i != 0 && !BO->hasNUses(2))
+      return nullptr;
+
+    Value *LHS = BO->getOperand(0);
+    Value *RHS = BO->getOperand(1);
+
+    auto *Shuffle = dyn_cast<ShuffleVectorInst>(LHS);
+    if (Shuffle) {
+      Op = RHS;
+    } else {
+      Shuffle = dyn_cast<ShuffleVectorInst>(RHS);
+      Op = LHS;
+    }
+
+    // The first operand of the shuffle should be the same as the other operand
+    // of the bin op.
+    if (!Shuffle || Shuffle->getOperand(0) != Op)
+      return nullptr;
+
+    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+    unsigned MaskEnd = 1 << i;
+    for (unsigned Index = 0; Index < MaskEnd; ++Index)
+      if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index))
+        return nullptr;
+  }
+
+  return const_cast<Value *>(Op);
+}
+
+// See if this BO is reachable from this Phi by walking forward through single
+// use BinaryOperators with the same opcode. If we get back then we know we've
+// found a loop and it is safe to step through this Add to find more leaves.
+static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) {
+  // The PHI itself should only have one use.
+  if (!Phi->hasOneUse())
     return false;
 
-  bool Changed = false;
-  Changed |= trySADReplacement(BO->getOperand(0), BO);
-  Changed |= trySADReplacement(BO->getOperand(1), BO);
-  return Changed;
+  Instruction *U = cast<Instruction>(*Phi->user_begin());
+  if (U == BO)
+    return true;
+
+  while (U->hasOneUse() && U->getOpcode() == BO->getOpcode())
+    U = cast<Instruction>(*U->user_begin());
+
+  return U == BO;
+}
+
+// Collect all the leaves of the tree of adds that feeds into the horizontal
+// reduction. Root is the Value that is used by the horizontal reduction.
+// We look through single use phis, single use adds, or adds that are used by
+// a phi that forms a loop with the add.
+static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
+  SmallPtrSet<Value *, 8> Visited;
+  SmallVector<Value *, 8> Worklist;
+  Worklist.push_back(Root);
+
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+     if (!Visited.insert(V).second)
+       continue;
+
+    if (auto *PN = dyn_cast<PHINode>(V)) {
+      // PHI node should have single use unless it is the root node, then it
+      // has 2 uses.
+      if (!PN->hasNUses(PN == Root ? 2 : 1))
+        break;
+
+      // Push incoming values to the worklist.
+      for (Value *InV : PN->incoming_values())
+        Worklist.push_back(InV);
+
+      continue;
+    }
+
+    if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+      if (BO->getOpcode() == Instruction::Add) {
+        // Simple case. Single use, just push its operands to the worklist.
+        if (BO->hasNUses(BO == Root ? 2 : 1)) {
+          for (Value *Op : BO->operands())
+            Worklist.push_back(Op);
+          continue;
+        }
+
+        // If there is additional use, make sure it is an unvisited phi that
+        // gets us back to this node.
+        if (BO->hasNUses(BO == Root ? 3 : 2)) {
+          PHINode *PN = nullptr;
+          for (auto *U : Root->users())
+            if (auto *P = dyn_cast<PHINode>(U))
+              if (!Visited.count(P))
+                PN = P;
+
+          // If we didn't find a 2-input PHI then this isn't a case we can
+          // handle.
+          if (!PN || PN->getNumIncomingValues() != 2)
+            continue;
+
+          // Walk forward from this phi to see if it reaches back to this add.
+          if (!isReachableFromPHI(PN, BO))
+            continue;
+
+          // The phi forms a loop with this Add, push its operands.
+          for (Value *Op : BO->operands())
+            Worklist.push_back(Op);
+        }
+      }
+    }
+
+    // Not an add or phi, make it a leaf.
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      if (!V->hasNUses(I == Root ? 2 : 1))
+        continue;
+
+      // Add this as a leaf.
+      Leaves.push_back(I);
+    }
+  }
 }
 
 bool X86PartialReduction::runOnFunction(Function &F) {
@@ -458,22 +458,29 @@ bool X86PartialReduction::runOnFunction(Function &F) {
   bool MadeChange = false;
   for (auto &BB : F) {
     for (auto &I : BB) {
-      auto *BO = dyn_cast<BinaryOperator>(&I);
-      if (!BO)
+      auto *EE = dyn_cast<ExtractElementInst>(&I);
+      if (!EE)
         continue;
 
-      if (!isVectorReductionOp(*BO))
+      // First find a reduction tree.
+      // FIXME: Do we need to handle other opcodes than Add?
+      Value *Root = matchAddReduction(*EE);
+      if (!Root)
         continue;
 
-      if (BO->getOpcode() == Instruction::Add) {
-        if (tryMAddPattern(BO)) {
+      SmallVector<Instruction *, 8> Leaves;
+      collectLeaves(Root, Leaves);
+
+      for (Instruction *I : Leaves) {
+        if (tryMAddReplacement(I)) {
           MadeChange = true;
           continue;
         }
-        if (trySADPattern(BO)) {
+
+        // Don't do SAD matching on the root node. SelectionDAG already
+        // has support for that and currently generates better code.
+        if (I != Root && trySADReplacement(I))
           MadeChange = true;
-          continue;
-        }
       }
     }
   }
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index d6d04d9b12841..6109bd25c69e2 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2657,9 +2657,9 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX-LABEL: madd_double_reduction:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -2720,9 +2720,9 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%r8), %xmm2
 ; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 006dd3d5ff178..f55a58048e227 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -1061,9 +1061,9 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX-LABEL: sad_double_reduction:
 ; AVX:       # %bb.0: # %bb
 ; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0

From 403d5a5e351956e950fdb8bba07f804fb7d52742 Mon Sep 17 00:00:00 2001
From: Hubert Tong <hubert.reinterpretcast@gmail.com>
Date: Sun, 31 May 2020 16:33:42 -0400
Subject: [PATCH 684/770] [test][compiler-rt] Avoid LD_PRELOAD for "outer"
 dynamic linkers

Summary:
This patch moves the setting of `LD_PRELOAD` "inwards" to avoid issues
where the built library needs to be loaded with the dynamic linker that
was configured with the build (and cannot, for example, be loaded by the
dynamic linker associated with the `env` utility).

Reviewed By: vitalybuka, nemanjai, jsji

Differential Revision: https://reviews.llvm.org/D79695
---
 .../asan/TestCases/Linux/preinstalled_signal.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp b/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp
index 2b50944c6f2f6..71929fdd9b37f 100644
--- a/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp
@@ -1,16 +1,16 @@
 // RUN: %clangxx -std=c++11 %s -o %t
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s
 
 // RUN: %clangxx -std=c++11 -DTEST_INSTALL_SIG_HANDLER %s -o %t
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HANDLER
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=handle_segv=0 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HANDLER
+// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s
 
 // RUN: %clangxx -std=c++11 -DTEST_INSTALL_SIG_ACTION %s -o %t
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-ACTION
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s
-// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=handle_segv=0 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-ACTION
+// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s
 
 // REQUIRES: asan-dynamic-runtime
 

From c15d5d12c625df52bf82828a6af5ef2dfb6b4533 Mon Sep 17 00:00:00 2001
From: Hubert Tong <hubert.reinterpretcast@gmail.com>
Date: Sun, 31 May 2020 16:38:10 -0400
Subject: [PATCH 685/770] [Driver] NFC: Use Twine temp to replace std::string
 local

This patch replaces a `std::string` local used for a concatentation with
a `Twine` where the string was being passed into call.
---
 clang/lib/Driver/ToolChains/Gnu.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9a340142a2428..ac9eb46dacb51 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -449,10 +449,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-export-dynamic");
 
     if (!Args.hasArg(options::OPT_shared) && !IsStaticPIE) {
-      const std::string Loader =
-          D.DyldPrefix + ToolChain.getDynamicLinker(Args);
       CmdArgs.push_back("-dynamic-linker");
-      CmdArgs.push_back(Args.MakeArgString(Loader));
+      CmdArgs.push_back(Args.MakeArgString(Twine(D.DyldPrefix) +
+                                           ToolChain.getDynamicLinker(Args)));
     }
   }
 

From 77e1181df446b54391acad08512b540e174cf6e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kirst=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Sun, 31 May 2020 21:22:35 +0200
Subject: [PATCH 686/770] [analyzer] Add dumps to CheckerRegistry

---
 .../StaticAnalyzer/Frontend/CheckerRegistry.h | 10 +++
 .../Frontend/CheckerRegistry.cpp              | 61 +++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h
index 4e98ba2e10d23..c3494d0ebeefd 100644
--- a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h
+++ b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstddef>
 #include <vector>
 
@@ -133,6 +134,9 @@ class CheckerRegistry {
               DevelopmentStatus == "released") &&
              "Invalid development status!");
     }
+
+    LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); }
+    LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const;
   };
 
   using CmdLineOptionList = llvm::SmallVector<CmdLineOption, 0>;
@@ -189,6 +193,9 @@ class CheckerRegistry {
 
     // Used for lower_bound.
     explicit CheckerInfo(StringRef FullName) : FullName(FullName) {}
+
+    LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); }
+    LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const;
   };
 
   using StateFromCmdLine = CheckerInfo::StateFromCmdLine;
@@ -206,6 +213,9 @@ class CheckerRegistry {
     }
 
     explicit PackageInfo(StringRef FullName) : FullName(FullName) {}
+
+    LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); }
+    LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const;
   };
 
   using PackageInfoList = llvm::SmallVector<PackageInfo, 0>;
diff --git a/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp b/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp
index 62ac1ed252dd1..f4d5db1e7a4b0 100644
--- a/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp
@@ -27,6 +27,10 @@ using namespace clang;
 using namespace ento;
 using llvm::sys::DynamicLibrary;
 
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
 using RegisterCheckersFn = void (*)(CheckerRegistry &);
 
 static bool isCompatibleAPIVersion(const char *VersionString) {
@@ -86,6 +90,63 @@ static bool isInPackage(const CheckerRegistry::CheckerInfo &Checker,
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+// Methods of CmdLineOption, PackageInfo and CheckerInfo.
+//===----------------------------------------------------------------------===//
+
+LLVM_DUMP_METHOD void
+CheckerRegistry::CmdLineOption::dumpToStream(llvm::raw_ostream &Out) const {
+  // The description can be just checked in Checkers.inc, the point here is to
+  // debug whether we succeeded in parsing it.
+  Out << OptionName << " (" << OptionType << ", "
+      << (IsHidden ? "hidden, " : "") << DevelopmentStatus << ") default: \""
+      << DefaultValStr;
+}
+
+static StringRef toString(CheckerRegistry::StateFromCmdLine Kind) {
+  switch (Kind) {
+  case CheckerRegistry::StateFromCmdLine::State_Disabled:
+    return "Disabled";
+  case CheckerRegistry::StateFromCmdLine::State_Enabled:
+    return "Enabled";
+  case CheckerRegistry::StateFromCmdLine::State_Unspecified:
+    return "Unspecified";
+  }
+}
+
+LLVM_DUMP_METHOD void
+CheckerRegistry::CheckerInfo::dumpToStream(llvm::raw_ostream &Out) const {
+  // The description can be just checked in Checkers.inc, the point here is to
+  // debug whether we succeeded in parsing it. Same with documentation uri.
+  Out << FullName << " (" << toString(State) << (IsHidden ? ", hidden" : "")
+      << ")\n";
+  Out << "  Options:\n";
+  for (const CmdLineOption &Option : CmdLineOptions) {
+    Out << "    ";
+    Option.dumpToStream(Out);
+    Out << '\n';
+  }
+  Out << "  Dependencies:\n";
+  for (const CheckerInfo *Dependency : Dependencies) {
+    Out << "  " << Dependency->FullName << '\n';
+  }
+}
+
+LLVM_DUMP_METHOD void
+CheckerRegistry::PackageInfo::dumpToStream(llvm::raw_ostream &Out) const {
+  Out << FullName << "\n";
+  Out << "  Options:\n";
+  for (const CmdLineOption &Option : CmdLineOptions) {
+    Out << "    ";
+    Option.dumpToStream(Out);
+    Out << '\n';
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Methods of CheckerRegistry.
+//===----------------------------------------------------------------------===//
+
 CheckerRegistry::CheckerInfoListRange
 CheckerRegistry::getMutableCheckersForCmdLineArg(StringRef CmdLineArg) {
   auto It = binaryFind(Checkers, CmdLineArg);

From 92448fd23daf966fe368eb8523d9c5a31797d5d8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Sun, 31 May 2020 17:15:14 -0700
Subject: [PATCH 687/770] [Driver] Simplify Linux::addProfileRTLibs

---
 clang/lib/Driver/ToolChains/Linux.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 99c23e71eb0cc..8188c972f4466 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -867,13 +867,9 @@ SanitizerMask Linux::getSupportedSanitizers() const {
 
 void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
                              llvm::opt::ArgStringList &CmdArgs) const {
-  bool Profile = needsProfileRT(Args);
-  if (!Profile && !needsGCovInstrumentation(Args))
-    return;
-
   // Add linker option -u__llvm_profile_runtime to cause runtime
   // initialization module to be linked in.
-  if (Profile)
+  if (needsProfileRT(Args))
     CmdArgs.push_back(Args.MakeArgString(
         Twine("-u", llvm::getInstrProfRuntimeHookVarName())));
   ToolChain::addProfileRTLibs(Args, CmdArgs);

From a8ca0ec267050f9ded865a729d50c2c0eb078b7e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 31 May 2020 19:58:55 -0400
Subject: [PATCH 688/770] AMDGPU/GlobalISel: Add stub reg-bank aware combiner
 pass

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   5 +
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   | 153 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   3 +
 4 files changed, 163 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 66ba6e1d1e69e..88c79665be60d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -33,6 +33,8 @@ void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone);
 void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &);
 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
+void initializeAMDGPURegBankCombinerPass(PassRegistry &);
 
 // R600 Passes
 FunctionPass *createR600VectorRegMerger();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 981aca5ab0a78..faaf9168d0dd8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -62,3 +62,8 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
    uchar_to_float, cvt_f32_ubyteN]> {
   let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
 }
+
+def AMDGPURegBankCombinerHelper : GICombinerHelper<
+  "AMDGPUGenRegBankCombinerHelper", []> {
+  let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
new file mode 100644
index 0000000000000..18c58c6ff5db7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -0,0 +1,153 @@
+//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after register banks are known.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-regbank-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPURegBankCombinerInfo : public CombinerInfo {
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
+
+public:
+  AMDGPUGenRegBankCombinerHelper Generated;
+
+  AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                  const AMDGPULegalizerInfo *LI,
+                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
+      : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+                     /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+        KB(KB), MDT(MDT) {
+    if (!Generated.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+               MachineIRBuilder &B) const override;
+};
+
+bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer,
+                                              MachineInstr &MI,
+                                              MachineIRBuilder &B) const {
+  CombinerHelper Helper(Observer, B, KB, MDT);
+
+  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+    return true;
+
+  return false;
+}
+
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPURegBankCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPURegBankCombiner(bool IsOptNone = false);
+
+  StringRef getPassName() const override {
+    return "AMDGPURegBankCombiner";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+  bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
+  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+  initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const AMDGPULegalizerInfo *LI
+    = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+  AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                         F.hasMinSize(), LI, KB, MDT);
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPURegBankCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
+                      "Combine AMDGPU machine instrs after regbankselect",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
+                    "Combine AMDGPU machine instrs after regbankselect", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) {
+  return new AMDGPURegBankCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e82c98d4b5fdc..1f6b37cf25fe2 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -19,6 +19,8 @@ tablegen(LLVM AMDGPUGenPreLegalizeGICombiner.inc -gen-global-isel-combiner
               -combiners="AMDGPUPreLegalizerCombinerHelper")
 tablegen(LLVM AMDGPUGenPostLegalizeGICombiner.inc -gen-global-isel-combiner
               -combiners="AMDGPUPostLegalizerCombinerHelper")
+tablegen(LLVM AMDGPUGenRegBankGICombiner.inc -gen-global-isel-combiner
+              -combiners="AMDGPURegBankCombinerHelper")
 
 set(LLVM_TARGET_DEFINITIONS R600.td)
 tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
@@ -67,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUPreLegalizerCombiner.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPUPropagateAttributes.cpp
+  AMDGPURegBankCombiner.cpp
   AMDGPURegisterBankInfo.cpp
   AMDGPURewriteOutArguments.cpp
   AMDGPUSubtarget.cpp

From 216bad9a64ebfac51d36210738d2b9aa3de69511 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Sun, 31 May 2020 22:04:35 -0400
Subject: [PATCH 689/770] [gn build] (semi-manually) port a8ca0ec2670

---
 .../gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn      | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index f44a40f6a64c7..8f554673f752d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -42,6 +42,15 @@ tablegen("AMDGPUGenPostLegalizeGICombiner") {
   td_file = "AMDGPUGISel.td"
 }
 
+tablegen("AMDGPUGenRegBankGICombiner") {
+  visibility = [ ":LLVMAMDGPUCodeGen" ]
+  args = [
+    "-gen-global-isel-combiner",
+    "-combiners=AMDGPURegBankCombinerHelper",
+  ]
+  td_file = "AMDGPUGISel.td"
+}
+
 tablegen("AMDGPUGenMCPseudoLowering") {
   visibility = [ ":LLVMAMDGPUCodeGen" ]
   args = [ "-gen-pseudo-lowering" ]
@@ -81,6 +90,7 @@ static_library("LLVMAMDGPUCodeGen") {
     ":AMDGPUGenMCPseudoLowering",
     ":AMDGPUGenPostLegalizeGICombiner",
     ":AMDGPUGenPreLegalizeGICombiner",
+    ":AMDGPUGenRegBankGICombiner",
     ":AMDGPUGenRegisterBank",
     ":R600GenCallingConv",
     ":R600GenDAGISel",
@@ -141,6 +151,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUPrintfRuntimeBinding.cpp",
     "AMDGPUPromoteAlloca.cpp",
     "AMDGPUPropagateAttributes.cpp",
+    "AMDGPURegBankCombiner.cpp",
     "AMDGPURegisterBankInfo.cpp",
     "AMDGPURewriteOutArguments.cpp",
     "AMDGPUSubtarget.cpp",

From 3101601b54fbb8062c179e804974b9fb4e2b7c19 Mon Sep 17 00:00:00 2001
From: Li Rong Yi <esme.yi@ibm.com>
Date: Mon, 1 Jun 2020 02:29:18 +0000
Subject: [PATCH 690/770] [PowerPC] Exploit vabsd on P9

Summary: Exploit vabsd* for for absolute difference of vectors on P9,
for example:
void foo (char *restrict p, char *restrict q, char *restrict t)
{
  for (int i = 0; i < 16; i++)
     t[i] = abs (p[i] - q[i]);
}
this case should be matched to the HW instruction vabsdub.

Reviewed By: steven.zhang

Differential Revision: https://reviews.llvm.org/D80271
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 18 +++++++++
 llvm/test/CodeGen/PowerPC/vec_absd.ll       | 42 +++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7e0cbbff2515c..0ede11ee285a2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16053,6 +16053,24 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
   SDLoc dl(N);
   SDValue Op0 = N->getOperand(0);
 
+  // fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b)
+  if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) {
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
+      return SDValue();
+    SDValue Sub = Op0.getOperand(0);
+    if (Sub.getOpcode() == ISD::SUB) {
+      SDValue SubOp0 = Sub.getOperand(0);
+      SDValue SubOp1 = Sub.getOperand(1);
+      if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) &&
+          (SubOp1.getOpcode() == ISD::ZERO_EXTEND)) {
+        return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0),
+                               SubOp1.getOperand(0),
+                               DCI.DAG.getTargetConstant(0, dl, MVT::i32));
+      }
+    }
+  }
+
   // Looking for a truncate of i128 to i64.
   if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
     return SDValue();
diff --git a/llvm/test/CodeGen/PowerPC/vec_absd.ll b/llvm/test/CodeGen/PowerPC/vec_absd.ll
index 268587bb2eaf6..1ae3ee5d2590a 100644
--- a/llvm/test/CodeGen/PowerPC/vec_absd.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_absd.ll
@@ -38,3 +38,45 @@ entry:
 ; CHECK: vabsduw 2, 2, 3
 ; CHECK: blr
 }
+
+define <16 x i8> @test_vabsdub(<16 x i8> %0, <16 x i8> %1) {
+entry:
+  %2 = zext <16 x i8> %0 to <16 x i32>
+  %3 = zext <16 x i8> %1 to <16 x i32>
+  %4 = sub nsw <16 x i32> %2, %3
+  %5 = icmp slt <16 x i32> %4, zeroinitializer
+  %6 = sub nsw <16 x i32> zeroinitializer, %4
+  %7 = select <16 x i1> %5, <16 x i32> %6, <16 x i32> %4
+  %8 = trunc <16 x i32> %7 to <16 x i8>
+  ret <16 x i8> %8
+; CHECK-LABEL: @test_vabsdub
+; CHECK: vabsdub 2, 2, 3
+; CHECK: blr
+}
+
+define <8 x i16> @test_vabsduh(<8 x i16> %0, <8 x i16> %1) {
+entry:
+  %2 = zext <8 x i16> %0 to <8 x i32>
+  %3 = zext <8 x i16> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = icmp slt <8 x i32> %4, zeroinitializer
+  %6 = sub nsw <8 x i32> zeroinitializer, %4
+  %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  ret <8 x i16> %8
+; CHECK-LABEL: @test_vabsduh
+; CHECK: vabsduh 2, 2, 3
+; CHECK: blr
+}
+
+define <4 x i32> @test_vabsduw(<4 x i32> %0, <4 x i32> %1) {
+entry:
+  %2 = sub nsw <4 x i32> %0, %1
+  %3 = icmp slt <4 x i32> %2, zeroinitializer
+  %4 = sub nsw <4 x i32> zeroinitializer, %2
+  %5 = select <4 x i1> %3, <4 x i32> %4, <4 x i32> %2
+  ret <4 x i32> %5
+; CHECK-LABEL: @test_vabsduw
+; CHECK: vabsduw 2, 2, 3
+; CHECK: blr
+}

From 2a24d350dbeacb131af91e8c438fed2bd81698c0 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Tue, 26 May 2020 22:39:37 -0400
Subject: [PATCH 691/770] [MachineCombine] add a hook for resource length limit

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 +++
 llvm/lib/CodeGen/MachineCombiner.cpp        | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 8c6d845215948..709030b620768 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1093,6 +1093,9 @@ class TargetInstrInfo : public MCInstrInfo {
                       SmallVectorImpl<MachineInstr *> &DelInstrs,
                       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
 
+  /// The limit on resource length extension we accept in MachineCombiner Pass.
+  virtual int getExtendResourceLenLimit() const { return 0; }
+
   /// This is an architecture-specific helper function of reassociateOps.
   /// Set special operand attributes for new instructions after reassociation.
   virtual void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 73895bdf834f5..34087d0491bdd 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -406,12 +406,14 @@ bool MachineCombiner::preservesResourceLen(
                     << ResLenBeforeCombine
                     << " and after: " << ResLenAfterCombine << "\n";);
   LLVM_DEBUG(
-      ResLenAfterCombine <= ResLenBeforeCombine
+      ResLenAfterCombine <=
+      ResLenBeforeCombine + TII->getExtendResourceLenLimit()
           ? dbgs() << "\t\t  As result it IMPROVES/PRESERVES Resource Length\n"
           : dbgs() << "\t\t  As result it DOES NOT improve/preserve Resource "
                       "Length\n");
 
-  return ResLenAfterCombine <= ResLenBeforeCombine;
+  return ResLenAfterCombine <=
+         ResLenBeforeCombine + TII->getExtendResourceLenLimit();
 }
 
 /// \returns true when new instruction sequence should be generated

From af38074874c605f9e598ae3f7e5d4befa3fe92bb Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Thu, 28 May 2020 17:50:31 +0200
Subject: [PATCH 692/770] Fix strict aliasing warning in msan.cpp

Use internal_memcpy instead.

Differential Revision: https://reviews.llvm.org/D80732
---
 compiler-rt/lib/msan/msan.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index 7095ee1bf20f2..8c789901adced 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -617,34 +617,41 @@ u32 __msan_get_umr_origin() {
 }
 
 u16 __sanitizer_unaligned_load16(const uu16 *p) {
-  *(uu16 *)&__msan_retval_tls[0] = *(uu16 *)MEM_TO_SHADOW((uptr)p);
+  internal_memcpy(&__msan_retval_tls[0], (void *)MEM_TO_SHADOW((uptr)p),
+                  sizeof(uu16));
   if (__msan_get_track_origins())
     __msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
   return *p;
 }
 u32 __sanitizer_unaligned_load32(const uu32 *p) {
-  *(uu32 *)&__msan_retval_tls[0] = *(uu32 *)MEM_TO_SHADOW((uptr)p);
+  internal_memcpy(&__msan_retval_tls[0], (void *)MEM_TO_SHADOW((uptr)p),
+                  sizeof(uu32));
   if (__msan_get_track_origins())
     __msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
   return *p;
 }
 u64 __sanitizer_unaligned_load64(const uu64 *p) {
-  __msan_retval_tls[0] = *(uu64 *)MEM_TO_SHADOW((uptr)p);
+  internal_memcpy(&__msan_retval_tls[0], (void *)MEM_TO_SHADOW((uptr)p),
+                  sizeof(uu64));
   if (__msan_get_track_origins())
     __msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
   return *p;
 }
 void __sanitizer_unaligned_store16(uu16 *p, u16 x) {
-  u16 s = *(uu16 *)&__msan_param_tls[1];
-  *(uu16 *)MEM_TO_SHADOW((uptr)p) = s;
+  static_assert(sizeof(uu16) == sizeof(u16), "incompatible types");
+  u16 s;
+  internal_memcpy(&s, &__msan_param_tls[1], sizeof(uu16));
+  internal_memcpy((void *)MEM_TO_SHADOW((uptr)p), &s, sizeof(uu16));
   if (s && __msan_get_track_origins())
     if (uu32 o = __msan_param_origin_tls[2])
       SetOriginIfPoisoned((uptr)p, (uptr)&s, sizeof(s), o);
   *p = x;
 }
 void __sanitizer_unaligned_store32(uu32 *p, u32 x) {
-  u32 s = *(uu32 *)&__msan_param_tls[1];
-  *(uu32 *)MEM_TO_SHADOW((uptr)p) = s;
+  static_assert(sizeof(uu32) == sizeof(u32), "incompatible types");
+  u32 s;
+  internal_memcpy(&s, &__msan_param_tls[1], sizeof(uu32));
+  internal_memcpy((void *)MEM_TO_SHADOW((uptr)p), &s, sizeof(uu32));
   if (s && __msan_get_track_origins())
     if (uu32 o = __msan_param_origin_tls[2])
       SetOriginIfPoisoned((uptr)p, (uptr)&s, sizeof(s), o);

From 11efb0837c897c709ae162eb5ebabb460fc537ff Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Mon, 1 Jun 2020 07:49:19 +0200
Subject: [PATCH 693/770] Improve SmallPtrSetImpl::count implementation

Relying on the find method implies a roundtrip to the iterator world, which is
not costless because iterator creation involves a few check to ensure the
iterator is in a valid position (through the SmallPtrSetIteratorImpl::AdvanceIfNotValid
method). It turns out that the result of SmallPtrSetImpl::find_imp is either
valid or the EndPointer, so there's no need to go through that abstraction,
and the compiler cannot guess it.

Differential Revision: https://reviews.llvm.org/D80708
---
 llvm/include/llvm/ADT/SmallPtrSet.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index 05073cf17f92a..0ab05cfe611aa 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -372,7 +372,9 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase {
     return erase_imp(PtrTraits::getAsVoidPointer(Ptr));
   }
   /// count - Return 1 if the specified pointer is in the set, 0 otherwise.
-  size_type count(ConstPtrType Ptr) const { return find(Ptr) != end() ? 1 : 0; }
+  size_type count(ConstPtrType Ptr) const {
+    return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer();
+  }
   iterator find(ConstPtrType Ptr) const {
     return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)));
   }

From b6d23f2efc64c226d30094bcc4258e0b63029da8 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Mon, 1 Jun 2020 07:52:01 +0100
Subject: [PATCH 694/770] [ASTMatchers] Force c++ unittests to specify correct
 language standard

Force the unittests on c++ code for matchers to specify the correct standard.

Reviewed By: gribozavr2

Differential Revision: https://reviews.llvm.org/D80884
---
 clang/unittests/ASTMatchers/ASTMatchersTest.h |   8 +-
 .../ASTMatchers/ASTMatchersTraversalTest.cpp  | 237 +++++++++++-------
 2 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/clang/unittests/ASTMatchers/ASTMatchersTest.h b/clang/unittests/ASTMatchers/ASTMatchersTest.h
index 6af039e720586..8bf23a5aca19f 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTest.h
+++ b/clang/unittests/ASTMatchers/ASTMatchersTest.h
@@ -153,7 +153,7 @@ matchesConditionally(const std::string &Code, const T &AMatcher,
   }
 
   for (auto Mode : LangModes) {
-    std::string LangModeArg;
+    StringRef LangModeArg;
     switch (Mode) {
     case LanguageMode::Cxx11:
       LangModeArg = "-std=c++11";
@@ -171,8 +171,10 @@ matchesConditionally(const std::string &Code, const T &AMatcher,
       llvm_unreachable("Invalid language mode");
     }
 
-    auto Result =
-        matchesConditionally(Code, AMatcher, ExpectMatch, LangModeArg);
+    auto Result = matchesConditionally(Code, AMatcher, ExpectMatch,
+                                       {LangModeArg, "-Werror=c++14-extensions",
+                                        "-Werror=c++17-extensions",
+                                        "-Werror=c++20-extensions"});
     if (!Result)
       return Result;
   }
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index 6bd8fcf664988..93b0eefa676b7 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -950,10 +950,14 @@ TEST(TemplateTypeParmDecl, VarTemplatePartialSpecializationDecl) {
       "template<typename U>\n"
       "template<typename U2>\n"
       "int Struct<U>::field<U2*> = 123;\n";
-  EXPECT_TRUE(matches(input, templateTypeParmDecl(hasName("T"))));
-  EXPECT_TRUE(matches(input, templateTypeParmDecl(hasName("T2"))));
-  EXPECT_TRUE(matches(input, templateTypeParmDecl(hasName("U"))));
-  EXPECT_TRUE(matches(input, templateTypeParmDecl(hasName("U2"))));
+  EXPECT_TRUE(
+      matches(input, templateTypeParmDecl(hasName("T")), LanguageMode::Cxx14));
+  EXPECT_TRUE(
+      matches(input, templateTypeParmDecl(hasName("T2")), LanguageMode::Cxx14));
+  EXPECT_TRUE(
+      matches(input, templateTypeParmDecl(hasName("U")), LanguageMode::Cxx14));
+  EXPECT_TRUE(
+      matches(input, templateTypeParmDecl(hasName("U2")), LanguageMode::Cxx14));
 }
 
 TEST(TemplateTypeParmDecl, ClassTemplatePartialSpecializationDecl) {
@@ -2061,113 +2065,146 @@ void func14() {
 
 )cpp";
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func1"))),
-                                hasReturnValue(integerLiteral(equals(42)))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func1"))),
+                                  hasReturnValue(integerLiteral(equals(42))))),
+              LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     integerLiteral(equals(42),
-                                    hasParent(returnStmt(forFunction(
-                                        functionDecl(hasName("func1")))))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       integerLiteral(equals(42),
+                                      hasParent(returnStmt(forFunction(
+                                          functionDecl(hasName("func1"))))))),
+              LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
       Code,
       traverse(TK_IgnoreUnlessSpelledInSource,
                returnStmt(forFunction(functionDecl(hasName("func2"))),
                           hasReturnValue(cxxTemporaryObjectExpr(
-                              hasArgument(0, integerLiteral(equals(42)))))))));
+                              hasArgument(0, integerLiteral(equals(42))))))),
+      LanguageMode::Cxx2a));
   EXPECT_TRUE(matches(
       Code,
-      traverse(TK_IgnoreUnlessSpelledInSource,
-               integerLiteral(
-                   equals(42),
-                   hasParent(cxxTemporaryObjectExpr(hasParent(returnStmt(
-                       forFunction(functionDecl(hasName("func2")))))))))));
+      traverse(
+          TK_IgnoreUnlessSpelledInSource,
+          integerLiteral(equals(42),
+                         hasParent(cxxTemporaryObjectExpr(hasParent(returnStmt(
+                             forFunction(functionDecl(hasName("func2"))))))))),
+      LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func3"))),
-                                hasReturnValue(
-                                    cxxFunctionalCastExpr(hasSourceExpression(
-                                        integerLiteral(equals(42)))))))));
+      Code,
+      traverse(
+          TK_IgnoreUnlessSpelledInSource,
+          returnStmt(forFunction(functionDecl(hasName("func3"))),
+                     hasReturnValue(cxxFunctionalCastExpr(
+                         hasSourceExpression(integerLiteral(equals(42))))))),
+      LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
       Code,
-      traverse(TK_IgnoreUnlessSpelledInSource,
-               integerLiteral(
-                   equals(42),
-                   hasParent(cxxFunctionalCastExpr(hasParent(returnStmt(
-                       forFunction(functionDecl(hasName("func3")))))))))));
+      traverse(
+          TK_IgnoreUnlessSpelledInSource,
+          integerLiteral(equals(42),
+                         hasParent(cxxFunctionalCastExpr(hasParent(returnStmt(
+                             forFunction(functionDecl(hasName("func3"))))))))),
+      LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func4"))),
-                                hasReturnValue(cxxTemporaryObjectExpr())))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func4"))),
+                                  hasReturnValue(cxxTemporaryObjectExpr()))),
+              LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func5"))),
-                                hasReturnValue(cxxTemporaryObjectExpr())))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func5"))),
+                                  hasReturnValue(cxxTemporaryObjectExpr()))),
+              LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func6"))),
-                                hasReturnValue(cxxTemporaryObjectExpr())))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func6"))),
+                                  hasReturnValue(cxxTemporaryObjectExpr()))),
+              LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func7"))),
-                                hasReturnValue(cxxTemporaryObjectExpr())))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func7"))),
+                                  hasReturnValue(cxxTemporaryObjectExpr()))),
+              LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func8"))),
-                                hasReturnValue(cxxFunctionalCastExpr(
-                                    hasSourceExpression(initListExpr())))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func8"))),
+                                  hasReturnValue(cxxFunctionalCastExpr(
+                                      hasSourceExpression(initListExpr()))))),
+              LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func9"))),
-                                hasReturnValue(cxxFunctionalCastExpr(
-                                    hasSourceExpression(initListExpr())))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       returnStmt(forFunction(functionDecl(hasName("func9"))),
+                                  hasReturnValue(cxxFunctionalCastExpr(
+                                      hasSourceExpression(initListExpr()))))),
+              LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func10"))),
-                                hasReturnValue(
-                                    declRefExpr(to(varDecl(hasName("a")))))))));
+      Code,
+      traverse(
+          TK_IgnoreUnlessSpelledInSource,
+          returnStmt(forFunction(functionDecl(hasName("func10"))),
+                     hasReturnValue(declRefExpr(to(varDecl(hasName("a"))))))),
+      LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     declRefExpr(to(varDecl(hasName("a"))),
-                                 hasParent(returnStmt(forFunction(
-                                     functionDecl(hasName("func10")))))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       declRefExpr(to(varDecl(hasName("a"))),
+                                   hasParent(returnStmt(forFunction(
+                                       functionDecl(hasName("func10"))))))),
+              LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func11"))),
-                                hasReturnValue(
-                                    declRefExpr(to(varDecl(hasName("b")))))))));
+      Code,
+      traverse(
+          TK_IgnoreUnlessSpelledInSource,
+          returnStmt(forFunction(functionDecl(hasName("func11"))),
+                     hasReturnValue(declRefExpr(to(varDecl(hasName("b"))))))),
+      LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     declRefExpr(to(varDecl(hasName("b"))),
-                                 hasParent(returnStmt(forFunction(
-                                     functionDecl(hasName("func11")))))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       declRefExpr(to(varDecl(hasName("b"))),
+                                   hasParent(returnStmt(forFunction(
+                                       functionDecl(hasName("func11"))))))),
+              LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     returnStmt(forFunction(functionDecl(hasName("func12"))),
-                                hasReturnValue(
-                                    declRefExpr(to(varDecl(hasName("c")))))))));
+      Code,
+      traverse(
+          TK_IgnoreUnlessSpelledInSource,
+          returnStmt(forFunction(functionDecl(hasName("func12"))),
+                     hasReturnValue(declRefExpr(to(varDecl(hasName("c"))))))),
+      LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     declRefExpr(to(varDecl(hasName("c"))),
-                                 hasParent(returnStmt(forFunction(
-                                     functionDecl(hasName("func12")))))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       declRefExpr(to(varDecl(hasName("c"))),
+                                   hasParent(returnStmt(forFunction(
+                                       functionDecl(hasName("func12"))))))),
+              LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
       Code,
@@ -2178,13 +2215,16 @@ void func14() {
                      has(declRefExpr(to(varDecl(hasName("a"))))),
                      has(varDecl(hasName("b"), hasInitializer(declRefExpr(to(
                                                    varDecl(hasName("c"))))))),
-                     has(parmVarDecl(hasName("d")))))));
+                     has(parmVarDecl(hasName("d"))))),
+      LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     declRefExpr(to(varDecl(hasName("a"))),
-                                 hasParent(lambdaExpr(forFunction(
-                                     functionDecl(hasName("func13")))))))));
+  EXPECT_TRUE(
+      matches(Code,
+              traverse(TK_IgnoreUnlessSpelledInSource,
+                       declRefExpr(to(varDecl(hasName("a"))),
+                                   hasParent(lambdaExpr(forFunction(
+                                       functionDecl(hasName("func13"))))))),
+              LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
       Code,
@@ -2192,18 +2232,21 @@ void func14() {
                varDecl(hasName("b"),
                        hasInitializer(declRefExpr(to(varDecl(hasName("c"))))),
                        hasParent(lambdaExpr(
-                           forFunction(functionDecl(hasName("func13")))))))));
+                           forFunction(functionDecl(hasName("func13"))))))),
+      LanguageMode::Cxx2a));
 
   EXPECT_TRUE(matches(
-      Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                     lambdaExpr(
-                         forFunction(functionDecl(hasName("func14"))),
-                         has(templateTypeParmDecl(hasName("TemplateType")))))));
+      Code,
+      traverse(TK_IgnoreUnlessSpelledInSource,
+               lambdaExpr(forFunction(functionDecl(hasName("func14"))),
+                          has(templateTypeParmDecl(hasName("TemplateType"))))),
+      LanguageMode::Cxx2a));
 
-  EXPECT_TRUE(
-      matches(Code, traverse(TK_IgnoreUnlessSpelledInSource,
-                             functionDecl(hasName("func14"),
-                                          hasDescendant(floatLiteral())))));
+  EXPECT_TRUE(matches(
+      Code,
+      traverse(TK_IgnoreUnlessSpelledInSource,
+               functionDecl(hasName("func14"), hasDescendant(floatLiteral()))),
+      LanguageMode::Cxx2a));
 }
 
 TEST(IgnoringImpCasts, MatchesImpCasts) {

From 40a3fcb05c83c41862038277aa667c956e7cac82 Mon Sep 17 00:00:00 2001
From: Djordje Todorovic <djordje.todorovic@syrmia.com>
Date: Wed, 27 May 2020 13:58:21 +0200
Subject: [PATCH 695/770] [DebugInfo][CallSites] Remove decl subprograms from
 'retainedTypes:'

After the D70350, the retainedTypes: isn't being used for the purpose
of call site debug info for extern calls, so it is safe to delete it
from IR representation.
We are also adding a test to ensure the subprogram isn't stored within
the retainedTypes: from corresponding DICompileUnit.

Differential Revision: https://reviews.llvm.org/D80369
---
 clang/lib/CodeGen/CGDebugInfo.cpp              |  2 +-
 clang/test/CodeGen/debug-info-extern-call.c    |  6 ++++++
 clang/test/Modules/DebugInfoTransitiveImport.m |  4 ++--
 clang/test/Modules/ModuleDebugInfo.cpp         | 18 +++++++++---------
 clang/test/Modules/ModuleDebugInfo.m           | 15 +++------------
 5 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 31f8df2430176..5be8e77c0b497 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -3900,7 +3900,7 @@ void CGDebugInfo::EmitFunctionDecl(GlobalDecl GD, SourceLocation Loc,
   if (IsDeclForCallSite)
     Fn->setSubprogram(SP);
 
-  DBuilder.retainType(SP);
+  DBuilder.finalizeSubprogram(SP);
 }
 
 void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
diff --git a/clang/test/CodeGen/debug-info-extern-call.c b/clang/test/CodeGen/debug-info-extern-call.c
index 072e578b58986..7ba115ad2ec9e 100644
--- a/clang/test/CodeGen/debug-info-extern-call.c
+++ b/clang/test/CodeGen/debug-info-extern-call.c
@@ -1,6 +1,10 @@
 // When entry values are emitted, expect a subprogram for extern decls so that
 // the dwarf generator can describe call site parameters at extern call sites.
 //
+// Initial implementation relied on the 'retainedTypes:' from the corresponding
+// DICompileUnit, so we also ensure that we do not store the extern declaration
+// subprogram into the 'retainedTypes:'.
+//
 // RUN: %clang -g -O2 -target x86_64-none-linux-gnu -S -emit-llvm %s -o - \
 // RUN:   | FileCheck %s -check-prefix=DECLS-FOR-EXTERN
 
@@ -17,6 +21,8 @@
 // RUN: %clang -g -O2 -target x86_64-none-linux-gnu -gsce -S -emit-llvm %s -o - \
 // RUN:   | FileCheck %s -check-prefix=NO-DECLS-FOR-EXTERN
 
+// DECLS-FOR-EXTERN-NOT: !DICompileUnit({{.*}}retainedTypes: ![[RETTYPES:[0-9]+]]
+// DECLS-FOR-EXTERN-NOT: ![[RETTYPES]] = !{
 // DECLS-FOR-EXTERN: !DISubprogram(name: "fn1"
 // DECLS-FOR-EXTERN-NOT: !DISubprogram(name: "memcmp"
 // DECLS-FOR-EXTERN-NOT: !DISubprogram(name: "__some_reserved_name"
diff --git a/clang/test/Modules/DebugInfoTransitiveImport.m b/clang/test/Modules/DebugInfoTransitiveImport.m
index 08dfecfb78997..bd763e81cf851 100644
--- a/clang/test/Modules/DebugInfoTransitiveImport.m
+++ b/clang/test/Modules/DebugInfoTransitiveImport.m
@@ -12,10 +12,10 @@
 
 // Definition of left:
 // CHECK: !DICompileUnit({{.*}}dwoId:
-// CHECK: ![[LEFT:[0-9]+]] = !DIFile({{.*}}diamond_left.h
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration,
-// CHECK-SAME:              entity: ![[MODULE:.*]], file: ![[LEFT]], line: 3)
+// CHECK-SAME:              entity: ![[MODULE:.*]], file: ![[LEFT:.*]], line: 3)
 // CHECK: ![[MODULE]] = !DIModule(scope: null, name: "diamond_top"
+// CHECK: ![[LEFT]] = !DIFile({{.*}}diamond_left.h
 
 // Skeleton for top:
 // CHECK: !DICompileUnit({{.*}}splitDebugFilename: {{.*}}diamond_top{{.*}}dwoId:
diff --git a/clang/test/Modules/ModuleDebugInfo.cpp b/clang/test/Modules/ModuleDebugInfo.cpp
index e6e99ed4e5379..3121719e55a66 100644
--- a/clang/test/Modules/ModuleDebugInfo.cpp
+++ b/clang/test/Modules/ModuleDebugInfo.cpp
@@ -51,15 +51,6 @@
 // CHECK-SAME:             )
 // CHECK: !DIEnumerator(name: "e5", value: 5, isUnsigned: true)
 
-// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "B",
-// no mangled name here yet.
-
-// This type is anchored by a function parameter.
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A<void>"
-// CHECK-SAME:             elements:
-// CHECK-SAME:             templateParams:
-// CHECK-SAME:             identifier: "_ZTSN8DebugCXX1AIJvEEE")
-
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Struct"
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX6StructE")
 
@@ -85,6 +76,12 @@
 // CHECK-SAME:             templateParams:
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIlNS_6traitsIlEEEE")
 
+// This type is anchored by a function parameter.
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A<void>"
+// CHECK-SAME:             elements:
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX1AIJvEEE")
+
 // CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "FloatInstantiation"
 // no mangled name here yet.
 
@@ -93,6 +90,9 @@
 // CHECK-SAME:             flags: DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIfNS_6traitsIfEEEE")
 
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "B",
+// no mangled name here yet.
+
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual"
 // CHECK-SAME:             elements:
 // CHECK-SAME:             identifier: "_ZTS7Virtual")
diff --git a/clang/test/Modules/ModuleDebugInfo.m b/clang/test/Modules/ModuleDebugInfo.m
index 9289abe080807..ed576e441e5da 100644
--- a/clang/test/Modules/ModuleDebugInfo.m
+++ b/clang/test/Modules/ModuleDebugInfo.m
@@ -36,19 +36,13 @@
 // CHECK-NOT:              name:
 // CHECK-SAME:             elements:
 
-// CHECK: !DISubprogram(name: "+[ObjCClass classMethod]",
-// CHECK-SAME:          scope: ![[MODULE]],
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDecl",
+// CHECK-SAME:             scope: ![[MODULE]],
 
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClass",
 // CHECK-SAME:             scope: ![[MODULE]],
 // CHECK-SAME:             elements
 
-// The forward declaration should not be in the module scope.
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "OpaqueData", file
-
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDecl",
-// CHECK-SAME:             scope: ![[MODULE]],
-
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClassWithPrivateIVars",
 // CHECK-SAME:             scope: ![[MODULE]],
 // CHECK-SAME:             elements
@@ -85,11 +79,8 @@
 // The output order is sublty different for module vs. pch,
 // so these are checked separately:
 //
-// CHECK2: !DISubprogram(name: "+[ObjCClass classMethod]"
-// CHECK2: !DISubprogram(name: "-[ObjCClass instanceMethodWithInt:]"
+// CHECK2: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDecl",
 // CHECK2: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClass",
 // CHECK2: !DIObjCProperty(name: "property",
 // CHECK2: !DIDerivedType(tag: DW_TAG_member, name: "ivar"
-// CHECK2: !DISubprogram(name: "-[Category(Category) categoryMethod]"
-// CHECK2: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDecl",
 // CHECK2: !DIDerivedType(tag: DW_TAG_typedef, name: "InnerEnum"

From dace8224f38a31636a02fe9c2af742222831f70c Mon Sep 17 00:00:00 2001
From: Tim Northover <t.p.northover@gmail.com>
Date: Wed, 22 Apr 2020 14:02:48 +0100
Subject: [PATCH 696/770] AArch64: materialize large stack offset into xzr
 correctly.

When a stack offset was too big to materialize in a single instruction, we were
trying to do it in stages:

    adds xD, sp, #imm
    adds xD, xD, #imm

Unfortunately, if xD is xzr then the second instruction doesn't exist and
wouldn't do what was needed if it did. Instead we can use a temporary register
for all but the last addition.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 17 ++++++++++-----
 llvm/test/CodeGen/AArch64/large-stack-cmp.ll | 23 ++++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/large-stack-cmp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fd07c32e5496f..801f162937ede 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3332,6 +3332,10 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
 
   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  Register TmpReg = DestReg;
+  if (TmpReg == AArch64::XZR)
+    TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
+        &AArch64::GPR64RegClass);
   do {
     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
     unsigned LocalShiftSize = 0;
@@ -3341,7 +3345,11 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
     }
     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
            "Encoding cannot handle value that big");
-    auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+
+    Offset -= ThisVal << LocalShiftSize;
+    if (Offset == 0)
+      TmpReg = DestReg;
+    auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
                    .addReg(SrcReg)
                    .addImm(Sign * (int)ThisVal);
     if (ShiftSize)
@@ -3362,8 +3370,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
               .addImm(Imm)
               .setMIFlag(Flag);
-        assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
-                                      "emit a single SEH directive");
+        assert(Offset == 0 && "Expected remaining offset to be zero to "
+                              "emit a single SEH directive");
       } else if (DestReg == AArch64::SP) {
         if (HasWinCFI)
           *HasWinCFI = true;
@@ -3376,8 +3384,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
         *HasWinCFI = true;
     }
 
-    SrcReg = DestReg;
-    Offset -= ThisVal << LocalShiftSize;
+    SrcReg = TmpReg;
   } while (Offset);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/large-stack-cmp.ll b/llvm/test/CodeGen/AArch64/large-stack-cmp.ll
new file mode 100644
index 0000000000000..68a76b79df930
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-stack-cmp.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s
+
+define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK: adds [[TMP:x[0-9]+]], sp,
+; CHECK: cmn [[TMP]],
+
+%var = alloca i32, i32 12
+  %var2 = alloca i32, i32 1030
+  %tst = icmp eq i32* %var, null
+  br i1 %tst, label %true, label %false
+
+true:
+  call void @bar()
+  ret void
+
+false:
+  call void @baz()
+  ret void
+}
+
+declare void @bar()
+declare void @baz()

From e75efcc3c1af068a145ea83deb0435b195a44162 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Thu, 21 May 2020 13:57:41 +0300
Subject: [PATCH 697/770] [llvm-readobj] - Improve error reporting for hash
 tables.

This improves the next points for broken hash tables:

1) Use reportUniqueWarning to prevent duplication when
   --hash-table and --elf-hash-histogram are used together.

2) Dump nbuckets and nchain fields. It is often possible
   to dump them even when the table itself goes past the EOF etc.

Differential revision: https://reviews.llvm.org/D80373
---
 .../tools/llvm-readobj/ELF/hash-table.test    | 15 ++++++
 llvm/tools/llvm-readobj/ELFDumper.cpp         | 54 ++++++++++++-------
 2 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-table.test b/llvm/test/tools/llvm-readobj/ELF/hash-table.test
index a1018a9daefed..b247c597eeb11 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-table.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-table.test
@@ -157,6 +157,8 @@ ProgramHeaders:
 # RUN:   FileCheck %s --check-prefix=ERR2 -DFILE=%t5.2.o --implicit-check-not="warning:"
 
 # ERR2:      HashTable {
+# ERR2:       Num Buckets: 94
+# ERR2:       Num Chains: 1
 # ERR2-NEXT:  warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 94, nchain = 1{{$}}
 # ERR2-NEXT: }
 
@@ -188,6 +190,8 @@ ProgramHeaders:
 
 # ERR3:      warning: '[[FILE]]': hash table nchain (94) differs from symbol count derived from SHT_DYNSYM section header (1)
 # ERR3:      HashTable {
+# ERR3-NEXT:  Num Buckets: 1
+# ERR3-NEXT:  Num Chains: 94
 # ERR3-NEXT:  warning: '[[FILE]]': the hash table at offset 0x54 goes past the end of the file (0x1d4), nbucket = 1, nchain = 94{{$}}
 # ERR3-NEXT: }
 
@@ -219,3 +223,14 @@ ProgramHeaders:
     Sections:
       - Section: .hash
       - Section: .dynamic
+
+## Show we do not duplicate warnings when printing both the hash table and the hash histogram.
+## Note that --elf-hash-histogram is only implemented for llvm-readelf currently.
+# RUN: yaml2obj --docnum=3 %s -o %t4.o
+# RUN: llvm-readelf --hash-table --elf-hash-histogram %t4.o 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=SINGLE-WARN -DFILE=%t4.o --implicit-check-not="warning:"
+
+# SINGLE-WARN:      warning: '[[FILE]]': hash table nchain (0) differs from symbol count derived from SHT_DYNSYM section header (1)
+# SINGLE-WARN-NEXT: HashTable {
+# SINGLE-WARN-NEXT: warning: '[[FILE]]': the hash table at offset 0x2b1 goes past the end of the file (0x2b8)
+# SINGLE-WARN-NEXT: }
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index dc080c8dd49cb..4b87067ce557f 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -2642,28 +2642,30 @@ template <class ELFT> void ELFDumper<ELFT>::printNeededLibraries() {
 }
 
 template <class ELFT>
-static bool checkHashTable(const ELFFile<ELFT> *Obj,
-                           const typename ELFT::Hash *H, StringRef FileName) {
-  auto WarnAndReturn = [&](uint64_t Off, const Twine &Msg = "") {
-    reportWarning(createError("the hash table at offset 0x" +
-                              Twine::utohexstr(Off) +
-                              " goes past the end of the file (0x" +
-                              Twine::utohexstr(Obj->getBufSize()) + ")" + Msg),
-                  FileName);
-    return false;
+static Error checkHashTable(const ELFFile<ELFT> *Obj,
+                            const typename ELFT::Hash *H,
+                            bool *IsHeaderValid = nullptr) {
+  auto MakeError = [&](uint64_t Off, const Twine &Msg = "") {
+    return createError("the hash table at offset 0x" + Twine::utohexstr(Off) +
+                       " goes past the end of the file (0x" +
+                       Twine::utohexstr(Obj->getBufSize()) + ")" + Msg);
   };
 
   // Each SHT_HASH section starts from two 32-bit fields: nbucket and nchain.
   const unsigned HeaderSize = 2 * sizeof(typename ELFT::Word);
   const uint64_t SecOffset = (const uint8_t *)H - Obj->base();
+
+  if (IsHeaderValid)
+    *IsHeaderValid = Obj->getBufSize() - SecOffset >= HeaderSize;
+
   if (Obj->getBufSize() - SecOffset < HeaderSize)
-    return WarnAndReturn(SecOffset);
+    return MakeError(SecOffset);
 
   if (Obj->getBufSize() - SecOffset - HeaderSize <
       ((uint64_t)H->nbucket + H->nchain) * sizeof(typename ELFT::Word))
-    return WarnAndReturn(SecOffset, ", nbucket = " + Twine(H->nbucket) +
-                                        ", nchain = " + Twine(H->nchain));
-  return true;
+    return MakeError(SecOffset, ", nbucket = " + Twine(H->nbucket) +
+                                    ", nchain = " + Twine(H->nchain));
+  return Error::success();
 }
 
 template <class ELFT>
@@ -2690,11 +2692,21 @@ static Error checkGNUHashTable(const ELFFile<ELFT> *Obj,
 
 template <typename ELFT> void ELFDumper<ELFT>::printHashTable() {
   DictScope D(W, "HashTable");
-  if (!HashTable ||
-      !checkHashTable(ObjF->getELFFile(), HashTable, ObjF->getFileName()))
+  if (!HashTable)
+    return;
+
+  bool IsHeaderValid;
+  Error Err = checkHashTable(ObjF->getELFFile(), HashTable, &IsHeaderValid);
+  if (IsHeaderValid) {
+    W.printNumber("Num Buckets", HashTable->nbucket);
+    W.printNumber("Num Chains", HashTable->nchain);
+  }
+
+  if (Err) {
+    reportUniqueWarning(std::move(Err));
     return;
-  W.printNumber("Num Buckets", HashTable->nbucket);
-  W.printNumber("Num Chains", HashTable->nchain);
+  }
+
   W.printList("Buckets", HashTable->buckets());
   W.printList("Chains", HashTable->chains());
 }
@@ -4026,7 +4038,9 @@ template <class ELFT> void GNUStyle<ELFT>::printHashSymbols(const ELFO *Obj) {
 
   if (const Elf_Hash *SysVHash = this->dumper()->getHashTable()) {
     OS << "\n Symbol table of .hash for image:\n";
-    if (checkHashTable(Obj, SysVHash, this->FileName))
+    if (Error E = checkHashTable<ELFT>(Obj, SysVHash))
+      this->reportUniqueWarning(std::move(E));
+    else
       PrintHashTable(SysVHash);
   }
 
@@ -4689,7 +4703,9 @@ template <class ELFT>
 void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
   // Print histogram for the .hash section.
   if (const Elf_Hash *HashTable = this->dumper()->getHashTable())
-    if (checkHashTable(Obj, HashTable, this->FileName))
+    if (Error E = checkHashTable<ELFT>(Obj, HashTable))
+      this->reportUniqueWarning(std::move(E));
+    else
       printHashHistogram(*HashTable);
 
   // Print histogram for the .gnu.hash section.

From 85c308804966e52a14bc98874a6cc6e42cbb1d40 Mon Sep 17 00:00:00 2001
From: Ehud Katz <ehudkatz@gmail.com>
Date: Mon, 1 Jun 2020 12:50:35 +0300
Subject: [PATCH 698/770] [StructurizeCFG] Fix region nodes ordering

This is a reimplementation of the `orderNodes` function, as the old
implementation didn't take into account all cases.
The new implementation uses SCCs instead of Loops to take account of
irreducible loops.

Fix PR41509

Differential Revision: https://reviews.llvm.org/D79037
---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 174 +++++++-----
 .../StructurizeCFG/interleaved-loop-order.ll  | 262 ++++++++++++++++++
 .../StructurizeCFG/nested-loop-subregion.ll   |  55 ++++
 3 files changed, 419 insertions(+), 72 deletions(-)
 create mode 100644 llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll
 create mode 100644 llvm/test/Transforms/StructurizeCFG/nested-loop-subregion.ll

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 84d8eebd97202..ef59f2412f68d 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -8,13 +8,12 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
@@ -90,6 +89,59 @@ using BBPredicates = DenseMap<BasicBlock *, Value *>;
 using PredMap = DenseMap<BasicBlock *, BBPredicates>;
 using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
 
+// A traits type that is intended to be used in graph algorithms. The graph
+// traits starts at an entry node, and traverses the RegionNodes that are in
+// the Nodes set.
+struct SubGraphTraits {
+  using NodeRef = std::pair<RegionNode *, SmallDenseSet<RegionNode *> *>;
+  using BaseSuccIterator = GraphTraits<RegionNode *>::ChildIteratorType;
+
+  // This wraps a set of Nodes into the iterator, so we know which edges to
+  // filter out.
+  class WrappedSuccIterator
+      : public iterator_adaptor_base<
+            WrappedSuccIterator, BaseSuccIterator,
+            typename std::iterator_traits<BaseSuccIterator>::iterator_category,
+            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+    SmallDenseSet<RegionNode *> *Nodes;
+
+  public:
+    WrappedSuccIterator(BaseSuccIterator It, SmallDenseSet<RegionNode *> *Nodes)
+        : iterator_adaptor_base(It), Nodes(Nodes) {}
+
+    NodeRef operator*() const { return {*I, Nodes}; }
+  };
+
+  static bool filterAll(const NodeRef &N) { return true; }
+  static bool filterSet(const NodeRef &N) { return N.second->count(N.first); }
+
+  using ChildIteratorType =
+      filter_iterator<WrappedSuccIterator, bool (*)(const NodeRef &)>;
+
+  static NodeRef getEntryNode(Region *R) {
+    return {GraphTraits<Region *>::getEntryNode(R), nullptr};
+  }
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static iterator_range<ChildIteratorType> children(const NodeRef &N) {
+    auto *filter = N.second ? &filterSet : &filterAll;
+    return make_filter_range(
+        make_range<WrappedSuccIterator>(
+            {GraphTraits<RegionNode *>::child_begin(N.first), N.second},
+            {GraphTraits<RegionNode *>::child_end(N.first), N.second}),
+        filter);
+  }
+
+  static ChildIteratorType child_begin(const NodeRef &N) {
+    return children(N).begin();
+  }
+
+  static ChildIteratorType child_end(const NodeRef &N) {
+    return children(N).end();
+  }
+};
+
 /// Finds the nearest common dominator of a set of BasicBlocks.
 ///
 /// For every BB you add to the set, you can specify whether we "remember" the
@@ -194,7 +246,6 @@ class StructurizeCFG : public RegionPass {
 
   LegacyDivergenceAnalysis *DA;
   DominatorTree *DT;
-  LoopInfo *LI;
 
   SmallVector<RegionNode *, 8> Order;
   BBSet Visited;
@@ -214,9 +265,6 @@ class StructurizeCFG : public RegionPass {
 
   void orderNodes();
 
-  Loop *getAdjustedLoop(RegionNode *RN);
-  unsigned getAdjustedLoopDepth(RegionNode *RN);
-
   void analyzeLoops(RegionNode *N);
 
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
@@ -282,7 +330,6 @@ class StructurizeCFG : public RegionPass {
       AU.addRequired<LegacyDivergenceAnalysis>();
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
@@ -314,75 +361,60 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   return false;
 }
 
-/// Use the exit block to determine the loop if RN is a SubRegion.
-Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) {
-  if (RN->isSubRegion()) {
-    Region *SubRegion = RN->getNodeAs<Region>();
-    return LI->getLoopFor(SubRegion->getExit());
-  }
-
-  return LI->getLoopFor(RN->getEntry());
-}
-
-/// Use the exit block to determine the loop depth if RN is a SubRegion.
-unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) {
-  if (RN->isSubRegion()) {
-    Region *SubR = RN->getNodeAs<Region>();
-    return LI->getLoopDepth(SubR->getExit());
-  }
-
-  return LI->getLoopDepth(RN->getEntry());
-}
-
-/// Build up the general order of nodes
+/// Build up the general order of nodes, by performing a topology sort of the
+/// parent region's nodes, while ensuring that there is no outer cycle node
+/// between any two inner cycle nodes.
 void StructurizeCFG::orderNodes() {
-  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
-  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
-
-  // The reverse post-order traversal of the list gives us an ordering close
-  // to what we want.  The only problem with it is that sometimes backedges
-  // for outer loops will be visited before backedges for inner loops.
-  for (RegionNode *RN : RPOT) {
-    Loop *Loop = getAdjustedLoop(RN);
-    ++LoopBlocks[Loop];
-  }
-
-  unsigned CurrentLoopDepth = 0;
-  Loop *CurrentLoop = nullptr;
-  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    RegionNode *RN = cast<RegionNode>(*I);
-    unsigned LoopDepth = getAdjustedLoopDepth(RN);
-
-    if (is_contained(Order, *I))
-      continue;
-
-    if (LoopDepth < CurrentLoopDepth) {
-      // Make sure we have visited all blocks in this loop before moving back to
-      // the outer loop.
+  Order.resize(std::distance(GraphTraits<Region *>::nodes_begin(ParentRegion),
+                             GraphTraits<Region *>::nodes_end(ParentRegion)));
+  if (Order.empty())
+    return;
 
-      auto LoopI = I;
-      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
-        LoopI++;
-        if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) {
-          --BlockCount;
-          Order.push_back(*LoopI);
-        }
+  SmallDenseSet<RegionNode *> Nodes;
+  auto EntryNode = SubGraphTraits::getEntryNode(ParentRegion);
+
+  // A list of range indices of SCCs in Order, to be processed.
+  SmallVector<std::pair<unsigned, unsigned>, 8> WorkList;
+  unsigned I = 0, E = Order.size();
+  while (true) {
+    // Run through all the SCCs in the subgraph starting with Entry.
+    for (auto SCCI =
+             scc_iterator<SubGraphTraits::NodeRef, SubGraphTraits>::begin(
+                 EntryNode);
+         !SCCI.isAtEnd(); ++SCCI) {
+      auto &SCC = *SCCI;
+
+      // An SCC up to the size of 2, can be reduced to an entry (the last node),
+      // and a possible additional node. Therefore, it is already in order, and
+      // there is no need to add it to the work-list.
+      unsigned Size = SCC.size();
+      if (Size > 2)
+        WorkList.emplace_back(I, I + Size);
+
+      // Add the SCC nodes to the Order array.
+      for (auto &N : SCC) {
+        assert(I < E && "SCC size mismatch!");
+        Order[I++] = N.first;
       }
     }
+    assert(I == E && "SCC size mismatch!");
 
-    CurrentLoop = getAdjustedLoop(RN);
-    if (CurrentLoop)
-      LoopBlocks[CurrentLoop]--;
+    // If there are no more SCCs to order, then we are done.
+    if (WorkList.empty())
+      break;
 
-    CurrentLoopDepth = LoopDepth;
-    Order.push_back(*I);
-  }
+    std::tie(I, E) = WorkList.pop_back_val();
+
+    // Collect the set of nodes in the SCC's subgraph. These are only the
+    // possible child nodes; we do not add the entry (last node) otherwise we
+    // will have the same exact SCC all over again.
+    Nodes.clear();
+    Nodes.insert(Order.begin() + I, Order.begin() + E - 1);
 
-  // This pass originally used a post-order traversal and then operated on
-  // the list in reverse. Now that we are using a reverse post-order traversal
-  // rather than re-working the whole pass to operate on the list in order,
-  // we just reverse the list and continue to operate on it in reverse.
-  std::reverse(Order.begin(), Order.end());
+    // Update the entry node.
+    EntryNode.first = Order[E - 1];
+    EntryNode.second = &Nodes;
+  }
 }
 
 /// Determine the end of the loops
@@ -490,8 +522,7 @@ void StructurizeCFG::collectInfos() {
   for (RegionNode *RN : reverse(Order)) {
     LLVM_DEBUG(dbgs() << "Visiting: "
                       << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-                      << RN->getEntry()->getName() << " Loop Depth: "
-                      << LI->getLoopDepth(RN->getEntry()) << "\n");
+                      << RN->getEntry()->getName() << "\n");
 
     // Analyze all the conditions leading to a node
     gatherPredicates(RN);
@@ -1013,7 +1044,6 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   ParentRegion = R;
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   orderNodes();
   collectInfos();
diff --git a/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll b/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll
new file mode 100644
index 0000000000000..05713c890fb9d
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/interleaved-loop-order.ll
@@ -0,0 +1,262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+
+; This test have an outer loop containing an inner loop,
+; for which there is an interleaved post-order traversal.
+;
+; This used to produce incorrect code.
+; For example %outer.loop.body used to branched to %inner.loop.end
+; (instead of %inner.loop.header).
+
+define i1 @test_nested(i32 %x, i1 %b1, i1 %b2, i1 %b3) {
+; CHECK-LABEL: @test_nested(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B3_INV:%.*]] = xor i1 [[B3:%.*]], true
+; CHECK-NEXT:    br label [[OUTER_LOOP_HEADER:%.*]]
+; CHECK:       Flow12:
+; CHECK-NEXT:    br i1 [[TMP3:%.*]], label [[EXIT_TRUE:%.*]], label [[FLOW13:%.*]]
+; CHECK:       exit.true:
+; CHECK-NEXT:    br label [[FLOW13]]
+; CHECK:       Flow13:
+; CHECK-NEXT:    br i1 [[TMP2:%.*]], label [[NEWDEFAULT:%.*]], label [[FLOW14:%.*]]
+; CHECK:       NewDefault:
+; CHECK-NEXT:    br label [[EXIT_FALSE:%.*]]
+; CHECK:       Flow14:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[EXIT_FALSE]] ], [ true, [[FLOW13]] ]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit.false:
+; CHECK-NEXT:    br label [[FLOW14]]
+; CHECK:       outer.loop.header:
+; CHECK-NEXT:    br i1 [[B1:%.*]], label [[OUTER_LOOP_BODY:%.*]], label [[FLOW3:%.*]]
+; CHECK:       outer.loop.body:
+; CHECK-NEXT:    br label [[INNER_LOOP_HEADER:%.*]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP16:%.*]], [[FLOW11:%.*]] ], [ true, [[OUTER_LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP2]] = phi i1 [ [[TMP12:%.*]], [[FLOW11]] ], [ false, [[OUTER_LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP3]] = phi i1 [ false, [[FLOW11]] ], [ true, [[OUTER_LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FLOW12:%.*]], label [[OUTER_LOOP_HEADER]]
+; CHECK:       inner.loop.header:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP8:%.*]], [[FLOW4:%.*]] ], [ false, [[OUTER_LOOP_BODY]] ]
+; CHECK-NEXT:    br i1 [[B2:%.*]], label [[INNER_LOOP_BODY:%.*]], label [[FLOW4]]
+; CHECK:       Flow6:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ false, [[INNER_LOOP_LATCH:%.*]] ], [ true, [[LEAFBLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[FLOW5:%.*]]
+; CHECK:       Flow7:
+; CHECK-NEXT:    br i1 [[TMP10:%.*]], label [[INNER_LOOP_END:%.*]], label [[FLOW8:%.*]]
+; CHECK:       inner.loop.end:
+; CHECK-NEXT:    br label [[FLOW8]]
+; CHECK:       inner.loop.body:
+; CHECK-NEXT:    br i1 [[B3_INV]], label [[INNER_LOOP_BODY_ELSE:%.*]], label [[FLOW:%.*]]
+; CHECK:       inner.loop.body.else:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ false, [[INNER_LOOP_BODY_ELSE]] ], [ true, [[INNER_LOOP_BODY]] ]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[INNER_LOOP_BODY_THEN:%.*]], label [[INNER_LOOP_COND:%.*]]
+; CHECK:       inner.loop.body.then:
+; CHECK-NEXT:    br label [[INNER_LOOP_COND]]
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i1 [ [[TMP17:%.*]], [[FLOW5]] ], [ true, [[INNER_LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP18:%.*]], [[FLOW5]] ], [ [[TMP4]], [[INNER_LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[TMP19:%.*]], [[FLOW5]] ], [ false, [[INNER_LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP10]] = phi i1 [ false, [[FLOW5]] ], [ true, [[INNER_LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FLOW7:%.*]], label [[INNER_LOOP_HEADER]]
+; CHECK:       inner.loop.cond:
+; CHECK-NEXT:    br label [[NODEBLOCK:%.*]]
+; CHECK:       NodeBlock:
+; CHECK-NEXT:    [[PIVOT:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    br i1 [[PIVOT]], label [[LEAFBLOCK]], label [[FLOW5]]
+; CHECK:       Flow8:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ true, [[INNER_LOOP_END]] ], [ false, [[FLOW7]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[LEAFBLOCK1:%.*]], label [[FLOW9:%.*]]
+; CHECK:       LeafBlock1:
+; CHECK-NEXT:    [[SWITCHLEAF2:%.*]] = icmp eq i32 [[X]], 1
+; CHECK-NEXT:    br i1 [[SWITCHLEAF2]], label [[INNER_LOOP_BREAK:%.*]], label [[FLOW10:%.*]]
+; CHECK:       LeafBlock:
+; CHECK-NEXT:    [[SWITCHLEAF:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[SWITCHLEAF]], label [[INNER_LOOP_LATCH]], label [[FLOW6:%.*]]
+; CHECK:       Flow9:
+; CHECK-NEXT:    [[TMP12]] = phi i1 [ [[TMP14:%.*]], [[FLOW10]] ], [ [[TMP8]], [[FLOW8]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ [[TMP15:%.*]], [[FLOW10]] ], [ [[TMP11]], [[FLOW8]] ]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[OUTER_LOOP_CLEANUP:%.*]], label [[FLOW11]]
+; CHECK:       inner.loop.break:
+; CHECK-NEXT:    br label [[FLOW10]]
+; CHECK:       Flow10:
+; CHECK-NEXT:    [[TMP14]] = phi i1 [ false, [[INNER_LOOP_BREAK]] ], [ true, [[LEAFBLOCK1]] ]
+; CHECK-NEXT:    [[TMP15]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP11]], [[LEAFBLOCK1]] ]
+; CHECK-NEXT:    br label [[FLOW9]]
+; CHECK:       outer.loop.cleanup:
+; CHECK-NEXT:    br label [[OUTER_LOOP_LATCH:%.*]]
+; CHECK:       Flow11:
+; CHECK-NEXT:    [[TMP16]] = phi i1 [ false, [[OUTER_LOOP_LATCH]] ], [ true, [[FLOW9]] ]
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       outer.loop.latch:
+; CHECK-NEXT:    br label [[FLOW11]]
+; CHECK:       Flow5:
+; CHECK-NEXT:    [[TMP17]] = phi i1 [ [[TMP5]], [[FLOW6]] ], [ true, [[NODEBLOCK]] ]
+; CHECK-NEXT:    [[TMP18]] = phi i1 [ [[TMP5]], [[FLOW6]] ], [ [[TMP4]], [[NODEBLOCK]] ]
+; CHECK-NEXT:    [[TMP19]] = phi i1 [ false, [[FLOW6]] ], [ true, [[NODEBLOCK]] ]
+; CHECK-NEXT:    br label [[FLOW4]]
+; CHECK:       inner.loop.latch:
+; CHECK-NEXT:    br label [[FLOW6]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[TMP0]]
+;
+entry:
+  br label %outer.loop.header
+
+exit.true:                                       ; preds = %outer.loop.header
+  br label %exit
+
+exit.false:                                      ; preds = %inner.loop.cond
+  br label %exit
+
+outer.loop.header:                               ; preds = %outer.loop.latch, %entry
+  br i1 %b1, label %outer.loop.body, label %exit.true
+
+outer.loop.body:                                 ; preds = %outer.loop.header
+  br label %inner.loop.header
+
+inner.loop.header:                               ; preds = %inner.loop.latch, %outer.loop.body
+  br i1 %b2, label %inner.loop.body, label %inner.loop.end
+
+inner.loop.end:                                  ; preds = %inner.loop.header
+  br label %outer.loop.cleanup
+
+inner.loop.body:                                 ; preds = %inner.loop.header
+  br i1 %b3, label %inner.loop.body.then, label %inner.loop.body.else
+
+inner.loop.body.else:                            ; preds = %inner.loop.body
+  br label %inner.loop.cond
+
+inner.loop.body.then:                            ; preds = %inner.loop.body
+  br label %inner.loop.cond
+
+inner.loop.cond:                                 ; preds = %inner.loop.body.then, %inner.loop.body.else
+  switch i32 %x, label %exit.false [
+  i32 0, label %inner.loop.latch
+  i32 1, label %inner.loop.break
+  ]
+
+inner.loop.break:                                ; preds = %inner.loop.cond
+  br label %outer.loop.cleanup
+
+outer.loop.cleanup:                              ; preds = %inner.loop.break, %inner.loop.end
+  br label %outer.loop.latch
+
+outer.loop.latch:                                ; preds = %outer.loop.cleanup
+  br label %outer.loop.header
+
+inner.loop.latch:                                ; preds = %inner.loop.cond
+  br label %inner.loop.header
+
+exit:                                            ; preds = %exit.false, %exit.true
+  %r = phi i1 [ true, %exit.true ], [ false, %exit.false ]
+  ret i1 %r
+}
+
+; This test checks sibling loops that by default have an
+; interleaved post-order traversal.
+
+define void @test_siblings(i1 %b1, i1 %b2, i1 %b3, i1 %b4, i1 %b5, i1 %b6, i1 %b7, i1 %b8, i1 %b9) {
+; CHECK-LABEL: @test_siblings(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B9_INV:%.*]] = xor i1 [[B9:%.*]], true
+; CHECK-NEXT:    [[B6_INV:%.*]] = xor i1 [[B6:%.*]], true
+; CHECK-NEXT:    [[B2_INV:%.*]] = xor i1 [[B2:%.*]], true
+; CHECK-NEXT:    [[B8_INV:%.*]] = xor i1 [[B8:%.*]], true
+; CHECK-NEXT:    [[B5_INV:%.*]] = xor i1 [[B5:%.*]], true
+; CHECK-NEXT:    [[B3_INV:%.*]] = xor i1 [[B3:%.*]], true
+; CHECK-NEXT:    [[B4_INV:%.*]] = xor i1 [[B4:%.*]], true
+; CHECK-NEXT:    [[B1_INV:%.*]] = xor i1 [[B1:%.*]], true
+; CHECK-NEXT:    br i1 [[B1_INV]], label [[IF_ELSE:%.*]], label [[FLOW:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ [[TMP0]], [[FLOW1:%.*]] ], [ [[B2]], [[IF_ELSE]] ], [ false, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP5:%.*]], [[FLOW1]] ], [ [[B2_INV]], [[IF_ELSE]] ], [ false, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ false, [[FLOW1]] ], [ false, [[IF_ELSE]] ], [ true, [[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOOP1_HEADER:%.*]], label [[FLOW1]]
+; CHECK:       loop1.header:
+; CHECK-NEXT:    br i1 [[B3_INV]], label [[LOOP1_BODY:%.*]], label [[FLOW2:%.*]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ true, [[LOOP1_BODY]] ], [ [[TMP1]], [[LOOP1_HEADER]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[B5_INV]], [[LOOP1_BODY]] ], [ [[B3]], [[LOOP1_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP1_LATCH:%.*]], label [[FLOW3:%.*]]
+; CHECK:       loop1.latch:
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP5]] = phi i1 [ [[TMP6:%.*]], [[FLOW3]] ], [ [[TMP1]], [[FLOW]] ]
+; CHECK-NEXT:    br i1 true, label [[FLOW4:%.*]], label [[FLOW]]
+; CHECK:       loop1.body:
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP6]] = phi i1 [ false, [[LOOP1_LATCH]] ], [ [[TMP3]], [[FLOW2]] ]
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i1 [ false, [[FLOW5:%.*]] ], [ [[TMP5]], [[FLOW1]] ]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP2_HEADER:%.*]], label [[FLOW5]]
+; CHECK:       loop2.header:
+; CHECK-NEXT:    br i1 [[B6_INV]], label [[LOOP2_BODY:%.*]], label [[FLOW6:%.*]]
+; CHECK:       Flow5:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i1 [ [[TMP11:%.*]], [[FLOW7:%.*]] ], [ false, [[FLOW4]] ]
+; CHECK-NEXT:    br i1 true, label [[FLOW8:%.*]], label [[FLOW4]]
+; CHECK:       loop2.body:
+; CHECK-NEXT:    br label [[FLOW6]]
+; CHECK:       Flow6:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ true, [[LOOP2_BODY]] ], [ false, [[LOOP2_HEADER]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ [[B7:%.*]], [[LOOP2_BODY]] ], [ [[B6]], [[LOOP2_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[LOOP2_LATCH:%.*]], label [[FLOW7]]
+; CHECK:       loop2.latch:
+; CHECK-NEXT:    br label [[FLOW7]]
+; CHECK:       Flow7:
+; CHECK-NEXT:    [[TMP11]] = phi i1 [ false, [[LOOP2_LATCH]] ], [ [[TMP9]], [[FLOW6]] ]
+; CHECK-NEXT:    br label [[FLOW5]]
+; CHECK:       Flow8:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i1 [ false, [[FLOW10:%.*]] ], [ [[TMP0]], [[FLOW5]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ false, [[FLOW10]] ], [ [[TMP8]], [[FLOW5]] ]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[LOOP3_HEADER:%.*]], label [[FLOW9:%.*]]
+; CHECK:       loop3.header:
+; CHECK-NEXT:    br label [[FLOW9]]
+; CHECK:       Flow9:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i1 [ true, [[LOOP3_HEADER]] ], [ [[TMP12]], [[FLOW8]] ]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOOP3_LATCH:%.*]], label [[FLOW10]]
+; CHECK:       loop3.latch:
+; CHECK-NEXT:    br label [[FLOW10]]
+; CHECK:       Flow10:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[FLOW8]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %b1, label %loop1.header, label %if.else
+
+if.else:
+  br i1 %b2, label %loop3.latch, label %loop2.header
+
+loop1.header:
+  br i1 %b3, label %loop1.latch, label %loop1.body
+
+loop1.latch:
+  br i1 %b4, label %loop1.header, label %exit
+
+loop1.body:
+  br i1 %b5, label %loop2.header, label %loop1.latch
+
+loop2.header:
+  br i1 %b6, label %loop2.latch, label %loop2.body
+
+loop2.body:
+  br i1 %b7, label %loop2.latch, label %loop3.header
+
+loop2.latch:
+  br i1 %b8, label %loop2.header, label %exit
+
+loop3.header:
+  br label %loop3.latch
+
+loop3.latch:
+  br i1 %b9, label %loop3.header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/StructurizeCFG/nested-loop-subregion.ll b/llvm/test/Transforms/StructurizeCFG/nested-loop-subregion.ll
new file mode 100644
index 0000000000000..70023d7ed98ae
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/nested-loop-subregion.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+
+define void @test(i1 %b1, i1 %b2, i1 %b3, i1 %b4) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  A:
+; CHECK-NEXT:    [[B2_INV:%.*]] = xor i1 [[B2:%.*]], true
+; CHECK-NEXT:    br i1 [[B1:%.*]], label [[B:%.*]], label [[H:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br label [[C:%.*]]
+; CHECK:       C:
+; CHECK-NEXT:    br i1 [[B2_INV]], label [[E:%.*]], label [[FLOW:%.*]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ [[B3:%.*]], [[E]] ], [ true, [[C]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, [[E]] ], [ true, [[C]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FLOW1:%.*]], label [[C]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    br i1 [[TMP1]], label [[D:%.*]], label [[F:%.*]]
+; CHECK:       D:
+; CHECK-NEXT:    br label [[F]]
+; CHECK:       E:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       F:
+; CHECK-NEXT:    br label [[G:%.*]]
+; CHECK:       G:
+; CHECK-NEXT:    br i1 [[B4:%.*]], label [[FLOW2:%.*]], label [[B]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    br label [[H]]
+; CHECK:       H:
+; CHECK-NEXT:    ret void
+;
+A:
+  br i1 %b1, label %B, label %H
+
+B:
+  br label %C
+
+C:
+  br i1 %b2, label %D, label %E
+
+D:
+  br label %F
+
+E:
+  br i1 %b3, label %F, label %C
+
+F:
+  br label %G
+
+G:
+  br i1 %b4, label %H, label %B
+
+H:
+  ret void
+}

From feee98645dde4be31a70cc6660d2fc4d4b9d32d8 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Mon, 1 Jun 2020 12:53:50 +0300
Subject: [PATCH 699/770] [llvm-readelf] - Add explicit braces. NFC.

Should fix the BB (http://lab.llvm.org:8011/builders/clang-ppc64le-rhel/builds/3907/steps/build%20stage%201/logs/stdio):

llvm-readobj/ELFDumper.cpp:4708:5: error: add explicit braces to avoid dangling else [-Werror,-Wdangling-else]
    else
    ^
---
 llvm/tools/llvm-readobj/ELFDumper.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 4b87067ce557f..dc413accb2871 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -4703,17 +4703,19 @@ template <class ELFT>
 void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
   // Print histogram for the .hash section.
   if (const Elf_Hash *HashTable = this->dumper()->getHashTable())
-    if (Error E = checkHashTable<ELFT>(Obj, HashTable))
+    if (Error E = checkHashTable<ELFT>(Obj, HashTable)) {
       this->reportUniqueWarning(std::move(E));
-    else
+    } else {
       printHashHistogram(*HashTable);
+    }
 
   // Print histogram for the .gnu.hash section.
   if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable()) {
-    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable))
+    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable)) {
       this->reportUniqueWarning(std::move(E));
-    else
+    } else {
       printGnuHashHistogram(*GnuHashTable);
+    }
   }
 }
 

From b21f32fcecd012fa2c2f8c61d7259079a7f1865e Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Mon, 1 Jun 2020 13:10:16 +0300
Subject: [PATCH 700/770] [llvm-readelf] - Add explicit braces again. NFC.

Partially reverts feee98645dde4be31a70cc6660d2fc4d4b9d32d8.

Add explicit braces to a different place to fix
"error: add explicit braces to avoid dangling else [-Werror,-Wdangling-else]"
---
 llvm/tools/llvm-readobj/ELFDumper.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index dc413accb2871..2c9a4b9c4900c 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -4702,20 +4702,19 @@ void GNUStyle<ELFT>::printGnuHashHistogram(const Elf_GnuHash &GnuHashTable) {
 template <class ELFT>
 void GNUStyle<ELFT>::printHashHistograms(const ELFFile<ELFT> *Obj) {
   // Print histogram for the .hash section.
-  if (const Elf_Hash *HashTable = this->dumper()->getHashTable())
-    if (Error E = checkHashTable<ELFT>(Obj, HashTable)) {
+  if (const Elf_Hash *HashTable = this->dumper()->getHashTable()) {
+    if (Error E = checkHashTable<ELFT>(Obj, HashTable))
       this->reportUniqueWarning(std::move(E));
-    } else {
+    else
       printHashHistogram(*HashTable);
-    }
+  }
 
   // Print histogram for the .gnu.hash section.
   if (const Elf_GnuHash *GnuHashTable = this->dumper()->getGnuHashTable()) {
-    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable)) {
+    if (Error E = checkGNUHashTable<ELFT>(Obj, GnuHashTable))
       this->reportUniqueWarning(std::move(E));
-    } else {
+    else
       printGnuHashHistogram(*GnuHashTable);
-    }
   }
 }
 

From 937403d68430cb59ff7ccba7b2a13b3a815c12fa Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Mon, 1 Jun 2020 15:35:33 +0700
Subject: [PATCH 701/770] [DebugInfo] Separate fields with commas in headers of
 .debug_pub* tables (1/3).

For most tables, we already use commas in headers. This set of patches
unifies dumping the remaining ones.

Differential Revision: https://reviews.llvm.org/D80806
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp    |  6 +++---
 llvm/test/DebugInfo/X86/gnu-public-names-gmlt.ll   |  2 +-
 llvm/test/DebugInfo/X86/gnu-public-names-tu.ll     |  4 ++--
 llvm/test/DebugInfo/X86/gnu-public-names.ll        |  2 +-
 llvm/test/DebugInfo/dwarfdump-pubnames.test        |  2 +-
 llvm/test/tools/dsymutil/ARM/obfuscated.test       | 14 +++++++-------
 .../test/tools/dsymutil/X86/basic-linking-x86.test | 12 ++++++------
 .../dsymutil/X86/basic-lto-dw4-linking-x86.test    | 10 +++++-----
 .../tools/dsymutil/X86/basic-lto-linking-x86.test  |  8 ++++----
 9 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index 70663fa85da41..04af91c1c96c9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -52,10 +52,10 @@ void DWARFDebugPubTable::dump(raw_ostream &OS) const {
   for (const Set &S : Sets) {
     int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(S.Format);
     OS << "length = " << format("0x%0*" PRIx64, OffsetDumpWidth, S.Length);
-    OS << " version = " << format("0x%04x", S.Version);
-    OS << " unit_offset = "
+    OS << ", version = " << format("0x%04x", S.Version);
+    OS << ", unit_offset = "
        << format("0x%0*" PRIx64, OffsetDumpWidth, S.Offset);
-    OS << " unit_size = " << format("0x%0*" PRIx64, OffsetDumpWidth, S.Size)
+    OS << ", unit_size = " << format("0x%0*" PRIx64, OffsetDumpWidth, S.Size)
        << '\n';
     OS << (GnuStyle ? "Offset     Linkage  Kind     Name\n"
                     : "Offset     Name\n");
diff --git a/llvm/test/DebugInfo/X86/gnu-public-names-gmlt.ll b/llvm/test/DebugInfo/X86/gnu-public-names-gmlt.ll
index dc1475c912e5d..f0cb77558f676 100644
--- a/llvm/test/DebugInfo/X86/gnu-public-names-gmlt.ll
+++ b/llvm/test/DebugInfo/X86/gnu-public-names-gmlt.ll
@@ -21,7 +21,7 @@
 ; GPUB-NEXT: "f3"
 
 ; GPUB: .debug_gnu_pubtypes contents:
-; GPUB-NEXT: length = 0x0000000e version = 0x0002 unit_offset = 0x00000000
+; GPUB-NEXT: length = 0x0000000e, version = 0x0002, unit_offset = 0x00000000
 ; GPUB-NEXT: Name
 
 ; NONE-NOT: .debug_pubnames contents:
diff --git a/llvm/test/DebugInfo/X86/gnu-public-names-tu.ll b/llvm/test/DebugInfo/X86/gnu-public-names-tu.ll
index 084c8028a5ea6..46bf89ced3350 100644
--- a/llvm/test/DebugInfo/X86/gnu-public-names-tu.ll
+++ b/llvm/test/DebugInfo/X86/gnu-public-names-tu.ll
@@ -17,13 +17,13 @@
 
 
 ; CHECK-LABEL: .debug_gnu_pubnames contents:
-; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
+; CHECK-NEXT: length = {{.*}}, version = 0x0002, unit_offset = 0x00000000, unit_size = {{.*}}
 ; CHECK-NEXT: Offset     Linkage  Kind     Name
 ; CHECK-NEXT: [[CU]]     EXTERNAL TYPE     "ns"
 ; CHECK-NEXT: {{.*}}     EXTERNAL VARIABLE "b"
 
 ; CHECK-LABEL: debug_gnu_pubtypes contents:
-; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
+; CHECK-NEXT: length = {{.*}}, version = 0x0002, unit_offset = 0x00000000, unit_size = {{.*}}
 ; CHECK-NEXT: Offset     Linkage  Kind     Name
 ; CHECK-NEXT: [[BAR]]    EXTERNAL TYPE     "bar"
 ; CHECK-NEXT: [[CU]]     EXTERNAL TYPE     "ns::foo"
diff --git a/llvm/test/DebugInfo/X86/gnu-public-names.ll b/llvm/test/DebugInfo/X86/gnu-public-names.ll
index c9ec1b59c48fd..6535fce87dd65 100644
--- a/llvm/test/DebugInfo/X86/gnu-public-names.ll
+++ b/llvm/test/DebugInfo/X86/gnu-public-names.ll
@@ -239,7 +239,7 @@
 ; CHECK: DW_AT_name {{.*}} "global_function"
 
 ; CHECK-LABEL: .debug_gnu_pubnames contents:
-; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
+; CHECK-NEXT: length = {{.*}}, version = 0x0002, unit_offset = 0x00000000, unit_size = {{.*}}
 ; CHECK-NEXT: Offset     Linkage  Kind     Name
 ; CHECK-NEXT:  [[GLOBAL_FUNC]] EXTERNAL FUNCTION "global_function"
 ; CHECK-NEXT:  [[NS]] EXTERNAL TYPE     "ns"
diff --git a/llvm/test/DebugInfo/dwarfdump-pubnames.test b/llvm/test/DebugInfo/dwarfdump-pubnames.test
index 558ba658925d2..a1765d258f7c2 100644
--- a/llvm/test/DebugInfo/dwarfdump-pubnames.test
+++ b/llvm/test/DebugInfo/dwarfdump-pubnames.test
@@ -2,7 +2,7 @@ RUN: llvm-dwarfdump %p/Inputs/dwarfdump-pubnames.elf-x86-64 \
 RUN:   --debug-pubnames | FileCheck %s
 
 CHECK: .debug_pubnames contents:
-CHECK: length = 0x000000a1 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000141
+CHECK: length = 0x000000a1, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000141
 
 CHECK: Offset        Name
 CHECK: 0x00000098    "global_namespace_variable"
diff --git a/llvm/test/tools/dsymutil/ARM/obfuscated.test b/llvm/test/tools/dsymutil/ARM/obfuscated.test
index 9ce684cfb7e5e..f7135ed15a72c 100644
--- a/llvm/test/tools/dsymutil/ARM/obfuscated.test
+++ b/llvm/test/tools/dsymutil/ARM/obfuscated.test
@@ -119,19 +119,19 @@ CHECK:        mod_time: 0x00000000
 CHECK:          length: 0x00000000
 
 CHECK: .debug_pubnames contents:
-CHECK: length = 0x00000017 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000044
+CHECK: length = 0x00000017, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000044
 CHECK: 0x0000002e "main"
-CHECK: length = 0x00000016 version = 0x0002 unit_offset = 0x00000044 unit_size = 0x00000044
+CHECK: length = 0x00000016, version = 0x0002, unit_offset = 0x00000044, unit_size = 0x00000044
 CHECK: 0x0000002e "one"
-CHECK: length = 0x00000016 version = 0x0002 unit_offset = 0x00000088 unit_size = 0x00000044
+CHECK: length = 0x00000016, version = 0x0002, unit_offset = 0x00000088, unit_size = 0x00000044
 CHECK: 0x0000002e "two"
-CHECK: length = 0x00000018 version = 0x0002 unit_offset = 0x000000cc unit_size = 0x00000044
+CHECK: length = 0x00000018, version = 0x0002, unit_offset = 0x000000cc, unit_size = 0x00000044
 CHECK: 0x0000002e "three"
-CHECK: length = 0x00000017 version = 0x0002 unit_offset = 0x00000110 unit_size = 0x00000044
+CHECK: length = 0x00000017, version = 0x0002, unit_offset = 0x00000110, unit_size = 0x00000044
 CHECK: 0x0000002e "four"
-CHECK: length = 0x00000017 version = 0x0002 unit_offset = 0x00000154 unit_size = 0x00000044
+CHECK: length = 0x00000017, version = 0x0002, unit_offset = 0x00000154, unit_size = 0x00000044
 CHECK: 0x0000002e "five"
-CHECK: length = 0x00000016 version = 0x0002 unit_offset = 0x00000198 unit_size = 0x00000044
+CHECK: length = 0x00000016, version = 0x0002, unit_offset = 0x00000198, unit_size = 0x00000044
 CHECK: 0x0000002e "six"
 
 CHECK: .apple_names contents:
diff --git a/llvm/test/tools/dsymutil/X86/basic-linking-x86.test b/llvm/test/tools/dsymutil/X86/basic-linking-x86.test
index 3f9ee4d513624..bc99054c9a281 100644
--- a/llvm/test/tools/dsymutil/X86/basic-linking-x86.test
+++ b/llvm/test/tools/dsymutil/X86/basic-linking-x86.test
@@ -190,29 +190,29 @@ CHECK-NEXT: 0x0000000100000f9b     12      0      1   0             0  is_stmt p
 CHECK-NEXT: 0x0000000100000fa9     12      0      1   0             0  is_stmt end_sequence
 
 CHECK: .debug_pubnames contents:
-CHECK-NEXT: length = 0x00000017 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000081
+CHECK-NEXT: length = 0x00000017, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000081
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000026 "main"
-CHECK-NEXT: length = 0x00000036 version = 0x0002 unit_offset = 0x00000081 unit_size = 0x000000a5
+CHECK-NEXT: length = 0x00000036, version = 0x0002, unit_offset = 0x00000081, unit_size = 0x000000a5
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x0000002d "private_int"
 CHECK-NEXT: 0x00000042 "baz"
 CHECK-NEXT: 0x00000057 "foo"
 CHECK-NEXT: 0x00000086 "inc"
-CHECK-NEXT: length = 0x00000026 version = 0x0002 unit_offset = 0x00000126 unit_size = 0x00000096
+CHECK-NEXT: length = 0x00000026, version = 0x0002, unit_offset = 0x00000126, unit_size = 0x00000096
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000026 "val"
 CHECK-NEXT: 0x00000048 "bar"
 CHECK-NEXT: 0x00000077 "inc"
 
 CHECK: .debug_pubtypes contents:
-CHECK-NEXT: length = 0x0000001f version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000081
+CHECK-NEXT: length = 0x0000001f, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000081
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000063 "int"
 CHECK-NEXT: 0x00000079 "char"
-CHECK-NEXT: length = 0x00000016 version = 0x0002 unit_offset = 0x00000081 unit_size = 0x000000a5
+CHECK-NEXT: length = 0x00000016, version = 0x0002, unit_offset = 0x00000081, unit_size = 0x000000a5
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000026 "int"
-CHECK-NEXT: length = 0x00000016 version = 0x0002 unit_offset = 0x00000126 unit_size = 0x00000096
+CHECK-NEXT: length = 0x00000016, version = 0x0002, unit_offset = 0x00000126, unit_size = 0x00000096
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000041 "int"
diff --git a/llvm/test/tools/dsymutil/X86/basic-lto-dw4-linking-x86.test b/llvm/test/tools/dsymutil/X86/basic-lto-dw4-linking-x86.test
index 8f4d80e4959dd..dc3d04b794180 100644
--- a/llvm/test/tools/dsymutil/X86/basic-lto-dw4-linking-x86.test
+++ b/llvm/test/tools/dsymutil/X86/basic-lto-dw4-linking-x86.test
@@ -180,23 +180,23 @@ CHECK-NEXT: 0x0000000100000fb2     20      1      1   0             0  is_stmt
 CHECK-NEXT: 0x0000000100000fb4     20      1      1   0             0  is_stmt end_sequence
 
 CHECK: .debug_pubnames contents:
-CHECK-NEXT: length = 0x00000017 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000077
+CHECK-NEXT: length = 0x00000017, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000077
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x0000002a "main"
-CHECK-NEXT: length = 0x0000002e version = 0x0002 unit_offset = 0x00000077 unit_size = 0x000000a4
+CHECK-NEXT: length = 0x0000002e, version = 0x0002, unit_offset = 0x00000077, unit_size = 0x000000a4
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000031 "baz"
 CHECK-NEXT: 0x00000046 "private_int"
 CHECK-NEXT: 0x00000067 "foo"
-CHECK-NEXT: length = 0x0000001e version = 0x0002 unit_offset = 0x0000011b unit_size = 0x00000085
+CHECK-NEXT: length = 0x0000001e, version = 0x0002, unit_offset = 0x0000011b, unit_size = 0x00000085
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x0000002a "val"
 CHECK-NEXT: 0x00000050 "bar"
 
 CHECK: .debug_pubtypes contents:
-CHECK-NEXT: length = 0x00000017 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000077
+CHECK-NEXT: length = 0x00000017, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000077
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x0000006f "char"
-CHECK-NEXT: length = 0x00000016 version = 0x0002 unit_offset = 0x00000077 unit_size = 0x000000a4
+CHECK-NEXT: length = 0x00000016, version = 0x0002, unit_offset = 0x00000077, unit_size = 0x000000a4
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x0000002a "int"
diff --git a/llvm/test/tools/dsymutil/X86/basic-lto-linking-x86.test b/llvm/test/tools/dsymutil/X86/basic-lto-linking-x86.test
index ae1ba454bffd5..347b1c951d2e9 100644
--- a/llvm/test/tools/dsymutil/X86/basic-lto-linking-x86.test
+++ b/llvm/test/tools/dsymutil/X86/basic-lto-linking-x86.test
@@ -179,21 +179,21 @@ CHECK: 0x0000000100000fb2     20      0      1   0             0  is_stmt
 CHECK: 0x0000000100000fb4     20      0      1   0             0  is_stmt end_sequence
 
 CHECK: .debug_pubnames contents:
-CHECK-NEXT: length = 0x00000017 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000081
+CHECK-NEXT: length = 0x00000017, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000081
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000026 "main"
-CHECK-NEXT: length = 0x0000002e version = 0x0002 unit_offset = 0x00000081 unit_size = 0x000000b9
+CHECK-NEXT: length = 0x0000002e, version = 0x0002, unit_offset = 0x00000081, unit_size = 0x000000b9
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000026 "private_int"
 CHECK-NEXT: 0x0000003f "baz"
 CHECK-NEXT: 0x00000058 "foo"
-CHECK-NEXT: length = 0x0000001e version = 0x0002 unit_offset = 0x0000013a unit_size = 0x000000ac
+CHECK-NEXT: length = 0x0000001e, version = 0x0002, unit_offset = 0x0000013a, unit_size = 0x000000ac
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000026 "val"
 CHECK-NEXT: 0x00000045 "bar"
 
 CHECK: .debug_pubtypes contents:
-CHECK-NEXT: length = 0x0000001f version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000081
+CHECK-NEXT: length = 0x0000001f, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000081
 CHECK-NEXT: Offset     Name
 CHECK-NEXT: 0x00000063 "int"
 CHECK-NEXT: 0x00000079 "char"

From 2a7af304823d4bb60efbbfea97d35030f0732748 Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Mon, 1 Jun 2020 15:56:28 +0700
Subject: [PATCH 702/770] [DebugInfo] Separate fields with commas in headers of
 compile units (2/3).

For most tables, we already use commas in headers. This set of patches
unifies dumping the remaining ones.

Differential Revision: https://reviews.llvm.org/D80806
---
 llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp        | 10 +++++-----
 llvm/test/CodeGen/X86/dwarf-headers.ll               | 12 ++++++------
 llvm/test/DebugInfo/Generic/version.ll               |  2 +-
 llvm/test/DebugInfo/X86/dwarfdump-header-64.s        |  2 +-
 llvm/test/DebugInfo/X86/dwarfdump-header.s           |  6 +++---
 llvm/test/DebugInfo/dwarfdump-zlib.test              |  2 +-
 llvm/test/MC/WebAssembly/dwarfdump.ll                |  2 +-
 llvm/test/tools/dsymutil/X86/empty-CU.test           |  2 +-
 llvm/test/tools/dsymutil/X86/generate-empty-CU.test  |  4 ++--
 .../tools/llvm-dwarfdump/X86/debug_loclists_nouse.s  |  2 +-
 llvm/test/tools/llvm-dwarfdump/X86/lookup.s          |  2 +-
 .../llvm-dwp/X86/dwos_list_from_exec_simple.test     | 10 +++++-----
 llvm/test/tools/llvm-dwp/X86/info-v5.s               |  2 +-
 llvm/test/tools/llvm-dwp/X86/merge.test              | 12 ++++++------
 llvm/test/tools/llvm-dwp/X86/simple.test             |  8 ++++----
 15 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 3c3513e3114b7..5eb33022a9689 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -18,14 +18,14 @@ void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(getFormat());
   OS << format("0x%08" PRIx64, getOffset()) << ": Compile Unit:"
      << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
-     << " version = " << format("0x%04x", getVersion());
+     << ", version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
-    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << " abbr_offset = "
+    OS << ", unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << ", abbr_offset = "
      << format("0x%04" PRIx64, getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize());
+     << ", addr_size = " << format("0x%02x", getAddressByteSize());
   if (getVersion() >= 5 && getUnitType() != dwarf::DW_UT_compile)
-    OS << " DWO_id = " << format("0x%016" PRIx64, *getDWOId());
+    OS << ", DWO_id = " << format("0x%016" PRIx64, *getDWOId());
   OS << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
      << ")\n";
 
diff --git a/llvm/test/CodeGen/X86/dwarf-headers.ll b/llvm/test/CodeGen/X86/dwarf-headers.ll
index 6159fc29f8623..2be7fc726acd5 100644
--- a/llvm/test/CodeGen/X86/dwarf-headers.ll
+++ b/llvm/test/CodeGen/X86/dwarf-headers.ll
@@ -38,7 +38,7 @@
 ; the length of the header.
 ;
 ; SINGLE-4: .debug_info contents:
-; SINGLE-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004 abbr_offset
+; SINGLE-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004, abbr_offset
 ; SINGLE-4: 0x0000000b: DW_TAG_compile_unit
 ;
 ; SINGLE-4: .debug_types contents:
@@ -48,11 +48,11 @@
 ; Verify the v4 split headers.
 ;
 ; O-4: .debug_info contents:
-; O-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004 abbr_offset
+; O-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004, abbr_offset
 ; O-4: 0x0000000b: DW_TAG_compile_unit
 ;
 ; DWO-4: .debug_info.dwo contents:
-; DWO-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004 abbr_offset
+; DWO-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004, abbr_offset
 ; DWO-4: 0x0000000b: DW_TAG_compile_unit
 ;
 ; DWO-4: .debug_types.dwo contents:
@@ -67,20 +67,20 @@
 ; SINGLE-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_type abbr_offset
 ; SINGLE-5: 0x00000018: DW_TAG_type_unit
 ; SINGLE-5-NOT: contents:
-; SINGLE-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_compile abbr_offset
+; SINGLE-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005, unit_type = DW_UT_compile, abbr_offset
 ; SINGLE-5: 0x0000000c: DW_TAG_compile_unit
 
 ; Verify the v5 split headers.
 ;
 ; O-5: .debug_info contents:
-; O-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_skeleton abbr_offset
+; O-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005, unit_type = DW_UT_skeleton, abbr_offset
 ; O-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
 ; O-5: 0x00000014: DW_TAG_skeleton_unit 
 ;
 ; DWO-5: .debug_info.dwo contents:
 ; DWO-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
 ; DWO-5: 0x00000018: DW_TAG_type_unit
-; DWO-5: 0x00000033: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_compile abbr_offset
+; DWO-5: 0x00000033: Compile Unit: {{.*}} version = 0x0005, unit_type = DW_UT_split_compile, abbr_offset
 ; DWO-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
 ; DWO-5: 0x00000047: DW_TAG_compile_unit
 
diff --git a/llvm/test/DebugInfo/Generic/version.ll b/llvm/test/DebugInfo/Generic/version.ll
index 4e1a7269ae216..99971a4cba53f 100644
--- a/llvm/test/DebugInfo/Generic/version.ll
+++ b/llvm/test/DebugInfo/Generic/version.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-dwarfdump %t | FileCheck %s
 
 ; Make sure we are generating DWARF version 3 when module flag says so.
-; CHECK: Compile Unit: length = {{.*}} version = 0x0003
+; CHECK: Compile Unit: length = {{.*}}, version = 0x0003
 
 define i32 @main() #0 !dbg !4 {
 entry:
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-header-64.s b/llvm/test/DebugInfo/X86/dwarfdump-header-64.s
index 549374c100c47..cbe436a74bc61 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-header-64.s
+++ b/llvm/test/DebugInfo/X86/dwarfdump-header-64.s
@@ -76,7 +76,7 @@ CU_5_version:
 CU_5_end:
 
 # CHECK-LABEL: .debug_info contents:
-# CHECK: 0x00000000: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000001a)
+# CHECK: 0x00000000: Compile Unit: length = 0x00000016, version = 0x0005, unit_type = DW_UT_compile, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000001a)
 # CHECK: 0x0000000c: DW_TAG_compile_unit
 # CHECK-NEXT: DW_AT_producer {{.*}} "Handmade DWARF producer"
 # CHECK-NEXT: DW_AT_name {{.*}} "V5_compile_unit"
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-header.s b/llvm/test/DebugInfo/X86/dwarfdump-header.s
index 41f3b2967933b..c2f425a6472f0 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-header.s
+++ b/llvm/test/DebugInfo/X86/dwarfdump-header.s
@@ -107,7 +107,7 @@ CU_4_version:
         .byte 0 # NULL
 CU_4_end:
 
-# CHECK: 0x00000000: Compile Unit: length = 0x00000015 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000019)
+# CHECK: 0x00000000: Compile Unit: length = 0x00000015, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x00000019)
 # CHECK: 0x0000000b: DW_TAG_compile_unit
 
 # DWARF v5 normal CU header.
@@ -125,7 +125,7 @@ CU_5_version:
         .byte 0 # NULL
 CU_5_end:
 
-# CHECK: 0x00000019: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000033)
+# CHECK: 0x00000019: Compile Unit: length = 0x00000016, version = 0x0005, unit_type = DW_UT_compile, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x00000033)
 # CHECK: 0x00000025: DW_TAG_compile_unit
 
         .section .debug_info.dwo,"e",@progbits
@@ -147,7 +147,7 @@ CU_split_5_version:
         .byte 0 # NULL
 CU_split_5_end:
 
-# CHECK: 0x00000000: Compile Unit: length = 0x0000001e version = 0x0005 unit_type = DW_UT_split_compile abbr_offset = 0x0000 addr_size = 0x08 DWO_id = 0x000000000000005a (next unit at 0x00000022)
+# CHECK: 0x00000000: Compile Unit: length = 0x0000001e, version = 0x0005, unit_type = DW_UT_split_compile, abbr_offset = 0x0000, addr_size = 0x08, DWO_id = 0x000000000000005a (next unit at 0x00000022)
 # CHECK: 0x00000014: DW_TAG_compile_unit
 # CHECK-NEXT: DW_AT_producer {{.*}} "Handmade DWO producer"
 # CHECK-NEXT: DW_AT_name {{.*}} "V5_dwo_compile_unit"
diff --git a/llvm/test/DebugInfo/dwarfdump-zlib.test b/llvm/test/DebugInfo/dwarfdump-zlib.test
index e60bbbfe358b7..3abb8d5c89a25 100644
--- a/llvm/test/DebugInfo/dwarfdump-zlib.test
+++ b/llvm/test/DebugInfo/dwarfdump-zlib.test
@@ -10,7 +10,7 @@ CHECK: .debug_abbrev contents
 // the decompression correctly and result is the same. This and above also checks
 // that sections names are properly shown in zlib-gnu style (without additional 'z' prefix).
 CHECK: .debug_info contents
-CHECK: 0x00000000: Compile Unit: length = 0x00000144 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000148)
+CHECK: 0x00000000: Compile Unit: length = 0x00000144, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x00000148)
 
 // Also check that relocations in the .zdebug sections are handled correctly:
 CHECK: DW_AT_ranges {{.*}} (0x00000000{{$}}
diff --git a/llvm/test/MC/WebAssembly/dwarfdump.ll b/llvm/test/MC/WebAssembly/dwarfdump.ll
index ea956002bd834..7e2c682a3104a 100644
--- a/llvm/test/MC/WebAssembly/dwarfdump.ll
+++ b/llvm/test/MC/WebAssembly/dwarfdump.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s
 
 ; CHECK: .debug_info contents:
-; CHECK-NEXT: 0x00000000: Compile Unit: length = 0x0000006e version = 0x0004 abbr_offset = 0x0000 addr_size = 0x04 (next unit at 0x00000072)
+; CHECK-NEXT: 0x00000000: Compile Unit: length = 0x0000006e, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x04 (next unit at 0x00000072)
 
 ; CHECK: 0x0000000b: DW_TAG_compile_unit
 ; CHECK-NEXT:              DW_AT_producer	("clang version 6.0.0 (trunk 315924) (llvm/trunk 315960)")
diff --git a/llvm/test/tools/dsymutil/X86/empty-CU.test b/llvm/test/tools/dsymutil/X86/empty-CU.test
index e2acefcdccfd2..f33f1a54d651f 100644
--- a/llvm/test/tools/dsymutil/X86/empty-CU.test
+++ b/llvm/test/tools/dsymutil/X86/empty-CU.test
@@ -1,6 +1,6 @@
 RUN: dsymutil --update -f %p/../Inputs/empty-CU.o -o - | llvm-dwarfdump -v - -debug-info | FileCheck %s
 
 CHECK: .debug_info contents:
-CHECK: 0x00000000: Compile Unit: length = 0x00000008 version = 0x0003 abbr_offset = 0x0000 addr_size = 0x04 (next unit at 0x0000000c)
+CHECK: 0x00000000: Compile Unit: length = 0x00000008, version = 0x0003, abbr_offset = 0x0000, addr_size = 0x04 (next unit at 0x0000000c)
 
 CHECK: 0x0000000b: DW_TAG_compile_unit [1]
diff --git a/llvm/test/tools/dsymutil/X86/generate-empty-CU.test b/llvm/test/tools/dsymutil/X86/generate-empty-CU.test
index 02914181b3356..53baa6789cf55 100644
--- a/llvm/test/tools/dsymutil/X86/generate-empty-CU.test
+++ b/llvm/test/tools/dsymutil/X86/generate-empty-CU.test
@@ -15,14 +15,14 @@ objects:
 ...
 
 .debug_info contents:
-CHECK: Compile Unit: length = 0x0000007d version = 0x0002 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000081)
+CHECK: Compile Unit: length = 0x0000007d, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x00000081)
 
 CHECK: DW_TAG_compile_unit
 CHECK:        DW_AT_name {{.*}} "basic1.c"
 CHECK:   DW_TAG_subprogram
                 DW_AT_name {{.*}} "main"
 
-CHECK: 0x00000081: Compile Unit: length = 0x00000089 version = 0x0002 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000010e)
+CHECK: 0x00000081: Compile Unit: length = 0x00000089, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000010e)
 
 CHECK: DW_TAG_compile_unit
 CHECK:        DW_AT_name {{.*}} "basic3.c"
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists_nouse.s b/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists_nouse.s
index 0b9a087471517..384652516e729 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists_nouse.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists_nouse.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
 # RUN: llvm-dwarfdump %t.o | FileCheck %s
 
-# CHECK: 0x00000000: Compile Unit: length = 0x00000009 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000000d)
+# CHECK: 0x00000000: Compile Unit: length = 0x00000009, version = 0x0005, unit_type = DW_UT_compile, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000000d)
 # CHECK: 0x0000000c: DW_TAG_compile_unit
 
 	.section	.debug_abbrev,"",@progbits
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/lookup.s b/llvm/test/tools/llvm-dwarfdump/X86/lookup.s
index 36a8a2906afee..dc1fb77f072b6 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/lookup.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/lookup.s
@@ -20,7 +20,7 @@
 # RUN:   | llvm-dwarfdump -lookup=0x14 - | \
 # RUN: FileCheck %s -check-prefixes=CHECK,C
 
-# CHECK: Compile Unit: length = 0x00000060 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000064)
+# CHECK: Compile Unit: length = 0x00000060, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x00000064)
 
 # CHECK: DW_TAG_compile_unit
 # CHECK:   DW_AT_name        ("foo.c")
diff --git a/llvm/test/tools/llvm-dwp/X86/dwos_list_from_exec_simple.test b/llvm/test/tools/llvm-dwp/X86/dwos_list_from_exec_simple.test
index 54c3a5789a2d6..b3de886cf2e35 100644
--- a/llvm/test/tools/llvm-dwp/X86/dwos_list_from_exec_simple.test
+++ b/llvm/test/tools/llvm-dwp/X86/dwos_list_from_exec_simple.test
@@ -61,25 +61,25 @@ CHECK: DW_TAG_subprogram
 CHECK: .debug_info.dwo contents:
 CHECK: [[AOFF:0x[0-9a-f]*]]:
 
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "c.cpp"
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "c"
 
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "e.cpp"
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "e"
 
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "a.cpp"
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "a"
 
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "b.cpp"
 CHECK:   DW_TAG_subprogram
@@ -87,7 +87,7 @@ CHECK:     DW_AT_name {{.*}} "b"
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "main"
 
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "d.cpp"
 CHECK:   DW_TAG_subprogram
diff --git a/llvm/test/tools/llvm-dwp/X86/info-v5.s b/llvm/test/tools/llvm-dwp/X86/info-v5.s
index 21fa22793a833..e9797d49590da 100644
--- a/llvm/test/tools/llvm-dwp/X86/info-v5.s
+++ b/llvm/test/tools/llvm-dwp/X86/info-v5.s
@@ -6,7 +6,7 @@
 # RUN: llvm-dwarfdump -v %t.dwp | FileCheck %s
 
 #CHECK-DAG: .debug_info.dwo contents:
-#CHECK: 0x00000000: Compile Unit: length = 0x00000050 version = 0x0005 unit_type = DW_UT_split_compile abbr_offset = 0x0000 addr_size = 0x08 DWO_id = [[DWOID:.*]] (next unit at 0x00000054)
+#CHECK: 0x00000000: Compile Unit: length = 0x00000050, version = 0x0005, unit_type = DW_UT_split_compile, abbr_offset = 0x0000, addr_size = 0x08, DWO_id = [[DWOID:.*]] (next unit at 0x00000054)
 
 # CHECK-DAG: .debug_cu_index contents:
 # CHECK: version = 2 slots = 2
diff --git a/llvm/test/tools/llvm-dwp/X86/merge.test b/llvm/test/tools/llvm-dwp/X86/merge.test
index ff86084578f92..d981c0cc0ffa8 100644
--- a/llvm/test/tools/llvm-dwp/X86/merge.test
+++ b/llvm/test/tools/llvm-dwp/X86/merge.test
@@ -27,16 +27,16 @@ CHECK: 0x0000[[BAOFF:.*]]
 
 CHECK: .debug_info.dwo contents:
 CHECK: [[COFF:0x[0-9a-f]*]]:
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
-CHECK:         0x[[CAOFF]] addr_size = 0x08 (next unit at [[AOFF:.*]])
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004, abbr_offset =
+CHECK:         0x[[CAOFF]], addr_size = 0x08 (next unit at [[AOFF:.*]])
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOC:.*]])
 CHECK: [[AOFF]]:
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
-CHECK:         0x[[AAOFF]] addr_size = 0x08 (next unit at [[BOFF:.*]])
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004, abbr_offset =
+CHECK:         0x[[AAOFF]], addr_size = 0x08 (next unit at [[BOFF:.*]])
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOA:.*]])
 CHECK: [[BOFF]]:
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
-CHECK:         0x[[BAOFF]] addr_size = 0x08 (next unit at [[XOFF:.*]])
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004, abbr_offset =
+CHECK:         0x[[BAOFF]], addr_size = 0x08 (next unit at [[XOFF:.*]])
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOB:.*]])
 
 CHECK-LABEL: .debug_cu_index
diff --git a/llvm/test/tools/llvm-dwp/X86/simple.test b/llvm/test/tools/llvm-dwp/X86/simple.test
index 788e6f5cfe4f7..c53a7de483d6f 100644
--- a/llvm/test/tools/llvm-dwp/X86/simple.test
+++ b/llvm/test/tools/llvm-dwp/X86/simple.test
@@ -29,8 +29,8 @@ CHECK: DW_TAG_formal_parameter
 
 CHECK: .debug_info.dwo contents:
 CHECK: [[AOFF:0x[0-9a-f]*]]:
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
-CHECK:         0x[[AAOFF]] addr_size = 0x08 (next unit at [[BOFF:.*]])
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004, abbr_offset =
+CHECK:         0x[[AAOFF]], addr_size = 0x08 (next unit at [[BOFF:.*]])
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "a.cpp"
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOA:.*]])
@@ -41,8 +41,8 @@ NOTYP:     DW_AT_name {{.*}} "foo"
 TYPES:     DW_AT_signature {{.*}} ([[FOOSIG:.*]])
 
 CHECK: [[BOFF]]:
-CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
-CHECK:         0x[[BAOFF]] addr_size = 0x08 (next unit at [[XOFF:.*]])
+CHECK-LABEL: Compile Unit: length = {{.*}}, version = 0x0004, abbr_offset =
+CHECK:         0x[[BAOFF]], addr_size = 0x08 (next unit at [[XOFF:.*]])
 CHECK:   DW_AT_name {{.*}} "b.cpp"
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOB:.*]])
 CHECK:   DW_TAG_structure_type

From cbec419b3ebba3fe5551540cfd3e484dfa253a3a Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Mon, 1 Jun 2020 15:57:23 +0700
Subject: [PATCH 703/770] [DebugInfo] Separate fields with commas in headers of
 type units (3/3).

For most tables, we already use commas in headers. This set of patches
unifies dumping the remaining ones.

Differential Revision: https://reviews.llvm.org/D80806
---
 llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp    | 18 +++++++++---------
 llvm/test/CodeGen/X86/dwarf-headers.ll        |  8 ++++----
 llvm/test/CodeGen/X86/dwarf-split-line-1.ll   |  2 +-
 llvm/test/DebugInfo/X86/dwarfdump-header.s    |  4 ++--
 llvm/test/DebugInfo/dwarfdump-type-units.test |  4 ++--
 llvm/test/DebugInfo/typeunit-header.test      |  2 +-
 llvm/test/tools/llvm-dwp/X86/simple.test      |  8 ++++----
 llvm/test/tools/llvm-dwp/X86/type_dedup.test  | 12 ++++++------
 8 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index 1873e2285140e..c06c20bbd6510 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -24,23 +24,23 @@ void DWARFTypeUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
 
   if (DumpOpts.SummarizeTypes) {
     OS << "name = '" << Name << "'"
-       << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
-       << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
+       << ", type_signature = " << format("0x%016" PRIx64, getTypeHash())
+       << ", length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
        << '\n';
     return;
   }
 
   OS << format("0x%08" PRIx64, getOffset()) << ": Type Unit:"
      << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
-     << " version = " << format("0x%04x", getVersion());
+     << ", version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
-    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << " abbr_offset = "
+    OS << ", unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << ", abbr_offset = "
      << format("0x%04" PRIx64, getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize())
-     << " name = '" << Name << "'"
-     << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
-     << " type_offset = " << format("0x%04" PRIx64, getTypeOffset())
+     << ", addr_size = " << format("0x%02x", getAddressByteSize())
+     << ", name = '" << Name << "'"
+     << ", type_signature = " << format("0x%016" PRIx64, getTypeHash())
+     << ", type_offset = " << format("0x%04" PRIx64, getTypeOffset())
      << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
      << ")\n";
 
diff --git a/llvm/test/CodeGen/X86/dwarf-headers.ll b/llvm/test/CodeGen/X86/dwarf-headers.ll
index 2be7fc726acd5..4c029de5dbfa1 100644
--- a/llvm/test/CodeGen/X86/dwarf-headers.ll
+++ b/llvm/test/CodeGen/X86/dwarf-headers.ll
@@ -42,7 +42,7 @@
 ; SINGLE-4: 0x0000000b: DW_TAG_compile_unit
 ;
 ; SINGLE-4: .debug_types contents:
-; SINGLE-4: 0x00000000: Type Unit: {{.*}} version = 0x0004 abbr_offset
+; SINGLE-4: 0x00000000: Type Unit: {{.*}} version = 0x0004, abbr_offset
 ; SINGLE-4: 0x00000017: DW_TAG_type_unit
 
 ; Verify the v4 split headers.
@@ -56,7 +56,7 @@
 ; DWO-4: 0x0000000b: DW_TAG_compile_unit
 ;
 ; DWO-4: .debug_types.dwo contents:
-; DWO-4: 0x00000000: Type Unit: {{.*}} version = 0x0004 abbr_offset
+; DWO-4: 0x00000000: Type Unit: {{.*}} version = 0x0004, abbr_offset
 ; DWO-4: 0x00000017: DW_TAG_type_unit
 
 ; Verify the v5 non-split headers. Type units come first.
@@ -64,7 +64,7 @@
 ; each new section.
 ;
 ; SINGLE-5: .debug_info contents:
-; SINGLE-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_type abbr_offset
+; SINGLE-5: 0x00000000: Type Unit: {{.*}} version = 0x0005, unit_type = DW_UT_type, abbr_offset
 ; SINGLE-5: 0x00000018: DW_TAG_type_unit
 ; SINGLE-5-NOT: contents:
 ; SINGLE-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005, unit_type = DW_UT_compile, abbr_offset
@@ -78,7 +78,7 @@
 ; O-5: 0x00000014: DW_TAG_skeleton_unit 
 ;
 ; DWO-5: .debug_info.dwo contents:
-; DWO-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
+; DWO-5: 0x00000000: Type Unit: {{.*}} version = 0x0005, unit_type = DW_UT_split_type, abbr_offset
 ; DWO-5: 0x00000018: DW_TAG_type_unit
 ; DWO-5: 0x00000033: Compile Unit: {{.*}} version = 0x0005, unit_type = DW_UT_split_compile, abbr_offset
 ; DWO-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
diff --git a/llvm/test/CodeGen/X86/dwarf-split-line-1.ll b/llvm/test/CodeGen/X86/dwarf-split-line-1.ll
index cdc3e205b59ef..d57c0b6d8a4d3 100644
--- a/llvm/test/CodeGen/X86/dwarf-split-line-1.ll
+++ b/llvm/test/CodeGen/X86/dwarf-split-line-1.ll
@@ -8,7 +8,7 @@
 
 ; CHECK-NOT: .debug_line.dwo
 ; CHECK: .debug_info.dwo contents:
-; CHECK: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
+; CHECK: 0x00000000: Type Unit: {{.*}} version = 0x0005, unit_type = DW_UT_split_type, abbr_offset
 ; CHECK: 0x00000018: DW_TAG_type_unit
 ; CHECK-NOT: DW_AT_stmt_list
 ; CHECK-NOT: DW_AT_decl_file
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-header.s b/llvm/test/DebugInfo/X86/dwarfdump-header.s
index c2f425a6472f0..6b40c99736fe5 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-header.s
+++ b/llvm/test/DebugInfo/X86/dwarfdump-header.s
@@ -181,7 +181,7 @@ TU_split_5_type:
         .byte 0 # NULL
 TU_split_5_end:
 
-# CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
+# CHECK: 0x00000000: Type Unit: length = 0x00000020, version = 0x0005, unit_type = DW_UT_split_type, abbr_offset = 0x0000, addr_size = 0x08, name = 'V5_split_type_unit', type_signature = 0x8899aabbccddeeff, type_offset = 0x001d (next unit at 0x00000024)
 # CHECK: 0x00000018: DW_TAG_type_unit
 
         .section .debug_types,"",@progbits
@@ -207,7 +207,7 @@ TU_4_type:
         .byte 0 # NULL
 TU_4_end:
 
-# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
+# CHECK: 0x00000000: Type Unit: length = 0x0000001f, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08, name = 'V4_type_unit', type_signature = 0x0011223344556677, type_offset = 0x001c (next unit at 0x00000023)
 # CHECK: 0x00000017: DW_TAG_type_unit
 
         .section .debug_line,"",@progbits
diff --git a/llvm/test/DebugInfo/dwarfdump-type-units.test b/llvm/test/DebugInfo/dwarfdump-type-units.test
index a8876a68afc1a..9f9efc5aaf0d4 100644
--- a/llvm/test/DebugInfo/dwarfdump-type-units.test
+++ b/llvm/test/DebugInfo/dwarfdump-type-units.test
@@ -18,7 +18,7 @@ CHECK: debug_types contents:
 LONG: 0x00000000: Type Unit: {{.*}} name =
 SHORT-NOT: Type Unit
 SHORT: name =
-CHECK-SAME: 'bar' type_signature = [[BAR_SIG]]
+CHECK-SAME: 'bar', type_signature = [[BAR_SIG]]
 SHORT-SAME: length =
 LONG-SAME: type_offset = 0x[[BAR_OFF:[0-9a-f]*]] (next unit at
 LONG: DW_TAG_type_unit
@@ -28,7 +28,7 @@ LONG-NEXT: DW_AT_name {{.*}}"bar"
 
 LONG: 0x00000000: Type Unit: {{.*}} name =
 SHORT: name =
-CHECK-SAME: 'foo' type_signature = [[FOO_SIG]]
+CHECK-SAME: 'foo', type_signature = [[FOO_SIG]]
 LONG-SAME: type_offset = 0x[[FOO_OFF:[0-9a-f]*]] (next unit at
 LONG: DW_TAG_type_unit
 LONG-NOT: NULL
diff --git a/llvm/test/DebugInfo/typeunit-header.test b/llvm/test/DebugInfo/typeunit-header.test
index a8d3f47144f57..e73cc6caecf4d 100644
--- a/llvm/test/DebugInfo/typeunit-header.test
+++ b/llvm/test/DebugInfo/typeunit-header.test
@@ -10,6 +10,6 @@ We make sure that llvm-dwarfdump is parsing the type unit header correctly
 and displays it. 
 
 CHECK: .debug_types contents:
-CHECK: 0x00000000: Type Unit: length = 0x00000019 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = '' type_signature = 0x0011223344556677 type_offset = 0x0019 (next unit at 0x0000001d)
+CHECK: 0x00000000: Type Unit: length = 0x00000019, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08, name = '', type_signature = 0x0011223344556677, type_offset = 0x0019 (next unit at 0x0000001d)
 CHECK: 0x00000017: DW_TAG_type_unit [1] *
 CHECK: DW_AT_visibility [DW_FORM_data1] (DW_VIS_local)
diff --git a/llvm/test/tools/llvm-dwp/X86/simple.test b/llvm/test/tools/llvm-dwp/X86/simple.test
index c53a7de483d6f..b74db32231105 100644
--- a/llvm/test/tools/llvm-dwp/X86/simple.test
+++ b/llvm/test/tools/llvm-dwp/X86/simple.test
@@ -55,14 +55,14 @@ CHECK:     DW_TAG_formal_parameter
 NOTYP-NOT: .debug_types.dwo contents:
 TYPES-LABEL: .debug_types.dwo contents:
 TYPES: [[FOOUOFF:0x[0-9a-f]*]]:
-TYPES-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-TYPES:         0x[[AAOFF]] addr_size = 0x08 name = 'foo' type_signature = [[FOOSIG]] type_offset = 0x[[FOOOFF:.*]] (next unit at [[BARUOFF:.*]])
+TYPES-LABEL: Type Unit: length = 0x00000020, version = 0x0004, abbr_offset =
+TYPES:         0x[[AAOFF]], addr_size = 0x08, name = 'foo', type_signature = [[FOOSIG]], type_offset = 0x[[FOOOFF:.*]] (next unit at [[BARUOFF:.*]])
 TYPES:             DW_TAG_type_unit
 TYPES: [[FOOOFF]]:   DW_TAG_structure_type
 TYPES:                 DW_AT_name {{.*}} "foo"
 TYPES: [[BARUOFF]]:
-TYPES-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-TYPES:         0x[[BAOFF]] addr_size = 0x08 name = 'bar' type_signature = [[BARSIG]] type_offset = 0x001e (next unit at [[XUOFF:.*]])
+TYPES-LABEL: Type Unit: length = 0x00000020, version = 0x0004, abbr_offset =
+TYPES:         0x[[BAOFF]], addr_size = 0x08, name = 'bar', type_signature = [[BARSIG]], type_offset = 0x001e (next unit at [[XUOFF:.*]])
 TYPES:             DW_TAG_type_unit
 TYPES: 0x00000042:   DW_TAG_structure_type
 TYPES:                 DW_AT_name {{.*}} "bar"
diff --git a/llvm/test/tools/llvm-dwp/X86/type_dedup.test b/llvm/test/tools/llvm-dwp/X86/type_dedup.test
index 666db0f64f84e..d7044d0cba5ed 100644
--- a/llvm/test/tools/llvm-dwp/X86/type_dedup.test
+++ b/llvm/test/tools/llvm-dwp/X86/type_dedup.test
@@ -18,20 +18,20 @@ b.cpp:
 
 CHECK-LABEL: .debug_types.dwo contents:
 CHECK: [[COMMONUOFF:0x[0-9a-f]*]]:
-CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-CHECK:         0x0000 addr_size = 0x08 name = 'common' type_signature = [[COMMONSIG:0x[0-9a-f]*]] type_offset = 0x[[COMMONOFF:.*]] (next unit at [[AUOFF:.*]])
+CHECK-LABEL: Type Unit: length = 0x00000020, version = 0x0004, abbr_offset =
+CHECK:         0x0000, addr_size = 0x08, name = 'common', type_signature = [[COMMONSIG:0x[0-9a-f]*]], type_offset = 0x[[COMMONOFF:.*]] (next unit at [[AUOFF:.*]])
 CHECK:                DW_TAG_type_unit
 CHECK: [[COMMONOFF]]:   DW_TAG_structure_type
 CHECK:                    DW_AT_name {{.*}} "common"
 CHECK: [[AUOFF]]:
-CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-CHECK:         0x0000 addr_size = 0x08 name = 'adistinct' type_signature = [[ASIG:0x[0-9a-f]*]] type_offset = 0x[[AOFF:.*]] (next unit at [[BUOFF:.*]])
+CHECK-LABEL: Type Unit: length = 0x00000020, version = 0x0004, abbr_offset =
+CHECK:         0x0000, addr_size = 0x08, name = 'adistinct', type_signature = [[ASIG:0x[0-9a-f]*]], type_offset = 0x[[AOFF:.*]] (next unit at [[BUOFF:.*]])
 CHECK:             DW_TAG_type_unit
 CHECK: 0x00000042:   DW_TAG_structure_type
 CHECK:                 DW_AT_name {{.*}} "adistinct"
 CHECK: [[BUOFF]]:
-CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-CHECK:         0x{{.*}} addr_size = 0x08 name = 'bdistinct' type_signature = [[BSIG:0x[0-9a-f]*]] type_offset = 0x[[BOFF:.*]] (next unit at [[XUOFF:.*]])
+CHECK-LABEL: Type Unit: length = 0x00000020, version = 0x0004, abbr_offset =
+CHECK:         0x{{.*}}, addr_size = 0x08, name = 'bdistinct', type_signature = [[BSIG:0x[0-9a-f]*]], type_offset = 0x[[BOFF:.*]] (next unit at [[XUOFF:.*]])
 CHECK:             DW_TAG_type_unit
 CHECK: 0x00000066:   DW_TAG_structure_type
 CHECK:                 DW_AT_name {{.*}} "bdistinct"

From de82114db86bb1bd89f3596409dc8f30fa316ce5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 1 Jun 2020 11:36:27 +0100
Subject: [PATCH 704/770] MIPatternMatch.h - remove unused APFloat/APInt
 includes. NFC.

---
 llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 1fae635f48fd3..043be086ff417 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -12,8 +12,6 @@
 #ifndef LLVM_GMIR_PATTERNMATCH_H
 #define LLVM_GMIR_PATTERNMATCH_H
 
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/InstrTypes.h"

From 014648e8f27b2f57940844f0170d15b42a497d33 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 1 Jun 2020 11:39:56 +0100
Subject: [PATCH 705/770] ARMFrameLowering.h - remove unnecessary includes.
 NFC.

They are implicitly included in TargetFrameLowering.h and only ever used in TargetFrameLowering override methods.
---
 llvm/lib/Target/ARM/ARMFrameLowering.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index dd4c0caf1c95d..4c2c07d64f57a 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -9,9 +9,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
-#include <vector>
 
 namespace llvm {
 

From 7bcde99f7779a71a389e559b7ff2ff6ec43ae004 Mon Sep 17 00:00:00 2001
From: James Henderson <james.henderson@sony.com>
Date: Wed, 20 May 2020 15:29:55 +0100
Subject: [PATCH 706/770] [llvm-dwarfdump][test] Use verbose output to check
 expected opcodes

The debug_line_invalid.test test case was previously using the
interpreted line table dumping to identify which opcodes have been
parsed. This change moves to looking for the expected opcodes
explicitly. This is probably a little clearer and also allows for
testing some cases that wouldn't be easily identifiable from the
interpreted table.

Reviewed by: MaskRay

Differential Revision: https://reviews.llvm.org/D80795
---
 .../X86/debug_line_invalid.test               | 111 +++++++++++++-----
 1 file changed, 81 insertions(+), 30 deletions(-)

diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test
index 7fa13fc9b9ad7..5386ef9b93a67 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test
@@ -23,7 +23,8 @@
 # RUN:   | FileCheck %s --check-prefixes=FIRST,NONFATAL,LAST --implicit-check-not='debug_line[{{.*}}]'
 # RUN: FileCheck %s --input-file=%t-malformed.err --check-prefixes=ALL,OTHER
 # RUN: llvm-dwarfdump -debug-line %t-malformed.o -verbose 2> %t-malformed-verbose.err \
-# RUN:   | FileCheck %s --check-prefixes=FIRST,NONFATAL,LAST --implicit-check-not='debug_line[{{.*}}]'
+# RUN:   | FileCheck %s --check-prefixes=FIRST,VERBOSE,NONFATAL,LAST --implicit-check-not='debug_line[{{.*}}]' \
+# RUN:       --implicit-check-not=' DW_LNS' --implicit-check-not=' DW_LNE' --implicit-check-not='address +='
 # RUN: FileCheck %s --input-file=%t-malformed-verbose.err --check-prefixes=ALL,OTHER
 
 ## We should still produce warnings for malformed tables after the specified unit.
@@ -36,12 +37,14 @@
 # RUN:   | FileCheck %s --check-prefix=LAST --implicit-check-not='debug_line[{{.*}}]'
 # RUN: FileCheck %s --input-file=%t-malformed-off-last.err --check-prefix=ALL
 
-# FIRST:       debug_line[0x00000000]
-# FIRST:       0x000000000badbeef {{.*}} end_sequence
-# NOFIRST-NOT: debug_line[0x00000000]
-# NOFIRST-NOT: 0x000000000badbeef {{.*}} end_sequence
-# NOLATER-NOT: debug_line[{{.*}}]
-# NOLATER-NOT: end_sequence
+# FIRST:        debug_line[0x00000000]
+# VERBOSE:      DW_LNE_set_address (0x000000000badbeef)
+# VERBOSE-NEXT: DW_LNE_end_sequence
+# FIRST:        0x000000000badbeef {{.*}} end_sequence
+# NOFIRST-NOT:  debug_line[0x00000000]
+# NOFIRST-NOT:  0x000000000badbeef {{.*}} end_sequence
+# NOLATER-NOT:  debug_line[{{.*}}]
+# NOLATER-NOT:  end_sequence
 
 ## For fatal issues, the following table(s) should not be dumped:
 # FATAL:      debug_line[0x00000048]
@@ -72,7 +75,8 @@
 # NONFATAL:      prologue_length: 0x00000013
 # NONFATAL-NOT:  include_directories
 # NONFATAL-NOT:  file_names
-# NONFATAL:      0x8877665544332211 {{.*}} end_sequence
+# VERBOSE:       DW_LNE_set_address (0x8877665544332211)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## Prologue with length shorter than parsed.
 # NONFATAL:      debug_line[0x00000081]
@@ -82,7 +86,9 @@
 # NONFATAL-NEXT:       dir_index: 1
 # NONFATAL-NEXT:        mod_time: 0x00000002
 # NONFATAL-NEXT:          length: 0x00000003
-# NONFATAL:      0x1122334455667788 {{.*}} 0 end_sequence{{$}}
+# VERBOSE:       DW_LNS_negate_stmt
+# VERBOSE-NEXT:  DW_LNE_set_address (0x1122334455667788)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## Prologue with length longer than parsed.
 # NONFATAL:      debug_line[0x000000c8]
@@ -93,21 +99,26 @@
 # NONFATAL-NEXT:        mod_time: 0x00000002
 # NONFATAL-NEXT:          length: 0x00000003
 # NONFATAL-NOT:  file_names
-# NONFATAL:      0x1111222233334444 {{.*}} is_stmt end_sequence
+# VERBOSE:       DW_LNE_set_address (0x1111222233334444)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## Extended opcode with incorrect length versus expected.
 # NONFATAL:      debug_line[0x00000111]
 # NONFATAL-NEXT: Line table prologue
 # NONFATAL:      prologue_length: 0x00000030
-# NONFATAL: 0x00000000abbadaba {{.*}} end_sequence
-# NONFATAL: 0x00000000babb1e45 {{.*}} 10 is_stmt prologue_end end_sequence{{$}}
+# VERBOSE:       DW_LNE_set_address (0x00000000abbadaba)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
+# VERBOSE:       DW_LNE_set_discriminator (10)
+# VERBOSE-NEXT:  DW_LNS_set_prologue_end
+# VERBOSE-NEXT:  DW_LNE_set_address (0x00000000babb1e45)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## No end of sequence.
 # NONFATAL:      debug_line[0x0000016c]
 # NONFATAL-NEXT: Line table prologue
 # NONFATAL:      prologue_length: 0x00000030
-# NONFATAL:      0x00000000deadfade {{.*}} is_stmt
-# NONFATAL-NOT:  end_sequence
+# VERBOSE:       DW_LNE_set_address (0x00000000deadfade)
+# VERBOSE-NEXT:  DW_LNS_copy
 
 ## Very short prologue length for V5 (ends during parameters).
 # NONFATAL:      debug_line[0x000001b2]
@@ -116,8 +127,22 @@
 # NONFATAL-NEXT: include_directories[  0] = "/tmp"
 # NONFATAL-NEXT: file_names[  0]:
 # NONFATAL-NEXT:            name: "xyz"
-# NONFATAL:      0x0000000000000000 1 0 1 0 0 is_stmt end_sequence
-# NONFATAL:      0x0000babb1ebabb1e {{.*}} end_sequence
+# VERBOSE:       DW_LNE_end_sequence
+# VERBOSE:       DW_LNS_copy
+# VERBOSE:       DW_LNS_const_add_pc (0x0000000000000011)
+# VERBOSE-NEXT:  DW_LNS_copy
+# VERBOSE:       address += 2, line += 1
+# VERBOSE:       address += 7, line += 0
+# VERBOSE:       address += 6, line += 7
+# VERBOSE:       address += 7, line += -4
+# VERBOSE:       DW_LNE_end_sequence
+# VERBOSE:       DW_LNS_const_add_pc (0x0000000000000011)
+# VERBOSE-NEXT:  DW_LNS_copy
+# VERBOSE:       address += 7, line += 4
+# VERBOSE:       address += 7, line += 5
+# VERBOSE:       address += 7, line += 6
+# VERBOSE:       DW_LNE_set_address (0x0000babb1ebabb1e)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## V5 prologue ends during file table.
 # NONFATAL:      debug_line[0x000001ee]
@@ -126,8 +151,14 @@
 # NONFATAL-NEXT: file_names[  0]:
 # NONFATAL-NEXT:            name: "xyz"
 # NONFATAL-NEXT:       dir_index: 1
-# NONFATAL:      0x0000000000000000 {{.*}} epilogue_begin
-# NONFATAL:      0x00000ab4acadab4a {{.*}} end_sequence
+# VERBOSE:       DW_LNS_set_epilogue_begin
+# VERBOSE-NEXT:  DW_LNS_copy
+# VERBOSE:       address += 7, line += 4
+# VERBOSE:       address += 7, line += 5
+# VERBOSE:       address += 7, line += 6
+# VERBOSE:       DW_LNE_end_sequence
+# VERBOSE:       DW_LNE_set_address (0x00000ab4acadab4a)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## V5 prologue ends during directory table.
 # NONFATAL:      debug_line[0x0000022f]
@@ -135,16 +166,26 @@
 # NONFATAL:      include_directories[  0] = "/tmp"
 # NONFATAL-NEXT: file_names[  0]:
 # NONFATAL-NEXT:            name: "xyz"
-# NONFATAL:      0x0000000000000002 2 0 1 0 0 is_stmt{{$}}
-# NONFATAL:      0x4444333322221111 {{.*}} end_sequence
+# VERBOSE:       address += 2, line += 1
+# VERBOSE:       address += 7, line += 0
+# VERBOSE:       address += 6, line += 7
+# VERBOSE:       address += 7, line += -4
+# VERBOSE:       DW_LNE_end_sequence
+# VERBOSE:       DW_LNS_const_add_pc (0x0000000000000011)
+# VERBOSE-NEXT:  DW_LNS_copy
+# VERBOSE:       address += 7, line += 4
+# VERBOSE:       address += 7, line += 5
+# VERBOSE:       address += 7, line += 6
+# VERBOSE:       DW_LNE_set_address (0x4444333322221111)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## V5 invalid MD5 hash form when there is still data to be read.
 # NONFATAL:      debug_line[0x0000026b]
 # NONFATAL-NEXT: Line table prologue
 # NONFATAL:      include_directories[  0] = "/tmp"
 # NONFATAL-NOT:  file_names
-# NONFATAL-NOT:  is_stmt
-# NONFATAL:      0x1234123412341234 {{.*}} end_sequence
+# VERBOSE:       DW_LNE_set_address (0x1234123412341234)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## V5 invalid MD5 hash form when data beyond the prologue length has
 ## been read before the MD5 problem is identified.
@@ -152,8 +193,13 @@
 # NONFATAL-NEXT: Line table prologue
 # NONFATAL:      include_directories[  0] = "/tmp"
 # NONFATAL-NOT:  file_names
-# NONFATAL:      0x0000000000000000 {{.*}} epilogue_begin
-# NONFATAL:      0x4321432143214321 {{.*}} is_stmt end_sequence
+# VERBOSE:       DW_LNS_set_epilogue_begin
+# VERBOSE-NEXT:  DW_LNS_copy
+# VERBOSE:       address += 7, line += 4
+# VERBOSE:       address += 7, line += 5
+# VERBOSE:       address += 7, line += 6
+# VERBOSE:       DW_LNE_set_address (0x4321432143214321)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## V5 invalid directory content description has unsupported form.
 # NONFATAL:      debug_line[0x000002ec]
@@ -161,7 +207,8 @@
 # NONFATAL:      include_directories[  0] = "/foo"
 # NONFATAL-NOT:  include_directories
 # NONFATAL-NOT:  file_names
-# NONFATAL:      0xaaaabbbbccccdddd {{.*}} is_stmt end_sequence
+# VERBOSE:       DW_LNE_set_address (0xaaaabbbbccccdddd)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## Opcode base field of value zero.
 # NONFATAL:      debug_line[0x00000332]
@@ -172,15 +219,17 @@
 # NONFATAL-NEXT:       dir_index: 1
 # NONFATAL-NEXT:        mod_time: 0x00000002
 # NONFATAL-NEXT:          length: 0x00000003
-# NONFATAL:      0xffffeeeeddddcccd 1 0 1 0 0 is_stmt{{$}}
-# NONFATAL:      0xffffeeeeddddcccd 1 0 1 0 0 is_stmt end_sequence{{$}}
+# VERBOSE:       DW_LNE_set_address (0xffffeeeeddddcccc)
+# VERBOSE-NEXT:  address += 1, line += 0
+# VERBOSE:       DW_LNE_end_sequence
 
 ## V4 table with unterminated include directory table.
 # NONFATAL:      debug_line[0x00000361]
 # NONFATAL-NEXT: Line table prologue
 # NONFATAL:      include_directories[  1] = "dir1"
 # NONFATAL-NOT:  file_names
-# NONFATAL:      0xabcdef0123456789 {{.*}} is_stmt end_sequence
+# VERBOSE:       DW_LNE_set_address (0xabcdef0123456789)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 ## V4 table with unterminated file name table.
 # NONFATAL:      debug_line[0x00000390]
@@ -191,10 +240,12 @@
 # NONFATAL-NEXT:        mod_time: 0x00000002
 # NONFATAL-NEXT:          length: 0x00000003
 # NONFATAL-NOT:  file_names
-# NONFATAL:      0xababcdcdefef0909 {{.*}} is_stmt end_sequence
+# VERBOSE:       DW_LNE_set_address (0xababcdcdefef0909)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 # LAST:          debug_line[0x000003c9]
-# LAST:          0x00000000cafebabe {{.*}} end_sequence
+# VERBOSE:       DW_LNE_set_address (0x00000000cafebabe)
+# VERBOSE-NEXT:  DW_LNE_end_sequence
 
 # ALL-NOT:  warning:
 # ALL:      warning: parsing line table prologue at offset 0x00000048: unsupported version 0

From e5b8772756737e41cb1e8ee1a5a33cb3d8a25be6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 1 Jun 2020 06:49:07 -0400
Subject: [PATCH 707/770] [utils] change default nameless value to "TMP"

This is effectively reverting rGbfdc2552664d to avoid test churn
while we figure out a better way forward.

We at least salvage the warning on name conflict from that patch
though.

If we change the default string again, we may want to mass update
tests at the same time. Alternatively, we could live with the poor
naming if we change -instnamer.

This also adds a test to LLVM as suggested in the post-commit
review. There's a clang test that is also affected. That seems
like a layering violation, but I have not looked at fixing that yet.

Differential Revision: https://reviews.llvm.org/D80584
---
 .../Inputs/mangled_names.c.expected           | 20 +++++++++----------
 .../Inputs/mangled_names.c.funcsig.expected   | 20 +++++++++----------
 .../update_test_checks/Inputs/basic.ll        | 11 +++++++++-
 .../Inputs/basic.ll.expected                  | 11 +++++++++-
 .../Inputs/basic.ll.funcsig.expected          | 12 ++++++++++-
 llvm/utils/UpdateTestChecks/common.py         |  2 +-
 6 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
index 6ea154286c152..d6ba7ae09b620 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.expected
@@ -8,10 +8,10 @@
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
 // CHECK-NEXT:    ret i64 [[ADD]]
 //
 long test(long a, int b) {
@@ -27,12 +27,12 @@ long test(long a, int b) {
 // CHECK-NEXT:    store i64 [[A:%.*]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B:%.*]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[C:%.*]], i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
-// CHECK-NEXT:    [[NAMELESS2:%.*]] = load i32, i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[NAMELESS2]] to i64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[C_ADDR]], align 4
+// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD2:%.*]] = add nsw i64 [[ADD]], [[CONV1]]
 // CHECK-NEXT:    ret i64 [[ADD2]]
 //
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
index dbe1296182aa6..005b2f2427473 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/mangled_names.c.funcsig.expected
@@ -9,10 +9,10 @@
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
 // CHECK-NEXT:    ret i64 [[ADD]]
 //
 long test(long a, int b) {
@@ -29,12 +29,12 @@ long test(long a, int b) {
 // CHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[NAMELESS0:%.*]] = load i64, i64* [[A_ADDR]], align 8
-// CHECK-NEXT:    [[NAMELESS1:%.*]] = load i32, i32* [[B_ADDR]], align 4
-// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[NAMELESS1]] to i64
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[NAMELESS0]], [[CONV]]
-// CHECK-NEXT:    [[NAMELESS2:%.*]] = load i32, i32* [[C_ADDR]], align 4
-// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[NAMELESS2]] to i64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[CONV]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[C_ADDR]], align 4
+// CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD2:%.*]] = add nsw i64 [[ADD]], [[CONV1]]
 // CHECK-NEXT:    ret i64 [[ADD2]]
 //
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll
index 7189eb4856ea5..707c5fb4a9708 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll
@@ -1,4 +1,4 @@
-; Example input for update_llc_test_checks (taken from test/Transforms/InstSimplify/add.ll)
+; Example input for update_test_checks (taken from test/Transforms/InstSimplify/add.ll)
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
 define i32 @common_sub_operand(i32 %X, i32 %Y) {
@@ -47,3 +47,12 @@ define <2 x i8> @knownnegation_commute_vec(<2 x i8> %x, <2 x i8> %y) {
   %r = add <2 x i8> %yx, %xy
   ret <2 x i8> %r
 }
+
+define i32 @nameless_value(i32 %X) {
+; CHECK-LABEL: @nameless_value(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = sub i32 42, %X
+  ret i32 %1
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.expected
index 71abc6aec3761..10e4b4b58efd0 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.expected
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; Example input for update_llc_test_checks (taken from test/Transforms/InstSimplify/add.ll)
+; Example input for update_test_checks (taken from test/Transforms/InstSimplify/add.ll)
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
 define i32 @common_sub_operand(i32 %X, i32 %Y) {
@@ -48,3 +48,12 @@ define <2 x i8> @knownnegation_commute_vec(<2 x i8> %x, <2 x i8> %y) {
   %r = add <2 x i8> %yx, %xy
   ret <2 x i8> %r
 }
+
+define i32 @nameless_value(i32 %X) {
+; CHECK-LABEL: @nameless_value(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = sub i32 42, %X
+  ret i32 %1
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.funcsig.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.funcsig.expected
index d97d1b8f77bcb..c30d105b60aef 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.funcsig.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/basic.ll.funcsig.expected
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; Example input for update_llc_test_checks (taken from test/Transforms/InstSimplify/add.ll)
+; Example input for update_test_checks (taken from test/Transforms/InstSimplify/add.ll)
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
 define i32 @common_sub_operand(i32 %X, i32 %Y) {
@@ -53,3 +53,13 @@ define <2 x i8> @knownnegation_commute_vec(<2 x i8> %x, <2 x i8> %y) {
   %r = add <2 x i8> %yx, %xy
   ret <2 x i8> %r
 }
+
+define i32 @nameless_value(i32 %X) {
+; CHECK-LABEL: define {{[^@]+}}@nameless_value
+; CHECK-SAME: (i32 [[X:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = sub i32 42, %X
+  ret i32 %1
+}
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index a2e9787253c59..d8a646853faae 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -218,7 +218,7 @@ def build_function_body_dictionary(function_re, scrubber, scrubber_args, raw_too
 # spaces, commas, paren, or end of the string
 IR_VALUE_RE = re.compile(r'(\s+)%([\w.-]+?)([,\s\(\)]|\Z)')
 
-NAMELESS_PREFIX = "NAMELESS"
+NAMELESS_PREFIX = "TMP"
 
 # Create a FileCheck variable name based on an IR name.
 def get_value_name(var):

From 2b37c5b560584f05edf5d375d4ca86fe9c5b0173 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Mon, 1 Jun 2020 10:28:22 +0200
Subject: [PATCH 708/770] [lldb][NFC] Make ClangExpressionSourceCode's wrapping
 logic more consistent

Summary:
ClangExpressionSourceCode has different ways to wrap the user expression based on
which context the expression is executed in. For example, if we're in a C++ member
function we put the expression inside a fake member function of a fake class to make the
evaluation possible. Similar things are done for Objective-C instance/static methods.
There is also a default wrapping where we put the expression in a normal function
just to make it possible to execute it.

The way we currently define which kind of wrapping the expression needs is based on
the `wrapping_language` we keep passing to the ClangExpressionSourceCode
instance. We repurposed the language type enum for that variable to distinguish the
cases above with the following mapping:
* language = C_plus_plus -> member function wrapping
* language = ObjC -> instance/static method wrapping (`is_static` distinguished between those two).
* language = C -> normal function wrapping
* all other cases like C_plus_plus11, Haskell etc. make our class a no-op that does mostly nothing.

That mapping is currently not documented and just confusing as the `language`
is unrelated to the expression language (and in the ClangUserExpression we even pretend
that it *is* the actual language, but luckily never used it for anything). Some of the code
in ClangExpressionSourceCode is also obviously thinking that this is the actual language of
the expression as it checks for non-existent cases such as `ObjC_plus_plus` which is
not part of the mapping.

This patch makes a new enum to describe the four cases above (with instance/static Objective-C
methods now being their own case). It also make that enum just a member of
ClangExpressionSourceCode instead of having to pass the same value to the class repeatedly.
This gets also rid of all the switch-case-checks for 'unknown' language such as C_plus_plus11 as this
is no longer necessary.

Reviewers: labath, JDevlieghere

Reviewed By: labath

Subscribers: abidh

Differential Revision: https://reviews.llvm.org/D80793
---
 .../Clang/ClangExpressionSourceCode.cpp       | 137 +++++++-----------
 .../Clang/ClangExpressionSourceCode.h         |  34 +++--
 .../Clang/ClangUserExpression.cpp             |  35 ++---
 .../Clang/ClangUserExpression.h               |   5 +-
 4 files changed, 96 insertions(+), 115 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
index 41d2b4adf3caf..a429963277d1a 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
@@ -174,8 +174,8 @@ static void AddMacros(const DebugMacros *dm, CompileUnit *comp_unit,
 
 lldb_private::ClangExpressionSourceCode::ClangExpressionSourceCode(
     llvm::StringRef filename, llvm::StringRef name, llvm::StringRef prefix,
-    llvm::StringRef body, Wrapping wrap)
-    : ExpressionSourceCode(name, prefix, body, wrap) {
+    llvm::StringRef body, Wrapping wrap, WrapKind wrap_kind)
+    : ExpressionSourceCode(name, prefix, body, wrap), m_wrap_kind(wrap_kind) {
   // Use #line markers to pretend that we have a single-line source file
   // containing only the user expression. This will hide our wrapper code
   // from the user when we render diagnostics with Clang.
@@ -261,10 +261,9 @@ TokenVerifier::TokenVerifier(std::string body) {
   }
 }
 
-static void AddLocalVariableDecls(const lldb::VariableListSP &var_list_sp,
-                                  StreamString &stream,
-                                  const std::string &expr,
-                                  lldb::LanguageType wrapping_language) {
+void ClangExpressionSourceCode::AddLocalVariableDecls(
+    const lldb::VariableListSP &var_list_sp, StreamString &stream,
+    const std::string &expr) const {
   TokenVerifier tokens(expr);
 
   for (size_t i = 0; i < var_list_sp->GetSize(); i++) {
@@ -281,13 +280,12 @@ static void AddLocalVariableDecls(const lldb::VariableListSP &var_list_sp,
     if (!expr.empty() && !tokens.hasToken(var_name.GetStringRef()))
       continue;
 
-    if ((var_name == "self" || var_name == "_cmd") &&
-        (wrapping_language == lldb::eLanguageTypeObjC ||
-         wrapping_language == lldb::eLanguageTypeObjC_plus_plus))
+    const bool is_objc = m_wrap_kind == WrapKind::ObjCInstanceMethod ||
+                         m_wrap_kind == WrapKind::ObjCStaticMethod;
+    if ((var_name == "self" || var_name == "_cmd") && is_objc)
       continue;
 
-    if (var_name == "this" &&
-        wrapping_language == lldb::eLanguageTypeC_plus_plus)
+    if (var_name == "this" && m_wrap_kind == WrapKind::CppMemberFunction)
       continue;
 
     stream.Printf("using $__lldb_local_vars::%s;\n", var_name.AsCString());
@@ -295,9 +293,8 @@ static void AddLocalVariableDecls(const lldb::VariableListSP &var_list_sp,
 }
 
 bool ClangExpressionSourceCode::GetText(
-    std::string &text, lldb::LanguageType wrapping_language, bool static_method,
-    ExecutionContext &exe_ctx, bool add_locals, bool force_add_all_locals,
-    llvm::ArrayRef<std::string> modules) const {
+    std::string &text, ExecutionContext &exe_ctx, bool add_locals,
+    bool force_add_all_locals, llvm::ArrayRef<std::string> modules) const {
   const char *target_specific_defines = "typedef signed char BOOL;\n";
   std::string module_macros;
 
@@ -374,21 +371,11 @@ bool ClangExpressionSourceCode::GetText(
         lldb::VariableListSP var_list_sp =
             frame->GetInScopeVariableList(false, true);
         AddLocalVariableDecls(var_list_sp, lldb_local_var_decls,
-                              force_add_all_locals ? "" : m_body,
-                              wrapping_language);
+                              force_add_all_locals ? "" : m_body);
       }
   }
 
   if (m_wrap) {
-    switch (wrapping_language) {
-    default:
-      return false;
-    case lldb::eLanguageTypeC:
-    case lldb::eLanguageTypeC_plus_plus:
-    case lldb::eLanguageTypeObjC:
-      break;
-    }
-
     // Generate a list of @import statements that will import the specified
     // module into our expression.
     std::string module_imports;
@@ -407,22 +394,12 @@ bool ClangExpressionSourceCode::GetText(
     // First construct a tagged form of the user expression so we can find it
     // later:
     std::string tagged_body;
-    switch (wrapping_language) {
-    default:
-      tagged_body = m_body;
-      break;
-    case lldb::eLanguageTypeC:
-    case lldb::eLanguageTypeC_plus_plus:
-    case lldb::eLanguageTypeObjC:
-      tagged_body.append(m_start_marker);
-      tagged_body.append(m_body);
-      tagged_body.append(m_end_marker);
-      break;
-    }
-    switch (wrapping_language) {
-    default:
-      break;
-    case lldb::eLanguageTypeC:
+    tagged_body.append(m_start_marker);
+    tagged_body.append(m_body);
+    tagged_body.append(m_end_marker);
+
+    switch (m_wrap_kind) {
+    case WrapKind::Function:
       wrap_stream.Printf("%s"
                          "void                           \n"
                          "%s(void *$__lldb_arg)          \n"
@@ -433,7 +410,7 @@ bool ClangExpressionSourceCode::GetText(
                          module_imports.c_str(), m_name.c_str(),
                          lldb_local_var_decls.GetData(), tagged_body.c_str());
       break;
-    case lldb::eLanguageTypeC_plus_plus:
+    case WrapKind::CppMemberFunction:
       wrap_stream.Printf("%s"
                          "void                                   \n"
                          "$__lldb_class::%s(void *$__lldb_arg)   \n"
@@ -444,38 +421,38 @@ bool ClangExpressionSourceCode::GetText(
                          module_imports.c_str(), m_name.c_str(),
                          lldb_local_var_decls.GetData(), tagged_body.c_str());
       break;
-    case lldb::eLanguageTypeObjC:
-      if (static_method) {
-        wrap_stream.Printf(
-            "%s"
-            "@interface $__lldb_objc_class ($__lldb_category)        \n"
-            "+(void)%s:(void *)$__lldb_arg;                          \n"
-            "@end                                                    \n"
-            "@implementation $__lldb_objc_class ($__lldb_category)   \n"
-            "+(void)%s:(void *)$__lldb_arg                           \n"
-            "{                                                       \n"
-            "    %s;                                                 \n"
-            "%s"
-            "}                                                       \n"
-            "@end                                                    \n",
-            module_imports.c_str(), m_name.c_str(), m_name.c_str(),
-            lldb_local_var_decls.GetData(), tagged_body.c_str());
-      } else {
-        wrap_stream.Printf(
-            "%s"
-            "@interface $__lldb_objc_class ($__lldb_category)       \n"
-            "-(void)%s:(void *)$__lldb_arg;                         \n"
-            "@end                                                   \n"
-            "@implementation $__lldb_objc_class ($__lldb_category)  \n"
-            "-(void)%s:(void *)$__lldb_arg                          \n"
-            "{                                                      \n"
-            "    %s;                                                \n"
-            "%s"
-            "}                                                      \n"
-            "@end                                                   \n",
-            module_imports.c_str(), m_name.c_str(), m_name.c_str(),
-            lldb_local_var_decls.GetData(), tagged_body.c_str());
-      }
+    case WrapKind::ObjCInstanceMethod:
+      wrap_stream.Printf(
+          "%s"
+          "@interface $__lldb_objc_class ($__lldb_category)       \n"
+          "-(void)%s:(void *)$__lldb_arg;                         \n"
+          "@end                                                   \n"
+          "@implementation $__lldb_objc_class ($__lldb_category)  \n"
+          "-(void)%s:(void *)$__lldb_arg                          \n"
+          "{                                                      \n"
+          "    %s;                                                \n"
+          "%s"
+          "}                                                      \n"
+          "@end                                                   \n",
+          module_imports.c_str(), m_name.c_str(), m_name.c_str(),
+          lldb_local_var_decls.GetData(), tagged_body.c_str());
+      break;
+
+    case WrapKind::ObjCStaticMethod:
+      wrap_stream.Printf(
+          "%s"
+          "@interface $__lldb_objc_class ($__lldb_category)        \n"
+          "+(void)%s:(void *)$__lldb_arg;                          \n"
+          "@end                                                    \n"
+          "@implementation $__lldb_objc_class ($__lldb_category)   \n"
+          "+(void)%s:(void *)$__lldb_arg                           \n"
+          "{                                                       \n"
+          "    %s;                                                 \n"
+          "%s"
+          "}                                                       \n"
+          "@end                                                    \n",
+          module_imports.c_str(), m_name.c_str(), m_name.c_str(),
+          lldb_local_var_decls.GetData(), tagged_body.c_str());
       break;
     }
 
@@ -488,17 +465,7 @@ bool ClangExpressionSourceCode::GetText(
 }
 
 bool ClangExpressionSourceCode::GetOriginalBodyBounds(
-    std::string transformed_text, lldb::LanguageType wrapping_language,
-    size_t &start_loc, size_t &end_loc) {
-  switch (wrapping_language) {
-  default:
-    return false;
-  case lldb::eLanguageTypeC:
-  case lldb::eLanguageTypeC_plus_plus:
-  case lldb::eLanguageTypeObjC:
-    break;
-  }
-
+    std::string transformed_text, size_t &start_loc, size_t &end_loc) {
   start_loc = transformed_text.find(m_start_marker);
   if (start_loc == std::string::npos)
     return false;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.h
index bb2e6346a49c5..9a54f0e3ad8d1 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.h
@@ -28,20 +28,30 @@ class ClangExpressionSourceCode : public ExpressionSourceCode {
   static const llvm::StringRef g_prefix_file_name;
   static const char *g_expression_prefix;
 
+  /// The possible ways an expression can be wrapped.
+  enum class WrapKind {
+    /// Wrapped in a non-static member function of a C++ class.
+    CppMemberFunction,
+    /// Wrapped in an instance Objective-C method.
+    ObjCInstanceMethod,
+    /// Wrapped in a static Objective-C method.
+    ObjCStaticMethod,
+    /// Wrapped in a non-member function.
+    /// Note that this is also used for static member functions of a C++ class.
+    Function
+  };
+
   static ClangExpressionSourceCode *CreateWrapped(llvm::StringRef filename,
                                                   llvm::StringRef prefix,
-                                                  llvm::StringRef body) {
+                                                  llvm::StringRef body,
+                                                  WrapKind wrap_kind) {
     return new ClangExpressionSourceCode(filename, "$__lldb_expr", prefix, body,
-                                         Wrap);
+                                         Wrap, wrap_kind);
   }
 
   /// Generates the source code that will evaluate the expression.
   ///
   /// \param text output parameter containing the source code string.
-  /// \param wrapping_language If the expression is supossed to be wrapped,
-  ///        then this is the language that should be used for that.
-  /// \param static_method True iff the expression is valuated inside a static
-  ///        Objective-C method.
   /// \param exe_ctx The execution context in which the expression will be
   ///        evaluated.
   /// \param add_locals True iff local variables should be injected into the
@@ -51,8 +61,7 @@ class ClangExpressionSourceCode : public ExpressionSourceCode {
   /// \param modules A list of (C++) modules that the expression should import.
   ///
   /// \return true iff the source code was successfully generated.
-  bool GetText(std::string &text, lldb::LanguageType wrapping_language,
-               bool static_method, ExecutionContext &exe_ctx, bool add_locals,
+  bool GetText(std::string &text, ExecutionContext &exe_ctx, bool add_locals,
                bool force_add_all_locals,
                llvm::ArrayRef<std::string> modules) const;
 
@@ -60,19 +69,24 @@ class ClangExpressionSourceCode : public ExpressionSourceCode {
   // passed to CreateWrapped. Return true if the bounds could be found.  This
   // will also work on text with FixItHints applied.
   bool GetOriginalBodyBounds(std::string transformed_text,
-                             lldb::LanguageType wrapping_language,
                              size_t &start_loc, size_t &end_loc);
 
 protected:
   ClangExpressionSourceCode(llvm::StringRef filename, llvm::StringRef name,
                             llvm::StringRef prefix, llvm::StringRef body,
-                            Wrapping wrap);
+                            Wrapping wrap, WrapKind wrap_kind);
 
 private:
+  void AddLocalVariableDecls(const lldb::VariableListSP &var_list_sp,
+                             StreamString &stream,
+                             const std::string &expr) const;
+
   /// String marking the start of the user expression.
   std::string m_start_marker;
   /// String marking the end of the user expression.
   std::string m_end_marker;
+  /// How the expression has been wrapped.
+  const WrapKind m_wrap_kind;
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index f01357f101152..2d6de782ced2f 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -397,16 +397,20 @@ static void SetupDeclVendor(ExecutionContext &exe_ctx, Target *target,
                                "current compilation unit.");
 }
 
-void ClangUserExpression::UpdateLanguageForExpr() {
-  m_expr_lang = lldb::LanguageType::eLanguageTypeUnknown;
-  if (m_options.GetExecutionPolicy() == eExecutionPolicyTopLevel)
-    return;
+ClangExpressionSourceCode::WrapKind ClangUserExpression::GetWrapKind() const {
+  assert(m_options.GetExecutionPolicy() != eExecutionPolicyTopLevel &&
+         "Top level expressions aren't wrapped.");
+  using Kind = ClangExpressionSourceCode::WrapKind;
   if (m_in_cplusplus_method)
-    m_expr_lang = lldb::eLanguageTypeC_plus_plus;
-  else if (m_in_objectivec_method)
-    m_expr_lang = lldb::eLanguageTypeObjC;
-  else
-    m_expr_lang = lldb::eLanguageTypeC;
+    return Kind::CppMemberFunction;
+  else if (m_in_objectivec_method) {
+    if (m_in_static_method)
+      return Kind::ObjCStaticMethod;
+    return Kind::ObjCInstanceMethod;
+  }
+  // Not in any kind of 'special' function, so just wrap it in a normal C
+  // function.
+  return Kind::Function;
 }
 
 void ClangUserExpression::CreateSourceCode(
@@ -420,10 +424,9 @@ void ClangUserExpression::CreateSourceCode(
     m_transformed_text = m_expr_text;
   } else {
     m_source_code.reset(ClangExpressionSourceCode::CreateWrapped(
-        m_filename, prefix, m_expr_text));
+        m_filename, prefix, m_expr_text, GetWrapKind()));
 
-    if (!m_source_code->GetText(m_transformed_text, m_expr_lang,
-                                m_in_static_method, exe_ctx, !m_ctx_obj,
+    if (!m_source_code->GetText(m_transformed_text, exe_ctx, !m_ctx_obj,
                                 for_completion, modules_to_import)) {
       diagnostic_manager.PutString(eDiagnosticSeverityError,
                                    "couldn't construct expression body");
@@ -435,7 +438,7 @@ void ClangUserExpression::CreateSourceCode(
     std::size_t original_start;
     std::size_t original_end;
     bool found_bounds = m_source_code->GetOriginalBodyBounds(
-        m_transformed_text, m_expr_lang, original_start, original_end);
+        m_transformed_text, original_start, original_end);
     if (found_bounds)
       m_user_expression_start_pos = original_start;
   }
@@ -560,7 +563,6 @@ bool ClangUserExpression::PrepareForParsing(
            llvm::make_range(m_include_directories.begin(),
                             m_include_directories.end()));
 
-  UpdateLanguageForExpr();
   CreateSourceCode(diagnostic_manager, exe_ctx, imported_modules,
                    for_completion);
   return true;
@@ -635,9 +637,8 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
         m_fixed_text = diagnostic_manager.GetFixedExpression();
         // Retrieve the original expression in case we don't have a top level
         // expression (which has no surrounding source code).
-        if (m_source_code &&
-            m_source_code->GetOriginalBodyBounds(m_fixed_text, m_expr_lang,
-                                                 fixed_start, fixed_end))
+        if (m_source_code && m_source_code->GetOriginalBodyBounds(
+                                 m_fixed_text, fixed_start, fixed_end))
           m_fixed_text =
               m_fixed_text.substr(fixed_start, fixed_end - fixed_start);
       }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
index d3073624cfa5f..f734069655ef3 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
@@ -185,7 +185,8 @@ class ClangUserExpression : public LLVMUserExpression {
                         ExecutionContext &exe_ctx,
                         std::vector<std::string> modules_to_import,
                         bool for_completion);
-  void UpdateLanguageForExpr();
+  /// Defines how the current expression should be wrapped.
+  ClangExpressionSourceCode::WrapKind GetWrapKind() const;
   bool SetupPersistentState(DiagnosticManager &diagnostic_manager,
                                    ExecutionContext &exe_ctx);
   bool PrepareForParsing(DiagnosticManager &diagnostic_manager,
@@ -208,8 +209,6 @@ class ClangUserExpression : public LLVMUserExpression {
     lldb::TargetSP m_target_sp;
   };
 
-  /// The language type of the current expression.
-  lldb::LanguageType m_expr_lang = lldb::eLanguageTypeUnknown;
   /// The include directories that should be used when parsing the expression.
   std::vector<std::string> m_include_directories;
 

From e8bcf4ef07ccba4a707b197f1bdee05a19936c47 Mon Sep 17 00:00:00 2001
From: James Henderson <james.henderson@sony.com>
Date: Wed, 20 May 2020 15:53:44 +0100
Subject: [PATCH 709/770] [DebugInfo] Add use of truncating data extractor to
 debug line parsing

This will ensure that nothing can ever start parsing data from a future
sequence and part-read data will be returned as 0 instead.

Reviewed by: aprantl, labath

Differential Revision: https://reviews.llvm.org/D80796
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp   | 47 ++++++++++---------
 .../X86/Inputs/debug_line_malformed.s         | 30 ++++++++++++
 .../X86/debug_line_invalid.test               | 13 ++++-
 3 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index ce59d731ac586..b2d368c096cc9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -721,14 +721,17 @@ Error DWARFDebugLine::LineTable::parse(
     ProgramLength = BytesRemaining;
   }
 
+  // Create a DataExtractor which can only see the data up to the end of the
+  // table, to prevent reading past the end.
   const uint64_t EndOffset = DebugLineOffset + ProgramLength;
+  DWARFDataExtractor TableData(DebugLineData, EndOffset);
 
   // See if we should tell the data extractor the address size.
-  if (DebugLineData.getAddressSize() == 0)
-    DebugLineData.setAddressSize(Prologue.getAddressSize());
+  if (TableData.getAddressSize() == 0)
+    TableData.setAddressSize(Prologue.getAddressSize());
   else
     assert(Prologue.getAddressSize() == 0 ||
-           Prologue.getAddressSize() == DebugLineData.getAddressSize());
+           Prologue.getAddressSize() == TableData.getAddressSize());
 
   ParsingState State(this, DebugLineOffset, RecoverableErrorHandler);
 
@@ -738,7 +741,7 @@ Error DWARFDebugLine::LineTable::parse(
       *OS << format("0x%08.08" PRIx64 ": ", *OffsetPtr);
 
     uint64_t OpcodeOffset = *OffsetPtr;
-    uint8_t Opcode = DebugLineData.getU8(OffsetPtr);
+    uint8_t Opcode = TableData.getU8(OffsetPtr);
 
     if (OS)
       *OS << format("%02.02" PRIx8 " ", Opcode);
@@ -746,7 +749,7 @@ Error DWARFDebugLine::LineTable::parse(
     if (Opcode == 0) {
       // Extended Opcodes always start with a zero opcode followed by
       // a uleb128 length so you can skip ones you don't know about
-      uint64_t Len = DebugLineData.getULEB128(OffsetPtr);
+      uint64_t Len = TableData.getULEB128(OffsetPtr);
       uint64_t ExtOffset = *OffsetPtr;
 
       // Tolerate zero-length; assume length is correct and soldier on.
@@ -756,7 +759,7 @@ Error DWARFDebugLine::LineTable::parse(
         continue;
       }
 
-      uint8_t SubOpcode = DebugLineData.getU8(OffsetPtr);
+      uint8_t SubOpcode = TableData.getU8(OffsetPtr);
       if (OS)
         *OS << LNExtendedString(SubOpcode);
       switch (SubOpcode) {
@@ -789,7 +792,7 @@ Error DWARFDebugLine::LineTable::parse(
         // Make sure the extractor knows the address size.  If not, infer it
         // from the size of the operand.
         {
-          uint8_t ExtractorAddressSize = DebugLineData.getAddressSize();
+          uint8_t ExtractorAddressSize = TableData.getAddressSize();
           uint64_t OpcodeAddressSize = Len - 1;
           if (ExtractorAddressSize != OpcodeAddressSize &&
               ExtractorAddressSize != 0)
@@ -812,13 +815,13 @@ Error DWARFDebugLine::LineTable::parse(
                 OpcodeAddressSize, ExtOffset));
             *OffsetPtr += OpcodeAddressSize;
           } else {
-            DebugLineData.setAddressSize(OpcodeAddressSize);
-            State.Row.Address.Address = DebugLineData.getRelocatedAddress(
+            TableData.setAddressSize(OpcodeAddressSize);
+            State.Row.Address.Address = TableData.getRelocatedAddress(
                 OffsetPtr, &State.Row.Address.SectionIndex);
 
             // Restore the address size if the extractor already had it.
             if (ExtractorAddressSize != 0)
-              DebugLineData.setAddressSize(ExtractorAddressSize);
+              TableData.setAddressSize(ExtractorAddressSize);
           }
 
           if (OS)
@@ -849,12 +852,12 @@ Error DWARFDebugLine::LineTable::parse(
         // the file register of the state machine.
         {
           FileNameEntry FileEntry;
-          const char *Name = DebugLineData.getCStr(OffsetPtr);
+          const char *Name = TableData.getCStr(OffsetPtr);
           FileEntry.Name =
               DWARFFormValue::createFromPValue(dwarf::DW_FORM_string, Name);
-          FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
-          FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
-          FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
+          FileEntry.DirIdx = TableData.getULEB128(OffsetPtr);
+          FileEntry.ModTime = TableData.getULEB128(OffsetPtr);
+          FileEntry.Length = TableData.getULEB128(OffsetPtr);
           Prologue.FileNames.push_back(FileEntry);
           if (OS)
             *OS << " (" << Name << ", dir=" << FileEntry.DirIdx << ", mod_time="
@@ -864,7 +867,7 @@ Error DWARFDebugLine::LineTable::parse(
         break;
 
       case DW_LNE_set_discriminator:
-        State.Row.Discriminator = DebugLineData.getULEB128(OffsetPtr);
+        State.Row.Discriminator = TableData.getULEB128(OffsetPtr);
         if (OS)
           *OS << " (" << State.Row.Discriminator << ")";
         break;
@@ -913,7 +916,7 @@ Error DWARFDebugLine::LineTable::parse(
         // result to the address register of the state machine.
         {
           uint64_t AddrOffset = State.advanceAddr(
-              DebugLineData.getULEB128(OffsetPtr), Opcode, OpcodeOffset);
+              TableData.getULEB128(OffsetPtr), Opcode, OpcodeOffset);
           if (OS)
             *OS << " (" << AddrOffset << ")";
         }
@@ -922,7 +925,7 @@ Error DWARFDebugLine::LineTable::parse(
       case DW_LNS_advance_line:
         // Takes a single signed LEB128 operand and adds that value to
         // the line register of the state machine.
-        State.Row.Line += DebugLineData.getSLEB128(OffsetPtr);
+        State.Row.Line += TableData.getSLEB128(OffsetPtr);
         if (OS)
           *OS << " (" << State.Row.Line << ")";
         break;
@@ -930,7 +933,7 @@ Error DWARFDebugLine::LineTable::parse(
       case DW_LNS_set_file:
         // Takes a single unsigned LEB128 operand and stores it in the file
         // register of the state machine.
-        State.Row.File = DebugLineData.getULEB128(OffsetPtr);
+        State.Row.File = TableData.getULEB128(OffsetPtr);
         if (OS)
           *OS << " (" << State.Row.File << ")";
         break;
@@ -938,7 +941,7 @@ Error DWARFDebugLine::LineTable::parse(
       case DW_LNS_set_column:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        State.Row.Column = DebugLineData.getULEB128(OffsetPtr);
+        State.Row.Column = TableData.getULEB128(OffsetPtr);
         if (OS)
           *OS << " (" << State.Row.Column << ")";
         break;
@@ -986,7 +989,7 @@ Error DWARFDebugLine::LineTable::parse(
         // requires the use of DW_LNS_advance_pc. Such assemblers, however,
         // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
         {
-          uint16_t PCOffset = DebugLineData.getRelocatedValue(2, OffsetPtr);
+          uint16_t PCOffset = TableData.getRelocatedValue(2, OffsetPtr);
           State.Row.Address.Address += PCOffset;
           if (OS)
             *OS
@@ -1009,7 +1012,7 @@ Error DWARFDebugLine::LineTable::parse(
       case DW_LNS_set_isa:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        State.Row.Isa = DebugLineData.getULEB128(OffsetPtr);
+        State.Row.Isa = TableData.getULEB128(OffsetPtr);
         if (OS)
           *OS << " (" << (uint64_t)State.Row.Isa << ")";
         break;
@@ -1022,7 +1025,7 @@ Error DWARFDebugLine::LineTable::parse(
           assert(Opcode - 1U < Prologue.StandardOpcodeLengths.size());
           uint8_t OpcodeLength = Prologue.StandardOpcodeLengths[Opcode - 1];
           for (uint8_t I = 0; I < OpcodeLength; ++I) {
-            uint64_t Value = DebugLineData.getULEB128(OffsetPtr);
+            uint64_t Value = TableData.getULEB128(OffsetPtr);
             if (OS)
               *OS << format("Skipping ULEB128 value: 0x%16.16" PRIx64 ")\n",
                             Value);
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/Inputs/debug_line_malformed.s b/llvm/test/tools/llvm-dwarfdump/X86/Inputs/debug_line_malformed.s
index d8809d11daab6..9a40b5674575b 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/Inputs/debug_line_malformed.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/Inputs/debug_line_malformed.s
@@ -488,6 +488,36 @@
 .byte   0, 1, 1         # DW_LNE_end_sequence
 .Lunterminated_files_end:
 
+# Opcode extends past the end of the table, as claimed by the unit length field.
+.long   .Lextended_past_end_end - .Lextended_past_end_start # Length of Unit
+.Lextended_past_end_start:
+.short  4               # DWARF version number
+.long   .Lprologue_extended_past_end_end-.Lprologue_extended_past_end_start # Length of Prologue
+.Lprologue_extended_past_end_start:
+.byte   1               # Minimum Instruction Length
+.byte   1               # Maximum Operations per Instruction
+.byte   1               # Default is_stmt
+.byte   -5              # Line Base
+.byte   14              # Line Range
+.byte   13              # Opcode Base
+.byte   0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 # Standard Opcode Lengths
+.asciz "dir1"           # Include table
+.asciz "dir2"
+.byte   0
+.asciz "file1"          # File table
+.byte   0, 0, 0
+.asciz "file2"
+.byte   1, 0, 0
+.byte   0
+.Lprologue_extended_past_end_end:
+.byte   0, 9, 2         # DW_LNE_set_address
+.quad   0xfeedfeed
+.byte   1               # DW_LNS_copy
+.byte   0, 9, 2         # DW_LNE_set_address
+.long   0xf001f000      # Truncated address (should be 8 bytes)
+.byte   0xf0, 0, 1
+.Lextended_past_end_end:
+
 # Trailing good section.
 .long   .Lunit_good_end - .Lunit_good_start # Length of Unit (DWARF-32 format)
 .Lunit_good_start:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test
index 5386ef9b93a67..5c9f6edf014c8 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_invalid.test
@@ -33,7 +33,7 @@
 # RUN: FileCheck %s --input-file=%t-malformed-off-first.err --check-prefix=ALL
 
 ## Don't stop looking for the later unit if non-fatal issues are found.
-# RUN: llvm-dwarfdump -debug-line=0x3c9 %t-malformed.o 2> %t-malformed-off-last.err \
+# RUN: llvm-dwarfdump -debug-line=0x419 %t-malformed.o 2> %t-malformed-off-last.err \
 # RUN:   | FileCheck %s --check-prefix=LAST --implicit-check-not='debug_line[{{.*}}]'
 # RUN: FileCheck %s --input-file=%t-malformed-off-last.err --check-prefix=ALL
 
@@ -243,7 +243,14 @@
 # VERBOSE:       DW_LNE_set_address (0xababcdcdefef0909)
 # VERBOSE-NEXT:  DW_LNE_end_sequence
 
-# LAST:          debug_line[0x000003c9]
+## Table with extended opcode that overruns table end.
+# NONFATAL:      debug_line[0x000003c9]
+# NONFATAL-NEXT: Line table prologue
+# VERBOSE:       DW_LNE_set_address (0x00000000feedfeed)
+# VERBOSE-NEXT:  DW_LNS_copy
+# VERBOSE:       DW_LNE_set_address (0x0000000000000000)
+
+# LAST:          debug_line[0x00000419]
 # VERBOSE:       DW_LNE_set_address (0x00000000cafebabe)
 # VERBOSE-NEXT:  DW_LNE_end_sequence
 
@@ -272,4 +279,6 @@
 # ALL-NEXT: warning: include directories table was not null terminated before the end of the prologue
 # ALL-NEXT: warning: parsing line table prologue at 0x00000390 found an invalid directory or file table description at 0x000003bf
 # ALL-NEXT: warning: file names table was not null terminated before the end of the prologue
+# OTHER-NEXT: warning: unexpected line op length at offset 0x00000411 expected 0x09 found 0x01
+# OTHER-NEXT: warning: last sequence in debug line table at offset 0x000003c9 is not terminated
 # ALL-NOT:  warning:

From 54422d21700cfb532c80b22662f7b79d741b21ba Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Mon, 1 Jun 2020 14:41:08 +0200
Subject: [PATCH 710/770] Revert "[lldb] Pass -fPIC flag even when DYLIB_ONLY
 is set"

This reverts commit fd0ab3b3eb88de3fe4792c34b50084595e22d68d.

The fix here is incorrect and the actual fault was an incorrect test Makefile.

To give some more background:

The original test for D80798 compiled three source files into either one
executable or one executable + 2 shared libraries, each being one different
test setup. If both the monolithic executable and the shared libraries
where compiled in the same directory, then Make would overwrite the .o files
of one test setup with the other. This caused that while -fPIC was passed
correctly to the test setup with the shared libraries, the compiler invocations
for the monolithic executable would later overwrite these object files (and
as only the test setup with the shared library used -fPIC, it appeared as if
the shared library object files didn't receive the -fPIC flag).

Thanks to Pavel for figuring this out.
---
 lldb/packages/Python/lldbsuite/test/make/Makefile.rules | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 5e3f478849901..ea0fa748bc361 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -657,14 +657,9 @@ endif
 $(DYLIB_OBJECTS) : CFLAGS += -DCOMPILING_LLDB_TEST_DLL
 
 ifneq "$(OS)" "Windows_NT"
-ifeq "$(DYLIB_ONLY)" ""
-CFLAGS += -fPIC
-CXXFLAGS += -fPIC
-else
 $(DYLIB_OBJECTS) : CFLAGS += -fPIC
 $(DYLIB_OBJECTS) : CXXFLAGS += -fPIC
 endif
-endif
 
 $(DYLIB_FILENAME) : $(DYLIB_OBJECTS)
 ifeq "$(OS)" "Darwin"

From 8d9070e040d0aa916b3b63c319eabdf3e4a5f9df Mon Sep 17 00:00:00 2001
From: James Henderson <james.henderson@sony.com>
Date: Fri, 22 May 2020 13:53:47 +0100
Subject: [PATCH 711/770] [Support] Add more context to DataExtractor getLEB128
 errors

Reviewed by: clayborg, dblaikie, labath

Differential Revision: https://reviews.llvm.org/D80799
---
 llvm/lib/Support/DataExtractor.cpp             |  5 ++++-
 .../X86/dwarfdump-debug-loclists-error-cases.s |  8 ++++----
 .../X86/debug_line_short_prologue.s            |  4 ++--
 llvm/unittests/Support/DataExtractorTest.cpp   | 18 ++++++++++++++++--
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Support/DataExtractor.cpp b/llvm/lib/Support/DataExtractor.cpp
index aaf20ebf84254..99265c4c001bb 100644
--- a/llvm/lib/Support/DataExtractor.cpp
+++ b/llvm/lib/Support/DataExtractor.cpp
@@ -206,7 +206,10 @@ static T getLEB128(StringRef Data, uint64_t *OffsetPtr, Error *Err,
       Decoder(Bytes.data() + *OffsetPtr, &bytes_read, Bytes.end(), &error);
   if (error) {
     if (Err)
-      *Err = createStringError(errc::illegal_byte_sequence, error);
+      *Err = createStringError(errc::illegal_byte_sequence,
+                               "unable to decode LEB128 at offset 0x%8.8" PRIx64
+                               ": %s",
+                               *OffsetPtr, error);
     return T();
   }
   *OffsetPtr += bytes_read;
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-debug-loclists-error-cases.s b/llvm/test/DebugInfo/X86/dwarfdump-debug-loclists-error-cases.s
index 048104986fa41..ec92651a16ec4 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-debug-loclists-error-cases.s
+++ b/llvm/test/DebugInfo/X86/dwarfdump-debug-loclists-error-cases.s
@@ -1,11 +1,11 @@
 # RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux --defsym CASE1=0 -o %t1.o
-# RUN: not llvm-dwarfdump -debug-loclists %t1.o 2>&1 | FileCheck %s --check-prefix=ULEB
+# RUN: not llvm-dwarfdump -debug-loclists %t1.o 2>&1 | FileCheck %s --check-prefix=ULEB -DOFFSET=0x0000000d
 
 # RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux --defsym CASE2=0 -o %t2.o
-# RUN: not llvm-dwarfdump -debug-loclists %t2.o 2>&1 | FileCheck %s --check-prefix=ULEB
+# RUN: not llvm-dwarfdump -debug-loclists %t2.o 2>&1 | FileCheck %s --check-prefix=ULEB -DOFFSET=0x0000000e
 
 # RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux --defsym CASE3=0 -o %t3.o
-# RUN: not llvm-dwarfdump -debug-loclists %t3.o 2>&1 | FileCheck %s --check-prefix=ULEB
+# RUN: not llvm-dwarfdump -debug-loclists %t3.o 2>&1 | FileCheck %s --check-prefix=ULEB -DOFFSET=0x0000000f
 
 # RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux --defsym CASE4=0 -o %t4.o
 # RUN: not llvm-dwarfdump -debug-loclists %t4.o 2>&1 | FileCheck %s
@@ -20,7 +20,7 @@
 # RUN: not llvm-dwarfdump -debug-loclists %t7.o 2>&1 | FileCheck %s --check-prefix=UNIMPL
 
 # CHECK: error: unexpected end of data
-# ULEB: error: malformed uleb128, extends past end
+# ULEB: error: unable to decode LEB128 at offset [[OFFSET]]: malformed uleb128, extends past end
 # UNIMPL: error: LLE of kind 47 not supported
 
 .section  .debug_loclists,"",@progbits
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_short_prologue.s b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_short_prologue.s
index 0bf4bc90d85d4..20391e7ea4d3c 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_short_prologue.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_short_prologue.s
@@ -17,9 +17,9 @@
 # C0-NEXT:  warning: parsing line table prologue at 0x00000000 found an invalid directory or file table description at 0x00000027
 # C0-NEXT:  warning: failed to parse entry content descriptors: unexpected end of data at offset 0x27
 # C1-NEXT:  warning: parsing line table prologue at 0x00000000 found an invalid directory or file table description at 0x0000002a
-# C1-NEXT:  warning: failed to parse entry content descriptors: malformed uleb128, extends past end
+# C1-NEXT:  warning: failed to parse entry content descriptors: unable to decode LEB128 at offset 0x0000002a: malformed uleb128, extends past end
 # C2-NEXT:  warning: parsing line table prologue at 0x00000000 found an invalid directory or file table description at 0x0000002b
-# C2-NEXT:  warning: failed to parse entry content descriptors: malformed uleb128, extends past end
+# C2-NEXT:  warning: failed to parse entry content descriptors: unable to decode LEB128 at offset 0x0000002b: malformed uleb128, extends past end
 # ALL:      include_directories[  0] = "/tmp"
 # OK:       file_names[  0]:
 # OK-NEXT:             name: "foo"
diff --git a/llvm/unittests/Support/DataExtractorTest.cpp b/llvm/unittests/Support/DataExtractorTest.cpp
index 651961586071b..cec08554e8f2c 100644
--- a/llvm/unittests/Support/DataExtractorTest.cpp
+++ b/llvm/unittests/Support/DataExtractorTest.cpp
@@ -137,11 +137,25 @@ TEST(DataExtractorTest, LEB128_error) {
 
   DataExtractor::Cursor C(0);
   EXPECT_EQ(0U, DE.getULEB128(C));
-  EXPECT_THAT_ERROR(C.takeError(), Failed());
+  EXPECT_THAT_ERROR(
+      C.takeError(),
+      FailedWithMessage("unable to decode LEB128 at offset 0x00000000: "
+                        "malformed uleb128, extends past end"));
 
   C = DataExtractor::Cursor(0);
   EXPECT_EQ(0U, DE.getSLEB128(C));
-  EXPECT_THAT_ERROR(C.takeError(), Failed());
+  EXPECT_THAT_ERROR(
+      C.takeError(),
+      FailedWithMessage("unable to decode LEB128 at offset 0x00000000: "
+                        "malformed sleb128, extends past end"));
+
+  // Show non-zero offsets are reported appropriately.
+  C = DataExtractor::Cursor(1);
+  EXPECT_EQ(0U, DE.getULEB128(C));
+  EXPECT_THAT_ERROR(
+      C.takeError(),
+      FailedWithMessage("unable to decode LEB128 at offset 0x00000001: "
+                        "malformed uleb128, extends past end"));
 }
 
 TEST(DataExtractorTest, Cursor_tell) {

From c0303e5391f65dbad3a6f1dbfa5ac9c9a83fa6c0 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 1 Jun 2020 10:14:47 -0400
Subject: [PATCH 712/770] [CodeGen] remove instnamer dependency from test file;
 NFC

This file was originally added without instnamer at:
rL283716 / fe2b9b4fbf860e3dc7da7705f548bc8d7b6ab9c1

But that was reverted and the test file reappeared with instnamer at:
rL285688 / 62f516f5906f967179610a73e4cc1d852b908bbd

I'm not seeing any difference locally from checking nameless values,
so trying to remove a layering violation and see if that can
survive the build bots.
---
 clang/test/CodeGen/x86-inline-asm-v-constraint.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/test/CodeGen/x86-inline-asm-v-constraint.c b/clang/test/CodeGen/x86-inline-asm-v-constraint.c
index 215cccfa443ec..b75a84d7a7bcb 100644
--- a/clang/test/CodeGen/x86-inline-asm-v-constraint.c
+++ b/clang/test/CodeGen/x86-inline-asm-v-constraint.c
@@ -1,19 +1,19 @@
-// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu x86-64 -o - |opt -instnamer -S |FileCheck %s --check-prefix SSE
-// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu skylake -D AVX -o -|opt -instnamer -S  | FileCheck %s --check-prefixes AVX,SSE
-// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu skylake-avx512 -D AVX512 -D AVX -o -|opt -instnamer -S  | FileCheck %s --check-prefixes AVX512,AVX,SSE
-// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu knl -D AVX -D AVX512 -o - |opt -instnamer -S  | FileCheck %s --check-prefixes AVX512,AVX,SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu x86-64 -o - |FileCheck %s --check-prefix SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu skylake -D AVX -o - | FileCheck %s --check-prefixes AVX,SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu skylake-avx512 -D AVX512 -D AVX -o - | FileCheck %s --check-prefixes AVX512,AVX,SSE
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -target-cpu knl -D AVX -D AVX512 -o - | FileCheck %s --check-prefixes AVX512,AVX,SSE
 
 typedef float __m128 __attribute__ ((vector_size (16)));
 typedef float __m256 __attribute__ ((vector_size (32)));
 typedef float __m512 __attribute__ ((vector_size (64)));
 
-// SSE: call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %tmp, <4 x float> %tmp1)
+// SSE: call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %0, <4 x float> %1)
 __m128 testXMM(__m128 _xmm0, long _l) {
   __asm__("vmovhlps %1, %2, %0" :"=v"(_xmm0) : "v"(_l), "v"(_xmm0));
   return _xmm0;
 }
 
-// AVX: call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %tmp)
+// AVX: call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %0)
 __m256 testYMM(__m256 _ymm0) {
 #ifdef AVX
   __asm__("vmovsldup %1, %0" :"=v"(_ymm0) : "v"(_ymm0));
@@ -21,7 +21,7 @@ __m256 testYMM(__m256 _ymm0) {
   return _ymm0;
 }
 
-// AVX512: call <16 x float> asm "vpternlogd $$0, $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %tmp, <16 x float> %tmp1)
+// AVX512: call <16 x float> asm "vpternlogd $$0, $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %0, <16 x float> %1)
 __m512 testZMM(__m512 _zmm0, __m512 _zmm1) {
 #ifdef AVX512
   __asm__("vpternlogd $0, %1, %2, %0" :"=v"(_zmm0) : "v"(_zmm1), "v"(_zmm0));

From 8a84158e5b966236c0e090cd5c7f44dab69e59a8 Mon Sep 17 00:00:00 2001
From: Ehud Katz <ehudkatz@gmail.com>
Date: Mon, 1 Jun 2020 17:42:09 +0300
Subject: [PATCH 713/770] [StructurizeCFG] Fix an incorrect comment, NFC.

---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index ef59f2412f68d..c20e57b02c1a5 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -361,7 +361,7 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   return false;
 }
 
-/// Build up the general order of nodes, by performing a topology sort of the
+/// Build up the general order of nodes, by performing a topological sort of the
 /// parent region's nodes, while ensuring that there is no outer cycle node
 /// between any two inner cycle nodes.
 void StructurizeCFG::orderNodes() {

From 522934da1f0c78c1de1a80d4ba14204a11f5afa8 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 1 Jun 2020 10:39:38 -0400
Subject: [PATCH 714/770] Support GCC [[gnu::attributes]] in C2x mode

GCC 10.1 introduced support for the [[]] style spelling of attributes in C
mode. Similar to how GCC supports __attribute__((foo)) as [[gnu::foo]] in
C++ mode, it now supports the same spelling in C mode as well. This patch
makes a change in Clang so that when you use the GCC attribute spelling,
the attribute is automatically available in all three spellings by default.
However, like Clang, GCC has some attributes it only recognizes in C++ mode
(specifically, abi_tag and init_priority), which this patch also honors.
---
 clang/include/clang/Basic/Attr.td         | 13 ++++++-------
 clang/test/Sema/attr-c2x.c                | 12 ++++++++++++
 clang/utils/TableGen/ClangAttrEmitter.cpp |  7 +++----
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index a691e2332ff7a..bc4a380545afe 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -257,7 +257,6 @@ class VariadicEnumArgument<string name, string type, list<string> values,
 class Spelling<string name, string variety> {
   string Name = name;
   string Variety = variety;
-  bit KnownToGCC;
 }
 
 class GNU<string name> : Spelling<name, "GNU">;
@@ -277,11 +276,11 @@ class Pragma<string namespace, string name> : Spelling<name, "Pragma"> {
   string Namespace = namespace;
 }
 
-// The GCC spelling implies GNU<name> and CXX11<"gnu", name> and also sets
-// KnownToGCC to 1. This spelling should be used for any GCC-compatible
+// The GCC spelling implies GNU<name>, CXX11<"gnu", name>, and optionally,
+// C2x<"gnu", name>. This spelling should be used for any GCC-compatible
 // attributes.
-class GCC<string name> : Spelling<name, "GCC"> {
-  let KnownToGCC = 1;
+class GCC<string name, bit allowInC = 1> : Spelling<name, "GCC"> {
+  bit AllowInC = allowInC;
 }
 
 // The Clang spelling implies GNU<name>, CXX11<"clang", name>, and optionally,
@@ -605,7 +604,7 @@ class IgnoredAttr : Attr {
 //
 
 def AbiTag : Attr {
-  let Spellings = [GCC<"abi_tag">];
+  let Spellings = [GCC<"abi_tag", /*AllowInC*/0>];
   let Args = [VariadicStringArgument<"Tags">];
   let Subjects = SubjectList<[Struct, Var, Function, Namespace], ErrorDiag>;
   let MeaningfulToClassTemplateDefinition = 1;
@@ -2113,7 +2112,7 @@ def WorkGroupSizeHint :  InheritableAttr {
 }
 
 def InitPriority : InheritableAttr {
-  let Spellings = [GCC<"init_priority">];
+  let Spellings = [GCC<"init_priority", /*AllowInC*/0>];
   let Args = [UnsignedArgument<"Priority">];
   let Subjects = SubjectList<[Var], ErrorDiag>;
   let Documentation = [Undocumented];
diff --git a/clang/test/Sema/attr-c2x.c b/clang/test/Sema/attr-c2x.c
index 561b88edfc84f..fae4c5d0fa907 100644
--- a/clang/test/Sema/attr-c2x.c
+++ b/clang/test/Sema/attr-c2x.c
@@ -27,3 +27,15 @@ void bar(void) {
 
 [[nodiscard]] int without_underscores(void);
 [[__nodiscard__]] int underscores(void);
+
+// Match GCC's behavior for C attributes as well.
+[[gnu::constructor]] void ctor_func(void);
+[[gnu::destructor]] void dtor_func(void);
+[[gnu::hot]] void hot_func(void);
+[[__gnu__::hot]] void hot_func2(void);
+[[gnu::__hot__]] void hot_func3(void);
+[[__gnu__::__hot__]] void hot_func4(void);
+
+// Note how not all GCC attributes are supported in C.
+[[gnu::abi_tag("")]] void abi_func(void); // expected-warning {{unknown attribute 'abi_tag' ignored}}
+struct S s [[gnu::init_priority(1)]]; // expected-warning {{unknown attribute 'init_priority' ignored}}
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 2b17195997858..1b9fd2d29bf90 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -48,7 +48,7 @@ namespace {
 
 class FlattenedSpelling {
   std::string V, N, NS;
-  bool K;
+  bool K = false;
 
 public:
   FlattenedSpelling(const std::string &Variety, const std::string &Name,
@@ -61,8 +61,6 @@ class FlattenedSpelling {
            "Given a GCC spelling, which means this hasn't been flattened!");
     if (V == "CXX11" || V == "C2x" || V == "Pragma")
       NS = std::string(Spelling.getValueAsString("Namespace"));
-    bool Unset;
-    K = Spelling.getValueAsBitOrUnset("KnownToGCC", Unset);
   }
 
   const std::string &variety() const { return V; }
@@ -82,9 +80,10 @@ GetFlattenedSpellings(const Record &Attr) {
     StringRef Variety = Spelling->getValueAsString("Variety");
     StringRef Name = Spelling->getValueAsString("Name");
     if (Variety == "GCC") {
-      // Gin up two new spelling objects to add into the list.
       Ret.emplace_back("GNU", std::string(Name), "", true);
       Ret.emplace_back("CXX11", std::string(Name), "gnu", true);
+      if (Spelling->getValueAsBit("AllowInC"))
+        Ret.emplace_back("C2x", std::string(Name), "gnu", true);
     } else if (Variety == "Clang") {
       Ret.emplace_back("GNU", std::string(Name), "", false);
       Ret.emplace_back("CXX11", std::string(Name), "clang", false);

From 5e111c5df8efde39c62d5e6906f590311782e30b Mon Sep 17 00:00:00 2001
From: AndreyChurbanov <andrey.churbanov@intel.com>
Date: Mon, 1 Jun 2020 17:51:02 +0300
Subject: [PATCH 715/770] [openmp] Fixed taskloop recursive splitting so that
 taskloop tasks have same parent tasks.

Differential Revision: https://reviews.llvm.org/D80577
---
 openmp/runtime/src/kmp_tasking.cpp            | 12 ++++----
 .../test/tasking/omp_taskloop_taskwait.c      | 30 +++++++++++++++++++
 2 files changed, 37 insertions(+), 5 deletions(-)
 create mode 100644 openmp/runtime/test/tasking/omp_taskloop_taskwait.c

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6e584731a85fe..a8da6146064cf 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3898,14 +3898,13 @@ void __kmp_fulfill_event(kmp_event_t *event) {
 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
-  kmp_taskdata_t *taskdata_src;
-  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
+  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
   size_t shareds_offset;
   size_t task_size;
 
   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
                 task_src));
-  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
                    TASK_FULL); // it should not be proxy task
   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
@@ -4280,7 +4279,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                           void *codeptr_ra,
 #endif
                           void *task_dup) {
-#if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
@@ -4288,7 +4286,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
                 task_dup));
-#endif
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
   kmp_uint64 lower = *lb;
   kmp_info_t *thread = __kmp_threads[gtid];
@@ -4332,9 +4329,14 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   *ub = ub0; // adjust upper bound for the 1st half
 
   // create auxiliary task for 2nd half of the loop
+  // make sure new task has same parent task as the pattern task
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  thread->th.th_current_task = taskdata->td_parent;
   kmp_task_t *new_task =
       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  // restore current task
+  thread->th.th_current_task = current_task;
   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
   p->task = next_task;
   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
diff --git a/openmp/runtime/test/tasking/omp_taskloop_taskwait.c b/openmp/runtime/test/tasking/omp_taskloop_taskwait.c
new file mode 100644
index 0000000000000..6cb226461c8eb
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskloop_taskwait.c
@@ -0,0 +1,30 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+int main()
+{
+  enum {ITERS = 500};
+  enum {SIZE = 5};
+  int err = 0;
+  #pragma omp parallel num_threads(2) reduction(+:err)
+  {
+    int r = 0;
+    int i;
+    #pragma omp taskloop grainsize(SIZE) shared(r) nogroup
+    for(i=0; i<ITERS; i++) {
+      #pragma omp atomic
+        ++r;
+    }
+    #pragma omp taskwait
+    printf("%d\n", r);
+    if (r != ITERS)
+      err++;
+  } // end of parallel
+  if (err != 0) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}

From dd54432a0f5a6f042fa4d2db3094c6f02e5ad275 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 1 Jun 2020 10:56:27 -0400
Subject: [PATCH 716/770] [InstNamer] use 'i' for Instructions, not 'tmp'

As discussed in https://bugs.llvm.org/show_bug.cgi?id=45951 and
D80584, the name 'tmp' is almost always a bad choice, but we have
a legacy of regression tests with that name because it was baked
into utils/update_test_checks.py.

This change makes -instnamer more consistent (already using "arg"
and "bb", the common LLVM shorthand). And it avoids the conflict
in telling users of the FileCheck script to run "-instnamer" to
create a better regression test and having that cause a warn/fail
in update_test_checks.py.
---
 llvm/lib/Transforms/Utils/InstructionNamer.cpp | 2 +-
 llvm/test/Transforms/InstNamer/basic.ll        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index aac0b55801c46..8e339fe46d457 100644
--- a/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -42,7 +42,7 @@ namespace {
 
         for (Instruction &I : BB)
           if (!I.hasName() && !I.getType()->isVoidTy())
-            I.setName("tmp");
+            I.setName("i");
       }
       return true;
     }
diff --git a/llvm/test/Transforms/InstNamer/basic.ll b/llvm/test/Transforms/InstNamer/basic.ll
index 4c819246b90b2..5fbcfca96a4f4 100644
--- a/llvm/test/Transforms/InstNamer/basic.ll
+++ b/llvm/test/Transforms/InstNamer/basic.ll
@@ -6,10 +6,10 @@ target triple = "x86_64-unknown-linux-gnu"
 define i32 @f_0(i32) {
 ; CHECK-LABEL: @f_0(
 ; CHECK: bb:
-; CHECK-NEXT:   %tmp = add i32 %arg, 2
+; CHECK-NEXT:   %i = add i32 %arg, 2
 ; CHECK-NEXT:   br label %bb1
 ; CHECK: bb1:
-; CHECK-NEXT:   ret i32 %tmp
+; CHECK-NEXT:   ret i32 %i
 
   %2 = add i32 %0, 2
   br label %3

From 26c78e3095f42c066804cf517339002a1028ed61 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Mon, 1 Jun 2020 08:12:53 -0700
Subject: [PATCH 717/770] [WebAssembly] Update test expectations

simd-2.C now compiles thanks to:
  https://github.com/WebAssembly/wasi-libc/pull/183

Differential Revision: https://reviews.llvm.org/D80930
---
 llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 701b347bcbd76..c9f7574b9a41b 100644
--- a/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -86,7 +86,6 @@ lifetime2.C  # violates C++ DR1696
 
 # WASI doesn't have stdjmp.h yet
 pr56982.c
-simd-2.C
 
 # WASI doesn't have pthread.h yet
 thread_local3.C

From 1caedd0c550646557d8d2feb97b3cbba8c48b2d7 Mon Sep 17 00:00:00 2001
From: Siva Chandra Reddy <sivachandra@google.com>
Date: Mon, 18 May 2020 15:06:01 -0700
Subject: [PATCH 718/770] [libc] Add implementations of ceil[f], floor[f] and
 trunc[f] from math.h.

Reviewers: abrachet

Differential Revision: https://reviews.llvm.org/D80612
---
 libc/lib/CMakeLists.txt              |   6 ++
 libc/src/math/CMakeLists.txt         |  60 +++++++++++++
 libc/src/math/ceil.cpp               |  16 ++++
 libc/src/math/ceil.h                 |  18 ++++
 libc/src/math/ceilf.cpp              |  16 ++++
 libc/src/math/ceilf.h                |  18 ++++
 libc/src/math/floor.cpp              |  16 ++++
 libc/src/math/floor.h                |  18 ++++
 libc/src/math/floorf.cpp             |  16 ++++
 libc/src/math/floorf.h               |  18 ++++
 libc/src/math/trunc.cpp              |  16 ++++
 libc/src/math/trunc.h                |  18 ++++
 libc/src/math/truncf.cpp             |  16 ++++
 libc/src/math/truncf.h               |  18 ++++
 libc/test/src/math/CMakeLists.txt    |  78 +++++++++++++++++
 libc/test/src/math/ceil_test.cpp     |  75 ++++++++++++++++
 libc/test/src/math/ceilf_test.cpp    |  75 ++++++++++++++++
 libc/test/src/math/floor_test.cpp    |  75 ++++++++++++++++
 libc/test/src/math/floorf_test.cpp   |  76 ++++++++++++++++
 libc/test/src/math/trunc_test.cpp    |  75 ++++++++++++++++
 libc/test/src/math/truncf_test.cpp   |  77 +++++++++++++++++
 libc/utils/FPUtil/FloatOperations.h  | 125 +++++++++++++++++++++++++--
 libc/utils/FPUtil/FloatProperties.h  |   6 ++
 libc/utils/MPFRWrapper/MPFRUtils.cpp |  12 +++
 libc/utils/MPFRWrapper/MPFRUtils.h   |  12 ++-
 25 files changed, 948 insertions(+), 8 deletions(-)
 create mode 100644 libc/src/math/ceil.cpp
 create mode 100644 libc/src/math/ceil.h
 create mode 100644 libc/src/math/ceilf.cpp
 create mode 100644 libc/src/math/ceilf.h
 create mode 100644 libc/src/math/floor.cpp
 create mode 100644 libc/src/math/floor.h
 create mode 100644 libc/src/math/floorf.cpp
 create mode 100644 libc/src/math/floorf.h
 create mode 100644 libc/src/math/trunc.cpp
 create mode 100644 libc/src/math/trunc.h
 create mode 100644 libc/src/math/truncf.cpp
 create mode 100644 libc/src/math/truncf.h
 create mode 100644 libc/test/src/math/ceil_test.cpp
 create mode 100644 libc/test/src/math/ceilf_test.cpp
 create mode 100644 libc/test/src/math/floor_test.cpp
 create mode 100644 libc/test/src/math/floorf_test.cpp
 create mode 100644 libc/test/src/math/trunc_test.cpp
 create mode 100644 libc/test/src/math/truncf_test.cpp

diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 51f587a2a70ac..e0921d93d4107 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -50,14 +50,20 @@ add_entrypoint_library(
   llvmlibm
   DEPENDS
     # math.h entrypoints
+    libc.src.math.ceil
+    libc.src.math.ceilf
     libc.src.math.cosf
     libc.src.math.fabs
     libc.src.math.fabsf
+    libc.src.math.floor
+    libc.src.math.floorf
     libc.src.math.expf
     libc.src.math.exp2f
     libc.src.math.round
     libc.src.math.sincosf
     libc.src.math.sinf
+    libc.src.math.trunc
+    libc.src.math.truncf
 )
 
 add_redirector_library(
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 41a20b31996f6..a3b1b4f6ec018 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -91,6 +91,66 @@ add_entrypoint_object(
     libc.utils.FPUtil.fputil
 )
 
+add_entrypoint_object(
+  trunc
+  SRCS
+    trunc.cpp
+  HDRS
+    trunc.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+)
+
+add_entrypoint_object(
+  truncf
+  SRCS
+    truncf.cpp
+  HDRS
+    truncf.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+)
+
+add_entrypoint_object(
+  ceil
+  SRCS
+    ceil.cpp
+  HDRS
+    ceil.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+)
+
+add_entrypoint_object(
+  ceilf
+  SRCS
+    ceilf.cpp
+  HDRS
+    ceilf.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+)
+
+add_entrypoint_object(
+  floor
+  SRCS
+    floor.cpp
+  HDRS
+    floor.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+)
+
+add_entrypoint_object(
+  floorf
+  SRCS
+    floorf.cpp
+  HDRS
+    floorf.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+)
+
 add_object_library(
   exp_utils
   HDRS
diff --git a/libc/src/math/ceil.cpp b/libc/src/math/ceil.cpp
new file mode 100644
index 0000000000000..c9af520d1f1f9
--- /dev/null
+++ b/libc/src/math/ceil.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of ceil function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/FloatOperations.h"
+
+namespace __llvm_libc {
+
+double LLVM_LIBC_ENTRYPOINT(ceil)(double x) { return fputil::ceil(x); }
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/ceil.h b/libc/src/math/ceil.h
new file mode 100644
index 0000000000000..98188de20e405
--- /dev/null
+++ b/libc/src/math/ceil.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for ceil --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_CEIL_H
+#define LLVM_LIBC_SRC_MATH_CEIL_H
+
+namespace __llvm_libc {
+
+double ceil(double x);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_CEIL_H
diff --git a/libc/src/math/ceilf.cpp b/libc/src/math/ceilf.cpp
new file mode 100644
index 0000000000000..2d2fb90c47fdc
--- /dev/null
+++ b/libc/src/math/ceilf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of ceilf function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/FloatOperations.h"
+
+namespace __llvm_libc {
+
+float LLVM_LIBC_ENTRYPOINT(ceilf)(float x) { return fputil::ceil(x); }
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/ceilf.h b/libc/src/math/ceilf.h
new file mode 100644
index 0000000000000..e8e64565052a6
--- /dev/null
+++ b/libc/src/math/ceilf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for ceilf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_CEILF_H
+#define LLVM_LIBC_SRC_MATH_CEILF_H
+
+namespace __llvm_libc {
+
+float ceilf(float x);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_CEILF_H
diff --git a/libc/src/math/floor.cpp b/libc/src/math/floor.cpp
new file mode 100644
index 0000000000000..b2d5f872b453f
--- /dev/null
+++ b/libc/src/math/floor.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of floor function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/FloatOperations.h"
+
+namespace __llvm_libc {
+
+double LLVM_LIBC_ENTRYPOINT(floor)(double x) { return fputil::floor(x); }
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/floor.h b/libc/src/math/floor.h
new file mode 100644
index 0000000000000..88a76ebf7d805
--- /dev/null
+++ b/libc/src/math/floor.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for floor -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FLOOR_H
+#define LLVM_LIBC_SRC_MATH_FLOOR_H
+
+namespace __llvm_libc {
+
+double floor(double x);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FLOOR_H
diff --git a/libc/src/math/floorf.cpp b/libc/src/math/floorf.cpp
new file mode 100644
index 0000000000000..397602eacf986
--- /dev/null
+++ b/libc/src/math/floorf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of floorf function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/FloatOperations.h"
+
+namespace __llvm_libc {
+
+float LLVM_LIBC_ENTRYPOINT(floorf)(float x) { return fputil::floor(x); }
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/floorf.h b/libc/src/math/floorf.h
new file mode 100644
index 0000000000000..029df3ac5c9fe
--- /dev/null
+++ b/libc/src/math/floorf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for floorf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FLOORF_H
+#define LLVM_LIBC_SRC_MATH_FLOORF_H
+
+namespace __llvm_libc {
+
+float floorf(float x);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FLOORF_H
diff --git a/libc/src/math/trunc.cpp b/libc/src/math/trunc.cpp
new file mode 100644
index 0000000000000..7e42fe47d2e76
--- /dev/null
+++ b/libc/src/math/trunc.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of trunc function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/FloatOperations.h"
+
+namespace __llvm_libc {
+
+double LLVM_LIBC_ENTRYPOINT(trunc)(double x) { return fputil::trunc(x); }
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/trunc.h b/libc/src/math/trunc.h
new file mode 100644
index 0000000000000..f7fed01f30d25
--- /dev/null
+++ b/libc/src/math/trunc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for trunc -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TRUNC_H
+#define LLVM_LIBC_SRC_MATH_TRUNC_H
+
+namespace __llvm_libc {
+
+double trunc(double x);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_TRUNC_H
diff --git a/libc/src/math/truncf.cpp b/libc/src/math/truncf.cpp
new file mode 100644
index 0000000000000..c567865f72271
--- /dev/null
+++ b/libc/src/math/truncf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of truncf function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/FloatOperations.h"
+
+namespace __llvm_libc {
+
+float LLVM_LIBC_ENTRYPOINT(truncf)(float x) { return fputil::trunc(x); }
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/truncf.h b/libc/src/math/truncf.h
new file mode 100644
index 0000000000000..b4f1cd7ea72f6
--- /dev/null
+++ b/libc/src/math/truncf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for truncf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TRUNCF_H
+#define LLVM_LIBC_SRC_MATH_TRUNCF_H
+
+namespace __llvm_libc {
+
+float truncf(float x);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_TRUNCF_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index ed36faaee4d87..3568b20a537c2 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -97,6 +97,84 @@ add_math_unittest(
     libc.utils.FPUtil.fputil
 )
 
+add_math_unittest(
+  trunc_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    trunc_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.trunc
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  truncf_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    truncf_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.truncf
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  ceil_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    ceil_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.ceil
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  ceilf_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    ceilf_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.ceilf
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  floor_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    floor_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.floor
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  floorf_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    floorf_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.floorf
+    libc.utils.FPUtil.fputil
+)
+
 add_math_unittest(
   expf_test
   NEED_MPFR
diff --git a/libc/test/src/math/ceil_test.cpp b/libc/test/src/math/ceil_test.cpp
new file mode 100644
index 0000000000000..fdb45a81db975
--- /dev/null
+++ b/libc/test/src/math/ceil_test.cpp
@@ -0,0 +1,75 @@
+//===-- Unittests for ceil ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/ceil.h"
+#include "utils/FPUtil/BitPatterns.h"
+#include "utils/FPUtil/FloatOperations.h"
+#include "utils/FPUtil/FloatProperties.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::fputil::valueAsBits;
+using __llvm_libc::fputil::valueFromBits;
+
+using BitPatterns = __llvm_libc::fputil::BitPatterns<double>;
+using Properties = __llvm_libc::fputil::FloatProperties<double>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+// Zero tolerance; As in, exact match with MPFR result.
+static constexpr mpfr::Tolerance tolerance{mpfr::Tolerance::doublePrecision, 0,
+                                           0};
+
+TEST(ceilTest, SpecialNumbers) {
+  EXPECT_EQ(
+      BitPatterns::aQuietNaN,
+      valueAsBits(__llvm_libc::ceil(valueFromBits(BitPatterns::aQuietNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeQuietNaN,
+            valueAsBits(__llvm_libc::ceil(
+                valueFromBits(BitPatterns::aNegativeQuietNaN))));
+
+  EXPECT_EQ(BitPatterns::aSignallingNaN,
+            valueAsBits(
+                __llvm_libc::ceil(valueFromBits(BitPatterns::aSignallingNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeSignallingNaN,
+            valueAsBits(__llvm_libc::ceil(
+                valueFromBits(BitPatterns::aNegativeSignallingNaN))));
+
+  EXPECT_EQ(BitPatterns::inf,
+            valueAsBits(__llvm_libc::ceil(valueFromBits(BitPatterns::inf))));
+  EXPECT_EQ(BitPatterns::negInf,
+            valueAsBits(__llvm_libc::ceil(valueFromBits(BitPatterns::negInf))));
+
+  EXPECT_EQ(BitPatterns::zero,
+            valueAsBits(__llvm_libc::ceil(valueFromBits(BitPatterns::zero))));
+  EXPECT_EQ(BitPatterns::negZero, valueAsBits(__llvm_libc::ceil(
+                                      valueFromBits(BitPatterns::negZero))));
+}
+
+TEST(ceilTest, RoundedNumbers) {
+  EXPECT_EQ(valueAsBits(1.0), valueAsBits(__llvm_libc::ceil(1.0)));
+  EXPECT_EQ(valueAsBits(-1.0), valueAsBits(__llvm_libc::ceil(-1.0)));
+  EXPECT_EQ(valueAsBits(10.0), valueAsBits(__llvm_libc::ceil(10.0)));
+  EXPECT_EQ(valueAsBits(-10.0), valueAsBits(__llvm_libc::ceil(-10.0)));
+  EXPECT_EQ(valueAsBits(12345.0), valueAsBits(__llvm_libc::ceil(12345.0)));
+  EXPECT_EQ(valueAsBits(-12345.0), valueAsBits(__llvm_libc::ceil(-12345.0)));
+}
+
+TEST(ceilTest, InDoubleRange) {
+  using BitsType = Properties::BitsType;
+  constexpr BitsType count = 1000000;
+  constexpr BitsType step = UINT64_MAX / count;
+  for (BitsType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = valueFromBits(v);
+    if (isnan(x) || isinf(x))
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Ceil, x, __llvm_libc::ceil(x),
+                      tolerance);
+  }
+}
diff --git a/libc/test/src/math/ceilf_test.cpp b/libc/test/src/math/ceilf_test.cpp
new file mode 100644
index 0000000000000..7bfb2ac15b6a1
--- /dev/null
+++ b/libc/test/src/math/ceilf_test.cpp
@@ -0,0 +1,75 @@
+//===-- Unittests for ceilf -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/ceilf.h"
+#include "utils/FPUtil/BitPatterns.h"
+#include "utils/FPUtil/FloatOperations.h"
+#include "utils/FPUtil/FloatProperties.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::fputil::valueAsBits;
+using __llvm_libc::fputil::valueFromBits;
+
+using BitPatterns = __llvm_libc::fputil::BitPatterns<float>;
+using Properties = __llvm_libc::fputil::FloatProperties<float>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+// Zero tolerance; As in, exact match with MPFR result.
+static constexpr mpfr::Tolerance tolerance{mpfr::Tolerance::doublePrecision, 0,
+                                           0};
+
+TEST(CeilfTest, SpecialNumbers) {
+  EXPECT_EQ(
+      BitPatterns::aQuietNaN,
+      valueAsBits(__llvm_libc::ceilf(valueFromBits(BitPatterns::aQuietNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeQuietNaN,
+            valueAsBits(__llvm_libc::ceilf(
+                valueFromBits(BitPatterns::aNegativeQuietNaN))));
+
+  EXPECT_EQ(BitPatterns::aSignallingNaN,
+            valueAsBits(__llvm_libc::ceilf(
+                valueFromBits(BitPatterns::aSignallingNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeSignallingNaN,
+            valueAsBits(__llvm_libc::ceilf(
+                valueFromBits(BitPatterns::aNegativeSignallingNaN))));
+
+  EXPECT_EQ(BitPatterns::inf,
+            valueAsBits(__llvm_libc::ceilf(valueFromBits(BitPatterns::inf))));
+  EXPECT_EQ(BitPatterns::negInf, valueAsBits(__llvm_libc::ceilf(
+                                     valueFromBits(BitPatterns::negInf))));
+
+  EXPECT_EQ(BitPatterns::zero,
+            valueAsBits(__llvm_libc::ceilf(valueFromBits(BitPatterns::zero))));
+  EXPECT_EQ(BitPatterns::negZero, valueAsBits(__llvm_libc::ceilf(
+                                      valueFromBits(BitPatterns::negZero))));
+}
+
+TEST(ceilfTest, RoundedNumbers) {
+  EXPECT_EQ(valueAsBits(1.0f), valueAsBits(__llvm_libc::ceilf(1.0f)));
+  EXPECT_EQ(valueAsBits(-1.0f), valueAsBits(__llvm_libc::ceilf(-1.0f)));
+  EXPECT_EQ(valueAsBits(10.0f), valueAsBits(__llvm_libc::ceilf(10.0f)));
+  EXPECT_EQ(valueAsBits(-10.0f), valueAsBits(__llvm_libc::ceilf(-10.0f)));
+  EXPECT_EQ(valueAsBits(12345.0f), valueAsBits(__llvm_libc::ceilf(12345.0f)));
+  EXPECT_EQ(valueAsBits(-12345.0f), valueAsBits(__llvm_libc::ceilf(-12345.0f)));
+}
+
+TEST(ceilfTest, InFloatRange) {
+  using BitsType = Properties::BitsType;
+  constexpr BitsType count = 1000000;
+  constexpr BitsType step = UINT32_MAX / count;
+  for (BitsType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = valueFromBits(v);
+    if (isnan(x) || isinf(x))
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Ceil, x, __llvm_libc::ceilf(x),
+                      tolerance);
+  }
+}
diff --git a/libc/test/src/math/floor_test.cpp b/libc/test/src/math/floor_test.cpp
new file mode 100644
index 0000000000000..a45d75710b8a6
--- /dev/null
+++ b/libc/test/src/math/floor_test.cpp
@@ -0,0 +1,75 @@
+//===-- Unittests for floor -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/floor.h"
+#include "utils/FPUtil/BitPatterns.h"
+#include "utils/FPUtil/FloatOperations.h"
+#include "utils/FPUtil/FloatProperties.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::fputil::valueAsBits;
+using __llvm_libc::fputil::valueFromBits;
+
+using BitPatterns = __llvm_libc::fputil::BitPatterns<double>;
+using Properties = __llvm_libc::fputil::FloatProperties<double>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+// Zero tolerance; As in, exact match with MPFR result.
+static constexpr mpfr::Tolerance tolerance{mpfr::Tolerance::doublePrecision, 0,
+                                           0};
+
+TEST(FloorTest, SpecialNumbers) {
+  EXPECT_EQ(
+      BitPatterns::aQuietNaN,
+      valueAsBits(__llvm_libc::floor(valueFromBits(BitPatterns::aQuietNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeQuietNaN,
+            valueAsBits(__llvm_libc::floor(
+                valueFromBits(BitPatterns::aNegativeQuietNaN))));
+
+  EXPECT_EQ(BitPatterns::aSignallingNaN,
+            valueAsBits(__llvm_libc::floor(
+                valueFromBits(BitPatterns::aSignallingNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeSignallingNaN,
+            valueAsBits(__llvm_libc::floor(
+                valueFromBits(BitPatterns::aNegativeSignallingNaN))));
+
+  EXPECT_EQ(BitPatterns::inf,
+            valueAsBits(__llvm_libc::floor(valueFromBits(BitPatterns::inf))));
+  EXPECT_EQ(BitPatterns::negInf, valueAsBits(__llvm_libc::floor(
+                                     valueFromBits(BitPatterns::negInf))));
+
+  EXPECT_EQ(BitPatterns::zero,
+            valueAsBits(__llvm_libc::floor(valueFromBits(BitPatterns::zero))));
+  EXPECT_EQ(BitPatterns::negZero, valueAsBits(__llvm_libc::floor(
+                                      valueFromBits(BitPatterns::negZero))));
+}
+
+TEST(floorTest, RoundedNumbers) {
+  EXPECT_EQ(valueAsBits(1.0), valueAsBits(__llvm_libc::floor(1.0)));
+  EXPECT_EQ(valueAsBits(-1.0), valueAsBits(__llvm_libc::floor(-1.0)));
+  EXPECT_EQ(valueAsBits(10.0), valueAsBits(__llvm_libc::floor(10.0)));
+  EXPECT_EQ(valueAsBits(-10.0), valueAsBits(__llvm_libc::floor(-10.0)));
+  EXPECT_EQ(valueAsBits(12345.0), valueAsBits(__llvm_libc::floor(12345.0)));
+  EXPECT_EQ(valueAsBits(-12345.0), valueAsBits(__llvm_libc::floor(-12345.0)));
+}
+
+TEST(floorTest, InDoubleRange) {
+  using BitsType = Properties::BitsType;
+  constexpr BitsType count = 1000000;
+  constexpr BitsType step = UINT64_MAX / count;
+  for (BitsType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = valueFromBits(v);
+    if (isnan(x) || isinf(x))
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Floor, x, __llvm_libc::floor(x),
+                      tolerance);
+  }
+}
diff --git a/libc/test/src/math/floorf_test.cpp b/libc/test/src/math/floorf_test.cpp
new file mode 100644
index 0000000000000..c25014da12577
--- /dev/null
+++ b/libc/test/src/math/floorf_test.cpp
@@ -0,0 +1,76 @@
+//===-- Unittests for floorf ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/floorf.h"
+#include "utils/FPUtil/BitPatterns.h"
+#include "utils/FPUtil/FloatOperations.h"
+#include "utils/FPUtil/FloatProperties.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::fputil::valueAsBits;
+using __llvm_libc::fputil::valueFromBits;
+
+using BitPatterns = __llvm_libc::fputil::BitPatterns<float>;
+using Properties = __llvm_libc::fputil::FloatProperties<float>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+// Zero tolerance; As in, exact match with MPFR result.
+static constexpr mpfr::Tolerance tolerance{mpfr::Tolerance::doublePrecision, 0,
+                                           0};
+
+TEST(FloorfTest, SpecialNumbers) {
+  EXPECT_EQ(
+      BitPatterns::aQuietNaN,
+      valueAsBits(__llvm_libc::floorf(valueFromBits(BitPatterns::aQuietNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeQuietNaN,
+            valueAsBits(__llvm_libc::floorf(
+                valueFromBits(BitPatterns::aNegativeQuietNaN))));
+
+  EXPECT_EQ(BitPatterns::aSignallingNaN,
+            valueAsBits(__llvm_libc::floorf(
+                valueFromBits(BitPatterns::aSignallingNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeSignallingNaN,
+            valueAsBits(__llvm_libc::floorf(
+                valueFromBits(BitPatterns::aNegativeSignallingNaN))));
+
+  EXPECT_EQ(BitPatterns::inf,
+            valueAsBits(__llvm_libc::floorf(valueFromBits(BitPatterns::inf))));
+  EXPECT_EQ(BitPatterns::negInf, valueAsBits(__llvm_libc::floorf(
+                                     valueFromBits(BitPatterns::negInf))));
+
+  EXPECT_EQ(BitPatterns::zero,
+            valueAsBits(__llvm_libc::floorf(valueFromBits(BitPatterns::zero))));
+  EXPECT_EQ(BitPatterns::negZero, valueAsBits(__llvm_libc::floorf(
+                                      valueFromBits(BitPatterns::negZero))));
+}
+
+TEST(floorfTest, RoundedNumbers) {
+  EXPECT_EQ(valueAsBits(1.0f), valueAsBits(__llvm_libc::floorf(1.0f)));
+  EXPECT_EQ(valueAsBits(-1.0f), valueAsBits(__llvm_libc::floorf(-1.0f)));
+  EXPECT_EQ(valueAsBits(10.0f), valueAsBits(__llvm_libc::floorf(10.0f)));
+  EXPECT_EQ(valueAsBits(-10.0f), valueAsBits(__llvm_libc::floorf(-10.0f)));
+  EXPECT_EQ(valueAsBits(12345.0f), valueAsBits(__llvm_libc::floorf(12345.0f)));
+  EXPECT_EQ(valueAsBits(-12345.0f),
+            valueAsBits(__llvm_libc::floorf(-12345.0f)));
+}
+
+TEST(floorfTest, InFloatRange) {
+  using BitsType = Properties::BitsType;
+  constexpr BitsType count = 1000000;
+  constexpr BitsType step = UINT32_MAX / count;
+  for (BitsType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = valueFromBits(v);
+    if (isnan(x) || isinf(x))
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Floor, x, __llvm_libc::floorf(x),
+                      tolerance);
+  }
+}
diff --git a/libc/test/src/math/trunc_test.cpp b/libc/test/src/math/trunc_test.cpp
new file mode 100644
index 0000000000000..55f34a649e9f3
--- /dev/null
+++ b/libc/test/src/math/trunc_test.cpp
@@ -0,0 +1,75 @@
+//===-- Unittests for trunc -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/trunc.h"
+#include "utils/FPUtil/BitPatterns.h"
+#include "utils/FPUtil/FloatOperations.h"
+#include "utils/FPUtil/FloatProperties.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::fputil::valueAsBits;
+using __llvm_libc::fputil::valueFromBits;
+
+using BitPatterns = __llvm_libc::fputil::BitPatterns<double>;
+using Properties = __llvm_libc::fputil::FloatProperties<double>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+// Zero tolerance; As in, exact match with MPFR result.
+static constexpr mpfr::Tolerance tolerance{mpfr::Tolerance::doublePrecision, 0,
+                                           0};
+
+TEST(TruncTest, SpecialNumbers) {
+  EXPECT_EQ(
+      BitPatterns::aQuietNaN,
+      valueAsBits(__llvm_libc::trunc(valueFromBits(BitPatterns::aQuietNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeQuietNaN,
+            valueAsBits(__llvm_libc::trunc(
+                valueFromBits(BitPatterns::aNegativeQuietNaN))));
+
+  EXPECT_EQ(BitPatterns::aSignallingNaN,
+            valueAsBits(__llvm_libc::trunc(
+                valueFromBits(BitPatterns::aSignallingNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeSignallingNaN,
+            valueAsBits(__llvm_libc::trunc(
+                valueFromBits(BitPatterns::aNegativeSignallingNaN))));
+
+  EXPECT_EQ(BitPatterns::inf,
+            valueAsBits(__llvm_libc::trunc(valueFromBits(BitPatterns::inf))));
+  EXPECT_EQ(BitPatterns::negInf, valueAsBits(__llvm_libc::trunc(
+                                     valueFromBits(BitPatterns::negInf))));
+
+  EXPECT_EQ(BitPatterns::zero,
+            valueAsBits(__llvm_libc::trunc(valueFromBits(BitPatterns::zero))));
+  EXPECT_EQ(BitPatterns::negZero, valueAsBits(__llvm_libc::trunc(
+                                      valueFromBits(BitPatterns::negZero))));
+}
+
+TEST(TruncTest, RoundedNumbers) {
+  EXPECT_EQ(valueAsBits(1.0), valueAsBits(__llvm_libc::trunc(1.0)));
+  EXPECT_EQ(valueAsBits(-1.0), valueAsBits(__llvm_libc::trunc(-1.0)));
+  EXPECT_EQ(valueAsBits(10.0), valueAsBits(__llvm_libc::trunc(10.0)));
+  EXPECT_EQ(valueAsBits(-10.0), valueAsBits(__llvm_libc::trunc(-10.0)));
+  EXPECT_EQ(valueAsBits(12345.0), valueAsBits(__llvm_libc::trunc(12345.0)));
+  EXPECT_EQ(valueAsBits(-12345.0), valueAsBits(__llvm_libc::trunc(-12345.0)));
+}
+
+TEST(truncTest, InDoubleRange) {
+  using BitsType = Properties::BitsType;
+  constexpr BitsType count = 1000000;
+  constexpr BitsType step = UINT64_MAX / count;
+  for (BitsType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = valueFromBits(v);
+    if (isnan(x) || isinf(x))
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Trunc, x, __llvm_libc::trunc(x),
+                      tolerance);
+  }
+}
diff --git a/libc/test/src/math/truncf_test.cpp b/libc/test/src/math/truncf_test.cpp
new file mode 100644
index 0000000000000..90fc84a5b5f97
--- /dev/null
+++ b/libc/test/src/math/truncf_test.cpp
@@ -0,0 +1,77 @@
+//===-- Unittests for truncf ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/truncf.h"
+#include "utils/FPUtil/BitPatterns.h"
+#include "utils/FPUtil/FloatOperations.h"
+#include "utils/FPUtil/FloatProperties.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::fputil::valueAsBits;
+using __llvm_libc::fputil::valueFromBits;
+
+using BitPatterns = __llvm_libc::fputil::BitPatterns<float>;
+using Properties = __llvm_libc::fputil::FloatProperties<float>;
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+// Zero tolerance; As in, exact match with MPFR result.
+static constexpr mpfr::Tolerance tolerance{mpfr::Tolerance::floatPrecision, 0,
+                                           0};
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+TEST(TruncfTest, SpecialNumbers) {
+  EXPECT_EQ(
+      BitPatterns::aQuietNaN,
+      valueAsBits(__llvm_libc::truncf(valueFromBits(BitPatterns::aQuietNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeQuietNaN,
+            valueAsBits(__llvm_libc::truncf(
+                valueFromBits(BitPatterns::aNegativeQuietNaN))));
+
+  EXPECT_EQ(BitPatterns::aSignallingNaN,
+            valueAsBits(__llvm_libc::truncf(
+                valueFromBits(BitPatterns::aSignallingNaN))));
+  EXPECT_EQ(BitPatterns::aNegativeSignallingNaN,
+            valueAsBits(__llvm_libc::truncf(
+                valueFromBits(BitPatterns::aNegativeSignallingNaN))));
+
+  EXPECT_EQ(BitPatterns::inf,
+            valueAsBits(__llvm_libc::truncf(valueFromBits(BitPatterns::inf))));
+  EXPECT_EQ(BitPatterns::negInf, valueAsBits(__llvm_libc::truncf(
+                                     valueFromBits(BitPatterns::negInf))));
+
+  EXPECT_EQ(BitPatterns::zero,
+            valueAsBits(__llvm_libc::truncf(valueFromBits(BitPatterns::zero))));
+  EXPECT_EQ(BitPatterns::negZero, valueAsBits(__llvm_libc::truncf(
+                                      valueFromBits(BitPatterns::negZero))));
+}
+
+TEST(TruncTest, RoundedNumbers) {
+  EXPECT_EQ(valueAsBits(1.0f), valueAsBits(__llvm_libc::truncf(1.0f)));
+  EXPECT_EQ(valueAsBits(-1.0f), valueAsBits(__llvm_libc::truncf(-1.0f)));
+  EXPECT_EQ(valueAsBits(10.0f), valueAsBits(__llvm_libc::truncf(10.0f)));
+  EXPECT_EQ(valueAsBits(-10.0f), valueAsBits(__llvm_libc::truncf(-10.0f)));
+  EXPECT_EQ(valueAsBits(12345.0f), valueAsBits(__llvm_libc::truncf(12345.0f)));
+  EXPECT_EQ(valueAsBits(-12345.0f),
+            valueAsBits(__llvm_libc::truncf(-12345.0f)));
+}
+
+TEST(truncfTest, InFloatRange) {
+  using BitsType = Properties::BitsType;
+  constexpr BitsType count = 1000000;
+  constexpr BitsType step = UINT32_MAX / count;
+  for (BitsType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = valueFromBits(v);
+    if (isnan(x) || isinf(x))
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Trunc, x, __llvm_libc::truncf(x),
+                      tolerance);
+  }
+}
diff --git a/libc/utils/FPUtil/FloatOperations.h b/libc/utils/FPUtil/FloatOperations.h
index ff7604bc81021..a378903c38b40 100644
--- a/libc/utils/FPUtil/FloatOperations.h
+++ b/libc/utils/FPUtil/FloatOperations.h
@@ -40,18 +40,23 @@ static inline typename FloatProperties<T>::BitsType absBits(T x) {
   return valueAsBits(x) & (~FloatProperties<T>::signMask);
 }
 
-// Return the zero adjusted exponent value of x.
-template <typename T,
-          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
-int getExponent(T x) {
-  using Properties = FloatProperties<T>;
-  using BitsType = typename Properties::BitsType;
-  BitsType bits = absBits(x);
+template <typename BitsType>
+static inline int getExponentFromBits(BitsType bits) {
+  using FPType = typename FloatType<BitsType>::Type;
+  using Properties = FloatProperties<FPType>;
+  bits &= Properties::exponentMask;
   int e = (bits >> Properties::mantissaWidth); // Shift out the mantissa.
   e -= Properties::exponentOffset;             // Zero adjust.
   return e;
 }
 
+// Return the zero adjusted exponent value of x.
+template <typename T,
+          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
+static inline int getExponent(T x) {
+  return getExponentFromBits(valueAsBits(x));
+}
+
 // Return true if x is infinity (positive or negative.)
 template <typename T,
           cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
@@ -74,6 +79,24 @@ static inline bool isNaN(T x) {
          ((bits & Properties::mantissaMask) != 0);
 }
 
+template <typename BitsType> static inline bool bitsAreInfOrNaN(BitsType bits) {
+  using FPType = typename FloatType<BitsType>::Type;
+  return (bits & BitPatterns<FPType>::inf) == BitPatterns<FPType>::inf;
+}
+
+template <typename BitsType> static inline bool bitsAreZero(BitsType bits) {
+  using FPType = typename FloatType<BitsType>::Type;
+  return (bits == BitPatterns<FPType>::zero) ||
+         (bits == BitPatterns<FPType>::negZero);
+}
+
+// Return true if x is any kind of NaN or infinity.
+template <typename T,
+          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
+static inline bool isInfOrNaN(T x) {
+  return bitsAreInfOrNaN(valueAsBits(x));
+}
+
 // Return true if x is a quiet NAN.
 template <typename T,
           cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
@@ -103,6 +126,94 @@ static inline T abs(T x) {
   return valueFromBits(absBits(x));
 }
 
+// Return the trucated value of x. If x is non-negative, then the return value
+// is greatest integer less than or equal to x. Otherwise, return the smallest
+// integer greater than or equal to x. That is, return the integer value rounded
+// toward zero.
+template <typename T,
+          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
+static inline T trunc(T x) {
+  using Properties = FloatProperties<T>;
+  using BitsType = typename FloatProperties<T>::BitsType;
+
+  BitsType bits = valueAsBits(x);
+
+  // If x is infinity, NaN or zero, return it.
+  if (bitsAreInfOrNaN(bits) || bitsAreZero(bits))
+    return x;
+
+  int exponent = getExponentFromBits(bits);
+
+  // If the exponent is greater than the most negative mantissa
+  // exponent, then x is already an integer.
+  if (exponent >= static_cast<int>(Properties::mantissaWidth))
+    return x;
+
+  // If the exponent is such that abs(x) is less than 1, then return 0.
+  if (exponent <= -1) {
+    if (Properties::signMask & bits)
+      return T(-0.0);
+    else
+      return T(0.0);
+  }
+
+  uint32_t trimSize = Properties::mantissaWidth - exponent;
+  return valueFromBits((bits >> trimSize) << trimSize);
+}
+
+template <typename T,
+          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
+static inline T ceil(T x) {
+  using Properties = FloatProperties<T>;
+  using BitsType = typename FloatProperties<T>::BitsType;
+
+  BitsType bits = valueAsBits(x);
+
+  // If x is infinity NaN or zero, return it.
+  if (bitsAreInfOrNaN(bits) || bitsAreZero(bits))
+    return x;
+
+  bool isNeg = bits & Properties::signMask;
+  int exponent = getExponentFromBits(bits);
+
+  // If the exponent is greater than the most negative mantissa
+  // exponent, then x is already an integer.
+  if (exponent >= static_cast<int>(Properties::mantissaWidth))
+    return x;
+
+  if (exponent <= -1) {
+    if (isNeg)
+      return T(-0.0);
+    else
+      return T(1.0);
+  }
+
+  uint32_t trimSize = Properties::mantissaWidth - exponent;
+  // If x is already an integer, return it.
+  if ((bits << (Properties::bitWidth - trimSize)) == 0)
+    return x;
+
+  BitsType truncBits = (bits >> trimSize) << trimSize;
+  T truncValue = valueFromBits(truncBits);
+
+  // If x is negative, the ceil operation is equivalent to the trunc operation.
+  if (isNeg)
+    return truncValue;
+
+  return truncValue + T(1.0);
+}
+
+template <typename T,
+          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
+static inline T floor(T x) {
+  auto bits = valueAsBits(x);
+  if (FloatProperties<T>::signMask & bits) {
+    return -ceil(-x);
+  } else {
+    return trunc(x);
+  }
+}
+
 } // namespace fputil
 } // namespace __llvm_libc
 
diff --git a/libc/utils/FPUtil/FloatProperties.h b/libc/utils/FPUtil/FloatProperties.h
index a9584c7428a13..fb0917259b915 100644
--- a/libc/utils/FPUtil/FloatProperties.h
+++ b/libc/utils/FPUtil/FloatProperties.h
@@ -21,9 +21,12 @@ template <> struct FloatProperties<float> {
   static_assert(sizeof(BitsType) == sizeof(float),
                 "Unexpected size of 'float' type.");
 
+  static constexpr uint32_t bitWidth = sizeof(BitsType) << 3;
+
   static constexpr uint32_t mantissaWidth = 23;
   static constexpr BitsType mantissaMask = 0x007fffffU;
   static constexpr BitsType signMask = 0x80000000U;
+  static constexpr BitsType exponentMask = ~(signMask | mantissaMask);
   static constexpr uint32_t exponentOffset = 127;
 
   // If a number x is a NAN, then it is a quiet NAN if:
@@ -37,9 +40,12 @@ template <> struct FloatProperties<double> {
   static_assert(sizeof(BitsType) == sizeof(double),
                 "Unexpected size of 'double' type.");
 
+  static constexpr uint32_t bitWidth = sizeof(BitsType) << 3;
+
   static constexpr uint32_t mantissaWidth = 52;
   static constexpr BitsType mantissaMask = 0x000fffffffffffffU;
   static constexpr BitsType signMask = 0x8000000000000000ULL;
+  static constexpr BitsType exponentMask = ~(signMask | mantissaMask);
   static constexpr uint32_t exponentOffset = 1023;
 
   // If a number x is a NAN, then it is a quiet NAN if:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 51c8c37592921..1bbc84d554c0e 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -89,6 +89,9 @@ class MPFRNumber {
     case Operation::Abs:
       mpfr_abs(value, mpfrInput.value, MPFR_RNDN);
       break;
+    case Operation::Ceil:
+      mpfr_ceil(value, mpfrInput.value);
+      break;
     case Operation::Cos:
       mpfr_cos(value, mpfrInput.value, MPFR_RNDN);
       break;
@@ -98,9 +101,18 @@ class MPFRNumber {
     case Operation::Exp2:
       mpfr_exp2(value, mpfrInput.value, MPFR_RNDN);
       break;
+    case Operation::Floor:
+      mpfr_floor(value, mpfrInput.value);
+      break;
+    case Operation::Round:
+      mpfr_round(value, mpfrInput.value);
+      break;
     case Operation::Sin:
       mpfr_sin(value, mpfrInput.value, MPFR_RNDN);
       break;
+    case Operation::Trunc:
+      mpfr_trunc(value, mpfrInput.value);
+      break;
     }
   }
 
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index e39ed91281a9d..5628165653325 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -39,7 +39,17 @@ struct Tolerance {
   uint32_t bits;
 };
 
-enum class Operation : int { Abs, Cos, Exp, Exp2, Sin };
+enum class Operation : int {
+  Abs,
+  Ceil,
+  Cos,
+  Exp,
+  Exp2,
+  Floor,
+  Round,
+  Sin,
+  Trunc
+};
 
 namespace internal {
 

From 745c6c8458babb31efc4e992a9c3e8598f03149f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Fri, 29 May 2020 14:00:51 -0700
Subject: [PATCH 719/770] Process gep (phi ptr1, ptr2) in SROA

Differential Revision: https://reviews.llvm.org/D79218
---
 llvm/lib/Transforms/Scalar/SROA.cpp  |  57 +++-
 llvm/test/Transforms/SROA/phi-gep.ll | 421 +++++++++++++++++++++++++++
 2 files changed, 474 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/SROA/phi-gep.ll

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 1d486a3e74fd1..7e92f7b06ecbd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3475,15 +3475,60 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
                                        Sel->getName() + ".sroa.sel");
     GEPI.replaceAllUsesWith(NSel);
     GEPI.eraseFromParent();
+    Instruction *NSelI = cast<Instruction>(NSel);
+    Visited.insert(NSelI);
+    enqueueUsers(*NSelI);
 
     LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
                       << "\n              " << *NFalse
                       << "\n              " << *NSel << '\n');
 
-    if (isa<Instruction>(NTrue))
-      visit(cast<Instruction>(NTrue));
-    if (isa<Instruction>(NFalse))
-      visit(cast<Instruction>(NFalse));
+    return true;
+  }
+
+  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
+  bool foldGEPPhi(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
+    if (GEPI.getParent() != PHI->getParent() ||
+        llvm::any_of(PHI->incoming_values(), [](Value *In)
+          { Instruction *I = dyn_cast<Instruction>(In);
+            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+                   !I->getParent()->isLegalToHoistInto();
+          }))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
+                      << "\n    original: " << *PHI
+                      << "\n              " << GEPI
+                      << "\n          to: ");
+
+    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    bool IsInBounds = GEPI.isInBounds();
+    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
+    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
+                                          PHI->getNumIncomingValues(),
+                                          PHI->getName() + ".sroa.phi");
+    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
+      Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+
+      IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+      Value *NewVal = IsInBounds
+          ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
+          : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
+      NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
+    }
+
+    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.eraseFromParent();
+    Visited.insert(NewPN);
+    enqueueUsers(*NewPN);
+
+    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
+                 dbgs() << "\n              " << *In;
+               dbgs() << "\n              " << *NewPN << '\n');
 
     return true;
   }
@@ -3493,6 +3538,10 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
         foldGEPSelect(GEPI))
       return true;
 
+    if (isa<PHINode>(GEPI.getPointerOperand()) &&
+        foldGEPPhi(GEPI))
+      return true;
+
     enqueueUsers(GEPI);
     return false;
   }
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
new file mode 100644
index 0000000000000..c808dfc4a5115
--- /dev/null
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -0,0 +1,421 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -sroa < %s | FileCheck %s
+
+%pair = type { i32, i32 }
+
+define i32 @test_sroa_phi_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 2, [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_non_inbound(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_non_inbound(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 2, [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
+  %gep = getelementptr %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_undef(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ undef, [[IF_THEN]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ undef, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+@g = global %pair zeroinitializer, align 4
+
+define i32 @test_sroa_phi_gep_global(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_global(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ @g, [[IF_THEN]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ @g, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_arg_phi_inspt(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_arg_phi_inspt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_INSPT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_INSPT]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 2, [[FOR]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_inspt = phi i32 [ 0, %entry ], [ %i, %for ]
+  %i = add i32 %phi_inspt, 1
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %for ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_phi_gep_phi_inspt(i1 %cond) {
+; CHECK-LABEL: @test_sroa_phi_gep_phi_inspt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_IN:%.*]] = phi %pair* [ null, [[ENTRY:%.*]] ], [ [[B]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_INSPT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_INSPT]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY]] ], [ [[PHI_IN]], [[FOR]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_in = phi %pair * [ null, %entry ], [ %b, %for ]
+  %phi_inspt = phi i32 [ 0, %entry ], [ %i, %for ]
+  %i = add i32 %phi_inspt, 1
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %phi_in, %for ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_gep_phi_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_gep_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[A_SROA_0]], [[ENTRY]] ], [ [[GEP_FOR:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
+; CHECK-NEXT:    [[GEP_FOR]] = getelementptr inbounds i32, i32* [[PHI]], i32 0
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_END:%.*]] = phi i32* [ [[A_SROA_0]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
+  %phi = phi i32* [ %gep_a, %entry], [ %gep_for, %for ]
+  %i = add i32 %phi_i, 1
+  %gep_for = getelementptr inbounds i32, i32* %phi, i32 0
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi_end = phi i32* [ %gep_a, %entry], [ %phi, %for ]
+  %load = load i32, i32* %phi_end, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_gep_cast_phi_gep(i1 %cond) {
+; CHECK-LABEL: @test_sroa_gep_cast_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST2:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    store i32 1065353216, i32* [[A_SROA_0]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
+; CHECK:       for:
+; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST2]], [[ENTRY]] ], [ [[GEP_FOR_2:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_SROA_PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_2_SROA_GEP:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
+; CHECK-NEXT:    [[GEP_FOR_1:%.*]] = bitcast float* [[PHI_SROA_PHI]] to i32*
+; CHECK-NEXT:    [[GEP_FOR_2]] = bitcast i32* [[GEP_FOR_1]] to float*
+; CHECK-NEXT:    [[GEP_FOR_2_SROA_GEP]] = getelementptr inbounds float, float* [[GEP_FOR_2]], i32 0
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_END:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_END_1:%.*]] = bitcast float* [[PHI_END]] to i32*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END_1]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_a.1 = bitcast i32* %gep_a to float*
+  store float 1.0, float* %gep_a.1, align 4
+  br i1 %cond, label %for, label %end
+
+for:
+  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
+  %phi = phi float* [ %gep_a.1, %entry], [ %gep_for.2, %for ]
+  %i = add i32 %phi_i, 1
+  %gep_for = getelementptr inbounds float, float* %phi, i32 0
+  %gep_for.1 = bitcast float* %gep_for to i32*
+  %gep_for.2 = bitcast i32* %gep_for.1 to float*
+  %loop.cond = icmp ult i32 %i, 10
+  br i1 %loop.cond, label %for, label %end
+
+end:
+  %phi_end = phi float* [ %gep_a.1, %entry], [ %phi, %for ]
+  %phi_end.1 = bitcast float* %phi_end to i32*
+  %load = load i32, i32* %phi_end.1, align 4
+  ret i32 %load
+}
+
+define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @test_sroa_invoke_phi_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[CALL:%.*]], label [[END:%.*]]
+; CHECK:       call:
+; CHECK-NEXT:    [[B:%.*]] = invoke %pair* @foo()
+; CHECK-NEXT:    to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[CALL]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK:       invoke_catch:
+; CHECK-NEXT:    [[RES:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    catch i8* null
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %a = alloca %pair, align 4
+  br i1 %cond, label %call, label %end
+
+call:
+  %b = invoke %pair* @foo()
+  to label %end unwind label %invoke_catch
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %call ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 0, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+
+invoke_catch:
+  %res = landingpad { i8*, i32 }
+  catch i8* null
+  ret i32 0
+}
+
+define i32 @test_sroa_phi_gep_nonconst_idx(i1 %cond, i32 %idx) {
+; CHECK-LABEL: @test_sroa_phi_gep_nonconst_idx(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca [[PAIR]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[A]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[B]], i32 0, i32 1
+; CHECK-NEXT:    store i32 1, i32* [[GEP_A]], align 4
+; CHECK-NEXT:    store i32 2, i32* [[GEP_B]], align 4
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i32 [[IDX:%.*]], i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %a = alloca %pair, align 4
+  %b = alloca %pair, align 4
+  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
+  %gep_b = getelementptr inbounds %pair, %pair* %b, i32 0, i32 1
+  store i32 1, i32* %gep_a, align 4
+  store i32 2, i32* %gep_b, align 4
+  br i1 %cond, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %phi = phi %pair* [ %a, %entry], [ %b, %if.then ]
+  %gep = getelementptr inbounds %pair, %pair* %phi, i32 %idx, i32 1
+  %load = load i32, i32* %gep, align 4
+  ret i32 %load
+}
+
+define void @test_sroa_gep_phi_select_other_block() {
+; CHECK-LABEL: @test_sroa_gep_phi_select_other_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [[PAIR:%.*]], align 8
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[ALLOCA]], [[ENTRY:%.*]] ], [ [[SELECT:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[SELECT]] = select i1 undef, %pair* [[PHI]], %pair* undef
+; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i64 1
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %alloca = alloca %pair, align 8
+  br label %while.body
+
+while.body:
+  %phi = phi %pair* [ %alloca, %entry ], [ %select, %while.body ]
+  %select = select i1 undef, %pair* %phi, %pair* undef
+  br i1 undef, label %exit, label %while.body
+
+exit:
+  %gep = getelementptr inbounds %pair, %pair* %phi, i64 1
+  unreachable
+}
+
+define void @test_sroa_gep_phi_select_same_block() {
+; CHECK-LABEL: @test_sroa_gep_phi_select_same_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [[PAIR:%.*]], align 8
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[ALLOCA]], [[ENTRY:%.*]] ], [ [[SELECT:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[SELECT]] = select i1 undef, %pair* [[PHI]], %pair* undef
+; CHECK-NEXT:    [[PHI_SROA_GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i64 1
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 undef, %pair* [[PHI_SROA_GEP]], %pair* undef
+; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %alloca = alloca %pair, align 8
+  br label %while.body
+
+while.body:
+  %phi = phi %pair* [ %alloca, %entry ], [ %select, %while.body ]
+  %select = select i1 undef, %pair* %phi, %pair* undef
+  %gep = getelementptr inbounds %pair, %pair* %select, i64 1
+  br i1 undef, label %exit, label %while.body
+
+exit:
+  unreachable
+}
+
+declare %pair* @foo()
+
+declare i32 @__gxx_personality_v0(...)

From b874dc4ddabe9cada83e8e0b82274faf2eee95cc Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 1 Jun 2020 11:50:43 -0400
Subject: [PATCH 720/770] [InstCombine] add test for select-of-shuffle; NFC

This is based on an example in D80658
---
 .../InstCombine/vec_demanded_elts.ll          | 89 +++++++++++--------
 1 file changed, 52 insertions(+), 37 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
index f444404d14d0e..38117fbdbb3f0 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -4,18 +4,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @test2(float %f) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[F:%.*]], [[F]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP5]] to i32
-; CHECK-NEXT:    ret i32 [[TMP21]]
+; CHECK-NEXT:    [[T5:%.*]] = fmul float [[F:%.*]], [[F]]
+; CHECK-NEXT:    [[T21:%.*]] = bitcast float [[T5]] to i32
+; CHECK-NEXT:    ret i32 [[T21]]
 ;
-  %tmp5 = fmul float %f, %f
-  %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0
-  %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
-  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
-  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
-  %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32>
-  %tmp21 = extractelement <4 x i32> %tmp19, i32 0
-  ret i32 %tmp21
+  %t5 = fmul float %f, %f
+  %t9 = insertelement <4 x float> undef, float %t5, i32 0
+  %t10 = insertelement <4 x float> %t9, float 0.000000e+00, i32 1
+  %t11 = insertelement <4 x float> %t10, float 0.000000e+00, i32 2
+  %t12 = insertelement <4 x float> %t11, float 0.000000e+00, i32 3
+  %t19 = bitcast <4 x float> %t12 to <4 x i32>
+  %t21 = extractelement <4 x i32> %t19, i32 0
+  ret i32 %t21
 }
 
 define void @get_image() nounwind {
@@ -29,11 +29,11 @@ define void @get_image() nounwind {
 ; CHECK-NEXT:    unreachable
 ;
 entry:
-  %0 = call i32 @fgetc(i8* null) nounwind               ; <i32> [#uses=1]
-  %1 = trunc i32 %0 to i8         ; <i8> [#uses=1]
-  %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1          ; <<100 x i8>> [#uses=1]
-  %tmp1 = extractelement <100 x i8> %tmp2, i32 0          ; <i8> [#uses=1]
-  %2 = icmp eq i8 %tmp1, 80               ; <i1> [#uses=1]
+  %0 = call i32 @fgetc(i8* null) nounwind
+  %1 = trunc i32 %0 to i8
+  %t2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1
+  %t1 = extractelement <100 x i8> %t2, i32 0
+  %2 = icmp eq i8 %t1, 80
   br i1 %2, label %bb2, label %bb3
 
 bb2:            ; preds = %entry
@@ -51,8 +51,8 @@ define void @vac(<4 x float>* nocapture %a) nounwind {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %tmp1 = load <4 x float>, <4 x float>* %a		; <<4 x float>> [#uses=1]
-  %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0	; <<4 x float>> [#uses=1]
+  %t1 = load <4 x float>, <4 x float>* %a		; <<4 x float>> [#uses=1]
+  %vecins = insertelement <4 x float> %t1, float 0.000000e+00, i32 0	; <<4 x float>> [#uses=1]
   %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
   %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
   %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
@@ -79,12 +79,12 @@ define <2 x float> @test_fptrunc(double %f) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
-  %tmp9 = insertelement <4 x double> undef, double %f, i32 0
-  %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
-  %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
-  %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
-  %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
-  %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %t9 = insertelement <4 x double> undef, double %f, i32 0
+  %t10 = insertelement <4 x double> %t9, double 0.000000e+00, i32 1
+  %t11 = insertelement <4 x double> %t10, double 0.000000e+00, i32 2
+  %t12 = insertelement <4 x double> %t11, double 0.000000e+00, i32 3
+  %t5 = fptrunc <4 x double> %t12 to <4 x float>
+  %ret = shufflevector <4 x float> %t5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %ret
 }
 
@@ -94,12 +94,12 @@ define <2 x double> @test_fpext(float %f) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
-  %tmp9 = insertelement <4 x float> undef, float %f, i32 0
-  %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
-  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
-  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
-  %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
-  %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  %t9 = insertelement <4 x float> undef, float %f, i32 0
+  %t10 = insertelement <4 x float> %t9, float 0.000000e+00, i32 1
+  %t11 = insertelement <4 x float> %t10, float 0.000000e+00, i32 2
+  %t12 = insertelement <4 x float> %t11, float 0.000000e+00, i32 3
+  %t5 = fpext <4 x float> %t12 to <4 x double>
+  %ret = shufflevector <4 x double> %t5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %ret
 }
 
@@ -663,17 +663,17 @@ define i32* @PR41624(<2 x { i32, i32 }*> %a) {
 define i32* @zero_sized_type_extract(<4 x i64> %arg, i64 %arg1) {
 ; CHECK-LABEL: @zero_sized_type_extract(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [0 x i32], <4 x [0 x i32]*> <[0 x i32]* @global, [0 x i32]* undef, [0 x i32]* undef, [0 x i32]* undef>, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i64> [[ARG:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32*> [[TMP]], i64 0
-; CHECK-NEXT:    ret i32* [[TMP2]]
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds [0 x i32], <4 x [0 x i32]*> <[0 x i32]* @global, [0 x i32]* undef, [0 x i32]* undef, [0 x i32]* undef>, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i64> [[ARG:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <4 x i32*> [[T]], i64 0
+; CHECK-NEXT:    ret i32* [[T2]]
 ;
 bb:
-  %tmp = getelementptr inbounds [0 x i32], <4 x [0 x i32]*> <[0 x i32]* @global, [0 x i32]* @global, [0 x i32]* @global, [0 x i32]* @global>, <4 x i64> zeroinitializer, <4 x i64> %arg
-  %tmp2 = extractelement <4 x i32*> %tmp, i64 0
-  ret i32* %tmp2
+  %t = getelementptr inbounds [0 x i32], <4 x [0 x i32]*> <[0 x i32]* @global, [0 x i32]* @global, [0 x i32]* @global, [0 x i32]* @global>, <4 x i64> zeroinitializer, <4 x i64> %arg
+  %t2 = extractelement <4 x i32*> %t, i64 0
+  ret i32* %t2
 }
 
-; The non-zero elements of the result are always 'min', so the splat is unnecessary.
+; The non-zero elements of the result are always 'y', so the splat is unnecessary.
 
 define <4 x i8> @select_cond_with_eq_true_false_elts(<4 x i8> %x, <4 x i8> %y, <4 x i1> %cmp) {
 ; CHECK-LABEL: @select_cond_with_eq_true_false_elts(
@@ -733,3 +733,18 @@ define <4 x i8> @select_cond_with_undef_true_false_elts(<4 x i8> %x, <4 x i8> %y
   %r = select <4 x i1> %cond, <4 x i8> %tval, <4 x i8> %x
   ret <4 x i8> %r
 }
+
+; The insert can not be safely eliminated because cmp[0] might be poison.
+
+define <4 x i8> @select_cond_(<4 x i8> %x, <4 x i8> %min, <4 x i1> %cmp, i1 %poison_blocker) {
+; CHECK-LABEL: @select_cond_(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i1> [[CMP:%.*]], i1 [[POISON_BLOCKER:%.*]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[MIN:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[INS]], <4 x i8> [[VECINS]], <4 x i8> [[X]]
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %ins = insertelement <4 x i1> %cmp, i1 %poison_blocker, i32 0
+  %vecins = shufflevector <4 x i8> %x, <4 x i8> %min, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %r = select <4 x i1> %ins, <4 x i8> %vecins, <4 x i8> %x
+  ret <4 x i8> %r
+}

From d9943e7f0ce888733ee7ba91da432e5f01f7aa85 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 1 Jun 2020 08:50:21 -0700
Subject: [PATCH 721/770] [Object] Add DF_1_PIE

This flag (and the whole field DT_FLAGS_1) originated from Solaris. I intend to use it in an LLD patch D80872.

Reviewed By: jhenderson

Differential Revision: https://reviews.llvm.org/D80871
---
 llvm/include/llvm/BinaryFormat/ELF.h               | 3 ++-
 llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test | 8 ++++----
 llvm/tools/llvm-readobj/ELFDumper.cpp              | 3 ++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 1a17135b60788..3957c134171e3 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1296,7 +1296,8 @@ enum {
   DF_1_NORELOC = 0x00400000,
   DF_1_SYMINTPOSE = 0x00800000, // Object has individual interposers.
   DF_1_GLOBAUDIT = 0x01000000,  // Global auditing required.
-  DF_1_SINGLETON = 0x02000000   // Singleton symbols are used.
+  DF_1_SINGLETON = 0x02000000,  // Singleton symbols are used.
+  DF_1_PIE = 0x08000000,        // Object is a position-independent executable.
 };
 
 // DT_MIPS_FLAGS values.
diff --git a/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test b/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test
index 0f3fe44b35430..c7bd551eb754b 100644
--- a/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test
+++ b/llvm/test/tools/llvm-readobj/ELF/dynamic-tags.test
@@ -59,7 +59,7 @@
 # LLVM64-NEXT:  0x000000006FFFFEF7 TLSDESC_GOT          0x1000
 # LLVM64-NEXT:  0x000000006FFFFFF9 RELACOUNT            0
 # LLVM64-NEXT:  0x000000006FFFFFFA RELCOUNT             0
-# LLVM64-NEXT:  0x000000006FFFFFFB FLAGS_1              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON {{$}}
+# LLVM64-NEXT:  0x000000006FFFFFFB FLAGS_1              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON PIE {{$}}
 # LLVM64-NEXT:  0x000000006FFFFFF0 VERSYM               0x1000
 # LLVM64-NEXT:  0x000000006FFFFFFC VERDEF               0x1000
 # LLVM64-NEXT:  0x000000006FFFFFFD VERDEFNUM            0
@@ -124,7 +124,7 @@
 # GNU64-NEXT:  0x000000006ffffef7 (TLSDESC_GOT)          0x1000
 # GNU64-NEXT:  0x000000006ffffff9 (RELACOUNT)            0
 # GNU64-NEXT:  0x000000006ffffffa (RELCOUNT)             0
-# GNU64-NEXT:  0x000000006ffffffb (FLAGS_1)              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON {{$}}
+# GNU64-NEXT:  0x000000006ffffffb (FLAGS_1)              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON PIE {{$}}
 # GNU64-NEXT:  0x000000006ffffff0 (VERSYM)               0x1000
 # GNU64-NEXT:  0x000000006ffffffc (VERDEF)               0x1000
 # GNU64-NEXT:  0x000000006ffffffd (VERDEFNUM)            0
@@ -349,7 +349,7 @@ ProgramHeaders:
 # LLVM32-NEXT:  0x6FFFFEF7 TLSDESC_GOT          0x1000
 # LLVM32-NEXT:  0x6FFFFFF9 RELACOUNT            0
 # LLVM32-NEXT:  0x6FFFFFFA RELCOUNT             0
-# LLVM32-NEXT:  0x6FFFFFFB FLAGS_1              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON {{$}}
+# LLVM32-NEXT:  0x6FFFFFFB FLAGS_1              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON PIE {{$}}
 # LLVM32-NEXT:  0x6FFFFFF0 VERSYM               0x1000
 # LLVM32-NEXT:  0x6FFFFFFC VERDEF               0x1000
 # LLVM32-NEXT:  0x6FFFFFFD VERDEFNUM            0
@@ -414,7 +414,7 @@ ProgramHeaders:
 # GNU32-NEXT:  0x6ffffef7 (TLSDESC_GOT)          0x1000
 # GNU32-NEXT:  0x6ffffff9 (RELACOUNT)            0
 # GNU32-NEXT:  0x6ffffffa (RELCOUNT)             0
-# GNU32-NEXT:  0x6ffffffb (FLAGS_1)              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON {{$}}
+# GNU32-NEXT:  0x6ffffffb (FLAGS_1)              NOW GLOBAL GROUP NODELETE LOADFLTR INITFIRST NOOPEN ORIGIN DIRECT TRANS INTERPOSE NODEFLIB NODUMP CONFALT ENDFILTEE DISPRELDNE DISPRELPND NODIRECT IGNMULDEF NOKSYMS NOHDR EDITED NORELOC SYMINTPOSE GLOBAUDIT SINGLETON PIE {{$}}
 # GNU32-NEXT:  0x6ffffff0 (VERSYM)               0x1000
 # GNU32-NEXT:  0x6ffffffc (VERDEF)               0x1000
 # GNU32-NEXT:  0x6ffffffd (VERDEFNUM)            0
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 2c9a4b9c4900c..861149ab9ca71 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -2350,7 +2350,8 @@ static const EnumEntry<unsigned> ElfDynamicDTFlags1[] = {
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, NORELOC),
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, SYMINTPOSE),
   LLVM_READOBJ_DT_FLAG_ENT(DF_1, GLOBAUDIT),
-  LLVM_READOBJ_DT_FLAG_ENT(DF_1, SINGLETON)
+  LLVM_READOBJ_DT_FLAG_ENT(DF_1, SINGLETON),
+  LLVM_READOBJ_DT_FLAG_ENT(DF_1, PIE),
 };
 
 static const EnumEntry<unsigned> ElfDynamicDTMipsFlags[] = {

From e132a9c012bab1425e6021a818befd17a73e2b64 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 1 Jun 2020 09:07:17 -0700
Subject: [PATCH 722/770] Update some names in test. NFC.

There seems to be some instability with IR nameing between
platforms. Attempted to fix it with replacing dot-numbered
names.
---
 llvm/test/Transforms/SROA/phi-gep.ll | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index c808dfc4a5115..cdabd7a420336 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -241,23 +241,23 @@ define i32 @test_sroa_gep_cast_phi_gep(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_gep_cast_phi_gep(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST2:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_1_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
+; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST2:%.*]] = bitcast i32* [[A_SROA_0]] to float*
 ; CHECK-NEXT:    [[A_SROA_0_0_GEP_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
 ; CHECK-NEXT:    store i32 1065353216, i32* [[A_SROA_0]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST2]], [[ENTRY]] ], [ [[GEP_FOR_2:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI_SROA_PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_2_SROA_GEP:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_CAST_TO_I32:%.*]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_SROA_PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_CAST_TO_I32_SROA_GEP:%.*]], [[FOR]] ]
 ; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
-; CHECK-NEXT:    [[GEP_FOR_1:%.*]] = bitcast float* [[PHI_SROA_PHI]] to i32*
-; CHECK-NEXT:    [[GEP_FOR_2]] = bitcast i32* [[GEP_FOR_1]] to float*
-; CHECK-NEXT:    [[GEP_FOR_2_SROA_GEP]] = getelementptr inbounds float, float* [[GEP_FOR_2]], i32 0
+; CHECK-NEXT:    [[GEP_FOR_CAST:%.*]] = bitcast float* [[PHI_SROA_PHI]] to i32*
+; CHECK-NEXT:    [[GEP_FOR_CAST_TO_I32]] = bitcast i32* [[GEP_FOR_CAST]] to float*
+; CHECK-NEXT:    [[GEP_FOR_CAST_TO_I32_SROA_GEP]] = getelementptr inbounds float, float* [[GEP_FOR_CAST_TO_I32]], i32 0
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI_END:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_1_SROA_CAST]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
+; CHECK-NEXT:    [[PHI_END:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST2]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
 ; CHECK-NEXT:    [[PHI_END_1:%.*]] = bitcast float* [[PHI_END]] to i32*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END_1]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
@@ -265,22 +265,22 @@ define i32 @test_sroa_gep_cast_phi_gep(i1 %cond) {
 entry:
   %a = alloca %pair, align 4
   %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_a.1 = bitcast i32* %gep_a to float*
-  store float 1.0, float* %gep_a.1, align 4
+  %gep_a_cast_to_i32 = bitcast i32* %gep_a to float*
+  store float 1.0, float* %gep_a_cast_to_i32, align 4
   br i1 %cond, label %for, label %end
 
 for:
   %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
-  %phi = phi float* [ %gep_a.1, %entry], [ %gep_for.2, %for ]
+  %phi = phi float* [ %gep_a_cast_to_i32, %entry], [ %gep_for_cast_to_i32, %for ]
   %i = add i32 %phi_i, 1
   %gep_for = getelementptr inbounds float, float* %phi, i32 0
-  %gep_for.1 = bitcast float* %gep_for to i32*
-  %gep_for.2 = bitcast i32* %gep_for.1 to float*
+  %gep_for_cast = bitcast float* %gep_for to i32*
+  %gep_for_cast_to_i32 = bitcast i32* %gep_for_cast to float*
   %loop.cond = icmp ult i32 %i, 10
   br i1 %loop.cond, label %for, label %end
 
 end:
-  %phi_end = phi float* [ %gep_a.1, %entry], [ %phi, %for ]
+  %phi_end = phi float* [ %gep_a_cast_to_i32, %entry], [ %phi, %for ]
   %phi_end.1 = bitcast float* %phi_end to i32*
   %load = load i32, i32* %phi_end.1, align 4
   ret i32 %load

From ae6e499d258c24fde433d02793c327367c402d50 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Wed, 27 May 2020 14:18:40 -0700
Subject: [PATCH 723/770] [flang] This adds the lowering stubs for Open MP.

The lowering bridge will call these lowering hooks to process the Open
MP directives that it iterates over in the PFT.  This is a mock
interface without an implementation in this patch.

Reviewed By: kiranchandramohan

Differential Revision: https://reviews.llvm.org/D80815
---
 flang/include/flang/Lower/AbstractConverter.h | 19 ++++++++++
 flang/include/flang/Lower/OpenMP.h            | 35 +++++++++++++++++++
 flang/lib/Lower/CMakeLists.txt                |  1 +
 flang/lib/Lower/OpenMP.cpp                    | 26 ++++++++++++++
 4 files changed, 81 insertions(+)
 create mode 100644 flang/include/flang/Lower/AbstractConverter.h
 create mode 100644 flang/include/flang/Lower/OpenMP.h
 create mode 100644 flang/lib/Lower/OpenMP.cpp

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
new file mode 100644
index 0000000000000..0716b841fa1e7
--- /dev/null
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -0,0 +1,19 @@
+//===-- Lower/AbstractConverter.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_ABSTRACTCONVERTER_H
+#define FORTRAN_LOWER_ABSTRACTCONVERTER_H
+
+namespace Fortran::lower {
+
+// temporary stub
+class AbstractConverter;
+
+} // namespace Fortran::lower
+
+#endif // FORTRAN_LOWER_ABSTRACTCONVERTER_H
diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h
new file mode 100644
index 0000000000000..0b273a6aa7340
--- /dev/null
+++ b/flang/include/flang/Lower/OpenMP.h
@@ -0,0 +1,35 @@
+//===-- Lower/OpenMP.h -- lower Open MP directives --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_OPENMP_H
+#define FORTRAN_LOWER_OPENMP_H
+
+namespace Fortran {
+namespace parser {
+struct OpenMPConstruct;
+struct OmpEndLoopDirective;
+} // namespace parser
+
+namespace lower {
+
+class AbstractConverter;
+
+namespace pft {
+struct Evaluation;
+} // namespace pft
+
+void genOpenMPConstruct(AbstractConverter &, pft::Evaluation &,
+                        const parser::OpenMPConstruct &);
+
+void genOpenMPEndLoop(AbstractConverter &, pft::Evaluation &,
+                      const parser::OmpEndLoopDirective &);
+
+} // namespace lower
+} // namespace Fortran
+
+#endif // FORTRAN_LOWER_OPENMP_H
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 6cbcfc3b630e8..9e1dbc782db6a 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -1,5 +1,6 @@
 
 add_flang_library(FortranLower
+  OpenMP.cpp
   PFTBuilder.cpp
 
   LINK_COMPONENTS
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
new file mode 100644
index 0000000000000..ad75eff0adb38
--- /dev/null
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -0,0 +1,26 @@
+//===-- OpenMP.cpp -- Open MP directive lowering --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Lower/OpenMP.h"
+#include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/PFTBuilder.h"
+#include "flang/Parser/parse-tree.h"
+
+#define TODO() llvm_unreachable("not yet implemented")
+
+void Fortran::lower::genOpenMPConstruct(
+    Fortran::lower::AbstractConverter &, Fortran::lower::pft::Evaluation &,
+    const Fortran::parser::OpenMPConstruct &) {
+  TODO();
+}
+
+void Fortran::lower::genOpenMPEndLoop(
+    Fortran::lower::AbstractConverter &, Fortran::lower::pft::Evaluation &,
+    const Fortran::parser::OmpEndLoopDirective &) {
+  TODO();
+}

From ef1d4bec891b2121fffa68675e3792b2527a75ee Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 1 Jun 2020 09:21:51 -0700
Subject: [PATCH 724/770] [Clang][CGM] style cleanups NFC

Summary:
Forked from:
https://reviews.llvm.org/D80242

Use the getter for access to DebugInfo consistently.
Use break in switch in CodeGenModule::EmitTopLevelDecl consistently.

Reviewers: dblaikie

Reviewed By: dblaikie

Subscribers: cfe-commits, srhines

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80840
---
 clang/lib/CodeGen/CodeGenModule.cpp | 30 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 89a95db086804..096e0bddf39cd 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -527,7 +527,7 @@ void CodeGenModule::Release() {
                               "StrictVTablePointersRequirement",
                               llvm::MDNode::get(VMContext, Ops));
   }
-  if (DebugInfo)
+  if (getModuleDebugInfo())
     // We support a single version in the linked module. The LLVM
     // parser will drop debug info with a different version number
     // (and warn about it, too).
@@ -653,8 +653,8 @@ void CodeGenModule::Release() {
   if (getCodeGenOpts().EmitGcovArcs || getCodeGenOpts().EmitGcovNotes)
     EmitCoverageFile();
 
-  if (DebugInfo)
-    DebugInfo->finalize();
+  if (CGDebugInfo *DI = getModuleDebugInfo())
+    DI->finalize();
 
   if (getCodeGenOpts().EmitVersionIdentMetadata)
     EmitVersionIdentMetadata();
@@ -5364,17 +5364,17 @@ void CodeGenModule::EmitTopLevelDecl(Decl *D) {
     break;
   case Decl::ClassTemplateSpecialization: {
     const auto *Spec = cast<ClassTemplateSpecializationDecl>(D);
-    if (DebugInfo &&
-        Spec->getSpecializationKind() == TSK_ExplicitInstantiationDefinition &&
-        Spec->hasDefinition())
-      DebugInfo->completeTemplateDefinition(*Spec);
+    if (CGDebugInfo *DI = getModuleDebugInfo())
+      if (Spec->getSpecializationKind() ==
+              TSK_ExplicitInstantiationDefinition &&
+          Spec->hasDefinition())
+        DI->completeTemplateDefinition(*Spec);
   } LLVM_FALLTHROUGH;
   case Decl::CXXRecord:
-    if (DebugInfo) {
+    if (CGDebugInfo *DI = getModuleDebugInfo())
       if (auto *ES = D->getASTContext().getExternalSource())
         if (ES->hasExternalDefinitions(D) == ExternalASTSource::EK_Never)
-          DebugInfo->completeUnusedClass(cast<CXXRecordDecl>(*D));
-    }
+          DI->completeUnusedClass(cast<CXXRecordDecl>(*D));
     // Emit any static data members, they may be definitions.
     for (auto *I : cast<CXXRecordDecl>(D)->decls())
       if (isa<VarDecl>(I) || isa<CXXRecordDecl>(I))
@@ -5395,15 +5395,15 @@ void CodeGenModule::EmitTopLevelDecl(Decl *D) {
   case Decl::Using:          // using X; [C++]
     if (CGDebugInfo *DI = getModuleDebugInfo())
         DI->EmitUsingDecl(cast<UsingDecl>(*D));
-    return;
+    break;
   case Decl::NamespaceAlias:
     if (CGDebugInfo *DI = getModuleDebugInfo())
         DI->EmitNamespaceAlias(cast<NamespaceAliasDecl>(*D));
-    return;
+    break;
   case Decl::UsingDirective: // using namespace X; [C++]
     if (CGDebugInfo *DI = getModuleDebugInfo())
       DI->EmitUsingDirective(cast<UsingDirectiveDecl>(*D));
-    return;
+    break;
   case Decl::CXXConstructor:
     getCXXABI().EmitCXXConstructors(cast<CXXConstructorDecl>(D));
     break;
@@ -5586,10 +5586,10 @@ void CodeGenModule::AddDeferredUnusedCoverageMapping(Decl *D) {
   case Decl::CXXConstructor:
   case Decl::CXXDestructor: {
     if (!cast<FunctionDecl>(D)->doesThisDeclarationHaveABody())
-      return;
+      break;
     SourceManager &SM = getContext().getSourceManager();
     if (LimitedCoverage && SM.getMainFileID() != SM.getFileID(D->getBeginLoc()))
-      return;
+      break;
     auto I = DeferredEmptyCoverageMappingDecls.find(D);
     if (I == DeferredEmptyCoverageMappingDecls.end())
       DeferredEmptyCoverageMappingDecls[D] = true;

From 796898172c48a475f27f038e587c35dbba9ab7a6 Mon Sep 17 00:00:00 2001
From: Christopher Tetreault <ctetreau@quicinc.com>
Date: Mon, 1 Jun 2020 09:55:24 -0700
Subject: [PATCH 725/770] [SVE] Eliminate calls to default-false
 VectorType::get() from Clang

Reviewers: efriedma, david-arm, fpetrogalli, ddunbar, rjmccall

Reviewed By: fpetrogalli, rjmccall

Subscribers: tschuett, rkruppe, psnobl, dmgreen, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80323
---
 clang/lib/CodeGen/CGBuiltin.cpp        | 289 +++++++++++++------------
 clang/lib/CodeGen/CGExpr.cpp           |  13 +-
 clang/lib/CodeGen/CGExprScalar.cpp     |  12 +-
 clang/lib/CodeGen/CodeGenTypes.cpp     |   9 +-
 clang/lib/CodeGen/SwiftCallingConv.cpp |   7 +-
 clang/lib/CodeGen/TargetInfo.cpp       |  31 +--
 clang/utils/TableGen/MveEmitter.cpp    |   4 +-
 7 files changed, 190 insertions(+), 175 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b5129249c016d..cf8c8a1669d76 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4486,29 +4486,29 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
   switch (TypeFlags.getEltType()) {
   case NeonTypeFlags::Int8:
   case NeonTypeFlags::Poly8:
-    return llvm::VectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
   case NeonTypeFlags::Int16:
   case NeonTypeFlags::Poly16:
-    return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
   case NeonTypeFlags::Float16:
     if (HasLegalHalfType)
-      return llvm::VectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
+      return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
     else
-      return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
+      return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
   case NeonTypeFlags::Int32:
-    return llvm::VectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
   case NeonTypeFlags::Int64:
   case NeonTypeFlags::Poly64:
-    return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
   case NeonTypeFlags::Poly128:
     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
     // There is a lot of i128 and f128 API missing.
     // so we use v16i8 to represent poly128 and get pattern matched.
-    return llvm::VectorType::get(CGF->Int8Ty, 16);
+    return llvm::FixedVectorType::get(CGF->Int8Ty, 16);
   case NeonTypeFlags::Float32:
-    return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
   case NeonTypeFlags::Float64:
-    return llvm::VectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
   }
   llvm_unreachable("Unknown vector element type!");
 }
@@ -4518,11 +4518,11 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
   int IsQuad = IntTypeFlags.isQuad();
   switch (IntTypeFlags.getEltType()) {
   case NeonTypeFlags::Int16:
-    return llvm::VectorType::get(CGF->HalfTy, (4 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad));
   case NeonTypeFlags::Int32:
-    return llvm::VectorType::get(CGF->FloatTy, (2 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad));
   case NeonTypeFlags::Int64:
-    return llvm::VectorType::get(CGF->DoubleTy, (1 << IsQuad));
+    return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad));
   default:
     llvm_unreachable("Type can't be converted to floating-point!");
   }
@@ -5403,7 +5403,7 @@ Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
   if (Modifier & AddRetType) {
     llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
     if (Modifier & VectorizeRetType)
-      Ty = llvm::VectorType::get(
+      Ty = llvm::FixedVectorType::get(
           Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
 
     Tys.push_back(Ty);
@@ -5412,7 +5412,7 @@ Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
   // Arguments.
   if (Modifier & VectorizeArgTypes) {
     int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
-    ArgType = llvm::VectorType::get(ArgType, Elts);
+    ArgType = llvm::FixedVectorType::get(ArgType, Elts);
   }
 
   if (Modifier & (Add1ArgType | Add2ArgTypes))
@@ -5586,7 +5586,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
       Ty = HalfTy;
       break;
     }
-    llvm::Type *VecFlt = llvm::VectorType::get(Ty, VTy->getNumElements());
+    auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements());
     llvm::Type *Tys[] = { VTy, VecFlt };
     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
     return EmitNeonCall(F, Ops, NameHint);
@@ -5846,8 +5846,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
     llvm::Type *EltTy =
       llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
-    llvm::Type *NarrowTy =
-      llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
+    auto *NarrowTy =
+        llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
     llvm::Type *Tys[2] = { Ty, NarrowTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
   }
@@ -5856,8 +5856,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     // The source operand type has twice as many elements of half the size.
     unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
     llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
-    llvm::Type *NarrowTy =
-      llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
+    auto *NarrowTy =
+        llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
     llvm::Type *Tys[2] = { Ty, NarrowTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
   }
@@ -5876,8 +5876,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     auto *RTy = cast<llvm::VectorType>(Ty);
     if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
         BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
-      RTy = llvm::VectorType::get(RTy->getElementType(),
-                                  RTy->getNumElements() * 2);
+      RTy = llvm::FixedVectorType::get(RTy->getElementType(),
+                                       RTy->getNumElements() * 2);
     llvm::Type *Tys[2] = {
         RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                              /*isQuad*/ false))};
@@ -6064,57 +6064,57 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   }
   case NEON::BI__builtin_neon_vdot_v:
   case NEON::BI__builtin_neon_vdotq_v: {
-    llvm::Type *InputTy =
-        llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    auto *InputTy =
+        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
     llvm::Type *Tys[2] = { Ty, InputTy };
     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
   }
   case NEON::BI__builtin_neon_vfmlal_low_v:
   case NEON::BI__builtin_neon_vfmlalq_low_v: {
-    llvm::Type *InputTy =
-        llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    auto *InputTy =
+        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
   }
   case NEON::BI__builtin_neon_vfmlsl_low_v:
   case NEON::BI__builtin_neon_vfmlslq_low_v: {
-    llvm::Type *InputTy =
-        llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    auto *InputTy =
+        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
   }
   case NEON::BI__builtin_neon_vfmlal_high_v:
   case NEON::BI__builtin_neon_vfmlalq_high_v: {
-    llvm::Type *InputTy =
-           llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    auto *InputTy =
+        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
   }
   case NEON::BI__builtin_neon_vfmlsl_high_v:
   case NEON::BI__builtin_neon_vfmlslq_high_v: {
-    llvm::Type *InputTy =
-           llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    auto *InputTy =
+        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
   }
   case NEON::BI__builtin_neon_vmmlaq_v: {
-    llvm::Type *InputTy =
-           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    auto *InputTy =
+        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
     llvm::Type *Tys[2] = { Ty, InputTy };
     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla");
   }
   case NEON::BI__builtin_neon_vusmmlaq_v: {
-    llvm::Type *InputTy =
-           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    auto *InputTy =
+        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
   }
   case NEON::BI__builtin_neon_vusdot_v:
   case NEON::BI__builtin_neon_vusdotq_v: {
-    llvm::Type *InputTy =
-           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    auto *InputTy =
+        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
   }
@@ -7003,7 +7003,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
       Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
       Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
       // Load the value as a one-element vector.
-      Ty = llvm::VectorType::get(VTy->getElementType(), 1);
+      Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1);
       llvm::Type *Tys[] = {Ty, Int8PtrTy};
       Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
       Value *Align = getAlignmentValue32(PtrOp0);
@@ -7497,7 +7497,7 @@ static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID
 }
 
 Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
-  llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
+  auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4);
   Op = Builder.CreateBitCast(Op, Int16Ty);
   Value *V = UndefValue::get(VTy);
   llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
@@ -8867,7 +8867,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
   }
   case NEON::BI__builtin_neon_vpaddd_s64: {
-    llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2);
+    auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
     Value *Vec = EmitScalarExpr(E->getArg(0));
     // The vector is v2f64, so make sure it's bitcast to that.
     Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
@@ -8879,8 +8879,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateAdd(Op0, Op1, "vpaddd");
   }
   case NEON::BI__builtin_neon_vpaddd_f64: {
-    llvm::Type *Ty =
-      llvm::VectorType::get(DoubleTy, 2);
+    auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
     Value *Vec = EmitScalarExpr(E->getArg(0));
     // The vector is v2f64, so make sure it's bitcast to that.
     Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
@@ -8892,8 +8891,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
   }
   case NEON::BI__builtin_neon_vpadds_f32: {
-    llvm::Type *Ty =
-      llvm::VectorType::get(FloatTy, 2);
+    auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
     Value *Vec = EmitScalarExpr(E->getArg(0));
     // The vector is v2f32, so make sure it's bitcast to that.
     Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
@@ -9066,87 +9064,95 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
   case NEON::BI__builtin_neon_vset_lane_f64:
     // The vector type needs a cast for the v1f64 variant.
-    Ops[1] = Builder.CreateBitCast(Ops[1],
-                                   llvm::VectorType::get(DoubleTy, 1));
+    Ops[1] =
+        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1));
     Ops.push_back(EmitScalarExpr(E->getArg(2)));
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
   case NEON::BI__builtin_neon_vsetq_lane_f64:
     // The vector type needs a cast for the v2f64 variant.
-    Ops[1] = Builder.CreateBitCast(Ops[1],
-        llvm::VectorType::get(DoubleTy, 2));
+    Ops[1] =
+        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2));
     Ops.push_back(EmitScalarExpr(E->getArg(2)));
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
 
   case NEON::BI__builtin_neon_vget_lane_i8:
   case NEON::BI__builtin_neon_vdupb_lane_i8:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int8Ty, 8));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
   case NEON::BI__builtin_neon_vgetq_lane_i8:
   case NEON::BI__builtin_neon_vdupb_laneq_i8:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int8Ty, 16));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   case NEON::BI__builtin_neon_vget_lane_i16:
   case NEON::BI__builtin_neon_vduph_lane_i16:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int16Ty, 4));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
   case NEON::BI__builtin_neon_vgetq_lane_i16:
   case NEON::BI__builtin_neon_vduph_laneq_i16:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int16Ty, 8));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   case NEON::BI__builtin_neon_vget_lane_i32:
   case NEON::BI__builtin_neon_vdups_lane_i32:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
   case NEON::BI__builtin_neon_vdups_lane_f32:
-    Ops[0] = Builder.CreateBitCast(Ops[0],
-        llvm::VectorType::get(FloatTy, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vdups_lane");
   case NEON::BI__builtin_neon_vgetq_lane_i32:
   case NEON::BI__builtin_neon_vdups_laneq_i32:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   case NEON::BI__builtin_neon_vget_lane_i64:
   case NEON::BI__builtin_neon_vdupd_lane_i64:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 1));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
   case NEON::BI__builtin_neon_vdupd_lane_f64:
-    Ops[0] = Builder.CreateBitCast(Ops[0],
-        llvm::VectorType::get(DoubleTy, 1));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vdupd_lane");
   case NEON::BI__builtin_neon_vgetq_lane_i64:
   case NEON::BI__builtin_neon_vdupd_laneq_i64:
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   case NEON::BI__builtin_neon_vget_lane_f32:
-    Ops[0] = Builder.CreateBitCast(Ops[0],
-        llvm::VectorType::get(FloatTy, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
   case NEON::BI__builtin_neon_vget_lane_f64:
-    Ops[0] = Builder.CreateBitCast(Ops[0],
-        llvm::VectorType::get(DoubleTy, 1));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
   case NEON::BI__builtin_neon_vgetq_lane_f32:
   case NEON::BI__builtin_neon_vdups_laneq_f32:
-    Ops[0] = Builder.CreateBitCast(Ops[0],
-        llvm::VectorType::get(FloatTy, 4));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   case NEON::BI__builtin_neon_vgetq_lane_f64:
   case NEON::BI__builtin_neon_vdupd_laneq_f64:
-    Ops[0] = Builder.CreateBitCast(Ops[0],
-        llvm::VectorType::get(DoubleTy, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2));
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vgetq_lane");
   case NEON::BI__builtin_neon_vaddh_f16:
@@ -9187,7 +9193,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     SmallVector<Value *, 2> ProductOps;
     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
     ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
-    llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
+    auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
                           ProductOps, "vqdmlXl");
     Constant *CI = ConstantInt::get(SizeTy, 0);
@@ -9284,7 +9290,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     SmallVector<Value *, 2> ProductOps;
     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
-    llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
+    auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
                           ProductOps, "vqdmlXl");
     Constant *CI = ConstantInt::get(SizeTy, 0);
@@ -9532,9 +9538,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[2] = Addend;
 
     // Now adjust things to handle the lane access.
-    llvm::Type *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v ?
-      llvm::VectorType::get(VTy->getElementType(), VTy->getNumElements() / 2) :
-      VTy;
+    auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
+                         ? llvm::FixedVectorType::get(VTy->getElementType(),
+                                                      VTy->getNumElements() / 2)
+                         : VTy;
     llvm::Constant *cst = cast<Constant>(Ops[3]);
     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst);
     Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
@@ -9564,8 +9571,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
 
-    llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
-                                            VTy->getNumElements() * 2);
+    auto *STy = llvm::FixedVectorType::get(VTy->getElementType(),
+                                           VTy->getNumElements() * 2);
     Ops[2] = Builder.CreateBitCast(Ops[2], STy);
     Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(),
                                                cast<ConstantInt>(Ops[3]));
@@ -9636,8 +9643,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     unsigned ArgElts = VTy->getNumElements();
     llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
     unsigned BitWidth = EltTy->getBitWidth();
-    llvm::Type *ArgTy = llvm::VectorType::get(
-        llvm::IntegerType::get(getLLVMContext(), BitWidth/2), 2*ArgElts);
+    auto *ArgTy = llvm::FixedVectorType::get(
+        llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts);
     llvm::Type* Tys[2] = { VTy, ArgTy };
     Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
     SmallVector<llvm::Value*, 1> TmpOps;
@@ -9968,7 +9975,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddv_s8: {
     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
@@ -9980,7 +9987,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddv_s16: {
     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
@@ -9992,7 +9999,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddvq_s8: {
     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
@@ -10004,7 +10011,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddvq_s16: {
     Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
@@ -10013,7 +10020,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxv_u8: {
     Int = Intrinsic::aarch64_neon_umaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10022,7 +10029,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxv_u16: {
     Int = Intrinsic::aarch64_neon_umaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10031,7 +10038,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxvq_u8: {
     Int = Intrinsic::aarch64_neon_umaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10040,7 +10047,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxvq_u16: {
     Int = Intrinsic::aarch64_neon_umaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10049,7 +10056,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxv_s8: {
     Int = Intrinsic::aarch64_neon_smaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10058,7 +10065,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxv_s16: {
     Int = Intrinsic::aarch64_neon_smaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10067,7 +10074,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxvq_s8: {
     Int = Intrinsic::aarch64_neon_smaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10076,7 +10083,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxvq_s16: {
     Int = Intrinsic::aarch64_neon_smaxv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10085,7 +10092,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxv_f16: {
     Int = Intrinsic::aarch64_neon_fmaxv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
+    VTy = llvm::FixedVectorType::get(HalfTy, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10094,7 +10101,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxvq_f16: {
     Int = Intrinsic::aarch64_neon_fmaxv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
+    VTy = llvm::FixedVectorType::get(HalfTy, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
@@ -10103,7 +10110,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminv_u8: {
     Int = Intrinsic::aarch64_neon_uminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10112,7 +10119,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminv_u16: {
     Int = Intrinsic::aarch64_neon_uminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10121,7 +10128,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminvq_u8: {
     Int = Intrinsic::aarch64_neon_uminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10130,7 +10137,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminvq_u16: {
     Int = Intrinsic::aarch64_neon_uminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10139,7 +10146,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminv_s8: {
     Int = Intrinsic::aarch64_neon_sminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10148,7 +10155,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminv_s16: {
     Int = Intrinsic::aarch64_neon_sminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10157,7 +10164,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminvq_s8: {
     Int = Intrinsic::aarch64_neon_sminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10166,7 +10173,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminvq_s16: {
     Int = Intrinsic::aarch64_neon_sminv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10175,7 +10182,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminv_f16: {
     Int = Intrinsic::aarch64_neon_fminv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
+    VTy = llvm::FixedVectorType::get(HalfTy, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10184,7 +10191,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminvq_f16: {
     Int = Intrinsic::aarch64_neon_fminv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
+    VTy = llvm::FixedVectorType::get(HalfTy, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
@@ -10193,7 +10200,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxnmv_f16: {
     Int = Intrinsic::aarch64_neon_fmaxnmv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
+    VTy = llvm::FixedVectorType::get(HalfTy, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
@@ -10202,7 +10209,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vmaxnmvq_f16: {
     Int = Intrinsic::aarch64_neon_fmaxnmv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
+    VTy = llvm::FixedVectorType::get(HalfTy, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
@@ -10211,7 +10218,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminnmv_f16: {
     Int = Intrinsic::aarch64_neon_fminnmv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
+    VTy = llvm::FixedVectorType::get(HalfTy, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
@@ -10220,7 +10227,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vminnmvq_f16: {
     Int = Intrinsic::aarch64_neon_fminnmv;
     Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
+    VTy = llvm::FixedVectorType::get(HalfTy, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
@@ -10234,7 +10241,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlv_u8: {
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10243,7 +10250,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlv_u16: {
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10251,7 +10258,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlvq_u8: {
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10260,7 +10267,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlvq_u16: {
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10268,7 +10275,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlv_s8: {
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10277,7 +10284,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlv_s16: {
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 4);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10285,7 +10292,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlvq_s8: {
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int8Ty, 16);
+    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10294,7 +10301,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vaddlvq_s16: {
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
-    VTy = llvm::VectorType::get(Int16Ty, 8);
+    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
     llvm::Type *Tys[2] = { Ty, VTy };
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
@@ -10754,8 +10761,8 @@ BuildVector(ArrayRef<llvm::Value*> Ops) {
   }
 
   // Otherwise, insertelement the values to build the vector.
-  Value *Result =
-    llvm::UndefValue::get(llvm::VectorType::get(Ops[0]->getType(), Ops.size()));
+  Value *Result = llvm::UndefValue::get(
+      llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size()));
 
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt32(i));
@@ -10767,8 +10774,9 @@ BuildVector(ArrayRef<llvm::Value*> Ops) {
 static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
                               unsigned NumElts) {
 
-  llvm::VectorType *MaskTy = llvm::VectorType::get(CGF.Builder.getInt1Ty(),
-                         cast<IntegerType>(Mask->getType())->getBitWidth());
+  auto *MaskTy = llvm::FixedVectorType::get(
+      CGF.Builder.getInt1Ty(),
+      cast<IntegerType>(Mask->getType())->getBitWidth());
   Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
 
   // If we have less than 8 elements, then the starting mask was an i8 and
@@ -10946,9 +10954,8 @@ static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
     if (C->isAllOnesValue())
       return Op0;
 
-  llvm::VectorType *MaskTy =
-    llvm::VectorType::get(CGF.Builder.getInt1Ty(),
-                          Mask->getType()->getIntegerBitWidth());
+  auto *MaskTy = llvm::FixedVectorType::get(
+      CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
   Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
   Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
   return CGF.Builder.CreateSelect(Mask, Op0, Op1);
@@ -10987,10 +10994,10 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
 
   if (CC == 3) {
     Cmp = Constant::getNullValue(
-                       llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
+        llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
   } else if (CC == 7) {
     Cmp = Constant::getAllOnesValue(
-                       llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts));
+        llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
   } else {
     ICmpInst::Predicate Pred;
     switch (CC) {
@@ -11204,8 +11211,8 @@ static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
                            ArrayRef<Value *> Ops) {
   llvm::Type *Ty = Ops[0]->getType();
   // Arguments have a vXi32 type so cast to vXi64.
-  Ty = llvm::VectorType::get(CGF.Int64Ty,
-                             Ty->getPrimitiveSizeInBits() / 64);
+  Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
+                                  Ty->getPrimitiveSizeInBits() / 64);
   Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
   Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
 
@@ -11306,7 +11313,7 @@ static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
   }
 
   // Bitcast from vXi16 to vXf16.
-  llvm::Type *HalfTy = llvm::VectorType::get(
+  auto *HalfTy = llvm::FixedVectorType::get(
       llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
   Src = CGF.Builder.CreateBitCast(Src, HalfTy);
 
@@ -12571,7 +12578,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       }
     }
 
-    llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts);
+    auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
     Value *Zero = llvm::Constant::getNullValue(VecTy);
     Value *SV = Builder.CreateShuffleVector(Zero, Cast,
@@ -12601,7 +12608,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       }
     }
 
-    llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts);
+    auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
     Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
     Value *Zero = llvm::Constant::getNullValue(VecTy);
     Value *SV = Builder.CreateShuffleVector(Cast, Zero,
@@ -14079,11 +14086,13 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
 
     // Need to cast the second argument from a vector of unsigned int to a
     // vector of long long.
-    Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
+    Ops[1] =
+        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int64Ty, 2));
 
     if (getTarget().isLittleEndian()) {
       // Reverse the double words in the vector we will extract from.
-      Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
+      Ops[0] =
+          Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
       Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{1, 0});
 
       // Reverse the index.
@@ -14091,7 +14100,8 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     }
 
     // Intrinsic expects the first arg to be a vector of int.
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
     Ops[2] = ConstantInt::getSigned(Int32Ty, Index);
     return Builder.CreateCall(F, Ops);
   }
@@ -14100,7 +14110,8 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
 
     // Intrinsic expects the first argument to be a vector of doublewords.
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
 
     // The second argument is a compile time constant int that needs to
     // be clamped to the range [0, 12].
@@ -14132,8 +14143,10 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     assert(ArgCI && "Third arg must be constant integer!");
 
     unsigned Index = ArgCI->getZExtValue();
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
-    Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
+    Ops[1] =
+        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int64Ty, 2));
 
     // Account for endianness by treating this as just a shuffle. So we use the
     // same indices for both LE and BE in order to produce expected results in
@@ -14153,8 +14166,10 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
     assert(ArgCI && "Third argument must be a compile time constant");
     unsigned Index = ArgCI->getZExtValue() & 0x3;
-    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
-    Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int32Ty, 4));
+    Ops[0] =
+        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
+    Ops[1] =
+        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(Int32Ty, 4));
 
     // Create a shuffle mask
     int ElemIdx0;
@@ -14188,7 +14203,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   case PPC::BI__builtin_pack_vector_int128: {
     bool isLittleEndian = getTarget().isLittleEndian();
     Value *UndefValue =
-        llvm::UndefValue::get(llvm::VectorType::get(Ops[0]->getType(), 2));
+        llvm::UndefValue::get(llvm::FixedVectorType::get(Ops[0]->getType(), 2));
     Value *Res = Builder.CreateInsertElement(
         UndefValue, Ops[0], (uint64_t)(isLittleEndian ? 1 : 0));
     Res = Builder.CreateInsertElement(Res, Ops[1],
@@ -14199,7 +14214,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
   case PPC::BI__builtin_unpack_vector_int128: {
     ConstantInt *Index = cast<ConstantInt>(Ops[1]);
     Value *Unpacked = Builder.CreateBitCast(
-        Ops[0], llvm::VectorType::get(ConvertType(E->getType()), 2));
+        Ops[0], llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
 
     if (getTarget().isLittleEndian())
       Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index ff8e7c57c0542..5cf3055a44658 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -150,8 +150,8 @@ Address CodeGenFunction::CreateMemTemp(QualType Ty, CharUnits Align,
 
   if (Ty->isConstantMatrixType()) {
     auto *ArrayTy = cast<llvm::ArrayType>(Result.getType()->getElementType());
-    auto *VectorTy = llvm::VectorType::get(ArrayTy->getElementType(),
-                                           ArrayTy->getNumElements());
+    auto *VectorTy = llvm::FixedVectorType::get(ArrayTy->getElementType(),
+                                                ArrayTy->getNumElements());
 
     Result = Address(
         Builder.CreateBitCast(Result.getPointer(), VectorTy->getPointerTo()),
@@ -1678,8 +1678,7 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
       if (VTy->getNumElements() == 3) {
 
         // Bitcast to vec4 type.
-        llvm::VectorType *vec4Ty =
-            llvm::VectorType::get(VTy->getElementType(), 4);
+        auto *vec4Ty = llvm::FixedVectorType::get(VTy->getElementType(), 4);
         Address Cast = Builder.CreateElementBitCast(Addr, vec4Ty, "castToVec4");
         // Now load value.
         llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4");
@@ -1750,8 +1749,8 @@ static Address MaybeConvertMatrixAddress(Address Addr, CodeGenFunction &CGF,
   auto *ArrayTy = dyn_cast<llvm::ArrayType>(
       cast<llvm::PointerType>(Addr.getPointer()->getType())->getElementType());
   if (ArrayTy && IsVector) {
-    auto *VectorTy = llvm::VectorType::get(ArrayTy->getElementType(),
-                                           ArrayTy->getNumElements());
+    auto *VectorTy = llvm::FixedVectorType::get(ArrayTy->getElementType(),
+                                                ArrayTy->getNumElements());
 
     return Address(CGF.Builder.CreateElementBitCast(Addr, VectorTy));
   }
@@ -1795,7 +1794,7 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
         Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy),
                                             ArrayRef<int>{0, 1, 2, -1},
                                             "extractVec");
-        SrcTy = llvm::VectorType::get(VecTy->getElementType(), 4);
+        SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4);
       }
       if (Addr.getElementType() != SrcTy) {
         Addr = Builder.CreateElementBitCast(Addr, SrcTy, "storetmp");
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 84620b1f7d81d..2406cb3b973a6 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1668,8 +1668,8 @@ Value *ScalarExprEmitter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) {
     //   n = extract mask i
     //   x = extract val n
     //   newv = insert newv, x, i
-    llvm::VectorType *RTy = llvm::VectorType::get(LTy->getElementType(),
-                                                  MTy->getNumElements());
+    auto *RTy = llvm::FixedVectorType::get(LTy->getElementType(),
+                                           MTy->getNumElements());
     Value* NewV = llvm::UndefValue::get(RTy);
     for (unsigned i = 0, e = MTy->getNumElements(); i != e; ++i) {
       Value *IIndx = llvm::ConstantInt::get(CGF.SizeTy, i);
@@ -4458,10 +4458,8 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
     llvm::Value *zeroVec = llvm::Constant::getNullValue(vecTy);
     llvm::Value *TestMSB = Builder.CreateICmpSLT(CondV, zeroVec);
-    llvm::Value *tmp = Builder.CreateSExt(TestMSB,
-                                          llvm::VectorType::get(elemType,
-                                                                numElem),
-                                          "sext");
+    llvm::Value *tmp = Builder.CreateSExt(
+        TestMSB, llvm::FixedVectorType::get(elemType, numElem), "sext");
     llvm::Value *tmp2 = Builder.CreateNot(tmp);
 
     // Cast float to int to perform ANDs if necessary.
@@ -4680,7 +4678,7 @@ Value *ScalarExprEmitter::VisitAsTypeExpr(AsTypeExpr *E) {
   // get a vec3.
   if (NumElementsSrc != 3 && NumElementsDst == 3) {
     if (!CGF.CGM.getCodeGenOpts().PreserveVec3Type) {
-      auto Vec4Ty = llvm::VectorType::get(
+      auto *Vec4Ty = llvm::FixedVectorType::get(
           cast<llvm::VectorType>(DstTy)->getElementType(), 4);
       Src = createCastsForTypeOfSameSize(Builder, CGF.CGM.getDataLayout(), Src,
                                          Vec4Ty);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 19b8ff3e8b3f1..a9ca5e2c10430 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -649,14 +649,15 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case Type::ExtVector:
   case Type::Vector: {
     const VectorType *VT = cast<VectorType>(Ty);
-    ResultType = llvm::VectorType::get(ConvertType(VT->getElementType()),
-                                       VT->getNumElements());
+    ResultType = llvm::FixedVectorType::get(ConvertType(VT->getElementType()),
+                                            VT->getNumElements());
     break;
   }
   case Type::ConstantMatrix: {
     const ConstantMatrixType *MT = cast<ConstantMatrixType>(Ty);
-    ResultType = llvm::VectorType::get(ConvertType(MT->getElementType()),
-                                       MT->getNumRows() * MT->getNumColumns());
+    ResultType =
+        llvm::FixedVectorType::get(ConvertType(MT->getElementType()),
+                                   MT->getNumRows() * MT->getNumColumns());
     break;
   }
   case Type::FunctionNoProto:
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 8bce93b71c0c2..3d7421ac2e16c 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -694,7 +694,7 @@ swiftcall::splitLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize,
   // Try to split the vector type in half.
   if (numElts >= 4 && isPowerOf2(numElts)) {
     if (isLegalVectorType(CGM, vectorSize / 2, eltTy, numElts / 2))
-      return {llvm::VectorType::get(eltTy, numElts / 2), 2};
+      return {llvm::FixedVectorType::get(eltTy, numElts / 2), 2};
   }
 
   return {eltTy, numElts};
@@ -747,7 +747,8 @@ void swiftcall::legalizeVectorType(CodeGenModule &CGM, CharUnits origVectorSize,
 
     // Add the right number of vectors of this size.
     auto numVecs = numElts >> logCandidateNumElts;
-    components.append(numVecs, llvm::VectorType::get(eltTy, candidateNumElts));
+    components.append(numVecs,
+                      llvm::FixedVectorType::get(eltTy, candidateNumElts));
     numElts -= (numVecs << logCandidateNumElts);
 
     if (numElts == 0) return;
@@ -757,7 +758,7 @@ void swiftcall::legalizeVectorType(CodeGenModule &CGM, CharUnits origVectorSize,
     // This only needs to be separately checked if it's not a power of 2.
     if (numElts > 2 && !isPowerOf2(numElts) &&
         isLegalVectorType(CGM, eltSize * numElts, eltTy, numElts)) {
-      components.push_back(llvm::VectorType::get(eltTy, numElts));
+      components.push_back(llvm::FixedVectorType::get(eltTy, numElts));
       return;
     }
 
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 2f564f3e860b9..028b522d42b7c 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -1478,8 +1478,8 @@ ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy,
       // registers and we need to make sure to pick a type the LLVM
       // backend will like.
       if (Size == 128)
-        return ABIArgInfo::getDirect(llvm::VectorType::get(
-                  llvm::Type::getInt64Ty(getVMContext()), 2));
+        return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
+            llvm::Type::getInt64Ty(getVMContext()), 2));
 
       // Always return in register if it fits in a general purpose
       // register, or if it is 64 bits and has a single element.
@@ -3122,8 +3122,8 @@ llvm::Type *X86_64ABIInfo::GetByteVectorType(QualType Ty) const {
         cast<llvm::VectorType>(IRType)->getElementType()->isIntegerTy(128)) {
       // Use a vXi64 vector.
       uint64_t Size = getContext().getTypeSize(Ty);
-      return llvm::VectorType::get(llvm::Type::getInt64Ty(getVMContext()),
-                                   Size / 64);
+      return llvm::FixedVectorType::get(llvm::Type::getInt64Ty(getVMContext()),
+                                        Size / 64);
     }
 
     return IRType;
@@ -3138,8 +3138,8 @@ llvm::Type *X86_64ABIInfo::GetByteVectorType(QualType Ty) const {
 
 
   // Return a LLVM IR vector type based on the size of 'Ty'.
-  return llvm::VectorType::get(llvm::Type::getDoubleTy(getVMContext()),
-                               Size / 64);
+  return llvm::FixedVectorType::get(llvm::Type::getDoubleTy(getVMContext()),
+                                    Size / 64);
 }
 
 /// BitsContainNoUserData - Return true if the specified [start,end) bit range
@@ -3273,7 +3273,8 @@ GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset,
   // case.
   if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()) &&
       ContainsFloatAtOffset(IRType, IROffset+4, getDataLayout()))
-    return llvm::VectorType::get(llvm::Type::getFloatTy(getVMContext()), 2);
+    return llvm::FixedVectorType::get(llvm::Type::getFloatTy(getVMContext()),
+                                      2);
 
   return llvm::Type::getDoubleTy(getVMContext());
 }
@@ -4140,8 +4141,8 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
 
       // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
       // Clang matches them for compatibility.
-      return ABIArgInfo::getDirect(
-          llvm::VectorType::get(llvm::Type::getInt64Ty(getVMContext()), 2));
+      return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
+          llvm::Type::getInt64Ty(getVMContext()), 2));
 
     default:
       break;
@@ -5478,13 +5479,13 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty) const {
       return ABIArgInfo::getDirect(ResType);
     }
     if (Size == 64) {
-      llvm::Type *ResType =
-          llvm::VectorType::get(llvm::Type::getInt32Ty(getVMContext()), 2);
+      auto *ResType =
+          llvm::FixedVectorType::get(llvm::Type::getInt32Ty(getVMContext()), 2);
       return ABIArgInfo::getDirect(ResType);
     }
     if (Size == 128) {
-      llvm::Type *ResType =
-          llvm::VectorType::get(llvm::Type::getInt32Ty(getVMContext()), 4);
+      auto *ResType =
+          llvm::FixedVectorType::get(llvm::Type::getInt32Ty(getVMContext()), 4);
       return ABIArgInfo::getDirect(ResType);
     }
     return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
@@ -6209,7 +6210,7 @@ ABIArgInfo ARMABIInfo::coerceIllegalVector(QualType Ty) const {
     return ABIArgInfo::getDirect(ResType);
   }
   if (Size == 64 || Size == 128) {
-    llvm::Type *ResType = llvm::VectorType::get(
+    auto *ResType = llvm::FixedVectorType::get(
         llvm::Type::getInt32Ty(getVMContext()), Size / 32);
     return ABIArgInfo::getDirect(ResType);
   }
@@ -6225,7 +6226,7 @@ ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
     // FP16 vectors should be converted to integer vectors
     if (!getTarget().hasLegalHalfType() && containsAnyFP16Vectors(Ty)) {
       uint64_t Size = getContext().getTypeSize(VT);
-      llvm::Type *NewVecTy = llvm::VectorType::get(
+      auto *NewVecTy = llvm::FixedVectorType::get(
           llvm::Type::getInt32Ty(getVMContext()), Size / 32);
       llvm::Type *Ty = llvm::ArrayType::get(NewVecTy, Members);
       return ABIArgInfo::getDirect(Ty, 0, nullptr, false);
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 36aa7534aafab..e9ae08ac4c051 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -300,7 +300,7 @@ class VectorType : public CRegularNamedType {
     return Element->cNameBase() + "x" + utostr(Lanes);
   }
   std::string llvmName() const override {
-    return "llvm::VectorType::get(" + Element->llvmName() + ", " +
+    return "llvm::FixedVectorType::get(" + Element->llvmName() + ", " +
            utostr(Lanes) + ")";
   }
 
@@ -354,7 +354,7 @@ class PredicateType : public CRegularNamedType {
     // explanation.
     unsigned ModifiedLanes = (Lanes == 2 ? 4 : Lanes);
 
-    return "llvm::VectorType::get(Builder.getInt1Ty(), " +
+    return "llvm::FixedVectorType::get(Builder.getInt1Ty(), " +
            utostr(ModifiedLanes) + ")";
   }
 

From 7ad36491cadee1ed91e18215c30328a3d5c104f9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 30 May 2020 11:53:54 -0400
Subject: [PATCH 726/770] AMDGPU: Fix alignment for dynamic allocas

The alignment value also needs to be scaled by the wave size.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    | 9 ++++++---
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bbd3737d2ef0e..08effeea18126 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3126,9 +3126,12 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
 
   unsigned StackAlign = TFL->getStackAlignment();
   Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
-  if (Align > StackAlign)
-    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+  if (Align > StackAlign) {
+    Tmp1 = DAG.getNode(
+      ISD::AND, dl, VT, Tmp1,
+      DAG.getConstant(-(uint64_t)Align << ST.getWavefrontSizeLog2(), dl, VT));
+  }
+
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);    // Output chain
   Tmp2 = DAG.getCALLSEQ_END(
       Chain, DAG.getIntPtrConstant(0, dl, true),
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 060d66ae84282..0cd60bd8203eb 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -95,7 +95,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_cbranch_scc1 BB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
-; GCN-NEXT:    s_andn2_b32 s6, s6, 63
+; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
 ; GCN-NEXT:    s_lshl_b32 s7, s7, 2
 ; GCN-NEXT:    s_mov_b32 s32, s6
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
@@ -223,7 +223,7 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
 ; GCN-NEXT:    s_cbranch_execz BB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
-; GCN-NEXT:    s_andn2_b32 s6, s6, 63
+; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN-NEXT:    v_mov_b32_e32 v6, 1

From 4e963299ee0f6f413f9d25bf7a53e6a90441f7b4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 1 Jun 2020 10:18:04 -0700
Subject: [PATCH 727/770] Temporarily removed unstable test. NFC.

---
 llvm/test/Transforms/SROA/phi-gep.ll | 49 ----------------------------
 1 file changed, 49 deletions(-)

diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index cdabd7a420336..2763c71d401bb 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -237,55 +237,6 @@ end:
   ret i32 %load
 }
 
-define i32 @test_sroa_gep_cast_phi_gep(i1 %cond) {
-; CHECK-LABEL: @test_sroa_gep_cast_phi_gep(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST2:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    [[A_SROA_0_0_GEP_SROA_CAST:%.*]] = bitcast i32* [[A_SROA_0]] to float*
-; CHECK-NEXT:    store i32 1065353216, i32* [[A_SROA_0]], align 4
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR:%.*]], label [[END:%.*]]
-; CHECK:       for:
-; CHECK-NEXT:    [[PHI_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_CAST_TO_I32:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI_SROA_PHI:%.*]] = phi float* [ [[A_SROA_0_0_GEP_SROA_CAST]], [[ENTRY]] ], [ [[GEP_FOR_CAST_TO_I32_SROA_GEP:%.*]], [[FOR]] ]
-; CHECK-NEXT:    [[I]] = add i32 [[PHI_I]], 1
-; CHECK-NEXT:    [[GEP_FOR_CAST:%.*]] = bitcast float* [[PHI_SROA_PHI]] to i32*
-; CHECK-NEXT:    [[GEP_FOR_CAST_TO_I32]] = bitcast i32* [[GEP_FOR_CAST]] to float*
-; CHECK-NEXT:    [[GEP_FOR_CAST_TO_I32_SROA_GEP]] = getelementptr inbounds float, float* [[GEP_FOR_CAST_TO_I32]], i32 0
-; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[I]], 10
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[FOR]], label [[END]]
-; CHECK:       end:
-; CHECK-NEXT:    [[PHI_END:%.*]] = phi float* [ [[A_SROA_0_0_GEP_A_CAST_TO_I32_SROA_CAST2]], [[ENTRY]] ], [ [[PHI]], [[FOR]] ]
-; CHECK-NEXT:    [[PHI_END_1:%.*]] = bitcast float* [[PHI_END]] to i32*
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PHI_END_1]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %a = alloca %pair, align 4
-  %gep_a = getelementptr inbounds %pair, %pair* %a, i32 0, i32 1
-  %gep_a_cast_to_i32 = bitcast i32* %gep_a to float*
-  store float 1.0, float* %gep_a_cast_to_i32, align 4
-  br i1 %cond, label %for, label %end
-
-for:
-  %phi_i = phi i32 [ 0, %entry ], [ %i, %for ]
-  %phi = phi float* [ %gep_a_cast_to_i32, %entry], [ %gep_for_cast_to_i32, %for ]
-  %i = add i32 %phi_i, 1
-  %gep_for = getelementptr inbounds float, float* %phi, i32 0
-  %gep_for_cast = bitcast float* %gep_for to i32*
-  %gep_for_cast_to_i32 = bitcast i32* %gep_for_cast to float*
-  %loop.cond = icmp ult i32 %i, 10
-  br i1 %loop.cond, label %for, label %end
-
-end:
-  %phi_end = phi float* [ %gep_a_cast_to_i32, %entry], [ %phi, %for ]
-  %phi_end.1 = bitcast float* %phi_end to i32*
-  %load = load i32, i32* %phi_end.1, align 4
-  ret i32 %load
-}
-
 define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality i32 (...)* @__gxx_personality_v0 {
 ; CHECK-LABEL: @test_sroa_invoke_phi_gep(
 ; CHECK-NEXT:  entry:

From ee9a251caf1d785798c3602d473c3d2d84180d50 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Mon, 1 Jun 2020 10:17:48 -0700
Subject: [PATCH 728/770] [ELF] Set DF_1_PIE for -pie

DF_1_PIE originated from Solaris (https://docs.oracle.com/cd/E36784_01/html/E36857/chapter6-42444.html ).
GNU ld since
https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=5fe2850dd96483f176858fd75c098313d5b20bc2
sets the flag on non-Solaris platforms.

It can help distinguish PIE from ET_DYN.
eu-classify from elfutils uses this to recognize PIE (https://sourceware.org/git/?p=elfutils.git;a=commit;h=3f489b5c7c78df6d52f8982f79c36e9a220e8951 )

glibc uses this flag to reject dlopen'ing a PIE (https://sourceware.org/bugzilla/show_bug.cgi?id=24323 )

Reviewed By: psmith

Differential Revision: https://reviews.llvm.org/D80872
---
 lld/ELF/SyntheticSections.cpp                 |   2 +
 lld/test/ELF/aarch64-feature-bti.s            |   8 +-
 .../ELF/aarch64-gnu-ifunc-nonpreemptable.s    |   6 +-
 lld/test/ELF/aarch64-ifunc-bti.s              |  12 +-
 lld/test/ELF/arm-pie-relative.s               |   4 +-
 lld/test/ELF/gnu-ifunc-dyntags.s              |   4 +-
 lld/test/ELF/i386-retpoline-pic.s             |   4 +-
 lld/test/ELF/local-got-pie.s                  |  11 +-
 lld/test/ELF/pack-dyn-relocs-arm2.s           |   4 +-
 lld/test/ELF/pack-dyn-relocs-relr-loop.s      |   6 +-
 lld/test/ELF/pack-dyn-relocs.s                | 296 +++++++++---------
 lld/test/ELF/pie.s                            |   7 +-
 lld/test/ELF/ppc32-call-stub-pic.s            |  16 +-
 lld/test/ELF/ppc32-ifunc-nonpreemptible-pic.s |   6 +-
 lld/test/ELF/ppc64-long-branch-pi.s           |  14 +-
 lld/test/ELF/relative-dynamic-reloc-pie.s     |   6 +-
 lld/test/ELF/riscv-gp.s                       |  10 +-
 lld/test/ELF/riscv-ifunc-nonpreemptible.s     |  12 +-
 lld/test/ELF/separate-segments.s              |  10 +-
 19 files changed, 221 insertions(+), 217 deletions(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 16f4d1e96fa5b..f6d66fff6d4b8 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1317,6 +1317,8 @@ template <class ELFT> void DynamicSection<ELFT>::finalizeContents() {
     dtFlags1 |= DF_1_NODELETE;
   if (config->zNodlopen)
     dtFlags1 |= DF_1_NOOPEN;
+  if (config->pie)
+    dtFlags1 |= DF_1_PIE;
   if (config->zNow) {
     dtFlags |= DF_BIND_NOW;
     dtFlags1 |= DF_1_NOW;
diff --git a/lld/test/ELF/aarch64-feature-bti.s b/lld/test/ELF/aarch64-feature-bti.s
index 4c24061462f77..f4fb701cc77c3 100644
--- a/lld/test/ELF/aarch64-feature-bti.s
+++ b/lld/test/ELF/aarch64-feature-bti.s
@@ -127,16 +127,16 @@
 # PIE-NEXT:    10350: bti    c
 # PIE-NEXT:           stp    x16, x30, [sp, #-16]!
 # PIE-NEXT:           adrp   x16, #131072
-# PIE-NEXT:           ldr    x17, [x16, #1160]
-# PIE-NEXT:           add    x16, x16, #1160
+# PIE-NEXT:           ldr    x17, [x16, #1176]
+# PIE-NEXT:           add    x16, x16, #1176
 # PIE-NEXT:           br     x17
 # PIE-NEXT:           nop
 # PIE-NEXT:           nop
 # PIE: 0000000000010370 <func2@plt>:
 # PIE-NEXT:    10370: bti    c
 # PIE-NEXT:           adrp   x16, #131072
-# PIE-NEXT:           ldr    x17, [x16, #1168]
-# PIE-NEXT:           add    x16, x16, #1168
+# PIE-NEXT:           ldr    x17, [x16, #1184]
+# PIE-NEXT:           add    x16, x16, #1184
 # PIE-NEXT:           br     x17
 # PIE-NEXT:           nop
 
diff --git a/lld/test/ELF/aarch64-gnu-ifunc-nonpreemptable.s b/lld/test/ELF/aarch64-gnu-ifunc-nonpreemptable.s
index 31b826c8b580a..9f8a59367b0ef 100644
--- a/lld/test/ELF/aarch64-gnu-ifunc-nonpreemptable.s
+++ b/lld/test/ELF/aarch64-gnu-ifunc-nonpreemptable.s
@@ -63,10 +63,10 @@ main:
 # PIE-EMPTY:
 # PIE-NEXT: <myfunc>:
 # PIE-NEXT:    10270: adrp    x16, #131072
-# PIE-NEXT:    10274: ldr     x17, [x16, #880]
-# PIE-NEXT:    10278: add     x16, x16, #880
+# PIE-NEXT:    10274: ldr     x17, [x16, #896]
+# PIE-NEXT:    10278: add     x16, x16, #896
 # PIE-NEXT:    1027c: br      x17
 
 # PIE-RELOC:      .rela.dyn {
-# PIE-RELOC-NEXT:   0x30370 R_AARCH64_IRELATIVE - 0x10260
+# PIE-RELOC-NEXT:   0x30380 R_AARCH64_IRELATIVE - 0x10260
 # PIE-RELOC-NEXT: }
diff --git a/lld/test/ELF/aarch64-ifunc-bti.s b/lld/test/ELF/aarch64-ifunc-bti.s
index a6434289691a2..ab6c6236d2a00 100644
--- a/lld/test/ELF/aarch64-ifunc-bti.s
+++ b/lld/test/ELF/aarch64-ifunc-bti.s
@@ -15,16 +15,16 @@
 # CHECK-NEXT:    10380:         bti     c
 # CHECK-NEXT:                   stp     x16, x30, [sp, #-16]!
 # CHECK-NEXT:                   adrp    x16, #131072
-# CHECK-NEXT:                   ldr     x17, [x16, #1272]
-# CHECK-NEXT:                   add     x16, x16, #1272
+# CHECK-NEXT:                   ldr     x17, [x16, #1288]
+# CHECK-NEXT:                   add     x16, x16, #1288
 # CHECK-NEXT:                   br      x17
 # CHECK-NEXT:                   nop
 # CHECK-NEXT:                   nop
 # CHECK: 00000000000103a0 <func1@plt>:
 # CHECK-NEXT:    103a0:         bti     c
 # CHECK-NEXT:                   adrp    x16, #131072
-# CHECK-NEXT:                   ldr     x17, [x16, #1280]
-# CHECK-NEXT:                   add     x16, x16, #1280
+# CHECK-NEXT:                   ldr     x17, [x16, #1296]
+# CHECK-NEXT:                   add     x16, x16, #1296
 # CHECK-NEXT:                   br      x17
 # CHECK-NEXT:                   nop
 # CHECK-EMPTY:
@@ -33,8 +33,8 @@
 # CHECK-NEXT: 00000000000103c0 <myfunc>:
 # CHECK-NEXT:    103c0:         bti     c
 # CHECK-NEXT:                   adrp    x16, #131072
-# CHECK-NEXT:                   ldr     x17, [x16, #1288]
-# CHECK-NEXT:                   add     x16, x16, #1288
+# CHECK-NEXT:                   ldr     x17, [x16, #1304]
+# CHECK-NEXT:                   add     x16, x16, #1304
 # CHECK-NEXT:                   br      x17
 # CHECK-NEXT:                   nop
 
diff --git a/lld/test/ELF/arm-pie-relative.s b/lld/test/ELF/arm-pie-relative.s
index e0758895a9dbe..20421f2d0c8e8 100644
--- a/lld/test/ELF/arm-pie-relative.s
+++ b/lld/test/ELF/arm-pie-relative.s
@@ -19,7 +19,7 @@ sym:
 
 // CHECK:      Relocations [
 // CHECK-NEXT:   Section (5) .rel.dyn {
-// CHECK-NEXT:     0x201DC R_ARM_RELATIVE
+// CHECK-NEXT:     0x201E4 R_ARM_RELATIVE
 
 // GOT:      section '.got':
-// GOT-NEXT: 0x000201dc e0010300
+// GOT-NEXT: 0x000201e4 e8010300
diff --git a/lld/test/ELF/gnu-ifunc-dyntags.s b/lld/test/ELF/gnu-ifunc-dyntags.s
index 9d1a0c5e23ad7..fd80dc24f2f8e 100644
--- a/lld/test/ELF/gnu-ifunc-dyntags.s
+++ b/lld/test/ELF/gnu-ifunc-dyntags.s
@@ -9,14 +9,14 @@
 
 # CHECK:  Name          Size   VMA
 # CHECK:  .rela.dyn   00000030 0000000000000248
-# CHECK:  .got.plt    00000010 00000000000033a0
+# CHECK:  .got.plt    00000010 00000000000033b0
 
 # TAGS:   Tag                Type                 Name/Value
 # TAGS:   0x0000000000000007 RELA                 0x248
 # TAGS:   0x0000000000000008 RELASZ               48 (bytes)
 # TAGS:   0x0000000000000017 JMPREL               0x0
 # TAGS:   0x0000000000000002 PLTRELSZ             0 (bytes)
-# TAGS:   0x0000000000000003 PLTGOT               0x33A0
+# TAGS:   0x0000000000000003 PLTGOT               0x33B0
 # TAGS:   0x0000000000000014 PLTREL               RELA
 
 # TAGS:      Relocations [
diff --git a/lld/test/ELF/i386-retpoline-pic.s b/lld/test/ELF/i386-retpoline-pic.s
index b6a01243c175e..293506d746284 100644
--- a/lld/test/ELF/i386-retpoline-pic.s
+++ b/lld/test/ELF/i386-retpoline-pic.s
@@ -7,8 +7,8 @@
 // RUN: llvm-objdump -d -s --no-show-raw-insn %t.exe | FileCheck %s
 
 // CHECK:      Contents of section .got.plt:
-// CHECK-NEXT: 32a8 40220000 00000000 00000000 11120000
-// CHECK-NEXT: 32b8 31120000
+// CHECK-NEXT: 32b0 40220000 00000000 00000000 11120000
+// CHECK-NEXT: 32c0 31120000
 
 // CHECK:      Disassembly of section .plt:
 // CHECK-EMPTY:
diff --git a/lld/test/ELF/local-got-pie.s b/lld/test/ELF/local-got-pie.s
index 41c33c2513b5b..b9c72076a15ce 100644
--- a/lld/test/ELF/local-got-pie.s
+++ b/lld/test/ELF/local-got-pie.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 // RUN: ld.lld --hash-style=sysv %t.o -o %t -pie
 // RUN: llvm-readobj -S -d -r %t | FileCheck %s
-// RUN: llvm-objdump -d %t | FileCheck --check-prefix=DISASM %s
+// RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=DISASM %s
 
 .globl _start
 _start:
@@ -13,13 +13,12 @@ _start:
 foo:
  nop
 
-// 0x22C8 - 0x1210 - 5 = 4275
 // DISASM:      Disassembly of section .text:
 // DISASM-EMPTY:
 // DISASM-NEXT: <_start>:
-// DISASM-NEXT:   1210: {{.*}} callq 0x22c8
+// DISASM-NEXT:   1210: callq 0x22d8
 // DISASM:      <foo>:
-// DISASM-NEXT:   1215: {{.*}} nop
+// DISASM-NEXT:   1215: nop
 
 // CHECK:      Name: .got
 // CHECK-NEXT: Type: SHT_PROGBITS
@@ -27,7 +26,7 @@ foo:
 // CHECK-NEXT:   SHF_ALLOC
 // CHECK-NEXT:   SHF_WRITE
 // CHECK-NEXT: ]
-// CHECK-NEXT: Address: 0x22C8
+// CHECK-NEXT: Address: 0x22D8
 // CHECK-NEXT: Offset:
 // CHECK-NEXT: Size: 8
 
@@ -35,6 +34,6 @@ foo:
 
 // CHECK:      Relocations [
 // CHECK-NEXT:   Section ({{.*}}) .rela.dyn {
-// CHECK-NEXT:     0x22C8 R_X86_64_RELATIVE - 0x1215
+// CHECK-NEXT:     0x22D8 R_X86_64_RELATIVE - 0x1215
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
diff --git a/lld/test/ELF/pack-dyn-relocs-arm2.s b/lld/test/ELF/pack-dyn-relocs-arm2.s
index dbe6d1aca21ca..b2aa8a5f0add4 100644
--- a/lld/test/ELF/pack-dyn-relocs-arm2.s
+++ b/lld/test/ELF/pack-dyn-relocs-arm2.s
@@ -8,8 +8,6 @@
 // RUN: llvm-readobj -r %t.exe | FileCheck %s
 
 // CHECK:      Section (5) .relr.dyn {
-// CHECK-NEXT:   0x301E0 R_ARM_RELATIVE - 0x0
-// CHECK-NEXT:   0x301E4 R_ARM_RELATIVE - 0x0
 // CHECK-NEXT:   0x301E8 R_ARM_RELATIVE - 0x0
 // CHECK-NEXT:   0x301EC R_ARM_RELATIVE - 0x0
 // CHECK-NEXT:   0x301F0 R_ARM_RELATIVE - 0x0
@@ -42,6 +40,8 @@
 // CHECK-NEXT:   0x3025C R_ARM_RELATIVE - 0x0
 // CHECK-NEXT:   0x30260 R_ARM_RELATIVE - 0x0
 // CHECK-NEXT:   0x30264 R_ARM_RELATIVE - 0x0
+// CHECK-NEXT:   0x30268 R_ARM_RELATIVE - 0x0
+// CHECK-NEXT:   0x3026C R_ARM_RELATIVE - 0x0
 // CHECK-NEXT: }
 
 // RUN: llvm-readobj -S --dynamic-table %t.exe | FileCheck --check-prefix=HEADER %s
diff --git a/lld/test/ELF/pack-dyn-relocs-relr-loop.s b/lld/test/ELF/pack-dyn-relocs-relr-loop.s
index 3fd524a65dc23..883b8a3339a66 100644
--- a/lld/test/ELF/pack-dyn-relocs-relr-loop.s
+++ b/lld/test/ELF/pack-dyn-relocs-relr-loop.s
@@ -12,14 +12,14 @@
 # CHECK: .relr.dyn needs 1 padding word(s)
 
 # RELR:      .relr.dyn {
-# RELR-NEXT:   0x2F40 R_AARCH64_RELATIVE - 0x0
-# RELR-NEXT:   0x2F48 R_AARCH64_RELATIVE - 0x0
+# RELR-NEXT:   0x2F30 R_AARCH64_RELATIVE - 0x0
+# RELR-NEXT:   0x2F38 R_AARCH64_RELATIVE - 0x0
 # RELR-NEXT:   0x3000 R_AARCH64_RELATIVE - 0x0
 # RELR-NEXT: }
 
 .section .data.rel.ro
 .align 3
-.space 0xce0
+.space 0xcd0
 foo:
 ## Encoded by the first word of .relr.dyn
 .quad foo
diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s
index 6c78a91c9d61c..5b4d91070f5ec 100644
--- a/lld/test/ELF/pack-dyn-relocs.s
+++ b/lld/test/ELF/pack-dyn-relocs.s
@@ -8,25 +8,23 @@
 
 /// Unpacked should have the relative relocations in their natural order.
 /// UNPACKED32:          Section ({{.+}}) .rel.dyn {
-// UNPACKED32-NEXT:     0x3031C R_ARM_RELATIVE - 0x0
-// UNPACKED32-NEXT:     0x30320 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30324 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30328 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x3032C R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30330 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30334 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30338 R_ARM_RELATIVE - 0x0
-
+// UNPACKED32-NEXT:     0x3033C R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30340 R_ARM_RELATIVE - 0x0
-// UNPACKED32-NEXT:     0x30344 R_ARM_RELATIVE - 0x0
+
 // UNPACKED32-NEXT:     0x30348 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x3034C R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30350 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30354 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30358 R_ARM_RELATIVE - 0x0
+// UNPACKED32-NEXT:     0x3035C R_ARM_RELATIVE - 0x0
+// UNPACKED32-NEXT:     0x30360 R_ARM_RELATIVE - 0x0
 
-// UNPACKED32-NEXT:     0x30364 R_ARM_RELATIVE - 0x0
-// UNPACKED32-NEXT:     0x30368 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x3036C R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30370 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30374 R_ARM_RELATIVE - 0x0
@@ -34,16 +32,18 @@
 // UNPACKED32-NEXT:     0x3037C R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30380 R_ARM_RELATIVE - 0x0
 // UNPACKED32-NEXT:     0x30384 R_ARM_RELATIVE - 0x0
-// UNPACKED32-NEXT:     0x30389 R_ARM_RELATIVE - 0x0
+// UNPACKED32-NEXT:     0x30388 R_ARM_RELATIVE - 0x0
+// UNPACKED32-NEXT:     0x3038C R_ARM_RELATIVE - 0x0
+// UNPACKED32-NEXT:     0x30391 R_ARM_RELATIVE - 0x0
 
-// UNPACKED32-NEXT:     0x3033C R_ARM_ABS32 bar2 0x0
-// UNPACKED32-NEXT:     0x30360 R_ARM_ABS32 bar2 0x0
-// UNPACKED32-NEXT:     0x3038D R_ARM_ABS32 bar2 0x0
-// UNPACKED32-NEXT:     0x30391 R_ARM_ABS32 bar2 0x0
+// UNPACKED32-NEXT:     0x30344 R_ARM_ABS32 bar2 0x0
+// UNPACKED32-NEXT:     0x30368 R_ARM_ABS32 bar2 0x0
 // UNPACKED32-NEXT:     0x30395 R_ARM_ABS32 bar2 0x0
 // UNPACKED32-NEXT:     0x30399 R_ARM_ABS32 bar2 0x0
 // UNPACKED32-NEXT:     0x3039D R_ARM_ABS32 bar2 0x0
-// UNPACKED32-NEXT:     0x3035C R_ARM_ABS32 zed2 0x0
+// UNPACKED32-NEXT:     0x303A1 R_ARM_ABS32 bar2 0x0
+// UNPACKED32-NEXT:     0x303A5 R_ARM_ABS32 bar2 0x0
+// UNPACKED32-NEXT:     0x30364 R_ARM_ABS32 zed2 0x0
 // UNPACKED32-NEXT:     }
 
 // RUN: ld.lld -pie --pack-dyn-relocs=android %t.a32.o %t.a32.so -o %t3.a32
@@ -73,42 +73,42 @@
 /// by the larger groups of relative relocations (i.e. the 8 and 9 followed
 /// by the 7.)
 // ANDROID32:          Section ({{.+}}) .rel.dyn {
-// ANDROID32-NEXT:     0x3024C R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30250 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30254 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30258 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x3025C R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30260 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30264 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30268 R_ARM_RELATIVE - 0
-
-// ANDROID32-NEXT:     0x30294 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30298 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x3029C R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302A0 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302A4 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302A8 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302AC R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302B0 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302B4 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30270 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30274 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30278 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x3027C R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30280 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30284 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x30288 R_ARM_RELATIVE - 0
-// ANDROID32-NEXT:     0x302B9 R_ARM_RELATIVE - 0
-
-// ANDROID32-NEXT:     0x3026C R_ARM_ABS32 bar2 0
-// ANDROID32-NEXT:     0x30290 R_ARM_ABS32 bar2 0
-// ANDROID32-NEXT:     0x302BD R_ARM_ABS32 bar2 0
-// ANDROID32-NEXT:     0x302C1 R_ARM_ABS32 bar2 0
-// ANDROID32-NEXT:     0x302C5 R_ARM_ABS32 bar2 0
-// ANDROID32-NEXT:     0x302C9 R_ARM_ABS32 bar2 0
-// ANDROID32-NEXT:     0x302CD R_ARM_ABS32 bar2 0
-
-// ANDROID32-NEXT:     0x3028C R_ARM_ABS32 zed2 0
+// ANDROID32-NEXT:     0x30254 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30258 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x3025C R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30260 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30264 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30268 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x3026C R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30270 R_ARM_RELATIVE - 0x0
+
+// ANDROID32-NEXT:     0x3029C R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302A0 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302A4 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302A8 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302AC R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302B0 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302B4 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302B8 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302BC R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30278 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x3027C R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30280 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30284 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30288 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x3028C R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x30290 R_ARM_RELATIVE - 0x0
+// ANDROID32-NEXT:     0x302C1 R_ARM_RELATIVE - 0x0
+
+// ANDROID32-NEXT:     0x30274 R_ARM_ABS32 bar2 0x0
+// ANDROID32-NEXT:     0x30298 R_ARM_ABS32 bar2 0x0
+// ANDROID32-NEXT:     0x302C5 R_ARM_ABS32 bar2 0x0
+// ANDROID32-NEXT:     0x302C9 R_ARM_ABS32 bar2 0x0
+// ANDROID32-NEXT:     0x302CD R_ARM_ABS32 bar2 0x0
+// ANDROID32-NEXT:     0x302D1 R_ARM_ABS32 bar2 0x0
+// ANDROID32-NEXT:     0x302D5 R_ARM_ABS32 bar2 0x0
+
+// ANDROID32-NEXT:     0x30294 R_ARM_ABS32 zed2 0x0
 // ANDROID32-NEXT:     }
 
 // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.a32.o %t.a32.so -o %t4.a32
@@ -139,7 +139,7 @@
 /// SHT_RELR section contains address/bitmap entries
 /// encoding the offsets for relative relocation.
 // RAW-RELR32:           Section ({{.+}}) .relr.dyn {
-// RAW-RELR32-NEXT:      0x3027C
+// RAW-RELR32-NEXT:      0x30284
 // RAW-RELR32-NEXT:      0x7FCFEFF
 // RAW-RELR32-NEXT:      }
 
@@ -147,36 +147,34 @@
 /// but contains only the relative relocations.
 /// Any relative relocations with odd offset stay in SHT_REL.
 // RELR32:               Section ({{.+}}) .rel.dyn {
-// RELR32-NEXT:          0x302E9 R_ARM_RELATIVE - 0x0
-// RELR32-NEXT:          0x3029C R_ARM_ABS32 bar2 0x0
-// RELR32-NEXT:          0x302C0 R_ARM_ABS32 bar2 0x0
-// RELR32-NEXT:          0x302ED R_ARM_ABS32 bar2 0x0
-// RELR32-NEXT:          0x302F1 R_ARM_ABS32 bar2 0x0
+// RELR32-NEXT:          0x302F1 R_ARM_RELATIVE - 0x0
+// RELR32-NEXT:          0x302A4 R_ARM_ABS32 bar2 0x0
+// RELR32-NEXT:          0x302C8 R_ARM_ABS32 bar2 0x0
 // RELR32-NEXT:          0x302F5 R_ARM_ABS32 bar2 0x0
 // RELR32-NEXT:          0x302F9 R_ARM_ABS32 bar2 0x0
 // RELR32-NEXT:          0x302FD R_ARM_ABS32 bar2 0x0
-// RELR32-NEXT:          0x302BC R_ARM_ABS32 zed2 0x0
+// RELR32-NEXT:          0x30301 R_ARM_ABS32 bar2 0x0
+// RELR32-NEXT:          0x30305 R_ARM_ABS32 bar2 0x0
+// RELR32-NEXT:          0x302C4 R_ARM_ABS32 zed2 0x0
 // RELR32-NEXT:          }
 // RELR32-NEXT:          Section ({{.+}}) .relr.dyn {
-// RELR32-NEXT:          0x3027C R_ARM_RELATIVE - 0x0
-// RELR32-NEXT:          0x30280 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x30284 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x30288 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x3028C R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x30290 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x30294 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x30298 R_ARM_RELATIVE - 0x0
-
+// RELR32-NEXT:          0x3029C R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302A0 R_ARM_RELATIVE - 0x0
-// RELR32-NEXT:          0x302A4 R_ARM_RELATIVE - 0x0
+
 // RELR32-NEXT:          0x302A8 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302AC R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302B0 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302B4 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302B8 R_ARM_RELATIVE - 0x0
+// RELR32-NEXT:          0x302BC R_ARM_RELATIVE - 0x0
+// RELR32-NEXT:          0x302C0 R_ARM_RELATIVE - 0x0
 
-// RELR32-NEXT:          0x302C4 R_ARM_RELATIVE - 0x0
-// RELR32-NEXT:          0x302C8 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302CC R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302D0 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302D4 R_ARM_RELATIVE - 0x0
@@ -184,6 +182,8 @@
 // RELR32-NEXT:          0x302DC R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302E0 R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          0x302E4 R_ARM_RELATIVE - 0x0
+// RELR32-NEXT:          0x302E8 R_ARM_RELATIVE - 0x0
+// RELR32-NEXT:          0x302EC R_ARM_RELATIVE - 0x0
 // RELR32-NEXT:          }
 
 // RUN: llvm-mc -filetype=obj -triple=aarch64-unknown-linux %p/Inputs/shared2.s -o %t.a64.so.o
@@ -193,42 +193,42 @@
 // RUN: llvm-readobj -r %t2.a64 | FileCheck --check-prefix=UNPACKED64 %s
 
 // UNPACKED64:          Section ({{.+}}) .rela.dyn {
-// UNPACKED64-NEXT:     0x30680 R_AARCH64_RELATIVE - 0x1
-// UNPACKED64-NEXT:     0x30688 R_AARCH64_RELATIVE - 0x2
-// UNPACKED64-NEXT:     0x30690 R_AARCH64_RELATIVE - 0x3
-// UNPACKED64-NEXT:     0x30698 R_AARCH64_RELATIVE - 0x4
-// UNPACKED64-NEXT:     0x306A0 R_AARCH64_RELATIVE - 0x5
-// UNPACKED64-NEXT:     0x306A8 R_AARCH64_RELATIVE - 0x6
-// UNPACKED64-NEXT:     0x306B0 R_AARCH64_RELATIVE - 0x7
-// UNPACKED64-NEXT:     0x306B8 R_AARCH64_RELATIVE - 0x8
-
-// UNPACKED64-NEXT:     0x306C8 R_AARCH64_RELATIVE - 0x1
-// UNPACKED64-NEXT:     0x306D0 R_AARCH64_RELATIVE - 0x2
-// UNPACKED64-NEXT:     0x306D8 R_AARCH64_RELATIVE - 0x3
-// UNPACKED64-NEXT:     0x306E0 R_AARCH64_RELATIVE - 0x4
-// UNPACKED64-NEXT:     0x306E8 R_AARCH64_RELATIVE - 0x5
-// UNPACKED64-NEXT:     0x306F0 R_AARCH64_RELATIVE - 0x6
-// UNPACKED64-NEXT:     0x306F8 R_AARCH64_RELATIVE - 0x7
-
-// UNPACKED64-NEXT:     0x30710 R_AARCH64_RELATIVE - 0x1
-// UNPACKED64-NEXT:     0x30718 R_AARCH64_RELATIVE - 0x2
-// UNPACKED64-NEXT:     0x30720 R_AARCH64_RELATIVE - 0x3
-// UNPACKED64-NEXT:     0x30728 R_AARCH64_RELATIVE - 0x4
-// UNPACKED64-NEXT:     0x30730 R_AARCH64_RELATIVE - 0x5
-// UNPACKED64-NEXT:     0x30738 R_AARCH64_RELATIVE - 0x6
-// UNPACKED64-NEXT:     0x30740 R_AARCH64_RELATIVE - 0x7
-// UNPACKED64-NEXT:     0x30748 R_AARCH64_RELATIVE - 0x8
-// UNPACKED64-NEXT:     0x30750 R_AARCH64_RELATIVE - 0x9
-
-// UNPACKED64-NEXT:     0x30759 R_AARCH64_RELATIVE - 0xA
-// UNPACKED64-NEXT:     0x306C0 R_AARCH64_ABS64 bar2 0x1
-// UNPACKED64-NEXT:     0x30708 R_AARCH64_ABS64 bar2 0x0
-// UNPACKED64-NEXT:     0x30761 R_AARCH64_ABS64 bar2 0x0
-// UNPACKED64-NEXT:     0x30769 R_AARCH64_ABS64 bar2 0x0
-// UNPACKED64-NEXT:     0x30771 R_AARCH64_ABS64 bar2 0x1
-// UNPACKED64-NEXT:     0x30779 R_AARCH64_ABS64 bar2 0x1
-// UNPACKED64-NEXT:     0x30781 R_AARCH64_ABS64 bar2 0x0
-// UNPACKED64-NEXT:     0x30700 R_AARCH64_ABS64 zed2 0x0
+// UNPACKED64-NEXT:     0x30690 R_AARCH64_RELATIVE - 0x1
+// UNPACKED64-NEXT:     0x30698 R_AARCH64_RELATIVE - 0x2
+// UNPACKED64-NEXT:     0x306A0 R_AARCH64_RELATIVE - 0x3
+// UNPACKED64-NEXT:     0x306A8 R_AARCH64_RELATIVE - 0x4
+// UNPACKED64-NEXT:     0x306B0 R_AARCH64_RELATIVE - 0x5
+// UNPACKED64-NEXT:     0x306B8 R_AARCH64_RELATIVE - 0x6
+// UNPACKED64-NEXT:     0x306C0 R_AARCH64_RELATIVE - 0x7
+// UNPACKED64-NEXT:     0x306C8 R_AARCH64_RELATIVE - 0x8
+
+// UNPACKED64-NEXT:     0x306D8 R_AARCH64_RELATIVE - 0x1
+// UNPACKED64-NEXT:     0x306E0 R_AARCH64_RELATIVE - 0x2
+// UNPACKED64-NEXT:     0x306E8 R_AARCH64_RELATIVE - 0x3
+// UNPACKED64-NEXT:     0x306F0 R_AARCH64_RELATIVE - 0x4
+// UNPACKED64-NEXT:     0x306F8 R_AARCH64_RELATIVE - 0x5
+// UNPACKED64-NEXT:     0x30700 R_AARCH64_RELATIVE - 0x6
+// UNPACKED64-NEXT:     0x30708 R_AARCH64_RELATIVE - 0x7
+
+// UNPACKED64-NEXT:     0x30720 R_AARCH64_RELATIVE - 0x1
+// UNPACKED64-NEXT:     0x30728 R_AARCH64_RELATIVE - 0x2
+// UNPACKED64-NEXT:     0x30730 R_AARCH64_RELATIVE - 0x3
+// UNPACKED64-NEXT:     0x30738 R_AARCH64_RELATIVE - 0x4
+// UNPACKED64-NEXT:     0x30740 R_AARCH64_RELATIVE - 0x5
+// UNPACKED64-NEXT:     0x30748 R_AARCH64_RELATIVE - 0x6
+// UNPACKED64-NEXT:     0x30750 R_AARCH64_RELATIVE - 0x7
+// UNPACKED64-NEXT:     0x30758 R_AARCH64_RELATIVE - 0x8
+// UNPACKED64-NEXT:     0x30760 R_AARCH64_RELATIVE - 0x9
+
+// UNPACKED64-NEXT:     0x30769 R_AARCH64_RELATIVE - 0xA
+// UNPACKED64-NEXT:     0x306D0 R_AARCH64_ABS64 bar2 0x1
+// UNPACKED64-NEXT:     0x30718 R_AARCH64_ABS64 bar2 0x0
+// UNPACKED64-NEXT:     0x30771 R_AARCH64_ABS64 bar2 0x0
+// UNPACKED64-NEXT:     0x30779 R_AARCH64_ABS64 bar2 0x0
+// UNPACKED64-NEXT:     0x30781 R_AARCH64_ABS64 bar2 0x1
+// UNPACKED64-NEXT:     0x30789 R_AARCH64_ABS64 bar2 0x1
+// UNPACKED64-NEXT:     0x30791 R_AARCH64_ABS64 bar2 0x0
+// UNPACKED64-NEXT:     0x30710 R_AARCH64_ABS64 zed2 0x0
 // UNPACKED64-NEXT:     }
 
 // RUN: ld.lld -pie --pack-dyn-relocs=android %t.a64.o %t.a64.so -o %t3.a64
@@ -255,42 +255,42 @@
 // ANDROID64-HEADERS: 0x0000000060000012 ANDROID_RELASZ        [[SIZE]]
 
 // ANDROID64:          Section ({{.+}}) .rela.dyn {
-// ANDROID64-NEXT:     0x303E0 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x303E8 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x303F0 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x303F8 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30400 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30408 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x30410 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x30418 R_AARCH64_RELATIVE - 0x8
-
-// ANDROID64-NEXT:     0x30470 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x30478 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x30480 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x30488 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30490 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30498 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x304A0 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x304A8 R_AARCH64_RELATIVE - 0x8
-// ANDROID64-NEXT:     0x304B0 R_AARCH64_RELATIVE - 0x9
-
-// ANDROID64-NEXT:     0x30428 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x30430 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x30438 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x30440 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30448 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30450 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x30458 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x304B9 R_AARCH64_RELATIVE - 0xA
-
-// ANDROID64-NEXT:     0x30468 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304C1 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304C9 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304E1 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x30420 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     0x30460 R_AARCH64_ABS64 zed2 0x0
-// ANDROID64-NEXT:     0x304D1 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     0x304D9 R_AARCH64_ABS64 bar2 0x1
+// ANDROID64-NEXT:     0x303F0 R_AARCH64_RELATIVE - 0x1
+// ANDROID64-NEXT:     0x303F8 R_AARCH64_RELATIVE - 0x2
+// ANDROID64-NEXT:     0x30400 R_AARCH64_RELATIVE - 0x3
+// ANDROID64-NEXT:     0x30408 R_AARCH64_RELATIVE - 0x4
+// ANDROID64-NEXT:     0x30410 R_AARCH64_RELATIVE - 0x5
+// ANDROID64-NEXT:     0x30418 R_AARCH64_RELATIVE - 0x6
+// ANDROID64-NEXT:     0x30420 R_AARCH64_RELATIVE - 0x7
+// ANDROID64-NEXT:     0x30428 R_AARCH64_RELATIVE - 0x8
+
+// ANDROID64-NEXT:     0x30480 R_AARCH64_RELATIVE - 0x1
+// ANDROID64-NEXT:     0x30488 R_AARCH64_RELATIVE - 0x2
+// ANDROID64-NEXT:     0x30490 R_AARCH64_RELATIVE - 0x3
+// ANDROID64-NEXT:     0x30498 R_AARCH64_RELATIVE - 0x4
+// ANDROID64-NEXT:     0x304A0 R_AARCH64_RELATIVE - 0x5
+// ANDROID64-NEXT:     0x304A8 R_AARCH64_RELATIVE - 0x6
+// ANDROID64-NEXT:     0x304B0 R_AARCH64_RELATIVE - 0x7
+// ANDROID64-NEXT:     0x304B8 R_AARCH64_RELATIVE - 0x8
+// ANDROID64-NEXT:     0x304C0 R_AARCH64_RELATIVE - 0x9
+
+// ANDROID64-NEXT:     0x30438 R_AARCH64_RELATIVE - 0x1
+// ANDROID64-NEXT:     0x30440 R_AARCH64_RELATIVE - 0x2
+// ANDROID64-NEXT:     0x30448 R_AARCH64_RELATIVE - 0x3
+// ANDROID64-NEXT:     0x30450 R_AARCH64_RELATIVE - 0x4
+// ANDROID64-NEXT:     0x30458 R_AARCH64_RELATIVE - 0x5
+// ANDROID64-NEXT:     0x30460 R_AARCH64_RELATIVE - 0x6
+// ANDROID64-NEXT:     0x30468 R_AARCH64_RELATIVE - 0x7
+// ANDROID64-NEXT:     0x304C9 R_AARCH64_RELATIVE - 0xA
+
+// ANDROID64-NEXT:     0x30478 R_AARCH64_ABS64 bar2 0x0
+// ANDROID64-NEXT:     0x304D1 R_AARCH64_ABS64 bar2 0x0
+// ANDROID64-NEXT:     0x304D9 R_AARCH64_ABS64 bar2 0x0
+// ANDROID64-NEXT:     0x304F1 R_AARCH64_ABS64 bar2 0x0
+// ANDROID64-NEXT:     0x30430 R_AARCH64_ABS64 bar2 0x1
+// ANDROID64-NEXT:     0x30470 R_AARCH64_ABS64 zed2 0x0
+// ANDROID64-NEXT:     0x304E1 R_AARCH64_ABS64 bar2 0x1
+// ANDROID64-NEXT:     0x304E9 R_AARCH64_ABS64 bar2 0x1
 // ANDROID64-NEXT:     }
 
 // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.a64.o %t.a64.so -o %t4.a64
@@ -321,7 +321,7 @@
 /// SHT_RELR section contains address/bitmap entries
 /// encoding the offsets for relative relocation.
 // RAW-RELR64:           Section ({{.+}}) .relr.dyn {
-// RAW-RELR64-NEXT:      0x30480
+// RAW-RELR64-NEXT:      0x30490
 // RAW-RELR64-NEXT:      0x7FCFEFF
 // RAW-RELR64-NEXT:      }
 
@@ -329,34 +329,32 @@
 /// but contains only the relative relocations.
 /// Any relative relocations with odd offset stay in SHT_RELA.
 // RELR64:      Section ({{.+}}) .rela.dyn {
-// RELR64-NEXT:   0x30559 R_AARCH64_RELATIVE - 0xA
-// RELR64-NEXT:   0x304C0 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30508 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30561 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30569 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30571 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30579 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30581 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30500 R_AARCH64_ABS64 zed2 0x0
+// RELR64-NEXT:   0x30569 R_AARCH64_RELATIVE - 0xA
+// RELR64-NEXT:   0x304D0 R_AARCH64_ABS64 bar2 0x1
+// RELR64-NEXT:   0x30518 R_AARCH64_ABS64 bar2 0x0
+// RELR64-NEXT:   0x30571 R_AARCH64_ABS64 bar2 0x0
+// RELR64-NEXT:   0x30579 R_AARCH64_ABS64 bar2 0x0
+// RELR64-NEXT:   0x30581 R_AARCH64_ABS64 bar2 0x1
+// RELR64-NEXT:   0x30589 R_AARCH64_ABS64 bar2 0x1
+// RELR64-NEXT:   0x30591 R_AARCH64_ABS64 bar2 0x0
+// RELR64-NEXT:   0x30510 R_AARCH64_ABS64 zed2 0x0
 // RELR64-NEXT: }
 // RELR64-NEXT: Section ({{.+}}) .relr.dyn {
-// RELR64-NEXT:   0x30480 R_AARCH64_RELATIVE - 0x0
-// RELR64-NEXT:   0x30488 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30490 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30498 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304A0 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304A8 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304B0 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304B8 R_AARCH64_RELATIVE - 0x0
+// RELR64-NEXT:   0x304C0 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304C8 R_AARCH64_RELATIVE - 0x0
-// RELR64-NEXT:   0x304D0 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304D8 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304E0 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304E8 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304F0 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x304F8 R_AARCH64_RELATIVE - 0x0
-// RELR64-NEXT:   0x30510 R_AARCH64_RELATIVE - 0x0
-// RELR64-NEXT:   0x30518 R_AARCH64_RELATIVE - 0x0
+// RELR64-NEXT:   0x30500 R_AARCH64_RELATIVE - 0x0
+// RELR64-NEXT:   0x30508 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30520 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30528 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30530 R_AARCH64_RELATIVE - 0x0
@@ -364,6 +362,8 @@
 // RELR64-NEXT:   0x30540 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30548 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT:   0x30550 R_AARCH64_RELATIVE - 0x0
+// RELR64-NEXT:   0x30558 R_AARCH64_RELATIVE - 0x0
+// RELR64-NEXT:   0x30560 R_AARCH64_RELATIVE - 0x0
 // RELR64-NEXT: }
 
 .data
diff --git a/lld/test/ELF/pie.s b/lld/test/ELF/pie.s
index 5626cca8617a6..4545b5d78048e 100644
--- a/lld/test/ELF/pie.s
+++ b/lld/test/ELF/pie.s
@@ -8,11 +8,11 @@
 
 ## Check -pie.
 # RUN: ld.lld -pie %t1.o -o %t
-# RUN: llvm-readobj --file-headers --sections -l --symbols -r %t | FileCheck %s
+# RUN: llvm-readobj --file-headers --sections -l -d --symbols -r %t | FileCheck %s
 
 ## Test --pic-executable alias
 # RUN: ld.lld --pic-executable %t1.o -o %t
-# RUN: llvm-readobj --file-headers --sections -l --symbols -r %t | FileCheck %s
+# RUN: llvm-readobj --file-headers --sections -l -d --symbols -r %t | FileCheck %s
 
 # CHECK:      ElfHeader {
 # CHECK-NEXT:  Ident {
@@ -47,6 +47,9 @@
 
 # CHECK:         Type: PT_DYNAMIC
 
+# CHECK:      DynamicSection [
+# CHECK:        0x000000006FFFFFFB FLAGS_1 PIE
+
 ## Check -nopie
 # RUN: ld.lld -no-pie %t1.o -o %t2
 # RUN: llvm-readobj --file-headers -r %t2 | FileCheck %s --check-prefix=NOPIE
diff --git a/lld/test/ELF/ppc32-call-stub-pic.s b/lld/test/ELF/ppc32-call-stub-pic.s
index 7dae81f77c89c..392be1fa2c9b7 100644
--- a/lld/test/ELF/ppc32-call-stub-pic.s
+++ b/lld/test/ELF/ppc32-call-stub-pic.s
@@ -28,15 +28,15 @@
 # RELOC-NEXT:   R_PPC_JMP_SLOT h 0x0
 # RELOC-NEXT: }
 
-# SEC: .got PROGBITS 00020368
-# DYN: PPC_GOT 0x20368
+# SEC: .got PROGBITS 00020370
+# DYN: PPC_GOT 0x20370
 
 ## .got2+0x8000-0x10004 = 0x30000+0x8000-0x10004 = 65536*2+32764
 # CHECK-LABEL: <_start>:
 # PIE-NEXT:           bcl 20, 31, 0x10210
 # PIE-NEXT:    10210: mflr 30
 # PIE-NEXT:           addis 30, 30, 3
-# PIE-NEXT:           addi 30, 30, -32412
+# PIE-NEXT:           addi 30, 30, -32404
 ## Two bl 00008000.got2.plt_pic32.f
 # PIE-NEXT:           bl 0x10244
 # PIE-NEXT:           bl 0x10244
@@ -104,7 +104,7 @@
 # CHECK-NEXT:  <00008000.got2.plt_pic32.f>:
 
 ## In Secure PLT ABI, .plt stores function pointers to first instructions of .glink
-# HEX: 0x0004036c 00010294 00010298 0001029c
+# HEX: 0x00040374 00010294 00010298 0001029c
 
 ## These instructions are referenced by .plt entries.
 # CHECK:      [[#%x,GLINK:]] <.glink>:
@@ -113,7 +113,7 @@
 # CHECK-NEXT: b 0x[[#%x,GLINK+12]]
 
 ## PLTresolve
-## Operand of addi: 0x100a8-.glink = 24
+## Operand of addi: 0x102cc-.glink = 24
 # CHECK-NEXT:         addis 11, 11, 0
 # CHECK-NEXT:         mflr 0
 # CHECK-NEXT:         bcl 20, 31, 0x[[#%x,NEXT:]]
@@ -123,12 +123,12 @@
 # CHECK-NEXT: mtlr 0
 # CHECK-NEXT: sub 11, 11, 12
 
-## Operand of lwz in -pie mode: &.got[1] - 0x100a8 = 0x20088+4 - 0x100a8 = 65536*1-28
+## Operand of lwz in -pie mode: &.got[1] - 0x102bc = 0x20380+4 - 0x102bc = 65536*1+200
 # CHECK-NEXT:  addis 12, 12, 1
-# PIE-NEXT:    lwz 0, 192(12)
+# PIE-NEXT:    lwz 0, 200(12)
 # SHARED-NEXT: lwz 0, 184(12)
 
-# PIE-NEXT:    lwz 12, 196(12)
+# PIE-NEXT:    lwz 12, 204(12)
 # SHARED-NEXT: lwz 12, 188(12)
 # CHECK-NEXT:  mtctr 0
 # CHECK-NEXT:  add 0, 11, 11
diff --git a/lld/test/ELF/ppc32-ifunc-nonpreemptible-pic.s b/lld/test/ELF/ppc32-ifunc-nonpreemptible-pic.s
index 0bb715a191d83..a88927fdfd75e 100644
--- a/lld/test/ELF/ppc32-ifunc-nonpreemptible-pic.s
+++ b/lld/test/ELF/ppc32-ifunc-nonpreemptible-pic.s
@@ -7,12 +7,12 @@
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 
 # RELOC:      .rela.dyn {
-# RELOC-NEXT:   0x30240 R_PPC_RELATIVE - 0x101A8
-# RELOC-NEXT:   0x30244 R_PPC_IRELATIVE - 0x10188
+# RELOC-NEXT:   0x30248 R_PPC_RELATIVE - 0x101A8
+# RELOC-NEXT:   0x3024C R_PPC_IRELATIVE - 0x10188
 # RELOC-NEXT: }
 
 # SYM: 000101a8 0 FUNC GLOBAL DEFAULT {{.*}} func
-# HEX: 0x00030240 00000000
+# HEX: 0x00030248 00000000
 
 .section .got2,"aw"
 .long func
diff --git a/lld/test/ELF/ppc64-long-branch-pi.s b/lld/test/ELF/ppc64-long-branch-pi.s
index dca63878c0e78..36f14f5cc319a 100644
--- a/lld/test/ELF/ppc64-long-branch-pi.s
+++ b/lld/test/ELF/ppc64-long-branch-pi.s
@@ -14,19 +14,19 @@
 # RUN: llvm-objdump -d --no-show-raw-insn %t.so | FileCheck %s
 
 # SEC-PIE:    Name       Type     Address          Off     Size   ES Flg Lk Inf Al
-# SEC-PIE:    .got       PROGBITS 00000000020020e0 20120e0 000008 00  WA  0   0  8
-# SEC-PIE:    .branch_lt NOBITS   00000000020020f0 20120f0 000020 00  WA  0   0  8
+# SEC-PIE:    .got       PROGBITS 00000000020020f0 20120f0 000008 00  WA  0   0  8
+# SEC-PIE:    .branch_lt NOBITS   0000000002002100 2012100 000020 00  WA  0   0  8
 
 # SEC-SHARED: Name       Type     Address          Off     Size   ES Flg Lk Inf Al
 # SEC-SHARED: .got       PROGBITS 00000000020020d0 20120d0 000008 00  WA  0   0  8
 # SEC-SHARED: .branch_lt NOBITS   00000000020020e0 20120e0 000020 00  WA  0   0  8
 
 # RELOC:      .rela.dyn {
-# RELOC-NEXT:   0x20020E8 R_PPC64_RELATIVE - 0x8000
-# RELOC-NEXT:   0x20020F0 R_PPC64_RELATIVE - 0x2002000
-# RELOC-NEXT:   0x20020F8 R_PPC64_RELATIVE - 0x2002008
-# RELOC-NEXT:   0x2002100 R_PPC64_RELATIVE - 0x200200C
-# RELOC-NEXT:   0x2002108 R_PPC64_RELATIVE - 0x2000
+# RELOC-NEXT:   0x20020F8 R_PPC64_RELATIVE - 0x8000
+# RELOC-NEXT:   0x2002100 R_PPC64_RELATIVE - 0x2002000
+# RELOC-NEXT:   0x2002108 R_PPC64_RELATIVE - 0x2002008
+# RELOC-NEXT:   0x2002110 R_PPC64_RELATIVE - 0x200200C
+# RELOC-NEXT:   0x2002118 R_PPC64_RELATIVE - 0x2000
 # RELOC-NEXT: }
 
 # CHECK:      <_start>:
diff --git a/lld/test/ELF/relative-dynamic-reloc-pie.s b/lld/test/ELF/relative-dynamic-reloc-pie.s
index 22a1c01b35c52..8c6e4a8402f44 100644
--- a/lld/test/ELF/relative-dynamic-reloc-pie.s
+++ b/lld/test/ELF/relative-dynamic-reloc-pie.s
@@ -6,9 +6,9 @@
 ## Test that we create R_X86_64_RELATIVE relocations with -pie.
 # CHECK:      Relocations [
 # CHECK-NEXT:   Section ({{.*}}) .rela.dyn {
-# CHECK-NEXT:     0x3358 R_X86_64_RELATIVE - 0x3358
-# CHECK-NEXT:     0x3360 R_X86_64_RELATIVE - 0x3360
-# CHECK-NEXT:     0x3368 R_X86_64_RELATIVE - 0x3361
+# CHECK-NEXT:     0x3368 R_X86_64_RELATIVE - 0x3368
+# CHECK-NEXT:     0x3370 R_X86_64_RELATIVE - 0x3370
+# CHECK-NEXT:     0x3378 R_X86_64_RELATIVE - 0x3371
 # CHECK-NEXT:   }
 # CHECK-NEXT: ]
 
diff --git a/lld/test/ELF/riscv-gp.s b/lld/test/ELF/riscv-gp.s
index 5f0819fcc8566..e21cc0cc5bcec 100644
--- a/lld/test/ELF/riscv-gp.s
+++ b/lld/test/ELF/riscv-gp.s
@@ -11,12 +11,12 @@
 # RUN: llvm-readelf -S %t.64 | FileCheck --check-prefix=SEC64 %s
 # RUN: not ld.lld -shared %t.64.o -o /dev/null 2>&1 | FileCheck --check-prefix=ERR %s
 
-## __global_pointer$ = .sdata+0x800 = 0x39b8
-# SEC32: [ 7] .sdata PROGBITS {{0*}}000031b8
-# SYM32: {{0*}}000039b8 0 NOTYPE GLOBAL DEFAULT 7 __global_pointer$
+## __global_pointer$ = .sdata+0x800 = 0x39c0
+# SEC32: [ 7] .sdata PROGBITS {{0*}}000031c0
+# SYM32: {{0*}}000039c0 0 NOTYPE GLOBAL DEFAULT 7 __global_pointer$
 
-# SEC64: [ 7] .sdata PROGBITS {{0*}}000032d0
-# SYM64: {{0*}}00003ad0 0 NOTYPE GLOBAL DEFAULT 7 __global_pointer$
+# SEC64: [ 7] .sdata PROGBITS {{0*}}000032e0
+# SYM64: {{0*}}00003ae0 0 NOTYPE GLOBAL DEFAULT 7 __global_pointer$
 
 ## __global_pointer$ - 0x1000 = 4096*3-2048
 # DIS:      1000: auipc gp, 3
diff --git a/lld/test/ELF/riscv-ifunc-nonpreemptible.s b/lld/test/ELF/riscv-ifunc-nonpreemptible.s
index 3e255456ad212..99d75a28a391a 100644
--- a/lld/test/ELF/riscv-ifunc-nonpreemptible.s
+++ b/lld/test/ELF/riscv-ifunc-nonpreemptible.s
@@ -12,7 +12,7 @@
 # RUN: llvm-objdump -d --no-show-raw-insn %t.64 | FileCheck --check-prefix=DIS64 %s
 
 # RELOC32:      .rela.dyn {
-# RELOC32-NEXT:   0x3218 R_RISCV_IRELATIVE - 0x117C
+# RELOC32-NEXT:   0x3220 R_RISCV_IRELATIVE - 0x117C
 # RELOC32-NEXT: }
 
 # SYM32: 0001190 0 FUNC GLOBAL DEFAULT {{.*}} func
@@ -22,14 +22,14 @@
 # DIS32-NEXT:       addi a0, a0, 16
 # DIS32:      Disassembly of section .iplt:
 # DIS32:      <func>:
-## 32-bit: &.got.plt[func]-. = 0x3218-0x1190 = 4096*2+136
+## 32-bit: &.got.plt[func]-. = 0x3220-0x1190 = 4096*2+144
 # DIS32-NEXT: 1190: auipc t3, 2
-# DIS32-NEXT:       lw t3, 136(t3)
+# DIS32-NEXT:       lw t3, 144(t3)
 # DIS32-NEXT:       jalr t1, t3
 # DIS32-NEXT:       nop
 
 # RELOC64:      .rela.dyn {
-# RELOC64-NEXT:   0x3370 R_RISCV_IRELATIVE - 0x1260
+# RELOC64-NEXT:   0x3380 R_RISCV_IRELATIVE - 0x1260
 # RELOC64-NEXT: }
 
 # SYM64: 000000000001270 0 FUNC GLOBAL DEFAULT {{.*}} func
@@ -39,9 +39,9 @@
 # DIS64-NEXT:       addi a0, a0, 12
 # DIS64:      Disassembly of section .iplt:
 # DIS64:      <func>:
-## 64-bit: &.got.plt[func]-. = 0x3370-0x1270 = 4096*2+256
+## 64-bit: &.got.plt[func]-. = 0x3380-0x1270 = 4096*2+272
 # DIS64-NEXT: 1270: auipc t3, 2
-# DIS64-NEXT:       ld t3, 256(t3)
+# DIS64-NEXT:       ld t3, 272(t3)
 # DIS64-NEXT:       jalr t1, t3
 # DIS64-NEXT:       nop
 
diff --git a/lld/test/ELF/separate-segments.s b/lld/test/ELF/separate-segments.s
index a0c910f31338b..d0e4afe7fb668 100644
--- a/lld/test/ELF/separate-segments.s
+++ b/lld/test/ELF/separate-segments.s
@@ -7,8 +7,8 @@
 # RUN: llvm-readelf -l %t | FileCheck --check-prefix=NONE %s
 # NONE:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000245 0x000245 R   0x1000
 # NONE-NEXT: LOAD 0x000248 0x0000000000001248 0x0000000000001248 0x000001 0x000001 R E 0x1000
-# NONE-NEXT: LOAD 0x000250 0x0000000000002250 0x0000000000002250 0x000080 0x000080 RW  0x1000
-# NONE-NEXT: LOAD 0x0002d0 0x00000000000032d0 0x00000000000032d0 0x000001 0x000001 RW  0x1000
+# NONE-NEXT: LOAD 0x000250 0x0000000000002250 0x0000000000002250 0x000090 0x000090 RW  0x1000
+# NONE-NEXT: LOAD 0x0002e0 0x00000000000032e0 0x00000000000032e0 0x000001 0x000001 RW  0x1000
 
 ## -z separate-code makes text segment (RX) separate.
 ## The two RW can have overlapping p_offset ranges at runtime.
@@ -16,15 +16,15 @@
 # RUN: llvm-readelf -l %t | FileCheck --check-prefix=CODE %s
 # CODE:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000245 0x000245 R   0x1000
 # CODE-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000001 0x000001 R E 0x1000
-# CODE-NEXT: LOAD 0x002000 0x0000000000002000 0x0000000000002000 0x000080 0x000080 RW  0x1000
-# CODE-NEXT: LOAD 0x002080 0x0000000000003080 0x0000000000003080 0x000001 0x000001 RW  0x1000
+# CODE-NEXT: LOAD 0x002000 0x0000000000002000 0x0000000000002000 0x000090 0x000090 RW  0x1000
+# CODE-NEXT: LOAD 0x002090 0x0000000000003090 0x0000000000003090 0x000001 0x000001 RW  0x1000
 
 ## -z separate-loadable-segments makes all segments separate.
 # RUN: ld.lld -pie %t.o -z separate-loadable-segments -o %t
 # RUN: llvm-readelf -l %t | FileCheck --check-prefix=ALL %s
 # ALL:       LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000245 0x000245 R   0x1000
 # ALL-NEXT:  LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000001 0x000001 R E 0x1000
-# ALL-NEXT:  LOAD 0x002000 0x0000000000002000 0x0000000000002000 0x000080 0x000080 RW  0x1000
+# ALL-NEXT:  LOAD 0x002000 0x0000000000002000 0x0000000000002000 0x000090 0x000090 RW  0x1000
 # ALL-NEXT:  LOAD 0x003000 0x0000000000003000 0x0000000000003000 0x000001 0x000001 RW  0x1000
 
 nop

From 0ed2c046362e2248eaf3d81e235115b28d4af262 Mon Sep 17 00:00:00 2001
From: hsmahesha <mahesha.comp@gmail.com>
Date: Mon, 1 Jun 2020 22:50:29 +0530
Subject: [PATCH 729/770] [AMDGPU/MemOpsCluster] Let mem ops clustering logic
 also consider number of clustered bytes

Summary:
While clustering mem ops, AMDGPU target needs to consider number of clustered bytes
to decide on max number of mem ops that can be clustered. This patch adds support to pass
number of clustered bytes to target mem ops clustering logic.

Reviewers: foad, rampitec, arsenm, vpykhtin, javedabsar

Reviewed By: foad

Subscribers: MatzeB, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, javed.absar, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80545
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 13 ++--
 llvm/lib/CodeGen/MachineScheduler.cpp         | 22 ++++---
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 10 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  8 +--
 .../lib/Target/AMDGPU/SIInsertHardClauses.cpp |  7 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 62 ++++++++++++++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          | 15 +++--
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  |  8 +--
 llvm/lib/Target/Hexagon/HexagonInstrInfo.h    | 10 +--
 llvm/lib/Target/Lanai/LanaiInstrInfo.cpp      |  6 +-
 llvm/lib/Target/Lanai/LanaiInstrInfo.h        | 10 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 10 ++-
 llvm/lib/Target/X86/X86InstrInfo.h            | 10 +--
 14 files changed, 132 insertions(+), 63 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 709030b620768..b3b2fa218627e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1254,11 +1254,10 @@ class TargetInstrInfo : public MCInstrInfo {
   /// It returns false if no base operands and offset was found.
   /// It is not guaranteed to always recognize base operands and offsets in all
   /// cases.
-  virtual bool
-  getMemOperandsWithOffset(const MachineInstr &MI,
-                           SmallVectorImpl<const MachineOperand *> &BaseOps,
-                           int64_t &Offset, bool &OffsetIsScalable,
-                           const TargetRegisterInfo *TRI) const {
+  virtual bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
+      int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const {
     return false;
   }
 
@@ -1286,9 +1285,11 @@ class TargetInstrInfo : public MCInstrInfo {
   /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
   /// \p NumLoads is the number of loads that will be in the cluster if this
   /// hook returns true.
+  /// \p NumBytes is the number of bytes that will be loaded from all the
+  /// clustered loads if this hook returns true.
   virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                    ArrayRef<const MachineOperand *> BaseOps2,
-                                   unsigned NumLoads) const {
+                                   unsigned NumLoads, unsigned NumBytes) const {
     llvm_unreachable("target did not implement shouldClusterMemOps()");
   }
 
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 92fd3edf92364..a688991913749 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1473,10 +1473,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SUnit *SU;
     SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
+    unsigned Width;
 
     MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
-              int64_t Offset)
-        : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset) {}
+              int64_t Offset, unsigned Width)
+        : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
+          Width(Width) {}
 
     static bool Compare(const MachineOperand *const &A,
                         const MachineOperand *const &B) {
@@ -1565,12 +1567,14 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   for (SUnit *SU : MemOps) {
+    const MachineInstr &MI = *SU->getInstr();
     SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
     bool OffsetIsScalable;
-    if (TII->getMemOperandsWithOffset(*SU->getInstr(), BaseOps, Offset,
-                                      OffsetIsScalable, TRI))
-      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset));
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI))
+      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
 #ifndef NDEBUG
     for (auto *Op : BaseOps)
       assert(Op);
@@ -1584,16 +1588,19 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
   // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
   // cluster mem ops collected within `MemOpRecords` array.
   unsigned ClusterLength = 1;
+  unsigned CurrentClusterBytes = MemOpRecords[0].Width;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     // Decision to cluster mem ops is taken based on target dependent logic
     auto MemOpa = MemOpRecords[Idx];
     auto MemOpb = MemOpRecords[Idx + 1];
     ++ClusterLength;
-    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps,
-                                  ClusterLength)) {
+    CurrentClusterBytes += MemOpb.Width;
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes)) {
       // Current mem ops pair could not be clustered, reset cluster length, and
       // go to next pair
       ClusterLength = 1;
+      CurrentClusterBytes = MemOpb.Width;
       continue;
     }
 
@@ -1605,6 +1612,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     // FIXME: Is this check really required?
     if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
       ClusterLength = 1;
+      CurrentClusterBytes = MemOpb.Width;
       continue;
     }
 
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 54913175f167e..228e3c1051ab7 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1037,7 +1037,9 @@ bool TargetInstrInfo::getMemOperandWithOffset(
     const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset,
     bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
   SmallVector<const MachineOperand *, 4> BaseOps;
-  if (!getMemOperandsWithOffset(MI, BaseOps, Offset, OffsetIsScalable, TRI) ||
+  unsigned Width;
+  if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable,
+                                     Width, TRI) ||
       BaseOps.size() != 1)
     return false;
   BaseOp = BaseOps.front();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 801f162937ede..ed8c5f6ce8793 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2033,15 +2033,14 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
   return true;
 }
 
-bool AArch64InstrInfo::getMemOperandsWithOffset(
+bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
-    int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI)
-    const {
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   if (!LdSt.mayLoadOrStore())
     return false;
 
   const MachineOperand *BaseOp;
-  unsigned Width;
   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
                                     Width, TRI))
     return false;
@@ -2513,7 +2512,8 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
 /// Only called for LdSt for which getMemOperandWithOffset returns true.
 bool AArch64InstrInfo::shouldClusterMemOps(
     ArrayRef<const MachineOperand *> BaseOps1,
-    ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads) const {
+    ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
+    unsigned NumBytes) const {
   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
   const MachineOperand &BaseOp1 = *BaseOps1.front();
   const MachineOperand &BaseOp2 = *BaseOps2.front();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 13ef9845ae3d2..e05b1837edc39 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -113,10 +113,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Hint that pairing the given load or store is unprofitable.
   static void suppressLdStPair(MachineInstr &MI);
 
-  bool getMemOperandsWithOffset(
+  bool getMemOperandsWithOffsetWidth(
       const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
-      int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI)
-      const override;
+      int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
 
   /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`.
   /// This is true for some SVE instructions like ldr/str that have a
@@ -140,7 +140,7 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                            ArrayRef<const MachineOperand *> BaseOps2,
-                           unsigned NumLoads) const override;
+                           unsigned NumLoads, unsigned NumBytes) const override;
 
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                         const DebugLoc &DL, MCRegister DestReg,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 4c8405120548a..35c49ae8c0dd1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -146,10 +146,11 @@ class SIInsertHardClauses : public MachineFunctionPass {
 
         int64_t Dummy1;
         bool Dummy2;
+        unsigned Dummy3;
         SmallVector<const MachineOperand *, 4> BaseOps;
         if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
-          if (!SII->getMemOperandsWithOffset(MI, BaseOps, Dummy1, Dummy2,
-                                             TRI)) {
+          if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2,
+                                                  Dummy3, TRI)) {
             // We failed to get the base operands, so we'll never clause this
             // instruction with any other, so pretend it's illegal.
             Type = HARDCLAUSE_ILLEGAL;
@@ -164,7 +165,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
               // scheduler it limits the size of the cluster to avoid increasing
               // register pressure too much, but this pass runs after register
               // allocation so there is no need for that kind of limit.
-              !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2)))) {
+              !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
           // Finish the current clause.
           Changed |= emitClause(CI, SII);
           CI = ClauseInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d7508c3b1ea2a..5feed9d53bc75 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -264,16 +264,27 @@ static bool isStride64(unsigned Opc) {
   }
 }
 
-bool SIInstrInfo::getMemOperandsWithOffset(
+unsigned SIInstrInfo::getOperandSizeInBytes(const MachineInstr &LdSt,
+                                            const MachineOperand *MOp) const {
+  assert(MOp && "Unexpected null machine operand!");
+  const MachineRegisterInfo &MRI = LdSt.getParent()->getParent()->getRegInfo();
+  const Register Reg = MOp->getReg();
+  const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
+                                         ? MRI.getRegClass(Reg)
+                                         : RI.getPhysRegClass(Reg);
+  return (RI.getRegSizeInBits(*DstRC) / 8);
+}
+
+bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
-    int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI)
-    const {
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   if (!LdSt.mayLoadOrStore())
     return false;
 
   unsigned Opc = LdSt.getOpcode();
   OffsetIsScalable = false;
-  const MachineOperand *BaseOp, *OffsetOp;
+  const MachineOperand *BaseOp, *OffsetOp, *MOp;
 
   if (isDS(LdSt)) {
     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
@@ -287,6 +298,11 @@ bool SIInstrInfo::getMemOperandsWithOffset(
       }
       BaseOps.push_back(BaseOp);
       Offset = OffsetOp->getImm();
+      // Get appropriate operand, and compute width accordingly.
+      MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst);
+      if (!MOp)
+        MOp = getNamedOperand(LdSt, AMDGPU::OpName::data0);
+      Width = getOperandSizeInBytes(LdSt, MOp);
     } else {
       // The 2 offset instructions use offset0 and offset1 instead. We can treat
       // these as a load with a single offset if the 2 offsets are consecutive.
@@ -318,6 +334,16 @@ bool SIInstrInfo::getMemOperandsWithOffset(
 
       BaseOps.push_back(BaseOp);
       Offset = EltSize * Offset0;
+      // Get appropriate operand(s), and compute width accordingly.
+      MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst);
+      if (!MOp) {
+        MOp = getNamedOperand(LdSt, AMDGPU::OpName::data0);
+        Width = getOperandSizeInBytes(LdSt, MOp);
+        MOp = getNamedOperand(LdSt, AMDGPU::OpName::data1);
+        Width += getOperandSizeInBytes(LdSt, MOp);
+      } else {
+        Width = getOperandSizeInBytes(LdSt, MOp);
+      }
     }
     return true;
   }
@@ -342,6 +368,11 @@ bool SIInstrInfo::getMemOperandsWithOffset(
       BaseOps.push_back(RSrc);
       BaseOps.push_back(SOffset);
       Offset = OffsetImm->getImm();
+      // Get appropriate operand, and compute width accordingly.
+      MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst);
+      if (!MOp)
+        MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata);
+      Width = getOperandSizeInBytes(LdSt, MOp);
       return true;
     }
 
@@ -359,6 +390,11 @@ bool SIInstrInfo::getMemOperandsWithOffset(
     Offset = OffsetImm->getImm();
     if (SOffset) // soffset can be an inline immediate.
       Offset += SOffset->getImm();
+    // Get appropriate operand, and compute width accordingly.
+    MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst);
+    if (!MOp)
+      MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata);
+    Width = getOperandSizeInBytes(LdSt, MOp);
     return true;
   }
 
@@ -369,6 +405,9 @@ bool SIInstrInfo::getMemOperandsWithOffset(
     BaseOps.push_back(BaseOp);
     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
     Offset = OffsetOp ? OffsetOp->getImm() : 0;
+    // Get appropriate operand, and compute width accordingly.
+    MOp = getNamedOperand(LdSt, AMDGPU::OpName::sdst);
+    Width = getOperandSizeInBytes(LdSt, MOp);
     return true;
   }
 
@@ -381,6 +420,11 @@ bool SIInstrInfo::getMemOperandsWithOffset(
     if (BaseOp)
       BaseOps.push_back(BaseOp);
     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
+    // Get appropriate operand, and compute width accordingly.
+    MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst);
+    if (!MOp)
+      MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata);
+    Width = getOperandSizeInBytes(LdSt, MOp);
     return true;
   }
 
@@ -430,7 +474,8 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
 
 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
-                                      unsigned NumLoads) const {
+                                      unsigned NumLoads,
+                                      unsigned NumBytes) const {
   assert(!BaseOps1.empty() && !BaseOps2.empty());
   const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
   const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
@@ -2730,9 +2775,12 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
                                                const MachineInstr &MIb) const {
   SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
   int64_t Offset0, Offset1;
+  unsigned Dummy0, Dummy1;
   bool Offset0IsScalable, Offset1IsScalable;
-  if (!getMemOperandsWithOffset(MIa, BaseOps0, Offset0, Offset0IsScalable, &RI) ||
-      !getMemOperandsWithOffset(MIb, BaseOps1, Offset1, Offset1IsScalable, &RI))
+  if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
+                                     Dummy0, &RI) ||
+      !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
+                                     Dummy1, &RI))
     return false;
 
   if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 8231a96f5f6b2..c6d0349d3575c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -181,15 +181,18 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                int64_t &Offset1,
                                int64_t &Offset2) const override;
 
-  bool
-  getMemOperandsWithOffset(const MachineInstr &LdSt,
-                           SmallVectorImpl<const MachineOperand *> &BaseOps,
-                           int64_t &Offset, bool &OffsetIsScalable,
-                           const TargetRegisterInfo *TRI) const final;
+  unsigned getOperandSizeInBytes(const MachineInstr &LdSt,
+                                 const MachineOperand *MOp) const;
+
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const final;
 
   bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                            ArrayRef<const MachineOperand *> BaseOps2,
-                           unsigned NumLoads) const override;
+                           unsigned NumLoads, unsigned NumBytes) const override;
 
   bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
                                int64_t Offset1, unsigned NumLoads) const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 0bfb28b935c3a..64922d30c4151 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2963,12 +2963,12 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
 }
 
 /// Get the base register and byte offset of a load/store instr.
-bool HexagonInstrInfo::getMemOperandsWithOffset(
+bool HexagonInstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
-    int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
-  unsigned AccessSize = 0;
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   OffsetIsScalable = false;
-  const MachineOperand *BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize);
+  const MachineOperand *BaseOp = getBaseAndOffset(LdSt, Offset, Width);
   if (!BaseOp || !BaseOp->isReg())
     return false;
   BaseOps.push_back(BaseOp);
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index d8998d265477d..847b9a6728916 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -204,11 +204,11 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// Get the base register and byte offset of a load/store instr.
-  bool
-  getMemOperandsWithOffset(const MachineInstr &LdSt,
-                           SmallVectorImpl<const MachineOperand *> &BaseOps,
-                           int64_t &Offset, bool &OffsetIsScalable,
-                           const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
 
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 8afd27d53469c..c821429703570 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -795,9 +795,10 @@ bool LanaiInstrInfo::getMemOperandWithOffsetWidth(
   return true;
 }
 
-bool LanaiInstrInfo::getMemOperandsWithOffset(
+bool LanaiInstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
-    int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   switch (LdSt.getOpcode()) {
   default:
     return false;
@@ -811,7 +812,6 @@ bool LanaiInstrInfo::getMemOperandsWithOffset(
   case Lanai::LDBs_RI:
   case Lanai::LDBz_RI:
     const MachineOperand *BaseOp;
-    unsigned Width;
     OffsetIsScalable = false;
     if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI))
       return false;
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index 216dbfb63accc..44c1e629a8e66 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -67,11 +67,11 @@ class LanaiInstrInfo : public LanaiGenInstrInfo {
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  bool
-  getMemOperandsWithOffset(const MachineInstr &LdSt,
-                           SmallVectorImpl<const MachineOperand *> &BaseOps,
-                           int64_t &Offset, bool &OffsetIsScalable,
-                           const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
 
   bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
                                     const MachineOperand *&BaseOp,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index c8939e348a70c..46ff62f7a4ed5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3662,9 +3662,10 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   }
 }
 
-bool X86InstrInfo::getMemOperandsWithOffset(
+bool X86InstrInfo::getMemOperandsWithOffsetWidth(
     const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
-    int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   const MCInstrDesc &Desc = MemOp.getDesc();
   int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
   if (MemRefBegin < 0)
@@ -3696,6 +3697,11 @@ bool X86InstrInfo::getMemOperandsWithOffset(
     return false;
 
   OffsetIsScalable = false;
+  // FIXME: Relying on memoperands() may not be right thing to do here. Check
+  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
+  // there is no use of `Width` for X86 back-end at the moment.
+  Width =
+      !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
   BaseOps.push_back(BaseOp);
   return true;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index fe79073ae3702..89f2ff118c378 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -317,11 +317,11 @@ class X86InstrInfo final : public X86GenInstrInfo {
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
-  bool
-  getMemOperandsWithOffset(const MachineInstr &LdSt,
-                           SmallVectorImpl<const MachineOperand *> &BaseOps,
-                           int64_t &Offset, bool &OffsetIsScalable,
-                           const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
   bool analyzeBranchPredicate(MachineBasicBlock &MBB,
                               TargetInstrInfo::MachineBranchPredicate &MBP,
                               bool AllowModify = false) const override;

From ed08c4fb2e63daed03987f80a17cbececeb2c656 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 1 Jun 2020 13:14:18 -0400
Subject: [PATCH 730/770] AMDGPU: Remove dead file

---
 llvm/lib/Target/AMDGPU/VIInstructions.td | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/VIInstructions.td

diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td
deleted file mode 100644
index c668acc83ff1d..0000000000000
--- a/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ /dev/null
@@ -1,13 +0,0 @@
-//===-- VIInstructions.td - VI Instruction Definitions --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Instruction definitions for VI and newer.
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds.  This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.

From 20793b2aef1c7589cff1c35194f1463e747cb1a5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 1 Jun 2020 13:15:06 -0400
Subject: [PATCH 731/770] AMDGPU: Fix test in code directory

---
 llvm/{lib/Target => test/CodeGen}/AMDGPU/sroa-before-unroll.ll | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/{lib/Target => test/CodeGen}/AMDGPU/sroa-before-unroll.ll (100%)

diff --git a/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
similarity index 100%
rename from llvm/lib/Target/AMDGPU/sroa-before-unroll.ll
rename to llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll

From 6c27c61d32fd2951a290c6d4363bd495f6feae96 Mon Sep 17 00:00:00 2001
From: Hiroshi Yamauchi <yamauchi@google.com>
Date: Wed, 8 Apr 2020 16:06:25 -0700
Subject: [PATCH 732/770] [PGO] Improve the working set size heuristics under
 the partial sample PGO.

Summary:
The working set size heuristics (ProfileSummaryInfo::hasHugeWorkingSetSize)
under the partial sample PGO may not be accurate because the profile is partial
and the number of hot profile counters in the ProfileSummary may not reflect the
actual working set size of the program being compiled.

To improve this, the (approximated) ratio of the the number of profile counters
of the program being compiled to the number of profile counters in the partial
sample profile is computed (which is called the partial profile ratio) and the
working set size of the profile is scaled by this ratio to reflect the working
set size of the program being compiled and used for the working set size
heuristics.

The partial profile ratio is approximated based on the number of the basic
blocks in the program and the NumCounts field in the ProfileSummary and computed
through the thin LTO indexing. This means that there is the limitation that the
scaled working set size is available to the thin LTO post link passes only.

Reviewers: davidxl

Subscribers: mgorny, eraman, hiraditya, steven_wu, dexonsmith, arphaman, dang, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79831
---
 llvm/include/llvm/IR/Module.h                 |  5 +
 llvm/include/llvm/IR/ProfileSummary.h         |  3 +-
 llvm/lib/Analysis/ProfileSummaryInfo.cpp      | 38 +++++++-
 llvm/lib/IR/Module.cpp                        | 21 ++++
 llvm/lib/LTO/LTOBackend.cpp                   |  4 +
 llvm/lib/Transforms/IPO/FunctionImport.cpp    |  6 ++
 .../Analysis/ProfileSummaryInfoTest.cpp       | 97 +++++++++++++++----
 llvm/unittests/IR/ModuleTest.cpp              | 37 +++++++
 8 files changed, 185 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index ead003007904c..36d58661ae4cb 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -46,6 +46,7 @@ class FunctionType;
 class GVMaterializer;
 class LLVMContext;
 class MemoryBuffer;
+class ModuleSummaryIndex;
 class Pass;
 class RandomNumberGenerator;
 template <class PtrType> class SmallPtrSetImpl;
@@ -882,6 +883,10 @@ class Module {
 
   /// Take ownership of the given memory buffer.
   void setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB);
+
+  /// Set the partial sample profile ratio in the profile summary module flag,
+  /// if applicable.
+  void setPartialSampleProfileRatio(const ModuleSummaryIndex &Index);
 };
 
 /// Given "llvm.used" or "llvm.compiler.used" as a global name, collect
diff --git a/llvm/include/llvm/IR/ProfileSummary.h b/llvm/include/llvm/IR/ProfileSummary.h
index 00af0c5e05c89..889568e7946bb 100644
--- a/llvm/include/llvm/IR/ProfileSummary.h
+++ b/llvm/include/llvm/IR/ProfileSummary.h
@@ -59,7 +59,8 @@ class ProfileSummary {
   bool Partial = false;
   /// This approximately represents the ratio of the number of profile counters
   /// of the program being built to the number of profile counters in the
-  /// partial sample profile. When 'Partial' is false, it is undefined.
+  /// partial sample profile. When 'Partial' is false, it is undefined. This is
+  /// currently only available under thin LTO mode.
   double PartialProfileRatio = 0;
   /// Return detailed summary as metadata.
   Metadata *getDetailedSummaryMD(LLVMContext &Context);
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 3360fd4c37c02..e3a76a6d075e7 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -70,6 +70,23 @@ static cl::opt<bool> PartialProfile(
     "partial-profile", cl::Hidden, cl::init(false),
     cl::desc("Specify the current profile is used as a partial profile."));
 
+cl::opt<bool> ScalePartialSampleProfileWorkingSetSize(
+    "scale-partial-sample-profile-working-set-size", cl::Hidden,
+    cl::init(false),
+    cl::desc(
+        "If true, scale the working set size of the partial sample profile "
+        "by the partial profile ratio to reflect the size of the program "
+        "being compiled."));
+
+static cl::opt<double> PartialSampleProfileWorkingSetSizeScaleFactor(
+    "partial-sample-profile-working-set-size-scale-factor", cl::Hidden,
+    cl::init(0.008),
+    cl::desc("The scale factor used to scale the working set size of the "
+             "partial sample profile along with the partial profile ratio. "
+             "This includes the factor of the profile counter per block "
+             "and the factor to scale the working set size to use the same "
+             "shared thresholds as PGO."));
+
 // Find the summary entry for a desired percentile of counts.
 static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS,
                                                         uint64_t Percentile) {
@@ -280,10 +297,23 @@ void ProfileSummaryInfo::computeThresholds() {
     ColdCountThreshold = ProfileSummaryColdCount;
   assert(ColdCountThreshold <= HotCountThreshold &&
          "Cold count threshold cannot exceed hot count threshold!");
-  HasHugeWorkingSetSize =
-      HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
-  HasLargeWorkingSetSize =
-      HotEntry.NumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+  if (!hasPartialSampleProfile() || !ScalePartialSampleProfileWorkingSetSize) {
+    HasHugeWorkingSetSize =
+        HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
+    HasLargeWorkingSetSize =
+        HotEntry.NumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+  } else {
+    // Scale the working set size of the partial sample profile to reflect the
+    // size of the program being compiled.
+    double PartialProfileRatio = Summary->getPartialProfileRatio();
+    uint64_t ScaledHotEntryNumCounts =
+        static_cast<uint64_t>(HotEntry.NumCounts * PartialProfileRatio *
+                              PartialSampleProfileWorkingSetSizeScaleFactor);
+    HasHugeWorkingSetSize =
+        ScaledHotEntryNumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
+    HasLargeWorkingSetSize =
+        ScaledHotEntryNumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+  }
 }
 
 Optional<uint64_t>
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 1416cdce99749..3ea181a9b48d3 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
@@ -673,3 +674,23 @@ GlobalVariable *llvm::collectUsedGlobalVariables(
   }
   return GV;
 }
+
+void Module::setPartialSampleProfileRatio(const ModuleSummaryIndex &Index) {
+  if (auto *SummaryMD = getProfileSummary(/*IsCS*/ false)) {
+    std::unique_ptr<ProfileSummary> ProfileSummary(
+        ProfileSummary::getFromMD(SummaryMD));
+    if (ProfileSummary) {
+      if (ProfileSummary->getKind() != ProfileSummary::PSK_Sample ||
+          !ProfileSummary->isPartialProfile())
+        return;
+      uint64_t BlockCount = Index.getBlockCount();
+      uint32_t NumCounts = ProfileSummary->getNumCounts();
+      if (!NumCounts)
+        return;
+      double Ratio = (double)BlockCount / NumCounts;
+      ProfileSummary->setPartialProfileRatio(Ratio);
+      setProfileSummary(ProfileSummary->getMD(getContext()),
+                        ProfileSummary::PSK_Sample);
+    }
+  }
+}
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 22019e465ac11..79c528176f257 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -541,6 +541,10 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
     return DiagFileOrErr.takeError();
   auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
+  // Set the partial sample profile ratio in the profile summary module flag of
+  // the module, if applicable.
+  Mod.setPartialSampleProfileRatio(CombinedIndex);
+
   if (Conf.CodeGenOnly) {
     codegen(Conf, TM.get(), AddStream, Task, Mod);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index a73ba84696e7b..468bf19f2e48a 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1232,6 +1232,12 @@ Expected<bool> FunctionImporter::importFunctions(
     // have loaded all the required metadata!
     UpgradeDebugInfo(*SrcModule);
 
+    // Set the partial sample profile ratio in the profile summary module flag
+    // of the imported source module, if applicable, so that the profile summary
+    // module flag will match with that of the destination module when it's
+    // imported.
+    SrcModule->setPartialSampleProfileRatio(Index);
+
     // Link in the specified functions.
     if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
                                &GlobalsToImport))
diff --git a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
index ae60c41435481..cbd2236e4cb3d 100644
--- a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
+extern llvm::cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
+
 namespace llvm {
 namespace {
 
@@ -42,7 +44,12 @@ class ProfileSummaryInfoTest : public testing::Test {
     BPI.reset(new BranchProbabilityInfo(F, *LI));
     return BlockFrequencyInfo(F, *BPI, *LI);
   }
-  std::unique_ptr<Module> makeLLVMModule(const char *ProfKind = nullptr) {
+  std::unique_ptr<Module> makeLLVMModule(const char *ProfKind = nullptr,
+                                         uint64_t NumCounts = 3,
+                                         uint64_t IsPartialProfile = 0,
+                                         double PartialProfileRatio = 0.0,
+                                         uint64_t HotNumCounts = 3,
+                                         uint64_t ColdNumCounts = 10) {
     const char *ModuleString =
         "define i32 @g(i32 %x) !prof !21 {{\n"
         "  ret i32 0\n"
@@ -83,27 +90,32 @@ class ProfileSummaryInfoTest : public testing::Test {
         "!22 = !{{!\"function_entry_count\", i64 100}\n"
         "!23 = !{{!\"branch_weights\", i32 64, i32 4}\n"
         "{0}";
-    const char *SummaryString = "!llvm.module.flags = !{{!1}"
-                                "!1 = !{{i32 1, !\"ProfileSummary\", !2}"
-                                "!2 = !{{!3, !4, !5, !6, !7, !8, !9, !10}"
-                                "!3 = !{{!\"ProfileFormat\", !\"{0}\"}"
-                                "!4 = !{{!\"TotalCount\", i64 10000}"
-                                "!5 = !{{!\"MaxCount\", i64 10}"
-                                "!6 = !{{!\"MaxInternalCount\", i64 1}"
-                                "!7 = !{{!\"MaxFunctionCount\", i64 1000}"
-                                "!8 = !{{!\"NumCounts\", i64 3}"
-                                "!9 = !{{!\"NumFunctions\", i64 3}"
-                                "!10 = !{{!\"DetailedSummary\", !11}"
-                                "!11 = !{{!12, !13, !14}"
-                                "!12 = !{{i32 10000, i64 1000, i32 1}"
-                                "!13 = !{{i32 999000, i64 300, i32 3}"
-                                "!14 = !{{i32 999999, i64 5, i32 10}";
+    const char *SummaryString =
+        "!llvm.module.flags = !{{!1}\n"
+        "!1 = !{{i32 1, !\"ProfileSummary\", !2}\n"
+        "!2 = !{{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12}\n"
+        "!3 = !{{!\"ProfileFormat\", !\"{0}\"}\n"
+        "!4 = !{{!\"TotalCount\", i64 10000}\n"
+        "!5 = !{{!\"MaxCount\", i64 10}\n"
+        "!6 = !{{!\"MaxInternalCount\", i64 1}\n"
+        "!7 = !{{!\"MaxFunctionCount\", i64 1000}\n"
+        "!8 = !{{!\"NumCounts\", i64 {1}}\n"
+        "!9 = !{{!\"NumFunctions\", i64 3}\n"
+        "!10 = !{{!\"IsPartialProfile\", i64 {2}}\n"
+        "!11 = !{{!\"PartialProfileRatio\", double {3}}\n"
+        "!12 = !{{!\"DetailedSummary\", !13}\n"
+        "!13 = !{{!14, !15, !16}\n"
+        "!14 = !{{i32 10000, i64 1000, i32 1}\n"
+        "!15 = !{{i32 990000, i64 300, i32 {4}}\n"
+        "!16 = !{{i32 999999, i64 5, i32 {5}}\n";
     SMDiagnostic Err;
-    if (ProfKind)
-      return parseAssemblyString(
-          formatv(ModuleString, formatv(SummaryString, ProfKind).str()).str(),
-          Err, C);
-    else
+    if (ProfKind) {
+      auto Summary =
+          formatv(SummaryString, ProfKind, NumCounts, IsPartialProfile,
+                  PartialProfileRatio, HotNumCounts, ColdNumCounts)
+              .str();
+      return parseAssemblyString(formatv(ModuleString, Summary).str(), Err, C);
+    } else
       return parseAssemblyString(formatv(ModuleString, "").str(), Err, C);
   }
 };
@@ -280,6 +292,7 @@ TEST_F(ProfileSummaryInfoTest, SampleProf) {
   ProfileSummaryInfo PSI = buildPSI(M.get());
   EXPECT_TRUE(PSI.hasProfileSummary());
   EXPECT_TRUE(PSI.hasSampleProfile());
+  EXPECT_FALSE(PSI.hasPartialSampleProfile());
 
   BasicBlock &BB0 = F->getEntryBlock();
   BasicBlock *BB1 = BB0.getTerminator()->getSuccessor(0);
@@ -373,5 +386,47 @@ TEST_F(ProfileSummaryInfoTest, SampleProfNoFuncEntryCount) {
   EXPECT_FALSE(PSI.isFunctionColdInCallGraphNthPercentile(990000, F, BFI));
 }
 
+TEST_F(ProfileSummaryInfoTest, PartialSampleProfWorkingSetSize) {
+  ScalePartialSampleProfileWorkingSetSize.setValue(true);
+
+  // With PartialProfileRatio unset (zero.)
+  auto M1 = makeLLVMModule("SampleProfile", /*NumCounts*/ 3,
+                           /*IsPartialProfile*/ 1,
+                           /*PartialProfileRatio*/ 0.0,
+                           /*HotNumCounts*/ 3, /*ColdNumCounts*/ 10);
+  ProfileSummaryInfo PSI1 = buildPSI(M1.get());
+  EXPECT_TRUE(PSI1.hasProfileSummary());
+  EXPECT_TRUE(PSI1.hasSampleProfile());
+  EXPECT_TRUE(PSI1.hasPartialSampleProfile());
+  EXPECT_FALSE(PSI1.hasHugeWorkingSetSize());
+  EXPECT_FALSE(PSI1.hasLargeWorkingSetSize());
+
+  // With PartialProfileRatio set (non-zero) and a small working set size.
+  auto M2 = makeLLVMModule("SampleProfile", /*NumCounts*/ 27493235,
+                           /*IsPartialProfile*/ 1,
+                           /*PartialProfileRatio*/ 0.00000012,
+                           /*HotNumCounts*/ 3102082,
+                           /*ColdNumCounts*/ 18306149);
+  ProfileSummaryInfo PSI2 = buildPSI(M2.get());
+  EXPECT_TRUE(PSI2.hasProfileSummary());
+  EXPECT_TRUE(PSI2.hasSampleProfile());
+  EXPECT_TRUE(PSI2.hasPartialSampleProfile());
+  EXPECT_FALSE(PSI2.hasHugeWorkingSetSize());
+  EXPECT_FALSE(PSI2.hasLargeWorkingSetSize());
+
+  // With PartialProfileRatio is set (non-zero) and a large working set size.
+  auto M3 = makeLLVMModule("SampleProfile", /*NumCounts*/ 27493235,
+                           /*IsPartialProfile*/ 1,
+                           /*PartialProfileRatio*/ 0.9,
+                           /*HotNumCounts*/ 3102082,
+                           /*ColdNumCounts*/ 18306149);
+  ProfileSummaryInfo PSI3 = buildPSI(M3.get());
+  EXPECT_TRUE(PSI3.hasProfileSummary());
+  EXPECT_TRUE(PSI3.hasSampleProfile());
+  EXPECT_TRUE(PSI3.hasPartialSampleProfile());
+  EXPECT_TRUE(PSI3.hasHugeWorkingSetSize());
+  EXPECT_TRUE(PSI3.hasLargeWorkingSetSize());
+}
+
 } // end anonymous namespace
 } // end namespace llvm
diff --git a/llvm/unittests/IR/ModuleTest.cpp b/llvm/unittests/IR/ModuleTest.cpp
index 7b34d5d0ee554..67338f797d3ab 100644
--- a/llvm/unittests/IR/ModuleTest.cpp
+++ b/llvm/unittests/IR/ModuleTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include "gtest/gtest.h"
@@ -121,4 +122,40 @@ TEST(ModuleTest, setProfileSummary) {
   delete PS;
 }
 
+TEST(ModuleTest, setPartialSampleProfileRatio) {
+  const char *IRString = R"IR(
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"ProfileSummary", !1}
+  !1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+  !2 = !{!"ProfileFormat", !"SampleProfile"}
+  !3 = !{!"TotalCount", i64 10000}
+  !4 = !{!"MaxCount", i64 10}
+  !5 = !{!"MaxInternalCount", i64 1}
+  !6 = !{!"MaxFunctionCount", i64 1000}
+  !7 = !{!"NumCounts", i64 200}
+  !8 = !{!"NumFunctions", i64 3}
+  !9 = !{!"IsPartialProfile", i64 1}
+  !10 = !{!"PartialProfileRatio", double 0.0}
+  !11 = !{!"DetailedSummary", !12}
+  !12 = !{!13, !14, !15}
+  !13 = !{i32 10000, i64 1000, i32 1}
+  !14 = !{i32 990000, i64 300, i32 10}
+  !15 = !{i32 999999, i64 5, i32 100}
+  )IR";
+
+  SMDiagnostic Err;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = parseAssemblyString(IRString, Err, Context);
+  ModuleSummaryIndex Index(/*HaveGVs*/ false);
+  const unsigned BlockCount = 100;
+  const unsigned NumCounts = 200;
+  Index.setBlockCount(BlockCount);
+  M->setPartialSampleProfileRatio(Index);
+  double Ratio = (double)BlockCount / NumCounts;
+  std::unique_ptr<ProfileSummary> ProfileSummary(
+      ProfileSummary::getFromMD(M->getProfileSummary(/*IsCS*/ false)));
+  EXPECT_EQ(Ratio, ProfileSummary->getPartialProfileRatio());
+}
+
 } // end namespace

From f97a609b1763d7e343f52a7061727c4874bc26df Mon Sep 17 00:00:00 2001
From: Julian Lettner <julian.lettner@apple.com>
Date: Thu, 14 May 2020 13:43:33 -0700
Subject: [PATCH 733/770] [Darwin] Add and adopt a way to query the Darwin
 kernel version

This applies the learnings from [1].  What I intended as a simple
cleanup made me realize that the compiler-rt version checks have two
separate issues:

1) In some places (e.g., mmap flag setting) what matters is the kernel
   version, not the OS version.
2) OS version checks are implemented by querying the kernel version.
   This is not necessarily correct inside the simulators if the
   simulator runtime isn't aligned with the host macOS.

This commit tackles 1) by adopting a separate query function for the
Darwin kernel version.  2) (and cleanups) will be dealt with in
follow-ups.

[1] https://reviews.llvm.org/D78942

rdar://63031937

Reviewed By: delcypher

Differential Revision: https://reviews.llvm.org/D79965
---
 .../lib/sanitizer_common/sanitizer_mac.cpp    | 27 ++++++++++++++-----
 .../lib/sanitizer_common/sanitizer_mac.h      | 17 +++++++++++-
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index 85fc21a96232e..7550545ea6fa2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -635,11 +635,7 @@ MacosVersion GetMacosVersionInternal() {
     case 14: return MACOS_VERSION_YOSEMITE;
     case 15: return MACOS_VERSION_EL_CAPITAN;
     case 16: return MACOS_VERSION_SIERRA;
-    case 17:
-      // Not a typo, 17.5 Darwin Kernel Version maps to High Sierra 10.13.4.
-      if (minor >= 5)
-        return MACOS_VERSION_HIGH_SIERRA_DOT_RELEASE_4;
-      return MACOS_VERSION_HIGH_SIERRA;
+    case 17: return MACOS_VERSION_HIGH_SIERRA;
     case 18: return MACOS_VERSION_MOJAVE;
     case 19: return MACOS_VERSION_CATALINA;
     default:
@@ -660,6 +656,23 @@ MacosVersion GetMacosVersion() {
   return result;
 }
 
+DarwinKernelVersion GetDarwinKernelVersion() {
+  char buf[100];
+  size_t len = sizeof(buf);
+  int res = internal_sysctlbyname("kern.osrelease", buf, &len, nullptr, 0);
+  CHECK_EQ(res, 0);
+
+  // Format: <major>.<minor>.<patch>\0
+  CHECK_GE(len, 6);
+  const char *p = buf;
+  u16 major = internal_simple_strtoll(p, &p, /*base=*/10);
+  CHECK_EQ(*p, '.');
+  p += 1;
+  u16 minor = internal_simple_strtoll(p, &p, /*base=*/10);
+
+  return DarwinKernelVersion(major, minor);
+}
+
 uptr GetRSS() {
   struct task_basic_info info;
   unsigned count = TASK_BASIC_INFO_COUNT;
@@ -796,10 +809,10 @@ void SignalContext::InitPcSpBp() {
 }
 
 void InitializePlatformEarly() {
-  // Only use xnu_fast_mmap when on x86_64 and the OS supports it.
+  // Only use xnu_fast_mmap when on x86_64 and the kernel supports it.
   use_xnu_fast_mmap =
 #if defined(__x86_64__)
-      GetMacosVersion() >= MACOS_VERSION_HIGH_SIERRA_DOT_RELEASE_4;
+      GetDarwinKernelVersion() >= DarwinKernelVersion(17, 5);
 #else
       false;
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h
index 2226fcb5d03a8..34dc2c05dcf4b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h
@@ -40,13 +40,28 @@ enum MacosVersion {
   MACOS_VERSION_EL_CAPITAN,
   MACOS_VERSION_SIERRA,
   MACOS_VERSION_HIGH_SIERRA,
-  MACOS_VERSION_HIGH_SIERRA_DOT_RELEASE_4,
   MACOS_VERSION_MOJAVE,
   MACOS_VERSION_CATALINA,
   MACOS_VERSION_UNKNOWN_NEWER
 };
 
+struct DarwinKernelVersion {
+  u16 major;
+  u16 minor;
+
+  DarwinKernelVersion(u16 major, u16 minor) : major(major), minor(minor) {}
+
+  bool operator==(const DarwinKernelVersion &other) const {
+    return major == other.major && minor == other.minor;
+  }
+  bool operator>=(const DarwinKernelVersion &other) const {
+    return major >= other.major ||
+           (major == other.major && minor >= other.minor);
+  }
+};
+
 MacosVersion GetMacosVersion();
+DarwinKernelVersion GetDarwinKernelVersion();
 
 char **GetEnviron();
 

From 836c7dcf1238683ff18882affac1dae5ae5c5f79 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 21:39:17 -0400
Subject: [PATCH 734/770] DAG: Fix getNode dropping flags if there's a glue
 output

The AMDGPU non-strict fdiv lowering needs to introduce an FP mode
switch in some cases, and has custom nodes to provide chain/glue for
the intermediate FP operations. We need to propagate nofpexcept here,
but getNode was dropping the flags.

Adding nofpexcept in the AMDGPU custom lowering is left to a future
patch.

Also fix a second case where flags were dropped, but in this case it
seems it just didn't handle this number of operands.

Test will be included in future AMDGPU patch.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 232b1ea6bb78e..7e1991c866eb8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7444,6 +7444,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     createOperands(N, Ops);
   }
 
+  N->setFlags(Flags);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
@@ -7525,13 +7526,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
       return SDValue(E, 0);
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
-    N->setFlags(Flags);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
     createOperands(N, Ops);
   }
+
+  N->setFlags(Flags);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);

From 2ecaf93525fe4b271117d3932118ecaccdacaa03 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Wed, 27 May 2020 18:19:54 -0700
Subject: [PATCH 735/770] [LiveDebugValues] Speed up removeEntryValue, NFC

Summary:
Instead of iterating over all VarLoc IDs in removeEntryValue(), just
iterate over the interval reserved for entry value VarLocs. This changes
the iteration order, hence the test update -- otherwise this is NFC.

This appears to give an ~8.5x wall time speed-up for LiveDebugValues when
compiling sqlite3.c 3.30.1 with a Release clang (on my machine):

```
          ---User Time---   --System Time--   --User+System--   ---Wall Time--- --- Name ---
  Before: 2.5402 ( 18.8%)   0.0050 (  0.4%)   2.5452 ( 17.3%)   2.5452 ( 17.3%) Live DEBUG_VALUE analysis
   After: 0.2364 (  2.1%)   0.0034 (  0.3%)   0.2399 (  2.0%)   0.2398 (  2.0%) Live DEBUG_VALUE analysis
```

The change in removeEntryValue() is the only one that appears to affect
wall time, but for consistency (and to resolve a pending TODO), I made
the analogous changes for iterating over SpillLocKind VarLocs.

Reviewers: nikic, aprantl, jmorse, djtodoro

Subscribers: hiraditya, dexonsmith, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80684
---
 llvm/include/llvm/ADT/CoalescingBitVector.h   |  14 ++
 llvm/lib/CodeGen/LiveDebugValues.cpp          | 124 ++++++++++++++----
 .../MIR/X86/entry-values-diamond-bbs.mir      |   6 +-
 .../unittests/ADT/CoalescingBitVectorTest.cpp |  55 ++++++++
 4 files changed, 170 insertions(+), 29 deletions(-)

diff --git a/llvm/include/llvm/ADT/CoalescingBitVector.h b/llvm/include/llvm/ADT/CoalescingBitVector.h
index 647857435b119..f8c8fec0ec9e7 100644
--- a/llvm/include/llvm/ADT/CoalescingBitVector.h
+++ b/llvm/include/llvm/ADT/CoalescingBitVector.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -352,6 +353,19 @@ template <typename IndexT, unsigned N = 16> class CoalescingBitVector {
     return It;
   }
 
+  /// Return a range iterator which iterates over all of the set bits in the
+  /// half-open range [Start, End).
+  iterator_range<const_iterator> half_open_range(IndexT Start,
+                                                 IndexT End) const {
+    assert(Start < End && "Not a valid range");
+    auto StartIt = find(Start);
+    if (StartIt == end() || *StartIt >= End)
+      return {end(), end()};
+    auto EndIt = StartIt;
+    EndIt.advanceToLowerBound(End);
+    return {StartIt, EndIt};
+  }
+
   void print(raw_ostream &OS) const {
     OS << "{";
     for (auto It = Intervals.begin(), End = Intervals.end(); It != End;
diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index 2d11a23e9ede4..abda001fa4ae7 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -137,16 +137,24 @@ using VarLocSet = CoalescingBitVector<uint64_t>;
 /// Why encode a location /into/ the VarLocMap index? This makes it possible
 /// to find the open VarLocs killed by a register def very quickly. This is a
 /// performance-critical operation for LiveDebugValues.
-///
-/// TODO: Consider adding reserved intervals for kinds of VarLocs other than
-/// RegisterKind, like SpillLocKind or EntryValueKind, to optimize iteration
-/// over open locations.
 struct LocIndex {
   uint32_t Location; // Physical registers live in the range [1;2^30) (see
                      // \ref MCRegister), so we have plenty of range left here
                      // to encode non-register locations.
   uint32_t Index;
 
+  /// The first location greater than 0 that is not reserved for VarLocs of
+  /// kind RegisterKind.
+  static constexpr uint32_t kFirstInvalidRegLocation = 1 << 30;
+
+  /// A special location reserved for VarLocs of kind SpillLocKind.
+  static constexpr uint32_t kSpillLocation = kFirstInvalidRegLocation;
+
+  /// A special location reserved for VarLocs of kind EntryValueBackupKind and
+  /// EntryValueCopyBackupKind.
+  static constexpr uint32_t kEntryValueBackupLocation =
+      kFirstInvalidRegLocation + 1;
+
   LocIndex(uint32_t Location, uint32_t Index)
       : Location(Location), Index(Index) {}
 
@@ -166,6 +174,14 @@ struct LocIndex {
   static uint64_t rawIndexForReg(uint32_t Reg) {
     return LocIndex(Reg, 0).getAsRawInteger();
   }
+
+  /// Return a range covering all set indices in the interval reserved for
+  /// \p Location in \p Set.
+  static auto indexRangeForLocation(const VarLocSet &Set, uint32_t Location) {
+    uint64_t Start = LocIndex(Location, 0).getAsRawInteger();
+    uint64_t End = LocIndex(Location + 1, 0).getAsRawInteger();
+    return Set.half_open_range(Start, End);
+  }
 };
 
 class LiveDebugValues : public MachineFunctionPass {
@@ -211,6 +227,9 @@ class LiveDebugValues : public MachineFunctionPass {
       bool operator==(const SpillLoc &Other) const {
         return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
       }
+      bool operator!=(const SpillLoc &Other) const {
+        return !(*this == Other);
+      }
     };
 
     /// Identity of the variable at this location.
@@ -477,10 +496,27 @@ class LiveDebugValues : public MachineFunctionPass {
     /// location.
     SmallDenseMap<uint32_t, std::vector<VarLoc>> Loc2Vars;
 
+    /// Determine the 32-bit location reserved for \p VL, based on its kind.
+    static uint32_t getLocationForVar(const VarLoc &VL) {
+      switch (VL.Kind) {
+      case VarLoc::RegisterKind:
+        assert((VL.Loc.RegNo < LocIndex::kFirstInvalidRegLocation) &&
+               "Physreg out of range?");
+        return VL.Loc.RegNo;
+      case VarLoc::SpillLocKind:
+        return LocIndex::kSpillLocation;
+      case VarLoc::EntryValueBackupKind:
+      case VarLoc::EntryValueCopyBackupKind:
+        return LocIndex::kEntryValueBackupLocation;
+      default:
+        return 0;
+      }
+    }
+
   public:
     /// Retrieve a unique LocIndex for \p VL.
     LocIndex insert(const VarLoc &VL) {
-      uint32_t Location = VL.isDescribedByReg();
+      uint32_t Location = getLocationForVar(VL);
       uint32_t &Index = Var2Index[VL];
       if (!Index) {
         auto &Vars = Loc2Vars[Location];
@@ -577,6 +613,30 @@ class LiveDebugValues : public MachineFunctionPass {
              "open ranges are inconsistent");
       return VarLocs.empty();
     }
+
+    /// Get an empty range of VarLoc IDs.
+    auto getEmptyVarLocRange() const {
+      return iterator_range<VarLocSet::const_iterator>(getVarLocs().end(),
+                                                       getVarLocs().end());
+    }
+
+    /// Get all set IDs for VarLocs of kind RegisterKind in \p Reg.
+    auto getRegisterVarLocs(Register Reg) const {
+      return LocIndex::indexRangeForLocation(getVarLocs(), Reg);
+    }
+
+    /// Get all set IDs for VarLocs of kind SpillLocKind.
+    auto getSpillVarLocs() const {
+      return LocIndex::indexRangeForLocation(getVarLocs(),
+                                             LocIndex::kSpillLocation);
+    }
+
+    /// Get all set IDs for VarLocs of kind EntryValueBackupKind or
+    /// EntryValueCopyBackupKind.
+    auto getEntryValueBackupVarLocs() const {
+      return LocIndex::indexRangeForLocation(
+          getVarLocs(), LocIndex::kEntryValueBackupLocation);
+    }
   };
 
   /// Collect all VarLoc IDs from \p CollectFrom for VarLocs of kind
@@ -821,7 +881,10 @@ void LiveDebugValues::getUsedRegs(const VarLocSet &CollectFrom,
   // All register-based VarLocs are assigned indices greater than or equal to
   // FirstRegIndex.
   uint64_t FirstRegIndex = LocIndex::rawIndexForReg(1);
-  for (auto It = CollectFrom.find(FirstRegIndex), End = CollectFrom.end();
+  uint64_t FirstInvalidIndex =
+      LocIndex::rawIndexForReg(LocIndex::kFirstInvalidRegLocation);
+  for (auto It = CollectFrom.find(FirstRegIndex),
+            End = CollectFrom.find(FirstInvalidIndex);
        It != End;) {
     // We found a VarLoc ID for a VarLoc that lives in a register. Figure out
     // which register and add it to UsedRegs.
@@ -924,11 +987,8 @@ bool LiveDebugValues::removeEntryValue(const MachineInstr &MI,
   }
 
   if (TrySalvageEntryValue) {
-    for (uint64_t ID : OpenRanges.getVarLocs()) {
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
       const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
-      if (!VL.isEntryBackupLoc())
-        continue;
-
       if (VL.getEntryValueCopyBackupReg() == Reg &&
           VL.MI.getOperand(0).getReg() == SrcRegOp->getReg())
         return false;
@@ -1259,10 +1319,11 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
   VarLocSet KillSet(Alloc);
   if (isSpillInstruction(MI, MF)) {
     Loc = extractSpillBaseRegAndOffset(MI);
-    for (uint64_t ID : OpenRanges.getVarLocs()) {
+    for (uint64_t ID : OpenRanges.getSpillVarLocs()) {
       LocIndex Idx = LocIndex::fromRawInteger(ID);
       const VarLoc &VL = VarLocIDs[Idx];
-      if (VL.Kind == VarLoc::SpillLocKind && VL.Loc.SpillLocation == *Loc) {
+      assert(VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
+      if (VL.Loc.SpillLocation == *Loc) {
         // This location is overwritten by the current instruction -- terminate
         // the open range, and insert an explicit DBG_VALUE $noreg.
         //
@@ -1298,21 +1359,31 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
                       << "\n");
   }
   // Check if the register or spill location is the location of a debug value.
-  for (uint64_t ID : OpenRanges.getVarLocs()) {
+  auto TransferCandidates = OpenRanges.getEmptyVarLocRange();
+  if (TKind == TransferKind::TransferSpill)
+    TransferCandidates = OpenRanges.getRegisterVarLocs(Reg);
+  else if (TKind == TransferKind::TransferRestore)
+    TransferCandidates = OpenRanges.getSpillVarLocs();
+  for (uint64_t ID : TransferCandidates) {
     LocIndex Idx = LocIndex::fromRawInteger(ID);
     const VarLoc &VL = VarLocIDs[Idx];
-    if (TKind == TransferKind::TransferSpill && VL.isDescribedByReg() == Reg) {
+    if (TKind == TransferKind::TransferSpill) {
+      assert(VL.isDescribedByReg() == Reg && "Broken VarLocSet?");
       LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
                         << VL.Var.getVariable()->getName() << ")\n");
-    } else if (TKind == TransferKind::TransferRestore &&
-               VL.Kind == VarLoc::SpillLocKind &&
-               VL.Loc.SpillLocation == *Loc) {
+    } else {
+      assert(TKind == TransferKind::TransferRestore &&
+             VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
+      if (VL.Loc.SpillLocation != *Loc)
+        // The spill location is not the location of a debug value.
+        continue;
       LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
                         << VL.Var.getVariable()->getName() << ")\n");
-    } else
-      continue;
+    }
     insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx, TKind,
                             Reg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
     return;
   }
 }
@@ -1356,7 +1427,7 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
   // a parameter describing only a moving of the value around, rather then
   // modifying it, we are still able to use the entry value if needed.
   if (isRegOtherThanSPAndFP(*DestRegOp, MI, TRI)) {
-    for (uint64_t ID : OpenRanges.getVarLocs()) {
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
       LocIndex Idx = LocIndex::fromRawInteger(ID);
       const VarLoc &VL = VarLocIDs[Idx];
       if (VL.getEntryValueBackupReg() == SrcReg) {
@@ -1378,13 +1449,14 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
   if (!SrcRegOp->isKill())
     return;
 
-  for (uint64_t ID : OpenRanges.getVarLocs()) {
+  for (uint64_t ID : OpenRanges.getRegisterVarLocs(SrcReg)) {
     LocIndex Idx = LocIndex::fromRawInteger(ID);
-    if (VarLocIDs[Idx].isDescribedByReg() == SrcReg) {
-      insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx,
-                              TransferKind::TransferCopy, DestReg);
-      return;
-    }
+    assert(VarLocIDs[Idx].isDescribedByReg() == SrcReg && "Broken VarLocSet?");
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx,
+                            TransferKind::TransferCopy, DestReg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
+    return;
   }
 }
 
diff --git a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir
index fc7bd93d0223c..734ae7127a804 100644
--- a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir
+++ b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir
@@ -23,9 +23,9 @@
 # CHECK-NEXT: $ebx = MOV32ri 2
 # CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
 # CHECK: bb.3.if.end
-# CHECK-NEXT: DBG_VALUE $edx, $noreg, ![[ARG_Q]], !DIExpression()
-# CHECK-NEXT: DBG_VALUE $ebp, $noreg, ![[ARG_C]], !DIExpression()
-# CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK-DAG: DBG_VALUE $edx, $noreg, ![[ARG_Q]], !DIExpression()
+# CHECK-DAG: DBG_VALUE $ebp, $noreg, ![[ARG_C]], !DIExpression()
+# CHECK-DAG: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
 --- |
   ; ModuleID = 'test.c'
   source_filename = "test.c"
diff --git a/llvm/unittests/ADT/CoalescingBitVectorTest.cpp b/llvm/unittests/ADT/CoalescingBitVectorTest.cpp
index 4f87bf415bebb..355426c4d84ec 100644
--- a/llvm/unittests/ADT/CoalescingBitVectorTest.cpp
+++ b/llvm/unittests/ADT/CoalescingBitVectorTest.cpp
@@ -31,6 +31,11 @@ bool elementsMatch(const UBitVec &BV, std::initializer_list<unsigned> List) {
   return true;
 }
 
+bool rangesMatch(iterator_range<UBitVec::const_iterator> R,
+                 std::initializer_list<unsigned> List) {
+  return std::equal(R.begin(), R.end(), List.begin(), List.end());
+}
+
 TEST(CoalescingBitVectorTest, Set) {
   UBitVec::Allocator Alloc;
   UBitVec BV1(Alloc);
@@ -486,6 +491,56 @@ TEST(CoalescingBitVectorTest, AdvanceToLowerBound) {
   EXPECT_TRUE(It == BV.end());
 }
 
+TEST(CoalescingBitVectorTest, HalfOpenRange) {
+  UBitVec::Allocator Alloc;
+
+  {
+    UBitVec BV(Alloc);
+    BV.set({1, 2, 3});
+
+    EXPECT_EQ(*BV.find(0), 1U); // find(Start) > Start
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(0, 5), {1, 2, 3}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(1, 4), {1, 2, 3}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(1, 3), {1, 2}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(2, 3), {2}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(2, 4), {2, 3}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(4, 5), {}));
+  }
+
+  {
+    UBitVec BV(Alloc);
+    BV.set({1, 2, 11, 12});
+
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(0, 15), {1, 2, 11, 12}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(1, 13), {1, 2, 11, 12}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(1, 12), {1, 2, 11}));
+
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(0, 5), {1, 2}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(1, 5), {1, 2}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(2, 5), {2}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(1, 2), {1}));
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(13, 14), {}));
+
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(2, 10), {2}));
+  }
+
+  {
+    UBitVec BV(Alloc);
+    BV.set({1});
+
+    EXPECT_EQ(*BV.find(0), 1U); // find(Start) == End
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(0, 1), {}));
+  }
+
+  {
+    UBitVec BV(Alloc);
+    BV.set({5});
+
+    EXPECT_EQ(*BV.find(3), 5U); // find(Start) > End
+    EXPECT_TRUE(rangesMatch(BV.half_open_range(3, 4), {}));
+  }
+}
+
 TEST(CoalescingBitVectorTest, Print) {
   std::string S;
   {

From 11c617c417766c7ff36a8fefb9bd2b608c971e19 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Mon, 1 Jun 2020 11:01:49 -0700
Subject: [PATCH 736/770] [LiveDebugValues] Add
 LocIndex::u32_{location,index}_t types for readability, NFC

This is per Adrian's suggestion in https://reviews.llvm.org/D80684.
---
 llvm/lib/CodeGen/LiveDebugValues.cpp | 35 ++++++++++++++++------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index abda001fa4ae7..6f0f9bd01ed3a 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -138,24 +138,27 @@ using VarLocSet = CoalescingBitVector<uint64_t>;
 /// to find the open VarLocs killed by a register def very quickly. This is a
 /// performance-critical operation for LiveDebugValues.
 struct LocIndex {
-  uint32_t Location; // Physical registers live in the range [1;2^30) (see
-                     // \ref MCRegister), so we have plenty of range left here
-                     // to encode non-register locations.
-  uint32_t Index;
+  using u32_location_t = uint32_t;
+  using u32_index_t = uint32_t;
+
+  u32_location_t Location; // Physical registers live in the range [1;2^30) (see
+                           // \ref MCRegister), so we have plenty of range left
+                           // here to encode non-register locations.
+  u32_index_t Index;
 
   /// The first location greater than 0 that is not reserved for VarLocs of
   /// kind RegisterKind.
-  static constexpr uint32_t kFirstInvalidRegLocation = 1 << 30;
+  static constexpr u32_location_t kFirstInvalidRegLocation = 1 << 30;
 
   /// A special location reserved for VarLocs of kind SpillLocKind.
-  static constexpr uint32_t kSpillLocation = kFirstInvalidRegLocation;
+  static constexpr u32_location_t kSpillLocation = kFirstInvalidRegLocation;
 
   /// A special location reserved for VarLocs of kind EntryValueBackupKind and
   /// EntryValueCopyBackupKind.
-  static constexpr uint32_t kEntryValueBackupLocation =
+  static constexpr u32_location_t kEntryValueBackupLocation =
       kFirstInvalidRegLocation + 1;
 
-  LocIndex(uint32_t Location, uint32_t Index)
+  LocIndex(u32_location_t Location, u32_index_t Index)
       : Location(Location), Index(Index) {}
 
   uint64_t getAsRawInteger() const {
@@ -166,7 +169,8 @@ struct LocIndex {
     static_assert(std::is_unsigned<IntT>::value &&
                       sizeof(ID) == sizeof(uint64_t),
                   "Cannot convert raw integer to LocIndex");
-    return {static_cast<uint32_t>(ID >> 32), static_cast<uint32_t>(ID)};
+    return {static_cast<u32_location_t>(ID >> 32),
+            static_cast<u32_index_t>(ID)};
   }
 
   /// Get the start of the interval reserved for VarLocs of kind RegisterKind
@@ -177,7 +181,8 @@ struct LocIndex {
 
   /// Return a range covering all set indices in the interval reserved for
   /// \p Location in \p Set.
-  static auto indexRangeForLocation(const VarLocSet &Set, uint32_t Location) {
+  static auto indexRangeForLocation(const VarLocSet &Set,
+                                    u32_location_t Location) {
     uint64_t Start = LocIndex(Location, 0).getAsRawInteger();
     uint64_t End = LocIndex(Location + 1, 0).getAsRawInteger();
     return Set.half_open_range(Start, End);
@@ -490,14 +495,14 @@ class LiveDebugValues : public MachineFunctionPass {
   class VarLocMap {
     /// Map a VarLoc to an index within the vector reserved for its location
     /// within Loc2Vars.
-    std::map<VarLoc, uint32_t> Var2Index;
+    std::map<VarLoc, LocIndex::u32_index_t> Var2Index;
 
     /// Map a location to a vector which holds VarLocs which live in that
     /// location.
-    SmallDenseMap<uint32_t, std::vector<VarLoc>> Loc2Vars;
+    SmallDenseMap<LocIndex::u32_location_t, std::vector<VarLoc>> Loc2Vars;
 
     /// Determine the 32-bit location reserved for \p VL, based on its kind.
-    static uint32_t getLocationForVar(const VarLoc &VL) {
+    static LocIndex::u32_location_t getLocationForVar(const VarLoc &VL) {
       switch (VL.Kind) {
       case VarLoc::RegisterKind:
         assert((VL.Loc.RegNo < LocIndex::kFirstInvalidRegLocation) &&
@@ -516,8 +521,8 @@ class LiveDebugValues : public MachineFunctionPass {
   public:
     /// Retrieve a unique LocIndex for \p VL.
     LocIndex insert(const VarLoc &VL) {
-      uint32_t Location = getLocationForVar(VL);
-      uint32_t &Index = Var2Index[VL];
+      LocIndex::u32_location_t Location = getLocationForVar(VL);
+      LocIndex::u32_index_t &Index = Var2Index[VL];
       if (!Index) {
         auto &Vars = Loc2Vars[Location];
         Vars.push_back(VL);

From 89d48ccabe6a950369b2bd922b1d8e987b856ac7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 27 May 2020 21:41:52 -0400
Subject: [PATCH 737/770] AMDGPU: Fix not emitting nofpexcept on fdiv expansion

In this awkward case, we have to emit custom pseudo-constrained FP
wrappers. InstrEmitter concludes that since a mayRaiseFPException
instruction had a chain, it can't add nofpexcept.

Test deferred until mayRaiseFPException is really set on everything.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 55 ++++++++++++++---------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 08effeea18126..ffbf8529fdeb0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7907,9 +7907,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
 }
 
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
-                          EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+                          EVT VT, SDValue A, SDValue B, SDValue GlueChain,
+                          SDNodeFlags Flags) {
   if (GlueChain->getNumValues() <= 1) {
-    return DAG.getNode(Opcode, SL, VT, A, B);
+    return DAG.getNode(Opcode, SL, VT, A, B, Flags);
   }
 
   assert(GlueChain->getNumValues() == 3);
@@ -7922,15 +7923,16 @@ static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
     break;
   }
 
-  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
-                     GlueChain.getValue(2));
+  return DAG.getNode(Opcode, SL, VTList,
+                     {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
+                     Flags);
 }
 
 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                            EVT VT, SDValue A, SDValue B, SDValue C,
-                           SDValue GlueChain) {
+                           SDValue GlueChain, SDNodeFlags Flags) {
   if (GlueChain->getNumValues() <= 1) {
-    return DAG.getNode(Opcode, SL, VT, A, B, C);
+    return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
   }
 
   assert(GlueChain->getNumValues() == 3);
@@ -7943,8 +7945,9 @@ static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
     break;
   }
 
-  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
-                     GlueChain.getValue(2));
+  return DAG.getNode(Opcode, SL, VTList,
+                     {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
+                     Flags);
 }
 
 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
@@ -8018,6 +8021,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
     return FastLowered;
 
+  // The selection matcher assumes anything with a chain selecting to a
+  // mayRaiseFPException machine instruction. Since we're introducing a chain
+  // here, we need to explicitly report nofpexcept for the regular fdiv
+  // lowering.
+  SDNodeFlags Flags = Op->getFlags();
+  Flags.setNoFPExcept(true);
+
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -8027,15 +8037,15 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
 
   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
-                                          RHS, RHS, LHS);
+                                          {RHS, RHS, LHS}, Flags);
   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
-                                        LHS, RHS, LHS);
+                                        {LHS, RHS, LHS}, Flags);
 
   // Denominator is scaled to not be denormal, so using rcp is ok.
   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
-                                  DenominatorScaled);
+                                  DenominatorScaled, Flags);
   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
-                                     DenominatorScaled);
+                                     DenominatorScaled, Flags);
 
   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
@@ -8045,6 +8055,10 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
 
   if (!HasFP32Denormals) {
+    // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
+    // lowering. The chain dependence is insufficient, and we need glue. We do
+    // not need the glue variants in a strictfp function.
+
     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
 
     SDNode *EnableDenorm;
@@ -8072,21 +8086,22 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
-                             ApproxRcp, One, NegDivScale0);
+                             ApproxRcp, One, NegDivScale0, Flags);
 
   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
-                             ApproxRcp, Fma0);
+                             ApproxRcp, Fma0, Flags);
 
   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
-                           Fma1, Fma1);
+                           Fma1, Fma1, Flags);
 
   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
-                             NumeratorScaled, Mul);
+                             NumeratorScaled, Mul, Flags);
 
-  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
+                             Fma2, Fma1, Mul, Fma2, Flags);
 
   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
-                             NumeratorScaled, Fma3);
+                             NumeratorScaled, Fma3, Flags);
 
   if (!HasFP32Denormals) {
     SDNode *DisableDenorm;
@@ -8113,9 +8128,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Scale = NumeratorScaled.getValue(1);
   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
-                             Fma4, Fma1, Fma3, Scale);
+                             {Fma4, Fma1, Fma3, Scale}, Flags);
 
-  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {

From 64a06b53e202217868117b2e3172b1e96a362a60 Mon Sep 17 00:00:00 2001
From: Viktoria Maksimova <viktoria.maksimova@intel.com>
Date: Tue, 19 May 2020 21:18:32 +0300
Subject: [PATCH 738/770] Fix translation of undefined function call when debug
 info is present

This fix resolves issue:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/issues/524

Function translation invokes translation of LLVM loop metadata as a last step.
SPIRVToLLVM keeps a map with metadata. If function with loop metadata is
forward-referenced, translation starts and metadata is placed in the map. If
this function contains calls of undefined functions inside, it will fail -
metadata map is not empty, but there is no loop info inside the undefined
function.

Example:
```
float bar(int x);

__kernel void foo(__global float* outPtr, int i) {
  for (int j = 0; j < i; ++j) {
    outPtr[j] = bar(j);
  }
}
```
---
 llvm-spirv/lib/SPIRV/SPIRVReader.cpp          |   4 +
 .../test/DebugInfo/Generic/undef-func-call.ll | 100 ++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 llvm-spirv/test/DebugInfo/Generic/undef-func-call.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index a677986195ee9..54353dfc3dd99 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -837,6 +837,10 @@ void SPIRVToLLVM::transLLVMLoopMetadata(const Function *F) {
   if (FuncLoopMetadataMap.empty())
     return;
 
+  // Function declaration doesn't contain loop metadata.
+  if (F->isDeclaration())
+    return;
+
   DominatorTree DomTree(*(const_cast<Function *>(F)));
   LoopInfo LI(DomTree);
 
diff --git a/llvm-spirv/test/DebugInfo/Generic/undef-func-call.ll b/llvm-spirv/test/DebugInfo/Generic/undef-func-call.ll
new file mode 100644
index 0000000000000..315a01d3bbcf6
--- /dev/null
+++ b/llvm-spirv/test/DebugInfo/Generic/undef-func-call.ll
@@ -0,0 +1,100 @@
+; RUN: llvm-as < %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv -r %t.spv
+
+; This is a regression test for reported issue https://github.com/KhronosGroup/SPIRV-LLVM-Translator/issues/524.
+; Test checks that reverse translation will not fail with assertion.
+
+; Build from the following source with clang -c -emit-llvm -O0 -target spir64 -gline-tables-only
+; float bar(int x);
+
+; __kernel void foo(__global float* outPtr, int i) {
+; #pragma clang loop unroll(enable)
+;   for (int j = 0; j < i; ++j) {
+;     outPtr[j] = bar(j);
+;   }
+; }
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @foo(float addrspace(1)* %outPtr, i32 %i) #0 !dbg !9 !kernel_arg_addr_space !6 !kernel_arg_access_qual !11 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !13 {
+entry:
+  %outPtr.addr = alloca float addrspace(1)*, align 8
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store float addrspace(1)* %outPtr, float addrspace(1)** %outPtr.addr, align 8
+  store i32 %i, i32* %i.addr, align 4
+  store i32 0, i32* %j, align 4, !dbg !14
+  br label %for.cond, !dbg !15
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %j, align 4, !dbg !16
+  %1 = load i32, i32* %i.addr, align 4, !dbg !17
+  %cmp = icmp slt i32 %0, %1, !dbg !18
+  br i1 %cmp, label %for.body, label %for.end, !dbg !19
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %j, align 4, !dbg !20
+  %call = call spir_func float @bar(i32 %2) #2, !dbg !21
+  %3 = load float addrspace(1)*, float addrspace(1)** %outPtr.addr, align 8, !dbg !22
+  %4 = load i32, i32* %j, align 4, !dbg !23
+  %idxprom = sext i32 %4 to i64, !dbg !22
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %3, i64 %idxprom, !dbg !22
+  store float %call, float addrspace(1)* %arrayidx, align 4, !dbg !24
+  br label %for.inc, !dbg !25
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %j, align 4, !dbg !26
+  %inc = add nsw i32 %5, 1, !dbg !26
+  store i32 %inc, i32* %j, align 4, !dbg !26
+  br label %for.cond, !dbg !19, !llvm.loop !27
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !29
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @bar(i32) #1
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!opencl.ocl.version = !{!6}
+!opencl.spir.version = !{!7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (https://github.com/llvm/llvm-project.git 6671a81bc71cc2635c5a10d6f688fea46ca4e5d6)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "loop.cl", directory: "/export/users/work/khr_spirv/llvm/build/bin")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, i32 0}
+!7 = !{i32 1, i32 2}
+!8 = !{!"clang version 11.0.0"}
+!9 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!10 = !DISubroutineType(types: !2)
+!11 = !{!"none", !"none"}
+!12 = !{!"float*", !"int"}
+!13 = !{!"", !""}
+!14 = !DILocation(line: 5, column: 12, scope: !9)
+!15 = !DILocation(line: 5, column: 8, scope: !9)
+!16 = !DILocation(line: 5, column: 19, scope: !9)
+!17 = !DILocation(line: 5, column: 23, scope: !9)
+!18 = !DILocation(line: 5, column: 21, scope: !9)
+!19 = !DILocation(line: 5, column: 3, scope: !9)
+!20 = !DILocation(line: 6, column: 21, scope: !9)
+!21 = !DILocation(line: 6, column: 17, scope: !9)
+!22 = !DILocation(line: 6, column: 5, scope: !9)
+!23 = !DILocation(line: 6, column: 12, scope: !9)
+!24 = !DILocation(line: 6, column: 15, scope: !9)
+!25 = !DILocation(line: 7, column: 3, scope: !9)
+!26 = !DILocation(line: 5, column: 26, scope: !9)
+!27 = distinct !{!27, !19, !25, !28}
+!28 = !{!"llvm.loop.unroll.enable"}
+!29 = !DILocation(line: 8, column: 1, scope: !9)

From edd5461debd9e10e19480cc7bf7d17f37b1e49cf Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Thu, 12 Mar 2020 15:22:02 -0700
Subject: [PATCH 739/770] added cl_khr_subgroup_extended_types and
 cl_khr_subgroup_non_uniform_vote

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp         |   5 +-
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     |  33 +
 .../lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h      |  32 +
 llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp       | 705 +++++++++++++++++-
 4 files changed, 760 insertions(+), 15 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 33fe7e8e7e05a..3c6971836b9fb 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -965,14 +965,15 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
 
   bool IsGroupAllAny = (DemangledName.find("_all") != std::string::npos ||
                         DemangledName.find("_any") != std::string::npos);
+  bool IsGroupAllEqual = DemangledName.find("_all_equal") != std::string::npos;
 
   auto Consts = getInt32(M, PreOps);
   OCLBuiltinTransInfo Info;
-  if (IsGroupAllAny)
+  if (IsGroupAllAny || IsGroupAllEqual)
     Info.RetTy = Type::getInt1Ty(*Ctx);
   Info.UniqName = DemangledName;
   Info.PostProc = [=](std::vector<Value *> &Ops) {
-    if (IsGroupAllAny) {
+    if (IsGroupAllAny && !IsGroupAllEqual) {
       IRBuilder<> IRB(CI);
       Ops[0] =
           IRB.CreateICmpNE(Ops[0], ConstantInt::get(Type::getInt32Ty(*Ctx), 0));
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 8f28bd3265556..d0af944004367 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2516,6 +2516,39 @@ _SPIRV_OP(GroupReserveReadPipePackets, true, 8)
 _SPIRV_OP(GroupReserveWritePipePackets, true, 8)
 _SPIRV_OP(GroupCommitReadPipe, false, 6)
 _SPIRV_OP(GroupCommitWritePipe, false, 6)
+_SPIRV_OP(GroupNonUniformElect, true, 4)
+_SPIRV_OP(GroupNonUniformAll, true, 5)
+_SPIRV_OP(GroupNonUniformAny, true, 5)
+_SPIRV_OP(GroupNonUniformAllEqual, true, 5)
+_SPIRV_OP(GroupNonUniformBroadcast, true, 6)
+_SPIRV_OP(GroupNonUniformBroadcastFirst, true, 5)
+_SPIRV_OP(GroupNonUniformBallot, true, 5)
+_SPIRV_OP(GroupNonUniformInverseBallot, true, 5)
+_SPIRV_OP(GroupNonUniformBallotBitExtract, true, 6)
+_SPIRV_OP(GroupNonUniformBallotBitCount, true, 6)
+_SPIRV_OP(GroupNonUniformBallotFindLSB, true, 5)
+_SPIRV_OP(GroupNonUniformBallotFindMSB, true, 5)
+_SPIRV_OP(GroupNonUniformShuffle, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleXor, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleUp, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleDown, true, 6)
+_SPIRV_OP(GroupNonUniformIAdd, true, 6, true)
+_SPIRV_OP(GroupNonUniformFAdd, true, 6, true)
+_SPIRV_OP(GroupNonUniformIMul, true, 6, true)
+_SPIRV_OP(GroupNonUniformFMul, true, 6, true)
+_SPIRV_OP(GroupNonUniformSMin, true, 6, true)
+_SPIRV_OP(GroupNonUniformUMin, true, 6, true)
+_SPIRV_OP(GroupNonUniformFMin, true, 6, true)
+_SPIRV_OP(GroupNonUniformSMax, true, 6, true)
+_SPIRV_OP(GroupNonUniformUMax, true, 6, true)
+_SPIRV_OP(GroupNonUniformFMax, true, 6, true)
+_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, true)
+_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, true)
+_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, true)
+_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, true)
+_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, true)
+_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, true)
+
 #undef _SPIRV_OP
 
 class SPIRVBlockingPipesIntelInst : public SPIRVInstTemplateBase {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
index e5f02881bdfb5..a0e6643b98f0d 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
@@ -294,6 +294,38 @@ _SPIRV_OP(AtomicFlagClear, 319)
 _SPIRV_OP(TypePipeStorage, 322)
 _SPIRV_OP(ConstantPipeStorage, 323)
 _SPIRV_OP(CreatePipeFromPipeStorage, 324)
+_SPIRV_OP(GroupNonUniformElect, 333)
+_SPIRV_OP(GroupNonUniformAll, 334)
+_SPIRV_OP(GroupNonUniformAny, 335)
+_SPIRV_OP(GroupNonUniformAllEqual, 336)
+_SPIRV_OP(GroupNonUniformBroadcast, 337)
+_SPIRV_OP(GroupNonUniformBroadcastFirst, 338)
+_SPIRV_OP(GroupNonUniformBallot, 339)
+_SPIRV_OP(GroupNonUniformInverseBallot, 340)
+_SPIRV_OP(GroupNonUniformBallotBitExtract, 341)
+_SPIRV_OP(GroupNonUniformBallotBitCount, 342)
+_SPIRV_OP(GroupNonUniformBallotFindLSB, 343)
+_SPIRV_OP(GroupNonUniformBallotFindMSB, 344)
+_SPIRV_OP(GroupNonUniformShuffle, 345)
+_SPIRV_OP(GroupNonUniformShuffleXor, 346)
+_SPIRV_OP(GroupNonUniformShuffleUp, 347)
+_SPIRV_OP(GroupNonUniformShuffleDown, 348)
+_SPIRV_OP(GroupNonUniformIAdd, 349)
+_SPIRV_OP(GroupNonUniformFAdd, 350)
+_SPIRV_OP(GroupNonUniformIMul, 351)
+_SPIRV_OP(GroupNonUniformFMul, 352)
+_SPIRV_OP(GroupNonUniformSMin, 353)
+_SPIRV_OP(GroupNonUniformUMin, 354)
+_SPIRV_OP(GroupNonUniformFMin, 355)
+_SPIRV_OP(GroupNonUniformSMax, 356)
+_SPIRV_OP(GroupNonUniformUMax, 357)
+_SPIRV_OP(GroupNonUniformFMax, 358)
+_SPIRV_OP(GroupNonUniformBitwiseAnd, 359)
+_SPIRV_OP(GroupNonUniformBitwiseOr, 360)
+_SPIRV_OP(GroupNonUniformBitwiseXor, 361)
+_SPIRV_OP(GroupNonUniformLogicalAnd, 362)
+_SPIRV_OP(GroupNonUniformLogicalOr, 363)
+_SPIRV_OP(GroupNonUniformLogicalXor, 364)
 _SPIRV_OP(Forward, 1024)
 _SPIRV_OP(SubgroupShuffleINTEL, 5571)
 _SPIRV_OP(SubgroupShuffleDownINTEL, 5572)
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp b/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp
index 1db39409e5104..f432a85c80177 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp
@@ -1,19 +1,19 @@
-// Copyright (c) 2014-2017 The Khronos Group Inc.
-//
+// Copyright (c) 2014-2019 The Khronos Group Inc.
+// 
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and/or associated documentation files (the "Materials"),
 // to deal in the Materials without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Materials, and to permit persons to whom the
 // Materials are furnished to do so, subject to the following conditions:
-//
+// 
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Materials.
-//
+// 
 // MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
 // STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
-// HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/
-//
+// HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
+// 
 // THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -26,14 +26,17 @@
 // the Binary Section of the SPIR-V specification.
 
 // Enumeration tokens for SPIR-V, in various styles:
-//   C, C++, C++11, JSON, Lua, Python
-//
+//   C, C++, C++11, JSON, Lua, Python, C#, D
+// 
 // - C will have tokens with a "Spv" prefix, e.g.: SpvSourceLanguageGLSL
 // - C++ will have tokens in the "spv" name space, e.g.: spv::SourceLanguageGLSL
 // - C++11 will use enum classes in the spv namespace, e.g.: spv::SourceLanguage::GLSL
 // - Lua will use tables, e.g.: spv.SourceLanguage.GLSL
 // - Python will use dictionaries, e.g.: spv['SourceLanguage']['GLSL']
-//
+// - C# will use enum classes in the Specification class located in the "Spv" namespace,
+//     e.g.: Spv.Specification.SourceLanguage.GLSL
+// - D will have tokens under the "spv" module, e.g: spv.SourceLanguage.GLSL
+// 
 // Some tokens act like mask values, which can be OR'd together,
 // while others are mutually exclusive.  The mask-like ones have
 // "Mask" in their name, and a parallel enum that has the shift
@@ -46,12 +49,12 @@ namespace spv {
 
 typedef unsigned int Id;
 
-#define SPV_VERSION 0x10100
-#define SPV_REVISION 7
+#define SPV_VERSION 0x10500
+#define SPV_REVISION 1
 
 static const unsigned int MagicNumber = 0x07230203;
-static const unsigned int Version = 0x00010100;
-static const unsigned int Revision = 7;
+static const unsigned int Version = 0x00010500;
+static const unsigned int Revision = 1;
 static const unsigned int OpCodeMask = 0xffff;
 static const unsigned int WordCountShift = 16;
 
@@ -73,6 +76,14 @@ enum ExecutionModel {
     ExecutionModelFragment = 4,
     ExecutionModelGLCompute = 5,
     ExecutionModelKernel = 6,
+    ExecutionModelTaskNV = 5267,
+    ExecutionModelMeshNV = 5268,
+    ExecutionModelRayGenerationNV = 5313,
+    ExecutionModelIntersectionNV = 5314,
+    ExecutionModelAnyHitNV = 5315,
+    ExecutionModelClosestHitNV = 5316,
+    ExecutionModelMissNV = 5317,
+    ExecutionModelCallableNV = 5318,
     ExecutionModelMax = 0x7fffffff,
 };
 
@@ -80,6 +91,8 @@ enum AddressingModel {
     AddressingModelLogical = 0,
     AddressingModelPhysical32 = 1,
     AddressingModelPhysical64 = 2,
+    AddressingModelPhysicalStorageBuffer64 = 5348,
+    AddressingModelPhysicalStorageBuffer64EXT = 5348,
     AddressingModelMax = 0x7fffffff,
 };
 
@@ -87,6 +100,8 @@ enum MemoryModel {
     MemoryModelSimple = 0,
     MemoryModelGLSL450 = 1,
     MemoryModelOpenCL = 2,
+    MemoryModelVulkan = 3,
+    MemoryModelVulkanKHR = 3,
     MemoryModelMax = 0x7fffffff,
 };
 
@@ -126,6 +141,27 @@ enum ExecutionMode {
     ExecutionModeFinalizer = 34,
     ExecutionModeSubgroupSize = 35,
     ExecutionModeSubgroupsPerWorkgroup = 36,
+    ExecutionModeSubgroupsPerWorkgroupId = 37,
+    ExecutionModeLocalSizeId = 38,
+    ExecutionModeLocalSizeHintId = 39,
+    ExecutionModePostDepthCoverage = 4446,
+    ExecutionModeDenormPreserve = 4459,
+    ExecutionModeDenormFlushToZero = 4460,
+    ExecutionModeSignedZeroInfNanPreserve = 4461,
+    ExecutionModeRoundingModeRTE = 4462,
+    ExecutionModeRoundingModeRTZ = 4463,
+    ExecutionModeStencilRefReplacingEXT = 5027,
+    ExecutionModeOutputLinesNV = 5269,
+    ExecutionModeOutputPrimitivesNV = 5270,
+    ExecutionModeDerivativeGroupQuadsNV = 5289,
+    ExecutionModeDerivativeGroupLinearNV = 5290,
+    ExecutionModeOutputTrianglesNV = 5298,
+    ExecutionModePixelInterlockOrderedEXT = 5366,
+    ExecutionModePixelInterlockUnorderedEXT = 5367,
+    ExecutionModeSampleInterlockOrderedEXT = 5368,
+    ExecutionModeSampleInterlockUnorderedEXT = 5369,
+    ExecutionModeShadingRateInterlockOrderedEXT = 5370,
+    ExecutionModeShadingRateInterlockUnorderedEXT = 5371,
     ExecutionModeMaxWorkgroupSizeINTEL = 5893,
     ExecutionModeMaxWorkDimINTEL = 5894,
     ExecutionModeNoGlobalOffsetINTEL = 5895,
@@ -147,6 +183,14 @@ enum StorageClass {
     StorageClassAtomicCounter = 10,
     StorageClassImage = 11,
     StorageClassStorageBuffer = 12,
+    StorageClassCallableDataNV = 5328,
+    StorageClassIncomingCallableDataNV = 5329,
+    StorageClassRayPayloadNV = 5338,
+    StorageClassHitAttributeNV = 5339,
+    StorageClassIncomingRayPayloadNV = 5342,
+    StorageClassShaderRecordBufferNV = 5343,
+    StorageClassPhysicalStorageBuffer = 5349,
+    StorageClassPhysicalStorageBufferEXT = 5349,
     StorageClassMax = 0x7fffffff,
 };
 
@@ -274,6 +318,16 @@ enum ImageOperandsShift {
     ImageOperandsConstOffsetsShift = 5,
     ImageOperandsSampleShift = 6,
     ImageOperandsMinLodShift = 7,
+    ImageOperandsMakeTexelAvailableShift = 8,
+    ImageOperandsMakeTexelAvailableKHRShift = 8,
+    ImageOperandsMakeTexelVisibleShift = 9,
+    ImageOperandsMakeTexelVisibleKHRShift = 9,
+    ImageOperandsNonPrivateTexelShift = 10,
+    ImageOperandsNonPrivateTexelKHRShift = 10,
+    ImageOperandsVolatileTexelShift = 11,
+    ImageOperandsVolatileTexelKHRShift = 11,
+    ImageOperandsSignExtendShift = 12,
+    ImageOperandsZeroExtendShift = 13,
     ImageOperandsMax = 0x7fffffff,
 };
 
@@ -287,6 +341,16 @@ enum ImageOperandsMask {
     ImageOperandsConstOffsetsMask = 0x00000020,
     ImageOperandsSampleMask = 0x00000040,
     ImageOperandsMinLodMask = 0x00000080,
+    ImageOperandsMakeTexelAvailableMask = 0x00000100,
+    ImageOperandsMakeTexelAvailableKHRMask = 0x00000100,
+    ImageOperandsMakeTexelVisibleMask = 0x00000200,
+    ImageOperandsMakeTexelVisibleKHRMask = 0x00000200,
+    ImageOperandsNonPrivateTexelMask = 0x00000400,
+    ImageOperandsNonPrivateTexelKHRMask = 0x00000400,
+    ImageOperandsVolatileTexelMask = 0x00000800,
+    ImageOperandsVolatileTexelKHRMask = 0x00000800,
+    ImageOperandsSignExtendMask = 0x00001000,
+    ImageOperandsZeroExtendMask = 0x00002000,
 };
 
 enum FPFastMathModeShift {
@@ -453,21 +517,67 @@ enum BuiltIn {
     BuiltInSubgroupLocalInvocationId = 41,
     BuiltInVertexIndex = 42,
     BuiltInInstanceIndex = 43,
+    BuiltInSubgroupEqMask = 4416,
     BuiltInSubgroupEqMaskKHR = 4416,
+    BuiltInSubgroupGeMask = 4417,
     BuiltInSubgroupGeMaskKHR = 4417,
+    BuiltInSubgroupGtMask = 4418,
     BuiltInSubgroupGtMaskKHR = 4418,
+    BuiltInSubgroupLeMask = 4419,
     BuiltInSubgroupLeMaskKHR = 4419,
+    BuiltInSubgroupLtMask = 4420,
     BuiltInSubgroupLtMaskKHR = 4420,
     BuiltInBaseVertex = 4424,
     BuiltInBaseInstance = 4425,
     BuiltInDrawIndex = 4426,
     BuiltInDeviceIndex = 4438,
     BuiltInViewIndex = 4440,
+    BuiltInBaryCoordNoPerspAMD = 4992,
+    BuiltInBaryCoordNoPerspCentroidAMD = 4993,
+    BuiltInBaryCoordNoPerspSampleAMD = 4994,
+    BuiltInBaryCoordSmoothAMD = 4995,
+    BuiltInBaryCoordSmoothCentroidAMD = 4996,
+    BuiltInBaryCoordSmoothSampleAMD = 4997,
+    BuiltInBaryCoordPullModelAMD = 4998,
+    BuiltInFragStencilRefEXT = 5014,
     BuiltInViewportMaskNV = 5253,
     BuiltInSecondaryPositionNV = 5257,
     BuiltInSecondaryViewportMaskNV = 5258,
     BuiltInPositionPerViewNV = 5261,
     BuiltInViewportMaskPerViewNV = 5262,
+    BuiltInFullyCoveredEXT = 5264,
+    BuiltInTaskCountNV = 5274,
+    BuiltInPrimitiveCountNV = 5275,
+    BuiltInPrimitiveIndicesNV = 5276,
+    BuiltInClipDistancePerViewNV = 5277,
+    BuiltInCullDistancePerViewNV = 5278,
+    BuiltInLayerPerViewNV = 5279,
+    BuiltInMeshViewCountNV = 5280,
+    BuiltInMeshViewIndicesNV = 5281,
+    BuiltInBaryCoordNV = 5286,
+    BuiltInBaryCoordNoPerspNV = 5287,
+    BuiltInFragSizeEXT = 5292,
+    BuiltInFragmentSizeNV = 5292,
+    BuiltInFragInvocationCountEXT = 5293,
+    BuiltInInvocationsPerPixelNV = 5293,
+    BuiltInLaunchIdNV = 5319,
+    BuiltInLaunchSizeNV = 5320,
+    BuiltInWorldRayOriginNV = 5321,
+    BuiltInWorldRayDirectionNV = 5322,
+    BuiltInObjectRayOriginNV = 5323,
+    BuiltInObjectRayDirectionNV = 5324,
+    BuiltInRayTminNV = 5325,
+    BuiltInRayTmaxNV = 5326,
+    BuiltInInstanceCustomIndexNV = 5327,
+    BuiltInObjectToWorldNV = 5330,
+    BuiltInWorldToObjectNV = 5331,
+    BuiltInHitTNV = 5332,
+    BuiltInHitKindNV = 5333,
+    BuiltInIncomingRayFlagsNV = 5351,
+    BuiltInWarpsPerSMNV = 5374,
+    BuiltInSMCountNV = 5375,
+    BuiltInWarpIDNV = 5376,
+    BuiltInSMIDNV = 5377,
     BuiltInMax = 0x7fffffff,
 };
 
@@ -488,6 +598,11 @@ enum LoopControlShift {
     LoopControlDontUnrollShift = 1,
     LoopControlDependencyInfiniteShift = 2,
     LoopControlDependencyLengthShift = 3,
+    LoopControlMinIterationsShift = 4,
+    LoopControlMaxIterationsShift = 5,
+    LoopControlIterationMultipleShift = 6,
+    LoopControlPeelCountShift = 7,
+    LoopControlPartialCountShift = 8,
     LoopControlMax = 0x7fffffff,
 };
 
@@ -538,6 +653,13 @@ enum MemorySemanticsShift {
     MemorySemanticsCrossWorkgroupMemoryShift = 9,
     MemorySemanticsAtomicCounterMemoryShift = 10,
     MemorySemanticsImageMemoryShift = 11,
+    MemorySemanticsOutputMemoryShift = 12,
+    MemorySemanticsOutputMemoryKHRShift = 12,
+    MemorySemanticsMakeAvailableShift = 13,
+    MemorySemanticsMakeAvailableKHRShift = 13,
+    MemorySemanticsMakeVisibleShift = 14,
+    MemorySemanticsMakeVisibleKHRShift = 14,
+    MemorySemanticsVolatileShift = 15,
     MemorySemanticsMax = 0x7fffffff,
 };
 
@@ -553,12 +675,25 @@ enum MemorySemanticsMask {
     MemorySemanticsCrossWorkgroupMemoryMask = 0x00000200,
     MemorySemanticsAtomicCounterMemoryMask = 0x00000400,
     MemorySemanticsImageMemoryMask = 0x00000800,
+    MemorySemanticsOutputMemoryMask = 0x00001000,
+    MemorySemanticsOutputMemoryKHRMask = 0x00001000,
+    MemorySemanticsMakeAvailableMask = 0x00002000,
+    MemorySemanticsMakeAvailableKHRMask = 0x00002000,
+    MemorySemanticsMakeVisibleMask = 0x00004000,
+    MemorySemanticsMakeVisibleKHRMask = 0x00004000,
+    MemorySemanticsVolatileMask = 0x00008000,
 };
 
 enum MemoryAccessShift {
     MemoryAccessVolatileShift = 0,
     MemoryAccessAlignedShift = 1,
     MemoryAccessNontemporalShift = 2,
+    MemoryAccessMakePointerAvailableShift = 3,
+    MemoryAccessMakePointerAvailableKHRShift = 3,
+    MemoryAccessMakePointerVisibleShift = 4,
+    MemoryAccessMakePointerVisibleKHRShift = 4,
+    MemoryAccessNonPrivatePointerShift = 5,
+    MemoryAccessNonPrivatePointerKHRShift = 5,
     MemoryAccessMax = 0x7fffffff,
 };
 
@@ -567,6 +702,12 @@ enum MemoryAccessMask {
     MemoryAccessVolatileMask = 0x00000001,
     MemoryAccessAlignedMask = 0x00000002,
     MemoryAccessNontemporalMask = 0x00000004,
+    MemoryAccessMakePointerAvailableMask = 0x00000008,
+    MemoryAccessMakePointerAvailableKHRMask = 0x00000008,
+    MemoryAccessMakePointerVisibleMask = 0x00000010,
+    MemoryAccessMakePointerVisibleKHRMask = 0x00000010,
+    MemoryAccessNonPrivatePointerMask = 0x00000020,
+    MemoryAccessNonPrivatePointerKHRMask = 0x00000020,
 };
 
 enum Scope {
@@ -575,6 +716,8 @@ enum Scope {
     ScopeWorkgroup = 2,
     ScopeSubgroup = 3,
     ScopeInvocation = 4,
+    ScopeQueueFamily = 5,
+    ScopeQueueFamilyKHR = 5,
     ScopeMax = 0x7fffffff,
 };
 
@@ -582,6 +725,10 @@ enum GroupOperation {
     GroupOperationReduce = 0,
     GroupOperationInclusiveScan = 1,
     GroupOperationExclusiveScan = 2,
+    GroupOperationClusteredReduce = 3,
+    GroupOperationPartitionedReduceNV = 6,
+    GroupOperationPartitionedInclusiveScanNV = 7,
+    GroupOperationPartitionedExclusiveScanNV = 8,
     GroupOperationMax = 0x7fffffff,
 };
 
@@ -1159,6 +1306,538 @@ enum Op {
   OpMax = 0x7fffffff,
 };
 
+#ifdef SPV_ENABLE_UTILITY_CODE
+inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
+    *hasResult = *hasResultType = false;
+    switch (opcode) {
+    default: /* unknown opcode */ break;
+    case OpNop: *hasResult = false; *hasResultType = false; break;
+    case OpUndef: *hasResult = true; *hasResultType = true; break;
+    case OpSourceContinued: *hasResult = false; *hasResultType = false; break;
+    case OpSource: *hasResult = false; *hasResultType = false; break;
+    case OpSourceExtension: *hasResult = false; *hasResultType = false; break;
+    case OpName: *hasResult = false; *hasResultType = false; break;
+    case OpMemberName: *hasResult = false; *hasResultType = false; break;
+    case OpString: *hasResult = true; *hasResultType = false; break;
+    case OpLine: *hasResult = false; *hasResultType = false; break;
+    case OpExtension: *hasResult = false; *hasResultType = false; break;
+    case OpExtInstImport: *hasResult = true; *hasResultType = false; break;
+    case OpExtInst: *hasResult = true; *hasResultType = true; break;
+    case OpMemoryModel: *hasResult = false; *hasResultType = false; break;
+    case OpEntryPoint: *hasResult = false; *hasResultType = false; break;
+    case OpExecutionMode: *hasResult = false; *hasResultType = false; break;
+    case OpCapability: *hasResult = false; *hasResultType = false; break;
+    case OpTypeVoid: *hasResult = true; *hasResultType = false; break;
+    case OpTypeBool: *hasResult = true; *hasResultType = false; break;
+    case OpTypeInt: *hasResult = true; *hasResultType = false; break;
+    case OpTypeFloat: *hasResult = true; *hasResultType = false; break;
+    case OpTypeVector: *hasResult = true; *hasResultType = false; break;
+    case OpTypeMatrix: *hasResult = true; *hasResultType = false; break;
+    case OpTypeImage: *hasResult = true; *hasResultType = false; break;
+    case OpTypeSampler: *hasResult = true; *hasResultType = false; break;
+    case OpTypeSampledImage: *hasResult = true; *hasResultType = false; break;
+    case OpTypeArray: *hasResult = true; *hasResultType = false; break;
+    case OpTypeRuntimeArray: *hasResult = true; *hasResultType = false; break;
+    case OpTypeStruct: *hasResult = true; *hasResultType = false; break;
+    case OpTypeOpaque: *hasResult = true; *hasResultType = false; break;
+    case OpTypePointer: *hasResult = true; *hasResultType = false; break;
+    case OpTypeFunction: *hasResult = true; *hasResultType = false; break;
+    case OpTypeEvent: *hasResult = true; *hasResultType = false; break;
+    case OpTypeDeviceEvent: *hasResult = true; *hasResultType = false; break;
+    case OpTypeReserveId: *hasResult = true; *hasResultType = false; break;
+    case OpTypeQueue: *hasResult = true; *hasResultType = false; break;
+    case OpTypePipe: *hasResult = true; *hasResultType = false; break;
+    case OpTypeForwardPointer: *hasResult = false; *hasResultType = false; break;
+    case OpConstantTrue: *hasResult = true; *hasResultType = true; break;
+    case OpConstantFalse: *hasResult = true; *hasResultType = true; break;
+    case OpConstant: *hasResult = true; *hasResultType = true; break;
+    case OpConstantComposite: *hasResult = true; *hasResultType = true; break;
+    case OpConstantSampler: *hasResult = true; *hasResultType = true; break;
+    case OpConstantNull: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstantTrue: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstantFalse: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstant: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstantComposite: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstantOp: *hasResult = true; *hasResultType = true; break;
+    case OpFunction: *hasResult = true; *hasResultType = true; break;
+    case OpFunctionParameter: *hasResult = true; *hasResultType = true; break;
+    case OpFunctionEnd: *hasResult = false; *hasResultType = false; break;
+    case OpFunctionCall: *hasResult = true; *hasResultType = true; break;
+    case OpVariable: *hasResult = true; *hasResultType = true; break;
+    case OpImageTexelPointer: *hasResult = true; *hasResultType = true; break;
+    case OpLoad: *hasResult = true; *hasResultType = true; break;
+    case OpStore: *hasResult = false; *hasResultType = false; break;
+    case OpCopyMemory: *hasResult = false; *hasResultType = false; break;
+    case OpCopyMemorySized: *hasResult = false; *hasResultType = false; break;
+    case OpAccessChain: *hasResult = true; *hasResultType = true; break;
+    case OpInBoundsAccessChain: *hasResult = true; *hasResultType = true; break;
+    case OpPtrAccessChain: *hasResult = true; *hasResultType = true; break;
+    case OpArrayLength: *hasResult = true; *hasResultType = true; break;
+    case OpGenericPtrMemSemantics: *hasResult = true; *hasResultType = true; break;
+    case OpInBoundsPtrAccessChain: *hasResult = true; *hasResultType = true; break;
+    case OpDecorate: *hasResult = false; *hasResultType = false; break;
+    case OpMemberDecorate: *hasResult = false; *hasResultType = false; break;
+    case OpDecorationGroup: *hasResult = true; *hasResultType = false; break;
+    case OpGroupDecorate: *hasResult = false; *hasResultType = false; break;
+    case OpGroupMemberDecorate: *hasResult = false; *hasResultType = false; break;
+    case OpVectorExtractDynamic: *hasResult = true; *hasResultType = true; break;
+    case OpVectorInsertDynamic: *hasResult = true; *hasResultType = true; break;
+    case OpVectorShuffle: *hasResult = true; *hasResultType = true; break;
+    case OpCompositeConstruct: *hasResult = true; *hasResultType = true; break;
+    case OpCompositeExtract: *hasResult = true; *hasResultType = true; break;
+    case OpCompositeInsert: *hasResult = true; *hasResultType = true; break;
+    case OpCopyObject: *hasResult = true; *hasResultType = true; break;
+    case OpTranspose: *hasResult = true; *hasResultType = true; break;
+    case OpSampledImage: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleProjImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleProjExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleProjDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleProjDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageFetch: *hasResult = true; *hasResultType = true; break;
+    case OpImageGather: *hasResult = true; *hasResultType = true; break;
+    case OpImageDrefGather: *hasResult = true; *hasResultType = true; break;
+    case OpImageRead: *hasResult = true; *hasResultType = true; break;
+    case OpImageWrite: *hasResult = false; *hasResultType = false; break;
+    case OpImage: *hasResult = true; *hasResultType = true; break;
+    case OpImageQueryFormat: *hasResult = true; *hasResultType = true; break;
+    case OpImageQueryOrder: *hasResult = true; *hasResultType = true; break;
+    case OpImageQuerySizeLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageQuerySize: *hasResult = true; *hasResultType = true; break;
+    case OpImageQueryLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageQueryLevels: *hasResult = true; *hasResultType = true; break;
+    case OpImageQuerySamples: *hasResult = true; *hasResultType = true; break;
+    case OpConvertFToU: *hasResult = true; *hasResultType = true; break;
+    case OpConvertFToS: *hasResult = true; *hasResultType = true; break;
+    case OpConvertSToF: *hasResult = true; *hasResultType = true; break;
+    case OpConvertUToF: *hasResult = true; *hasResultType = true; break;
+    case OpUConvert: *hasResult = true; *hasResultType = true; break;
+    case OpSConvert: *hasResult = true; *hasResultType = true; break;
+    case OpFConvert: *hasResult = true; *hasResultType = true; break;
+    case OpQuantizeToF16: *hasResult = true; *hasResultType = true; break;
+    case OpConvertPtrToU: *hasResult = true; *hasResultType = true; break;
+    case OpSatConvertSToU: *hasResult = true; *hasResultType = true; break;
+    case OpSatConvertUToS: *hasResult = true; *hasResultType = true; break;
+    case OpConvertUToPtr: *hasResult = true; *hasResultType = true; break;
+    case OpPtrCastToGeneric: *hasResult = true; *hasResultType = true; break;
+    case OpGenericCastToPtr: *hasResult = true; *hasResultType = true; break;
+    case OpGenericCastToPtrExplicit: *hasResult = true; *hasResultType = true; break;
+    case OpBitcast: *hasResult = true; *hasResultType = true; break;
+    case OpSNegate: *hasResult = true; *hasResultType = true; break;
+    case OpFNegate: *hasResult = true; *hasResultType = true; break;
+    case OpIAdd: *hasResult = true; *hasResultType = true; break;
+    case OpFAdd: *hasResult = true; *hasResultType = true; break;
+    case OpISub: *hasResult = true; *hasResultType = true; break;
+    case OpFSub: *hasResult = true; *hasResultType = true; break;
+    case OpIMul: *hasResult = true; *hasResultType = true; break;
+    case OpFMul: *hasResult = true; *hasResultType = true; break;
+    case OpUDiv: *hasResult = true; *hasResultType = true; break;
+    case OpSDiv: *hasResult = true; *hasResultType = true; break;
+    case OpFDiv: *hasResult = true; *hasResultType = true; break;
+    case OpUMod: *hasResult = true; *hasResultType = true; break;
+    case OpSRem: *hasResult = true; *hasResultType = true; break;
+    case OpSMod: *hasResult = true; *hasResultType = true; break;
+    case OpFRem: *hasResult = true; *hasResultType = true; break;
+    case OpFMod: *hasResult = true; *hasResultType = true; break;
+    case OpVectorTimesScalar: *hasResult = true; *hasResultType = true; break;
+    case OpMatrixTimesScalar: *hasResult = true; *hasResultType = true; break;
+    case OpVectorTimesMatrix: *hasResult = true; *hasResultType = true; break;
+    case OpMatrixTimesVector: *hasResult = true; *hasResultType = true; break;
+    case OpMatrixTimesMatrix: *hasResult = true; *hasResultType = true; break;
+    case OpOuterProduct: *hasResult = true; *hasResultType = true; break;
+    case OpDot: *hasResult = true; *hasResultType = true; break;
+    case OpIAddCarry: *hasResult = true; *hasResultType = true; break;
+    case OpISubBorrow: *hasResult = true; *hasResultType = true; break;
+    case OpUMulExtended: *hasResult = true; *hasResultType = true; break;
+    case OpSMulExtended: *hasResult = true; *hasResultType = true; break;
+    case OpAny: *hasResult = true; *hasResultType = true; break;
+    case OpAll: *hasResult = true; *hasResultType = true; break;
+    case OpIsNan: *hasResult = true; *hasResultType = true; break;
+    case OpIsInf: *hasResult = true; *hasResultType = true; break;
+    case OpIsFinite: *hasResult = true; *hasResultType = true; break;
+    case OpIsNormal: *hasResult = true; *hasResultType = true; break;
+    case OpSignBitSet: *hasResult = true; *hasResultType = true; break;
+    case OpLessOrGreater: *hasResult = true; *hasResultType = true; break;
+    case OpOrdered: *hasResult = true; *hasResultType = true; break;
+    case OpUnordered: *hasResult = true; *hasResultType = true; break;
+    case OpLogicalEqual: *hasResult = true; *hasResultType = true; break;
+    case OpLogicalNotEqual: *hasResult = true; *hasResultType = true; break;
+    case OpLogicalOr: *hasResult = true; *hasResultType = true; break;
+    case OpLogicalAnd: *hasResult = true; *hasResultType = true; break;
+    case OpLogicalNot: *hasResult = true; *hasResultType = true; break;
+    case OpSelect: *hasResult = true; *hasResultType = true; break;
+    case OpIEqual: *hasResult = true; *hasResultType = true; break;
+    case OpINotEqual: *hasResult = true; *hasResultType = true; break;
+    case OpUGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case OpSGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case OpUGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpSGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpULessThan: *hasResult = true; *hasResultType = true; break;
+    case OpSLessThan: *hasResult = true; *hasResultType = true; break;
+    case OpULessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpSLessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFOrdEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFUnordEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFOrdNotEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFUnordNotEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFOrdLessThan: *hasResult = true; *hasResultType = true; break;
+    case OpFUnordLessThan: *hasResult = true; *hasResultType = true; break;
+    case OpFOrdGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case OpFUnordGreaterThan: *hasResult = true; *hasResultType = true; break;
+    case OpFOrdLessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFUnordLessThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFOrdGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpFUnordGreaterThanEqual: *hasResult = true; *hasResultType = true; break;
+    case OpShiftRightLogical: *hasResult = true; *hasResultType = true; break;
+    case OpShiftRightArithmetic: *hasResult = true; *hasResultType = true; break;
+    case OpShiftLeftLogical: *hasResult = true; *hasResultType = true; break;
+    case OpBitwiseOr: *hasResult = true; *hasResultType = true; break;
+    case OpBitwiseXor: *hasResult = true; *hasResultType = true; break;
+    case OpBitwiseAnd: *hasResult = true; *hasResultType = true; break;
+    case OpNot: *hasResult = true; *hasResultType = true; break;
+    case OpBitFieldInsert: *hasResult = true; *hasResultType = true; break;
+    case OpBitFieldSExtract: *hasResult = true; *hasResultType = true; break;
+    case OpBitFieldUExtract: *hasResult = true; *hasResultType = true; break;
+    case OpBitReverse: *hasResult = true; *hasResultType = true; break;
+    case OpBitCount: *hasResult = true; *hasResultType = true; break;
+    case OpDPdx: *hasResult = true; *hasResultType = true; break;
+    case OpDPdy: *hasResult = true; *hasResultType = true; break;
+    case OpFwidth: *hasResult = true; *hasResultType = true; break;
+    case OpDPdxFine: *hasResult = true; *hasResultType = true; break;
+    case OpDPdyFine: *hasResult = true; *hasResultType = true; break;
+    case OpFwidthFine: *hasResult = true; *hasResultType = true; break;
+    case OpDPdxCoarse: *hasResult = true; *hasResultType = true; break;
+    case OpDPdyCoarse: *hasResult = true; *hasResultType = true; break;
+    case OpFwidthCoarse: *hasResult = true; *hasResultType = true; break;
+    case OpEmitVertex: *hasResult = false; *hasResultType = false; break;
+    case OpEndPrimitive: *hasResult = false; *hasResultType = false; break;
+    case OpEmitStreamVertex: *hasResult = false; *hasResultType = false; break;
+    case OpEndStreamPrimitive: *hasResult = false; *hasResultType = false; break;
+    case OpControlBarrier: *hasResult = false; *hasResultType = false; break;
+    case OpMemoryBarrier: *hasResult = false; *hasResultType = false; break;
+    case OpAtomicLoad: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicStore: *hasResult = false; *hasResultType = false; break;
+    case OpAtomicExchange: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicCompareExchange: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicCompareExchangeWeak: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicIIncrement: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicIDecrement: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicIAdd: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicISub: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicSMin: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicUMin: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicSMax: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicUMax: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicAnd: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicOr: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicXor: *hasResult = true; *hasResultType = true; break;
+    case OpPhi: *hasResult = true; *hasResultType = true; break;
+    case OpLoopMerge: *hasResult = false; *hasResultType = false; break;
+    case OpSelectionMerge: *hasResult = false; *hasResultType = false; break;
+    case OpLabel: *hasResult = true; *hasResultType = false; break;
+    case OpBranch: *hasResult = false; *hasResultType = false; break;
+    case OpBranchConditional: *hasResult = false; *hasResultType = false; break;
+    case OpSwitch: *hasResult = false; *hasResultType = false; break;
+    case OpKill: *hasResult = false; *hasResultType = false; break;
+    case OpReturn: *hasResult = false; *hasResultType = false; break;
+    case OpReturnValue: *hasResult = false; *hasResultType = false; break;
+    case OpUnreachable: *hasResult = false; *hasResultType = false; break;
+    case OpLifetimeStart: *hasResult = false; *hasResultType = false; break;
+    case OpLifetimeStop: *hasResult = false; *hasResultType = false; break;
+    case OpGroupAsyncCopy: *hasResult = true; *hasResultType = true; break;
+    case OpGroupWaitEvents: *hasResult = false; *hasResultType = false; break;
+    case OpGroupAll: *hasResult = true; *hasResultType = true; break;
+    case OpGroupAny: *hasResult = true; *hasResultType = true; break;
+    case OpGroupBroadcast: *hasResult = true; *hasResultType = true; break;
+    case OpGroupIAdd: *hasResult = true; *hasResultType = true; break;
+    case OpGroupFAdd: *hasResult = true; *hasResultType = true; break;
+    case OpGroupFMin: *hasResult = true; *hasResultType = true; break;
+    case OpGroupUMin: *hasResult = true; *hasResultType = true; break;
+    case OpGroupSMin: *hasResult = true; *hasResultType = true; break;
+    case OpGroupFMax: *hasResult = true; *hasResultType = true; break;
+    case OpGroupUMax: *hasResult = true; *hasResultType = true; break;
+    case OpGroupSMax: *hasResult = true; *hasResultType = true; break;
+    case OpReadPipe: *hasResult = true; *hasResultType = true; break;
+    case OpWritePipe: *hasResult = true; *hasResultType = true; break;
+    case OpReservedReadPipe: *hasResult = true; *hasResultType = true; break;
+    case OpReservedWritePipe: *hasResult = true; *hasResultType = true; break;
+    case OpReserveReadPipePackets: *hasResult = true; *hasResultType = true; break;
+    case OpReserveWritePipePackets: *hasResult = true; *hasResultType = true; break;
+    case OpCommitReadPipe: *hasResult = false; *hasResultType = false; break;
+    case OpCommitWritePipe: *hasResult = false; *hasResultType = false; break;
+    case OpIsValidReserveId: *hasResult = true; *hasResultType = true; break;
+    case OpGetNumPipePackets: *hasResult = true; *hasResultType = true; break;
+    case OpGetMaxPipePackets: *hasResult = true; *hasResultType = true; break;
+    case OpGroupReserveReadPipePackets: *hasResult = true; *hasResultType = true; break;
+    case OpGroupReserveWritePipePackets: *hasResult = true; *hasResultType = true; break;
+    case OpGroupCommitReadPipe: *hasResult = false; *hasResultType = false; break;
+    case OpGroupCommitWritePipe: *hasResult = false; *hasResultType = false; break;
+    case OpEnqueueMarker: *hasResult = true; *hasResultType = true; break;
+    case OpEnqueueKernel: *hasResult = true; *hasResultType = true; break;
+    case OpGetKernelNDrangeSubGroupCount: *hasResult = true; *hasResultType = true; break;
+    case OpGetKernelNDrangeMaxSubGroupSize: *hasResult = true; *hasResultType = true; break;
+    case OpGetKernelWorkGroupSize: *hasResult = true; *hasResultType = true; break;
+    case OpGetKernelPreferredWorkGroupSizeMultiple: *hasResult = true; *hasResultType = true; break;
+    case OpRetainEvent: *hasResult = false; *hasResultType = false; break;
+    case OpReleaseEvent: *hasResult = false; *hasResultType = false; break;
+    case OpCreateUserEvent: *hasResult = true; *hasResultType = true; break;
+    case OpIsValidEvent: *hasResult = true; *hasResultType = true; break;
+    case OpSetUserEventStatus: *hasResult = false; *hasResultType = false; break;
+    case OpCaptureEventProfilingInfo: *hasResult = false; *hasResultType = false; break;
+    case OpGetDefaultQueue: *hasResult = true; *hasResultType = true; break;
+    case OpBuildNDRange: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleProjImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleProjExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleProjDrefImplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseSampleProjDrefExplicitLod: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseFetch: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseGather: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseDrefGather: *hasResult = true; *hasResultType = true; break;
+    case OpImageSparseTexelsResident: *hasResult = true; *hasResultType = true; break;
+    case OpNoLine: *hasResult = false; *hasResultType = false; break;
+    case OpAtomicFlagTestAndSet: *hasResult = true; *hasResultType = true; break;
+    case OpAtomicFlagClear: *hasResult = false; *hasResultType = false; break;
+    case OpImageSparseRead: *hasResult = true; *hasResultType = true; break;
+    case OpSizeOf: *hasResult = true; *hasResultType = true; break;
+    case OpTypePipeStorage: *hasResult = true; *hasResultType = false; break;
+    case OpConstantPipeStorage: *hasResult = true; *hasResultType = true; break;
+    case OpCreatePipeFromPipeStorage: *hasResult = true; *hasResultType = true; break;
+    case OpGetKernelLocalSizeForSubgroupCount: *hasResult = true; *hasResultType = true; break;
+    case OpGetKernelMaxNumSubgroups: *hasResult = true; *hasResultType = true; break;
+    case OpTypeNamedBarrier: *hasResult = true; *hasResultType = false; break;
+    case OpNamedBarrierInitialize: *hasResult = true; *hasResultType = true; break;
+    case OpMemoryNamedBarrier: *hasResult = false; *hasResultType = false; break;
+    case OpModuleProcessed: *hasResult = false; *hasResultType = false; break;
+    case OpExecutionModeId: *hasResult = false; *hasResultType = false; break;
+    case OpDecorateId: *hasResult = false; *hasResultType = false; break;
+    case OpGroupNonUniformElect: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformAll: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformAny: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformAllEqual: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBroadcast: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBroadcastFirst: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBallot: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformInverseBallot: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBallotBitExtract: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBallotBitCount: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBallotFindLSB: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBallotFindMSB: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformShuffle: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformShuffleXor: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformShuffleUp: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformShuffleDown: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformIAdd: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformFAdd: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformIMul: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformFMul: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformSMin: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformUMin: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformFMin: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformSMax: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformUMax: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformFMax: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBitwiseAnd: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBitwiseOr: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformBitwiseXor: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformLogicalAnd: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformLogicalOr: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformLogicalXor: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformQuadBroadcast: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformQuadSwap: *hasResult = true; *hasResultType = true; break;
+    case OpCopyLogical: *hasResult = true; *hasResultType = true; break;
+    case OpPtrEqual: *hasResult = true; *hasResultType = true; break;
+    case OpPtrNotEqual: *hasResult = true; *hasResultType = true; break;
+    case OpPtrDiff: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupBallotKHR: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupFirstInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAllKHR: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAnyKHR: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAllEqualKHR: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupReadInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case OpGroupIAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupFAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupFMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupUMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupSMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupFMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupUMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpGroupSMaxNonUniformAMD: *hasResult = true; *hasResultType = true; break;
+    case OpFragmentMaskFetchAMD: *hasResult = true; *hasResultType = true; break;
+    case OpFragmentFetchAMD: *hasResult = true; *hasResultType = true; break;
+    case OpReadClockKHR: *hasResult = true; *hasResultType = true; break;
+    case OpImageSampleFootprintNV: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformPartitionNV: *hasResult = true; *hasResultType = true; break;
+    case OpWritePackedPrimitiveIndices4x8NV: *hasResult = false; *hasResultType = false; break;
+    case OpReportIntersectionNV: *hasResult = true; *hasResultType = true; break;
+    case OpIgnoreIntersectionNV: *hasResult = false; *hasResultType = false; break;
+    case OpTerminateRayNV: *hasResult = false; *hasResultType = false; break;
+    case OpTraceNV: *hasResult = false; *hasResultType = false; break;
+    case OpTypeAccelerationStructureNV: *hasResult = true; *hasResultType = false; break;
+    case OpExecuteCallableNV: *hasResult = false; *hasResultType = false; break;
+    case OpTypeCooperativeMatrixNV: *hasResult = true; *hasResultType = false; break;
+    case OpCooperativeMatrixLoadNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixStoreNV: *hasResult = false; *hasResultType = false; break;
+    case OpCooperativeMatrixMulAddNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixLengthNV: *hasResult = true; *hasResultType = true; break;
+    case OpBeginInvocationInterlockEXT: *hasResult = false; *hasResultType = false; break;
+    case OpEndInvocationInterlockEXT: *hasResult = false; *hasResultType = false; break;
+    case OpDemoteToHelperInvocationEXT: *hasResult = false; *hasResultType = false; break;
+    case OpIsHelperInvocationEXT: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupShuffleINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupShuffleDownINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupShuffleUpINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupShuffleXorINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupBlockReadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupBlockWriteINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroupImageBlockReadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupImageBlockWriteINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroupImageMediaBlockReadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupImageMediaBlockWriteINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpUCountLeadingZerosINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpUCountTrailingZerosINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpAbsISubINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpAbsUSubINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpIAddSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpUAddSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpIAverageINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpUAverageINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpIAverageRoundedINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpUAverageRoundedINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpISubSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpUSubSatINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpIMul32x16INTEL: *hasResult = true; *hasResultType = true; break;
+    case OpUMul32x16INTEL: *hasResult = true; *hasResultType = true; break;
+    case OpDecorateString: *hasResult = false; *hasResultType = false; break;
+    case OpMemberDecorateString: *hasResult = false; *hasResultType = false; break;
+    case OpVmeImageINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpTypeVmeImageINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcImePayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcRefPayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcSicPayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcMcePayloadINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcMceResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcImeResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcImeResultSingleReferenceStreamoutINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcImeResultDualReferenceStreamoutINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcImeSingleReferenceStreaminINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcImeDualReferenceStreaminINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcRefResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpTypeAvcSicResultINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetInterShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetInterDirectionPenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetAcOnlyHaarINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceConvertToImePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceConvertToImeResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceConvertToRefPayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceConvertToRefResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceConvertToSicPayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceConvertToSicResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetMotionVectorsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetBestInterDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterMajorShapeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterMinorShapeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterDirectionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterMotionVectorCountINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterReferenceIdsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeSetSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeSetDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeRefWindowSizeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeAdjustRefOffsetINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeConvertToMcePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeSetMaxMotionVectorCountINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeSetWeightedSadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeConvertToMceResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetSingleReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetDualReferenceStreaminINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeStripDualReferenceStreamoutINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetBorderReachedINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcFmeInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcBmeInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefConvertToMcePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefSetBidirectionalMixDisableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefSetBilinearFilterEnableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefEvaluateWithDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcRefConvertToMceResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicInitializeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicConfigureSkcINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicConfigureIpeLumaINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicConfigureIpeLumaChromaINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetMotionVectorMaskINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicConvertToMcePayloadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicSetBilinearFilterEnableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicEvaluateIpeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicEvaluateWithDualReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicConvertToMceResultINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetIpeLumaShapeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetPackedIpeLumaModesINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetIpeChromaModeINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpSubgroupAvcSicGetInterRawSadsINTEL: *hasResult = true; *hasResultType = true; break;
+    }
+}
+#endif /* SPV_ENABLE_UTILITY_CODE */
+
 // Overload operator| for mask bit combining
 
 inline ImageOperandsMask operator|(ImageOperandsMask a, ImageOperandsMask b) { return ImageOperandsMask(unsigned(a) | unsigned(b)); }

From 596d78942c4beb61a5c19282bc6f33d4f21b0b0d Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sun, 22 Mar 2020 23:06:53 -0700
Subject: [PATCH 740/770] added cl_khr_subgroup_ballot

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp         | 25 +++++++++++++------
 llvm-spirv/lib/SPIRV/OCLUtil.h                | 24 ++++++++++++++++++
 llvm-spirv/lib/SPIRV/SPIRVInternal.h          |  1 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h     |  5 ++++
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     |  2 +-
 .../lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h     | 13 ++++++++++
 .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h     |  5 ++++
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h   |  4 ++-
 8 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 3c6971836b9fb..2b5bfd852d4c3 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -931,17 +931,25 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
     return;
 
   if (DemangledName != kOCLBuiltinName::WaitGroupEvent) {
-    StringRef GroupOp = DemangledName;
-    GroupOp = GroupOp.drop_front(strlen(kSPIRVName::GroupPrefix));
+    StringRef FuncName = DemangledName;
+    FuncName = FuncName.drop_front(strlen(kSPIRVName::GroupPrefix));
     SPIRSPIRVGroupOperationMap::foreachConditional(
         [&](const std::string &S, SPIRVGroupOperationKind G) {
-          if (!GroupOp.startswith(S))
+          if (!FuncName.startswith(S))
             return true; // continue
           PreOps.push_back(G);
-          StringRef Op = GroupOp.drop_front(S.size() + 1);
-          assert(!Op.empty() && "Invalid OpenCL group builtin function");
+          StringRef Op = StringSwitch<StringRef>(FuncName)
+              .StartsWith("ballot", "group_ballot_bit_count_")
+              .StartsWith("non_uniform_group", kSPIRVName::GroupNonUniformPrefix)
+              .Default(kSPIRVName::GroupPrefix);
+          StringRef GroupOp = StringSwitch<StringRef>(FuncName)
+              .Case("ballot_bit_count", "add")
+              .Case("ballot_inclusive_scan", "add")
+              .Case("ballot_exclusive_scan", "add")
+              .Default(FuncName.take_back(3));   // assumes op is three characters
+          assert(!GroupOp.empty() && "Invalid OpenCL group builtin function");
           char OpTyC = 0;
-          auto NeedSign = Op == "max" || Op == "min";
+          auto NeedSign = GroupOp == "max" || GroupOp == "min";
           auto OpTy = F->getReturnType();
           if (OpTy->isFloatingPointTy())
             OpTyC = 'f';
@@ -957,8 +965,7 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
           } else
             llvm_unreachable("Invalid OpenCL group builtin argument type");
 
-          DemangledName =
-              std::string(kSPIRVName::GroupPrefix) + OpTyC + Op.str();
+          DemangledName = Op.str() + OpTyC + GroupOp.str();
           return false; // break out of loop
         });
   }
@@ -967,6 +974,8 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
                         DemangledName.find("_any") != std::string::npos);
   bool IsGroupAllEqual = DemangledName.find("_all_equal") != std::string::npos;
 
+  // TODO: Need to convert arg to sub_group_ballot to i1!
+
   auto Consts = getInt32(M, PreOps);
   OCLBuiltinTransInfo Info;
   if (IsGroupAllAny || IsGroupAllEqual)
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index 6acbc7de83b7d..48b9a289c9952 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -627,6 +627,9 @@ template <> inline void SPIRVMap<std::string, SPIRVGroupOperationKind>::init() {
   add("reduce", GroupOperationReduce);
   add("scan_inclusive", GroupOperationInclusiveScan);
   add("scan_exclusive", GroupOperationExclusiveScan);
+  add("ballot_bit_count", GroupOperationReduce);
+  add("ballot_inclusive_scan", GroupOperationInclusiveScan);
+  add("ballot_exclusive_scan", GroupOperationExclusiveScan);
 }
 
 template <> inline void SPIRVMap<std::string, SPIRVFPRoundingModeKind>::init() {
@@ -690,12 +693,19 @@ inline void SPIRVMap<std::string, SPIRVBuiltinVariableKind>::init() {
   add("get_group_id", BuiltInWorkgroupId);
   add("get_global_linear_id", BuiltInGlobalLinearId);
   add("get_local_linear_id", BuiltInLocalInvocationIndex);
+  // cl_khr_subgroups
   add("get_sub_group_size", BuiltInSubgroupSize);
   add("get_max_sub_group_size", BuiltInSubgroupMaxSize);
   add("get_num_sub_groups", BuiltInNumSubgroups);
   add("get_enqueued_num_sub_groups", BuiltInNumEnqueuedSubgroups);
   add("get_sub_group_id", BuiltInSubgroupId);
   add("get_sub_group_local_id", BuiltInSubgroupLocalInvocationId);
+  // cl_khr_subgroup_ballot
+  add("get_sub_group_eq_mask", BuiltInSubgroupEqMask);
+  add("get_sub_group_ge_mask", BuiltInSubgroupGeMask);
+  add("get_sub_group_gt_mask", BuiltInSubgroupGtMask);
+  add("get_sub_group_le_mask", BuiltInSubgroupLeMask);
+  add("get_sub_group_lt_mask", BuiltInSubgroupLtMask);
 }
 
 // Maps uniqued OCL builtin function name to SPIR-V op code.
@@ -839,6 +849,20 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   _SPIRV_OP(intel_sub_group_media_block_read, SubgroupImageMediaBlockReadINTEL)
   _SPIRV_OP(intel_sub_group_media_block_write,
             SubgroupImageMediaBlockWriteINTEL)
+  // cl_khr_subgroup_non_uniform_vote
+  _SPIRV_OP(group_elect, GroupNonUniformElect)
+  _SPIRV_OP(group_non_uniform_all, GroupNonUniformAll)
+  _SPIRV_OP(group_non_uniform_any, GroupNonUniformAny)
+  _SPIRV_OP(group_non_uniform_all_equal, GroupNonUniformAllEqual)
+  // cl_khr_subgroup_ballot
+  _SPIRV_OP(group_non_uniform_broadcast, GroupNonUniformBroadcast)
+  _SPIRV_OP(group_broadcast_first, GroupNonUniformBroadcastFirst)
+  _SPIRV_OP(group_ballot, GroupNonUniformBallot)
+  _SPIRV_OP(group_inverse_ballot, GroupNonUniformInverseBallot)
+  _SPIRV_OP(group_ballot_bit_extract, GroupNonUniformBallotBitExtract)
+  _SPIRV_OP(group_ballot_bit_count_iadd, GroupNonUniformBallotBitCount)
+  _SPIRV_OP(group_ballot_find_lsb, GroupNonUniformBallotFindLSB)
+  _SPIRV_OP(group_ballot_find_msb, GroupNonUniformBallotFindMSB)
 #undef _SPIRV_OP
 }
 
diff --git a/llvm-spirv/lib/SPIRV/SPIRVInternal.h b/llvm-spirv/lib/SPIRV/SPIRVInternal.h
index 0853281a2e89f..0275a5e21f02d 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVInternal.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVInternal.h
@@ -330,6 +330,7 @@ const static char AtomicPrefixInternal[] = "atomic_";
 
 namespace kSPIRVName {
 const static char GroupPrefix[] = "group_";
+const static char GroupNonUniformPrefix[] = "group_non_uniform_";
 const static char Prefix[] = "__spirv_";
 const static char Postfix[] = "__";
 const static char ImageQuerySize[] = "ImageQuerySize";
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
index 2fbd8d2603f1b..3de9929617622 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
@@ -407,6 +407,11 @@ template <> inline void SPIRVMap<BuiltIn, SPIRVCapVec>::init() {
   ADD_VEC_INIT(BuiltInNumEnqueuedSubgroups, {CapabilityKernel});
   ADD_VEC_INIT(BuiltInSubgroupId, {CapabilityKernel});
   ADD_VEC_INIT(BuiltInSubgroupLocalInvocationId, {CapabilityKernel});
+  ADD_VEC_INIT(BuiltInSubgroupEqMask, {CapabilityGroupNonUniformBallot});
+  ADD_VEC_INIT(BuiltInSubgroupGeMask, {CapabilityGroupNonUniformBallot});
+  ADD_VEC_INIT(BuiltInSubgroupGtMask, {CapabilityGroupNonUniformBallot});
+  ADD_VEC_INIT(BuiltInSubgroupLeMask, {CapabilityGroupNonUniformBallot});
+  ADD_VEC_INIT(BuiltInSubgroupLtMask, {CapabilityGroupNonUniformBallot});
   ADD_VEC_INIT(BuiltInVertexIndex, {CapabilityShader});
   ADD_VEC_INIT(BuiltInInstanceIndex, {CapabilityShader});
 }
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index d0af944004367..1a1a0716b302e 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2525,7 +2525,7 @@ _SPIRV_OP(GroupNonUniformBroadcastFirst, true, 5)
 _SPIRV_OP(GroupNonUniformBallot, true, 5)
 _SPIRV_OP(GroupNonUniformInverseBallot, true, 5)
 _SPIRV_OP(GroupNonUniformBallotBitExtract, true, 6)
-_SPIRV_OP(GroupNonUniformBallotBitCount, true, 6)
+_SPIRV_OP(GroupNonUniformBallotBitCount, true, 6, false, 1)
 _SPIRV_OP(GroupNonUniformBallotFindLSB, true, 5)
 _SPIRV_OP(GroupNonUniformBallotFindMSB, true, 5)
 _SPIRV_OP(GroupNonUniformShuffle, true, 6)
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
index 393471b874711..121a6522007f2 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
@@ -459,6 +459,11 @@ inline bool isValid(spv::BuiltIn V) {
   case BuiltInNumEnqueuedSubgroups:
   case BuiltInSubgroupId:
   case BuiltInSubgroupLocalInvocationId:
+  case BuiltInSubgroupEqMask:
+  case BuiltInSubgroupGeMask:
+  case BuiltInSubgroupGtMask:
+  case BuiltInSubgroupLeMask:
+  case BuiltInSubgroupLtMask:
   case BuiltInVertexIndex:
   case BuiltInInstanceIndex:
     return true;
@@ -563,6 +568,14 @@ inline bool isValid(spv::Capability V) {
   case CapabilitySubgroupDispatch:
   case CapabilityNamedBarrier:
   case CapabilityPipeStorage:
+  case CapabilityGroupNonUniform:
+  case CapabilityGroupNonUniformVote:
+  case CapabilityGroupNonUniformArithmetic:
+  case CapabilityGroupNonUniformBallot:
+  case CapabilityGroupNonUniformShuffle:
+  case CapabilityGroupNonUniformShuffleRelative:
+  case CapabilityGroupNonUniformClustered:
+  case CapabilityGroupNonUniformQuad:
   case CapabilityFPGAMemoryAttributesINTEL:
   case CapabilityArbitraryPrecisionIntegersINTEL:
   case CapabilityFPGALoopControlsINTEL:
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index 909ee64541799..723cc7b9b1af7 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -394,6 +394,11 @@ template <> inline void SPIRVMap<BuiltIn, std::string>::init() {
   add(BuiltInNumEnqueuedSubgroups, "BuiltInNumEnqueuedSubgroups");
   add(BuiltInSubgroupId, "BuiltInSubgroupId");
   add(BuiltInSubgroupLocalInvocationId, "BuiltInSubgroupLocalInvocationId");
+  add(BuiltInSubgroupEqMask, "BuiltInSubgroupEqMask");
+  add(BuiltInSubgroupGeMask, "BuiltInSubgroupGeMask");
+  add(BuiltInSubgroupGtMask, "BuiltInSubgroupGtMask");
+  add(BuiltInSubgroupLeMask, "BuiltInSubgroupLeMask");
+  add(BuiltInSubgroupLtMask, "BuiltInSubgroupLtMask");
   add(BuiltInVertexIndex, "BuiltInVertexIndex");
   add(BuiltInInstanceIndex, "BuiltInInstanceIndex");
 }
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h
index 271378eba11b1..03478b62c6166 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h
@@ -131,7 +131,9 @@ inline bool hasExecScope(Op OpCode) {
 
 inline bool hasGroupOperation(Op OpCode) {
   unsigned OC = OpCode;
-  return OpGroupIAdd <= OC && OC <= OpGroupSMax;
+  return (OpGroupIAdd <= OC && OC <= OpGroupSMax) ||
+         (OpGroupNonUniformBallotBitCount <= OC && OpGroupNonUniformBallotFindMSB <= OC) ||
+         (OpGroupNonUniformIAdd <= OC && OC <= OpGroupNonUniformLogicalXor);
 }
 
 inline bool isGroupOpCode(Op OpCode) {

From 6f85d6470d790e9f7ba500dee6255fe01c822825 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sun, 22 Mar 2020 23:52:34 -0700
Subject: [PATCH 741/770] initial changes for
 cl_khr_subgroup_non_uniform_arithmetic

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp         |  2 +-
 llvm-spirv/lib/SPIRV/OCLUtil.h                | 23 +++++++++++++
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     | 32 +++++++++----------
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 2b5bfd852d4c3..2a80eef9133b0 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -940,7 +940,7 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
           PreOps.push_back(G);
           StringRef Op = StringSwitch<StringRef>(FuncName)
               .StartsWith("ballot", "group_ballot_bit_count_")
-              .StartsWith("non_uniform_group", kSPIRVName::GroupNonUniformPrefix)
+              .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
               .Default(kSPIRVName::GroupPrefix);
           StringRef GroupOp = StringSwitch<StringRef>(FuncName)
               .Case("ballot_bit_count", "add")
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index 48b9a289c9952..61241be1ee38d 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -630,6 +630,12 @@ template <> inline void SPIRVMap<std::string, SPIRVGroupOperationKind>::init() {
   add("ballot_bit_count", GroupOperationReduce);
   add("ballot_inclusive_scan", GroupOperationInclusiveScan);
   add("ballot_exclusive_scan", GroupOperationExclusiveScan);
+  add("non_uniform_reduce", GroupOperationReduce);
+  add("non_uniform_scan_inclusive", GroupOperationInclusiveScan);
+  add("non_uniform_scan_exclusive", GroupOperationExclusiveScan);
+  add("non_uniform_reduce_logical", GroupOperationReduce);
+  add("non_uniform_scan_inclusive_logical", GroupOperationInclusiveScan);
+  add("non_uniform_scan_exclusive_logical", GroupOperationExclusiveScan);
 }
 
 template <> inline void SPIRVMap<std::string, SPIRVFPRoundingModeKind>::init() {
@@ -863,6 +869,23 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   _SPIRV_OP(group_ballot_bit_count_iadd, GroupNonUniformBallotBitCount)
   _SPIRV_OP(group_ballot_find_lsb, GroupNonUniformBallotFindLSB)
   _SPIRV_OP(group_ballot_find_msb, GroupNonUniformBallotFindMSB)
+  // cl_khr_subgroup_non_uniform_arithmetic
+  _SPIRV_OP(group_non_uniform_iadd, GroupNonUniformIAdd)
+  _SPIRV_OP(group_non_uniform_fadd, GroupNonUniformFAdd)
+  _SPIRV_OP(group_non_uniform_imul, GroupNonUniformIMul)
+  _SPIRV_OP(group_non_uniform_fmul, GroupNonUniformFMul)
+  _SPIRV_OP(group_non_uniform_smin, GroupNonUniformSMin)
+  _SPIRV_OP(group_non_uniform_umin, GroupNonUniformUMin)
+  _SPIRV_OP(group_non_uniform_fmin, GroupNonUniformFMin)
+  _SPIRV_OP(group_non_uniform_smax, GroupNonUniformSMax)
+  _SPIRV_OP(group_non_uniform_umax, GroupNonUniformUMax)
+  _SPIRV_OP(group_non_uniform_fmax, GroupNonUniformFMax)
+  _SPIRV_OP(group_non_uniform_iand, GroupNonUniformBitwiseAnd)
+  _SPIRV_OP(group_non_uniform_ior, GroupNonUniformBitwiseOr)
+  _SPIRV_OP(group_non_uniform_ixor, GroupNonUniformBitwiseXor)
+  //_SPIRV_OP(group_non_uniform_xxx, GroupNonUniformLogicalAnd)
+  //_SPIRV_OP(group_non_uniform_xxx, GroupNonUniformLogicalOr)
+  //_SPIRV_OP(group_non_uniform_xxx, GroupNonUniformLogicalXor)
 #undef _SPIRV_OP
 }
 
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 1a1a0716b302e..6d5b6daf69d48 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2532,22 +2532,22 @@ _SPIRV_OP(GroupNonUniformShuffle, true, 6)
 _SPIRV_OP(GroupNonUniformShuffleXor, true, 6)
 _SPIRV_OP(GroupNonUniformShuffleUp, true, 6)
 _SPIRV_OP(GroupNonUniformShuffleDown, true, 6)
-_SPIRV_OP(GroupNonUniformIAdd, true, 6, true)
-_SPIRV_OP(GroupNonUniformFAdd, true, 6, true)
-_SPIRV_OP(GroupNonUniformIMul, true, 6, true)
-_SPIRV_OP(GroupNonUniformFMul, true, 6, true)
-_SPIRV_OP(GroupNonUniformSMin, true, 6, true)
-_SPIRV_OP(GroupNonUniformUMin, true, 6, true)
-_SPIRV_OP(GroupNonUniformFMin, true, 6, true)
-_SPIRV_OP(GroupNonUniformSMax, true, 6, true)
-_SPIRV_OP(GroupNonUniformUMax, true, 6, true)
-_SPIRV_OP(GroupNonUniformFMax, true, 6, true)
-_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, true)
-_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, true)
-_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, true)
-_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, true)
-_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, true)
-_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, true)
+_SPIRV_OP(GroupNonUniformIAdd, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformFAdd, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformIMul, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformFMul, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformSMin, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformUMin, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformFMin, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformSMax, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformUMax, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformFMax, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, false, 1)
 
 #undef _SPIRV_OP
 

From ff35f174643b255710bd3542f6dc4480a456d063 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 23 Mar 2020 18:06:06 -0700
Subject: [PATCH 742/770] added cl_khr_subgroup_non_uniform_arithmetic

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp       | 21 +++++++++++++--------
 llvm-spirv/lib/SPIRV/OCLUtil.h              |  6 +++---
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h |  2 +-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 2a80eef9133b0..b8ee100084cac 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -939,14 +939,19 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
             return true; // continue
           PreOps.push_back(G);
           StringRef Op = StringSwitch<StringRef>(FuncName)
-              .StartsWith("ballot", "group_ballot_bit_count_")
-              .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
-              .Default(kSPIRVName::GroupPrefix);
+            .StartsWith("ballot", "group_ballot_bit_count_")
+            .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
+            .Default(kSPIRVName::GroupPrefix);
+          StringRef LogicalOp =
+            FuncName.contains("logical_") ?
+            "logical_" : "";
           StringRef GroupOp = StringSwitch<StringRef>(FuncName)
-              .Case("ballot_bit_count", "add")
-              .Case("ballot_inclusive_scan", "add")
-              .Case("ballot_exclusive_scan", "add")
-              .Default(FuncName.take_back(3));   // assumes op is three characters
+            .Case("ballot_bit_count", "add")
+            .Case("ballot_inclusive_scan", "add")
+            .Case("ballot_exclusive_scan", "add")
+            .Default(FuncName.take_back(3));    // assumes op is three characters
+          if (GroupOp.startswith("_"))
+            GroupOp = GroupOp.take_back(2);     // when op is two characters
           assert(!GroupOp.empty() && "Invalid OpenCL group builtin function");
           char OpTyC = 0;
           auto NeedSign = GroupOp == "max" || GroupOp == "min";
@@ -965,7 +970,7 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
           } else
             llvm_unreachable("Invalid OpenCL group builtin argument type");
 
-          DemangledName = Op.str() + OpTyC + GroupOp.str();
+          DemangledName = Op.str() + LogicalOp.str() + OpTyC + GroupOp.str();
           return false; // break out of loop
         });
   }
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index 61241be1ee38d..c4de3f5daecd4 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -883,9 +883,9 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   _SPIRV_OP(group_non_uniform_iand, GroupNonUniformBitwiseAnd)
   _SPIRV_OP(group_non_uniform_ior, GroupNonUniformBitwiseOr)
   _SPIRV_OP(group_non_uniform_ixor, GroupNonUniformBitwiseXor)
-  //_SPIRV_OP(group_non_uniform_xxx, GroupNonUniformLogicalAnd)
-  //_SPIRV_OP(group_non_uniform_xxx, GroupNonUniformLogicalOr)
-  //_SPIRV_OP(group_non_uniform_xxx, GroupNonUniformLogicalXor)
+  _SPIRV_OP(group_non_uniform_logical_iand, GroupNonUniformLogicalAnd)
+  _SPIRV_OP(group_non_uniform_logical_ior, GroupNonUniformLogicalOr)
+  _SPIRV_OP(group_non_uniform_logical_ixor, GroupNonUniformLogicalXor)
 #undef _SPIRV_OP
 }
 
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h
index 03478b62c6166..f7a4c7c854578 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVOpCode.h
@@ -132,7 +132,7 @@ inline bool hasExecScope(Op OpCode) {
 inline bool hasGroupOperation(Op OpCode) {
   unsigned OC = OpCode;
   return (OpGroupIAdd <= OC && OC <= OpGroupSMax) ||
-         (OpGroupNonUniformBallotBitCount <= OC && OpGroupNonUniformBallotFindMSB <= OC) ||
+         (OpGroupNonUniformBallotBitCount == OC) ||
          (OpGroupNonUniformIAdd <= OC && OC <= OpGroupNonUniformLogicalXor);
 }
 

From 8c6b5138bf6e98a6667a7183d2aae480e5e6b088 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 23 Mar 2020 23:12:46 -0700
Subject: [PATCH 743/770] added cl_khr_subgroup_shuffle

---
 llvm-spirv/lib/SPIRV/OCLUtil.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index c4de3f5daecd4..e4c8bca78e5c3 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -886,6 +886,9 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   _SPIRV_OP(group_non_uniform_logical_iand, GroupNonUniformLogicalAnd)
   _SPIRV_OP(group_non_uniform_logical_ior, GroupNonUniformLogicalOr)
   _SPIRV_OP(group_non_uniform_logical_ixor, GroupNonUniformLogicalXor)
+  // cl_khr_subgroup_shuffle
+  _SPIRV_OP(group_shuffle, GroupNonUniformShuffle)
+  _SPIRV_OP(group_shuffle_xor, GroupNonUniformShuffleXor)
 #undef _SPIRV_OP
 }
 

From 9ad880e677c6f3560fe7b1735e6e7780030ca615 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 23 Mar 2020 23:20:55 -0700
Subject: [PATCH 744/770] added cl_khr_subgroup_shuffle_relative

---
 llvm-spirv/lib/SPIRV/OCLUtil.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index e4c8bca78e5c3..8d62ae76d96f1 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -889,6 +889,9 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   // cl_khr_subgroup_shuffle
   _SPIRV_OP(group_shuffle, GroupNonUniformShuffle)
   _SPIRV_OP(group_shuffle_xor, GroupNonUniformShuffleXor)
+  // cl_khr_subgroup_shuffle_relative
+  _SPIRV_OP(group_shuffle_up, GroupNonUniformShuffleUp)
+  _SPIRV_OP(group_shuffle_down, GroupNonUniformShuffleDown)
 #undef _SPIRV_OP
 }
 

From b9baae5f4f308aae49117d7f4128660e9ed73805 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Fri, 27 Mar 2020 16:59:05 -0700
Subject: [PATCH 745/770] initial changes for cl_khr_subgroup_clustered_reduce

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp         |  6 +++-
 llvm-spirv/lib/SPIRV/OCLUtil.h                | 19 +++++++++++
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     | 32 +++++++++----------
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index b8ee100084cac..1855d72b83a94 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -942,6 +942,9 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
             .StartsWith("ballot", "group_ballot_bit_count_")
             .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
             .Default(kSPIRVName::GroupPrefix);
+          StringRef ClusteredOp =
+            FuncName.contains("clustered_") ?
+            "clustered_" : "";
           StringRef LogicalOp =
             FuncName.contains("logical_") ?
             "logical_" : "";
@@ -970,7 +973,8 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
           } else
             llvm_unreachable("Invalid OpenCL group builtin argument type");
 
-          DemangledName = Op.str() + LogicalOp.str() + OpTyC + GroupOp.str();
+          DemangledName = Op.str() + ClusteredOp.str() + LogicalOp.str() +
+            OpTyC + GroupOp.str();
           return false; // break out of loop
         });
   }
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index 8d62ae76d96f1..23bee5d96c9e4 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -636,6 +636,7 @@ template <> inline void SPIRVMap<std::string, SPIRVGroupOperationKind>::init() {
   add("non_uniform_reduce_logical", GroupOperationReduce);
   add("non_uniform_scan_inclusive_logical", GroupOperationInclusiveScan);
   add("non_uniform_scan_exclusive_logical", GroupOperationExclusiveScan);
+  add("clustered_reduce", GroupOperationClusteredReduce);
 }
 
 template <> inline void SPIRVMap<std::string, SPIRVFPRoundingModeKind>::init() {
@@ -892,6 +893,24 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   // cl_khr_subgroup_shuffle_relative
   _SPIRV_OP(group_shuffle_up, GroupNonUniformShuffleUp)
   _SPIRV_OP(group_shuffle_down, GroupNonUniformShuffleDown)
+  // cl_khr_subgroup_clustered_reduce
+  _SPIRV_OP(group_clustered_iadd, GroupNonUniformIAdd)
+  _SPIRV_OP(group_clustered_iadd, GroupNonUniformIAdd)
+  _SPIRV_OP(group_clustered_fadd, GroupNonUniformFAdd)
+  _SPIRV_OP(group_clustered_imul, GroupNonUniformIMul)
+  _SPIRV_OP(group_clustered_fmul, GroupNonUniformFMul)
+  _SPIRV_OP(group_clustered_smin, GroupNonUniformSMin)
+  _SPIRV_OP(group_clustered_umin, GroupNonUniformUMin)
+  _SPIRV_OP(group_clustered_fmin, GroupNonUniformFMin)
+  _SPIRV_OP(group_clustered_smax, GroupNonUniformSMax)
+  _SPIRV_OP(group_clustered_umax, GroupNonUniformUMax)
+  _SPIRV_OP(group_clustered_fmax, GroupNonUniformFMax)
+  _SPIRV_OP(group_clustered_iand, GroupNonUniformBitwiseAnd)
+  _SPIRV_OP(group_clustered_ior, GroupNonUniformBitwiseOr)
+  _SPIRV_OP(group_clustered_ixor, GroupNonUniformBitwiseXor)
+  _SPIRV_OP(group_clustered_logical_iand, GroupNonUniformLogicalAnd)
+  _SPIRV_OP(group_clustered_logical_ior, GroupNonUniformLogicalOr)
+  _SPIRV_OP(group_clustered_logical_ixor, GroupNonUniformLogicalXor)
 #undef _SPIRV_OP
 }
 
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 6d5b6daf69d48..73e681e6521dd 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2532,22 +2532,22 @@ _SPIRV_OP(GroupNonUniformShuffle, true, 6)
 _SPIRV_OP(GroupNonUniformShuffleXor, true, 6)
 _SPIRV_OP(GroupNonUniformShuffleUp, true, 6)
 _SPIRV_OP(GroupNonUniformShuffleDown, true, 6)
-_SPIRV_OP(GroupNonUniformIAdd, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformFAdd, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformIMul, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformFMul, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformSMin, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformUMin, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformFMin, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformSMax, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformUMax, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformFMax, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, false, 1)
-_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformIAdd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFAdd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformIMul, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFMul, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformSMin, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformUMin, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFMin, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformSMax, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformUMax, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFMax, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, true, 1)
 
 #undef _SPIRV_OP
 

From e32f8908d29eab71b26bfce2e37ad92f3ea78520 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sun, 29 Mar 2020 23:37:36 -0700
Subject: [PATCH 746/770] properly return boolean types for elect and logical
 instructions

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 1855d72b83a94..cfb89e64b8cfb 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -979,19 +979,20 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
         });
   }
 
-  bool IsGroupAllAny = (DemangledName.find("_all") != std::string::npos ||
-                        DemangledName.find("_any") != std::string::npos);
-  bool IsGroupAllEqual = DemangledName.find("_all_equal") != std::string::npos;
-
-  // TODO: Need to convert arg to sub_group_ballot to i1!
+  const bool IsElect = DemangledName == "group_elect";
+  const bool IsAllOrAny = (DemangledName.find("_all") != std::string::npos ||
+                           DemangledName.find("_any") != std::string::npos);
+  const bool IsAllEqual = DemangledName.find("_all_equal") != std::string::npos;
+  const bool IsBallot = DemangledName == "group_ballot";
+  const bool IsLogical = DemangledName.find("_logical") != std::string::npos;
 
   auto Consts = getInt32(M, PreOps);
   OCLBuiltinTransInfo Info;
-  if (IsGroupAllAny || IsGroupAllEqual)
+  if (IsElect || IsAllOrAny || IsAllEqual || IsLogical)
     Info.RetTy = Type::getInt1Ty(*Ctx);
   Info.UniqName = DemangledName;
   Info.PostProc = [=](std::vector<Value *> &Ops) {
-    if (IsGroupAllAny && !IsGroupAllEqual) {
+    if ((IsAllOrAny && !IsAllEqual) || IsBallot || IsLogical) {
       IRBuilder<> IRB(CI);
       Ops[0] =
           IRB.CreateICmpNE(Ops[0], ConstantInt::get(Type::getInt32Ty(*Ctx), 0));

From 614ed23b66bb65f123db5fcccec1443debaf4995 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sun, 29 Mar 2020 23:43:55 -0700
Subject: [PATCH 747/770] properly return boolean type for inverse ballot
 instruction

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index cfb89e64b8cfb..1970605ef182c 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -984,11 +984,12 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
                            DemangledName.find("_any") != std::string::npos);
   const bool IsAllEqual = DemangledName.find("_all_equal") != std::string::npos;
   const bool IsBallot = DemangledName == "group_ballot";
+  const bool IsInverseBallot = DemangledName == "group_inverse_ballot";
   const bool IsLogical = DemangledName.find("_logical") != std::string::npos;
 
   auto Consts = getInt32(M, PreOps);
   OCLBuiltinTransInfo Info;
-  if (IsElect || IsAllOrAny || IsAllEqual || IsLogical)
+  if (IsElect || IsAllOrAny || IsAllEqual || IsInverseBallot || IsLogical)
     Info.RetTy = Type::getInt1Ty(*Ctx);
   Info.UniqName = DemangledName;
   Info.PostProc = [=](std::vector<Value *> &Ops) {

From 42a98a16c11117f4fca886d4cf202fd266f660a3 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Mon, 6 Apr 2020 13:13:27 -0700
Subject: [PATCH 748/770] distinguish between signed and unsigned clustered min
 and max

---
 llvm-spirv/include/LLVMSPIRVOpts.h         |  6 +++++-
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp      | 20 ++++++++++++++++----
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h |  9 +++++++++
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h
index 02230c4374aca..d5ce801de4af8 100644
--- a/llvm-spirv/include/LLVMSPIRVOpts.h
+++ b/llvm-spirv/include/LLVMSPIRVOpts.h
@@ -51,10 +51,14 @@ enum class VersionNumber : uint32_t {
   // Instruction
   SPIRV_1_0 = 0x00010000,
   SPIRV_1_1 = 0x00010100,
+  SPIRV_1_2 = 0x00010200,
+  SPIRV_1_3 = 0x00010300,
+  SPIRV_1_4 = 0x00010400,
+  SPIRV_1_5 = 0x00010500,
   // TODO: populate this enum with the latest versions (up to 1.4) once
   // translator get support of correponding features
   MinimumVersion = SPIRV_1_0,
-  MaximumVersion = SPIRV_1_1
+  MaximumVersion = SPIRV_1_5
 };
 
 enum class ExtensionID : uint32_t {
diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 1970605ef182c..7e68cb86f3592 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -957,15 +957,22 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
             GroupOp = GroupOp.take_back(2);     // when op is two characters
           assert(!GroupOp.empty() && "Invalid OpenCL group builtin function");
           char OpTyC = 0;
-          auto NeedSign = GroupOp == "max" || GroupOp == "min";
           auto OpTy = F->getReturnType();
           if (OpTy->isFloatingPointTy())
             OpTyC = 'f';
           else if (OpTy->isIntegerTy()) {
+            auto NeedSign = GroupOp == "max" || GroupOp == "min";
             if (!NeedSign)
               OpTyC = 'i';
             else {
-              if (isLastFuncParamSigned(F->getName()))
+              // clustered reduce args are (type, uint)
+              // other operation args are (type)
+              auto mangledName = F->getName();
+              auto mangledTyC =
+                ClusteredOp.empty() ?
+                mangledName.back() :
+                mangledName.take_back(2).front();
+              if (isMangledTypeSigned(mangledTyC))
                 OpTyC = 's';
               else
                 OpTyC = 'u';
@@ -985,15 +992,20 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
   const bool IsAllEqual = DemangledName.find("_all_equal") != std::string::npos;
   const bool IsBallot = DemangledName == "group_ballot";
   const bool IsInverseBallot = DemangledName == "group_inverse_ballot";
+  const bool IsBallotBitExtract = DemangledName == "group_ballot_bit_extract";
   const bool IsLogical = DemangledName.find("_logical") != std::string::npos;
 
+  const bool HasBoolReturnType = IsElect || IsAllOrAny || IsAllEqual ||
+      IsInverseBallot || IsBallotBitExtract || IsLogical;
+  const bool HasBoolArg = (IsAllOrAny && !IsAllEqual) || IsBallot || IsLogical;
+
   auto Consts = getInt32(M, PreOps);
   OCLBuiltinTransInfo Info;
-  if (IsElect || IsAllOrAny || IsAllEqual || IsInverseBallot || IsLogical)
+  if (HasBoolReturnType)
     Info.RetTy = Type::getInt1Ty(*Ctx);
   Info.UniqName = DemangledName;
   Info.PostProc = [=](std::vector<Value *> &Ops) {
-    if ((IsAllOrAny && !IsAllEqual) || IsBallot || IsLogical) {
+    if (HasBoolArg) {
       IRBuilder<> IRB(CI);
       Ops[0] =
           IRB.CreateICmpNE(Ops[0], ConstantInt::get(Type::getInt32Ty(*Ctx), 0));
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
index 772a9e7dc3556..17da5f0f1bf84 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
@@ -743,6 +743,15 @@ class SPIRVCapability : public SPIRVEntryNoId<OpCapability> {
 
   SPIRVWord getRequiredSPIRVVersion() const override {
     switch (Kind) {
+    case CapabilityGroupNonUniform:
+    case CapabilityGroupNonUniformVote:
+    case CapabilityGroupNonUniformArithmetic:
+    case CapabilityGroupNonUniformBallot:
+    case CapabilityGroupNonUniformShuffle:
+    case CapabilityGroupNonUniformShuffleRelative:
+    case CapabilityGroupNonUniformClustered:
+      return static_cast<SPIRVWord>(VersionNumber::SPIRV_1_3);
+
     case CapabilityNamedBarrier:
     case CapabilitySubgroupDispatch:
     case CapabilityPipeStorage:

From 97f0125a991eb4f57a32f4a09ab2d90510427eb2 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Fri, 10 Apr 2020 10:13:09 +0200
Subject: [PATCH 749/770] Add two-way translation test for
 sub_group_non_uniform_vote.

Testing SPIRV->LLVM translation is disabled in this commit. Enable this
once it gets implemented.
---
 .../transcoding/sub_group_non_uniform_vote.ll | 254 ++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll b/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll
new file mode 100644
index 0000000000000..6210a5a26f995
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll
@@ -0,0 +1,254 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
+;; 
+;; kernel void testSubGroupElect(global int* dst){
+;; 	dst[0] = sub_group_elect();
+;; }
+;; 
+;; kernel void testSubGroupNonUniformAll(global int* dst){
+;; 	dst[0] = sub_group_non_uniform_all(0); 
+;; }
+;; 
+;; kernel void testSubGroupNonUniformAny(global int* dst){
+;; 	dst[0] = sub_group_non_uniform_any(0);
+;; }
+;; 
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; kernel void testSubGroupNonUniformAllEqual(global int* dst){
+;;     {
+;;         char v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         uchar v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         short v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         ushort v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         int v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         uint v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         long v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         ulong v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         float v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         half v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;;     {
+;;         double v = 0;
+;;         dst[0] = sub_group_non_uniform_all_equal( v );
+;;     }
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; ModuleID = 'sub_group_non_uniform_vote.cl'
+source_filename = "sub_group_non_uniform_vote.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: ConstantFalse [[bool]] [[false:[0-9]+]]
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformElect [[bool]] {{[0-9]+}} [[ScopeSubgroup]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testSubGroupElect
+; CHECK-LLVM: call spir_func i32 @_Z15sub_group_electv()
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testSubGroupElect(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z15sub_group_electv() #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z15sub_group_electv() local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformAll [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[false]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testSubGroupNonUniformAll
+; CHECK-LLVM: call spir_func i32 @_Z25sub_group_non_uniform_alli(i32 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testSubGroupNonUniformAll(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z25sub_group_non_uniform_alli(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z25sub_group_non_uniform_alli(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformAny [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[false]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testSubGroupNonUniformAny
+; CHECK-LLVM: call spir_func i32 @_Z25sub_group_non_uniform_anyi(i32 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testSubGroupNonUniformAny(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z25sub_group_non_uniform_anyi(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z25sub_group_non_uniform_anyi(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]]
+; CHECK-SPIRV: GroupNonUniformAllEqual [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testSubGroupNonUniformAllEqual
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equalc(i8 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equalc(i8 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equals(i16 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equals(i16 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equali(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equali(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equall(i64 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equall(i64 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equalf(float {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equalDh(half {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_all_equald(double {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testSubGroupNonUniformAllEqual(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalc(i8 signext 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %3 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalh(i8 zeroext 0) #2
+  store i32 %3, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %4 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equals(i16 signext 0) #2
+  store i32 %4, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %5 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalt(i16 zeroext 0) #2
+  store i32 %5, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %6 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equali(i32 0) #2
+  store i32 %6, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %7 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalj(i32 0) #2
+  store i32 %7, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %8 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equall(i64 0) #2
+  store i32 %8, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %9 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalm(i64 0) #2
+  store i32 %9, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %10 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalf(float 0.000000e+00) #2
+  store i32 %10, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %11 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equalDh(half 0xH0000) #2
+  store i32 %11, i32 addrspace(1)* %0, align 4, !tbaa !7
+  %12 = tail call spir_func i32 @_Z31sub_group_non_uniform_all_equald(double 0.000000e+00) #2
+  store i32 %12, i32 addrspace(1)* %0, align 4, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equals(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equali(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equall(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equalDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_all_equald(double) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"int*"}
+!6 = !{!""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}

From 49b65720cb0497e49beb53bb397a10551c90b5d3 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Wed, 15 Apr 2020 09:35:33 +0200
Subject: [PATCH 750/770] Fix group_ops test

---
 llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp b/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
index 1c09fc76a5ef4..74fff6c5b02fb 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
@@ -257,14 +257,29 @@ void SPIRVToOCL::visitCallSPIRVGroupBuiltin(CallInst *CI, Op OC) {
   if (!HasGroupOperation) {
     DemangledName = Prefix + DemangledName;
   } else {
-    auto GO = getArgAs<spv::GroupOperation>(CI, 1);
     StringRef Op = DemangledName;
     Op = Op.drop_front(strlen(kSPIRVName::GroupPrefix));
     bool Unsigned = Op.front() == 'u';
     if (!Unsigned)
       Op = Op.drop_front(1);
-    DemangledName = Prefix + kSPIRVName::GroupPrefix +
-                    SPIRSPIRVGroupOperationMap::rmap(GO) + '_' + Op.str();
+
+    auto GO = getArgAs<spv::GroupOperation>(CI, 1);
+    std::string GroupOp = "";
+    switch (GO) {
+    case GroupOperationReduce:
+      GroupOp = "reduce";
+      break;
+    case GroupOperationInclusiveScan:
+      GroupOp = "scan_inclusive";
+      break;
+    case GroupOperationExclusiveScan:
+      GroupOp = "scan_exclusive";
+      break;
+    default:
+      assert(!"Unsupported group operation");
+      break;
+    }
+    DemangledName = Prefix + kSPIRVName::GroupPrefix + GroupOp + '_' + Op.str();
   }
   assert(CI->getCalledFunction() && "Unexpected indirect call");
   AttributeList Attrs = CI->getCalledFunction()->getAttributes();

From dd1bf56fa88c03dc14ede5f7ffa6e5cf064e17c0 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Wed, 15 Apr 2020 13:04:39 +0200
Subject: [PATCH 751/770] Add two-way translation test for sub_group_ballot.

---
 .../test/transcoding/sub_group_ballot.ll      | 1192 +++++++++++++++++
 1 file changed, 1192 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_ballot.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_ballot.ll b/llvm-spirv/test/transcoding/sub_group_ballot.ll
new file mode 100644
index 0000000000000..ede9b6f5cbf8e
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_ballot.ll
@@ -0,0 +1,1192 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; 
+;; kernel void testNonUniformBroadcastChars()
+;; {
+;;     char16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastUChars()
+;; {
+;;     uchar16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastShorts()
+;; {
+;;     short16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastUShorts()
+;; {
+;;     ushort16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastInts()
+;; {
+;;     int16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastUInts()
+;; {
+;;     uint16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastLongs()
+;; {
+;;     long16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastULongs()
+;; {
+;;     ulong16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastFloats()
+;; {
+;;     float16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastHalfs()
+;; {
+;;     half16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testNonUniformBroadcastDoubles()
+;; {
+;;     double16 v = 0;
+;;     v.s0 = sub_group_non_uniform_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_non_uniform_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_non_uniform_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_non_uniform_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_non_uniform_broadcast(v.s01234567, 0);
+;;     v = sub_group_non_uniform_broadcast(v, 0);
+;;     v.s0 = sub_group_broadcast_first(v.s0);
+;; }
+;; 
+;; kernel void testBallotOperations(global uint* dst)
+;; {
+;;     uint4 v = sub_group_ballot(0);
+;;     dst[0] = sub_group_inverse_ballot(v);
+;;     dst[1] = sub_group_ballot_bit_extract(v, 0);
+;;     dst[2] = sub_group_ballot_bit_count(v);
+;;     dst[3] = sub_group_ballot_inclusive_scan(v);
+;;     dst[4] = sub_group_ballot_exclusive_scan(v);
+;;     dst[5] = sub_group_ballot_find_lsb(v);
+;;     dst[6] = sub_group_ballot_find_msb(v);
+;; }
+;; 
+;; kernel void testSubgroupMasks(global uint4* dst)
+;; {
+;;     dst[0] = get_sub_group_eq_mask();
+;;     dst[1] = get_sub_group_ge_mask();
+;;     dst[2] = get_sub_group_gt_mask();
+;;     dst[3] = get_sub_group_le_mask();
+;;     dst[4] = get_sub_group_lt_mask();
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; ModuleID = 'subgroup_ballot.cl'
+source_filename = "subgroup_ballot.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-DAG: Decorate [[eqMask:[0-9]+]] BuiltIn 4416
+; CHECK-SPIRV-DAG: Decorate [[geMask:[0-9]+]] BuiltIn 4417
+; CHECK-SPIRV-DAG: Decorate [[gtMask:[0-9]+]] BuiltIn 4418
+; CHECK-SPIRV-DAG: Decorate [[leMask:[0-9]+]] BuiltIn 4419
+; CHECK-SPIRV-DAG: Decorate [[ltMask:[0-9]+]] BuiltIn 4420
+
+; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: TypeVector [[char2:[0-9]+]]  [[char]] 2
+; CHECK-SPIRV-DAG: TypeVector [[char3:[0-9]+]]  [[char]] 3
+; CHECK-SPIRV-DAG: TypeVector [[char4:[0-9]+]]  [[char]] 4
+; CHECK-SPIRV-DAG: TypeVector [[char8:[0-9]+]]  [[char]] 8
+; CHECK-SPIRV-DAG: TypeVector [[char16:[0-9]+]] [[char]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[short2:[0-9]+]]  [[short]] 2
+; CHECK-SPIRV-DAG: TypeVector [[short3:[0-9]+]]  [[short]] 3
+; CHECK-SPIRV-DAG: TypeVector [[short4:[0-9]+]]  [[short]] 4
+; CHECK-SPIRV-DAG: TypeVector [[short8:[0-9]+]]  [[short]] 8
+; CHECK-SPIRV-DAG: TypeVector [[short16:[0-9]+]] [[short]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[int2:[0-9]+]]  [[int]] 2
+; CHECK-SPIRV-DAG: TypeVector [[int3:[0-9]+]]  [[int]] 3
+; CHECK-SPIRV-DAG: TypeVector [[int4:[0-9]+]]  [[int]] 4
+; CHECK-SPIRV-DAG: TypeVector [[int8:[0-9]+]]  [[int]] 8
+; CHECK-SPIRV-DAG: TypeVector [[int16:[0-9]+]] [[int]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[long2:[0-9]+]]  [[long]] 2
+; CHECK-SPIRV-DAG: TypeVector [[long3:[0-9]+]]  [[long]] 3
+; CHECK-SPIRV-DAG: TypeVector [[long4:[0-9]+]]  [[long]] 4
+; CHECK-SPIRV-DAG: TypeVector [[long8:[0-9]+]]  [[long]] 8
+; CHECK-SPIRV-DAG: TypeVector [[long16:[0-9]+]] [[long]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[float2:[0-9]+]]  [[float]] 2
+; CHECK-SPIRV-DAG: TypeVector [[float3:[0-9]+]]  [[float]] 3
+; CHECK-SPIRV-DAG: TypeVector [[float4:[0-9]+]]  [[float]] 4
+; CHECK-SPIRV-DAG: TypeVector [[float8:[0-9]+]]  [[float]] 8
+; CHECK-SPIRV-DAG: TypeVector [[float16:[0-9]+]] [[float]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[half2:[0-9]+]]  [[half]] 2
+; CHECK-SPIRV-DAG: TypeVector [[half3:[0-9]+]]  [[half]] 3
+; CHECK-SPIRV-DAG: TypeVector [[half4:[0-9]+]]  [[half]] 4
+; CHECK-SPIRV-DAG: TypeVector [[half8:[0-9]+]]  [[half]] 8
+; CHECK-SPIRV-DAG: TypeVector [[half16:[0-9]+]] [[half]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[double2:[0-9]+]]  [[double]] 2
+; CHECK-SPIRV-DAG: TypeVector [[double3:[0-9]+]]  [[double]] 3
+; CHECK-SPIRV-DAG: TypeVector [[double4:[0-9]+]]  [[double]] 4
+; CHECK-SPIRV-DAG: TypeVector [[double8:[0-9]+]]  [[double]] 8
+; CHECK-SPIRV-DAG: TypeVector [[double16:[0-9]+]] [[double]] 16
+
+; CHECK-SPIRV-DAG: ConstantFalse [[bool]] [[false:[0-9]+]]
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char2]] [[char2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char2]] {{[0-9]+}} [[ScopeSubgroup]] [[char2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char3]] [[char3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char3]] {{[0-9]+}} [[ScopeSubgroup]] [[char3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char4]] [[char4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char4]] {{[0-9]+}} [[ScopeSubgroup]] [[char4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char8]] [[char8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char8]] {{[0-9]+}} [[ScopeSubgroup]] [[char8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[char16]] [[char16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char16]] {{[0-9]+}} [[ScopeSubgroup]] [[char16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[char]] [[char_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastChars
+; CHECK-LLVM: call spir_func i8 @_Z31sub_group_non_uniform_broadcasthj(i8 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i8> @_Z31sub_group_non_uniform_broadcastDv2_hj(<2 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i8> @_Z31sub_group_non_uniform_broadcastDv3_hj(<3 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i8> @_Z31sub_group_non_uniform_broadcastDv4_hj(<4 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i8> @_Z31sub_group_non_uniform_broadcastDv8_hj(<8 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i8> @_Z31sub_group_non_uniform_broadcastDv16_hj(<16 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i8 @_Z25sub_group_broadcast_firsth(i8 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastChars() local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func signext i8 @_Z31sub_group_non_uniform_broadcastcj(i8 signext 0, i32 0) #7
+  %2 = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %1, i64 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i8> @_Z31sub_group_non_uniform_broadcastDv2_cj(<2 x i8> %3, i32 0) #7
+  %5 = shufflevector <2 x i8> %4, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i8> %5, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i8> @_Z31sub_group_non_uniform_broadcastDv3_cj(<3 x i8> %7, i32 0) #7
+  %9 = shufflevector <3 x i8> %8, <3 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i8> %9, <16 x i8> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i8> %10, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i8> @_Z31sub_group_non_uniform_broadcastDv4_cj(<4 x i8> %11, i32 0) #7
+  %13 = shufflevector <4 x i8> %12, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i8> %13, <16 x i8> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i8> %14, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i8> @_Z31sub_group_non_uniform_broadcastDv8_cj(<8 x i8> %15, i32 0) #7
+  %17 = shufflevector <8 x i8> %16, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i8> %17, <16 x i8> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i8> @_Z31sub_group_non_uniform_broadcastDv16_cj(<16 x i8> %18, i32 0) #7
+  %20 = extractelement <16 x i8> %19, i64 0
+  %21 = tail call spir_func signext i8 @_Z25sub_group_broadcast_firstc(i8 signext %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z31sub_group_non_uniform_broadcastcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i8> @_Z31sub_group_non_uniform_broadcastDv2_cj(<2 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i8> @_Z31sub_group_non_uniform_broadcastDv3_cj(<3 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i8> @_Z31sub_group_non_uniform_broadcastDv4_cj(<4 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i8> @_Z31sub_group_non_uniform_broadcastDv8_cj(<8 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i8> @_Z31sub_group_non_uniform_broadcastDv16_cj(<16 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z25sub_group_broadcast_firstc(i8 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char2]] [[char2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char2]] {{[0-9]+}} [[ScopeSubgroup]] [[char2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char3]] [[char3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char3]] {{[0-9]+}} [[ScopeSubgroup]] [[char3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char4]] [[char4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char4]] {{[0-9]+}} [[ScopeSubgroup]] [[char4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char8]] [[char8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char8]] {{[0-9]+}} [[ScopeSubgroup]] [[char8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[char16]] [[char16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[char16]] {{[0-9]+}} [[ScopeSubgroup]] [[char16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[char]] [[char_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastUChars
+; CHECK-LLVM: call spir_func i8 @_Z31sub_group_non_uniform_broadcasthj(i8 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i8> @_Z31sub_group_non_uniform_broadcastDv2_hj(<2 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i8> @_Z31sub_group_non_uniform_broadcastDv3_hj(<3 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i8> @_Z31sub_group_non_uniform_broadcastDv4_hj(<4 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i8> @_Z31sub_group_non_uniform_broadcastDv8_hj(<8 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i8> @_Z31sub_group_non_uniform_broadcastDv16_hj(<16 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i8 @_Z25sub_group_broadcast_firsth(i8 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastUChars() local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func zeroext i8 @_Z31sub_group_non_uniform_broadcasthj(i8 zeroext 0, i32 0) #7
+  %2 = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %1, i64 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i8> @_Z31sub_group_non_uniform_broadcastDv2_hj(<2 x i8> %3, i32 0) #7
+  %5 = shufflevector <2 x i8> %4, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i8> %5, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i8> @_Z31sub_group_non_uniform_broadcastDv3_hj(<3 x i8> %7, i32 0) #7
+  %9 = shufflevector <3 x i8> %8, <3 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i8> %9, <16 x i8> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i8> %10, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i8> @_Z31sub_group_non_uniform_broadcastDv4_hj(<4 x i8> %11, i32 0) #7
+  %13 = shufflevector <4 x i8> %12, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i8> %13, <16 x i8> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i8> %14, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i8> @_Z31sub_group_non_uniform_broadcastDv8_hj(<8 x i8> %15, i32 0) #7
+  %17 = shufflevector <8 x i8> %16, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i8> %17, <16 x i8> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i8> @_Z31sub_group_non_uniform_broadcastDv16_hj(<16 x i8> %18, i32 0) #7
+  %20 = extractelement <16 x i8> %19, i64 0
+  %21 = tail call spir_func zeroext i8 @_Z25sub_group_broadcast_firsth(i8 zeroext %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z31sub_group_non_uniform_broadcasthj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i8> @_Z31sub_group_non_uniform_broadcastDv2_hj(<2 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i8> @_Z31sub_group_non_uniform_broadcastDv3_hj(<3 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i8> @_Z31sub_group_non_uniform_broadcastDv4_hj(<4 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i8> @_Z31sub_group_non_uniform_broadcastDv8_hj(<8 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i8> @_Z31sub_group_non_uniform_broadcastDv16_hj(<16 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z25sub_group_broadcast_firsth(i8 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short2]] [[short2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short2]] {{[0-9]+}} [[ScopeSubgroup]] [[short2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short3]] [[short3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short3]] {{[0-9]+}} [[ScopeSubgroup]] [[short3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short4]] [[short4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short4]] {{[0-9]+}} [[ScopeSubgroup]] [[short4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short8]] [[short8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short8]] {{[0-9]+}} [[ScopeSubgroup]] [[short8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[short16]] [[short16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short16]] {{[0-9]+}} [[ScopeSubgroup]] [[short16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[short]] [[short_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastShorts
+; CHECK-LLVM: call spir_func i16 @_Z31sub_group_non_uniform_broadcasttj(i16 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i16> @_Z31sub_group_non_uniform_broadcastDv2_tj(<2 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i16> @_Z31sub_group_non_uniform_broadcastDv3_tj(<3 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i16> @_Z31sub_group_non_uniform_broadcastDv4_tj(<4 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i16> @_Z31sub_group_non_uniform_broadcastDv8_tj(<8 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i16> @_Z31sub_group_non_uniform_broadcastDv16_tj(<16 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i16 @_Z25sub_group_broadcast_firstt(i16 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastShorts() local_unnamed_addr #2 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func signext i16 @_Z31sub_group_non_uniform_broadcastsj(i16 signext 0, i32 0) #7
+  %2 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %1, i64 0
+  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i16> @_Z31sub_group_non_uniform_broadcastDv2_sj(<2 x i16> %3, i32 0) #7
+  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i16> %5, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i16> @_Z31sub_group_non_uniform_broadcastDv3_sj(<3 x i16> %7, i32 0) #7
+  %9 = shufflevector <3 x i16> %8, <3 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i16> %9, <16 x i16> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i16> %10, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i16> @_Z31sub_group_non_uniform_broadcastDv4_sj(<4 x i16> %11, i32 0) #7
+  %13 = shufflevector <4 x i16> %12, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i16> %13, <16 x i16> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i16> %14, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i16> @_Z31sub_group_non_uniform_broadcastDv8_sj(<8 x i16> %15, i32 0) #7
+  %17 = shufflevector <8 x i16> %16, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i16> %17, <16 x i16> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i16> @_Z31sub_group_non_uniform_broadcastDv16_sj(<16 x i16> %18, i32 0) #7
+  %20 = extractelement <16 x i16> %19, i64 0
+  %21 = tail call spir_func signext i16 @_Z25sub_group_broadcast_firsts(i16 signext %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z31sub_group_non_uniform_broadcastsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i16> @_Z31sub_group_non_uniform_broadcastDv2_sj(<2 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i16> @_Z31sub_group_non_uniform_broadcastDv3_sj(<3 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i16> @_Z31sub_group_non_uniform_broadcastDv4_sj(<4 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i16> @_Z31sub_group_non_uniform_broadcastDv8_sj(<8 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i16> @_Z31sub_group_non_uniform_broadcastDv16_sj(<16 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z25sub_group_broadcast_firsts(i16 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short2]] [[short2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short2]] {{[0-9]+}} [[ScopeSubgroup]] [[short2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short3]] [[short3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short3]] {{[0-9]+}} [[ScopeSubgroup]] [[short3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short4]] [[short4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short4]] {{[0-9]+}} [[ScopeSubgroup]] [[short4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short8]] [[short8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short8]] {{[0-9]+}} [[ScopeSubgroup]] [[short8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[short16]] [[short16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[short16]] {{[0-9]+}} [[ScopeSubgroup]] [[short16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[short]] [[short_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastUShorts
+; CHECK-LLVM: call spir_func i16 @_Z31sub_group_non_uniform_broadcasttj(i16 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i16> @_Z31sub_group_non_uniform_broadcastDv2_tj(<2 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i16> @_Z31sub_group_non_uniform_broadcastDv3_tj(<3 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i16> @_Z31sub_group_non_uniform_broadcastDv4_tj(<4 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i16> @_Z31sub_group_non_uniform_broadcastDv8_tj(<8 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i16> @_Z31sub_group_non_uniform_broadcastDv16_tj(<16 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i16 @_Z25sub_group_broadcast_firstt(i16 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastUShorts() local_unnamed_addr #2 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func zeroext i16 @_Z31sub_group_non_uniform_broadcasttj(i16 zeroext 0, i32 0) #7
+  %2 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %1, i64 0
+  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i16> @_Z31sub_group_non_uniform_broadcastDv2_tj(<2 x i16> %3, i32 0) #7
+  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i16> %5, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i16> @_Z31sub_group_non_uniform_broadcastDv3_tj(<3 x i16> %7, i32 0) #7
+  %9 = shufflevector <3 x i16> %8, <3 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i16> %9, <16 x i16> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i16> %10, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i16> @_Z31sub_group_non_uniform_broadcastDv4_tj(<4 x i16> %11, i32 0) #7
+  %13 = shufflevector <4 x i16> %12, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i16> %13, <16 x i16> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i16> %14, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i16> @_Z31sub_group_non_uniform_broadcastDv8_tj(<8 x i16> %15, i32 0) #7
+  %17 = shufflevector <8 x i16> %16, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i16> %17, <16 x i16> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i16> @_Z31sub_group_non_uniform_broadcastDv16_tj(<16 x i16> %18, i32 0) #7
+  %20 = extractelement <16 x i16> %19, i64 0
+  %21 = tail call spir_func zeroext i16 @_Z25sub_group_broadcast_firstt(i16 zeroext %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z31sub_group_non_uniform_broadcasttj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i16> @_Z31sub_group_non_uniform_broadcastDv2_tj(<2 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i16> @_Z31sub_group_non_uniform_broadcastDv3_tj(<3 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i16> @_Z31sub_group_non_uniform_broadcastDv4_tj(<4 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i16> @_Z31sub_group_non_uniform_broadcastDv8_tj(<8 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i16> @_Z31sub_group_non_uniform_broadcastDv16_tj(<16 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z25sub_group_broadcast_firstt(i16 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int2]] [[int2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int2]] {{[0-9]+}} [[ScopeSubgroup]] [[int2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int3]] [[int3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int3]] {{[0-9]+}} [[ScopeSubgroup]] [[int3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int4]] [[int4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int4]] {{[0-9]+}} [[ScopeSubgroup]] [[int4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int8]] [[int8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int8]] {{[0-9]+}} [[ScopeSubgroup]] [[int8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[int16]] [[int16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int16]] {{[0-9]+}} [[ScopeSubgroup]] [[int16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[int]] [[int_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastInts
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_broadcastjj(i32 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i32> @_Z31sub_group_non_uniform_broadcastDv2_jj(<2 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i32> @_Z31sub_group_non_uniform_broadcastDv3_jj(<3 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i32> @_Z31sub_group_non_uniform_broadcastDv4_jj(<4 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i32> @_Z31sub_group_non_uniform_broadcastDv8_jj(<8 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i32> @_Z31sub_group_non_uniform_broadcastDv16_jj(<16 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z25sub_group_broadcast_firstj(i32 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastInts() local_unnamed_addr #3 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i32 @_Z31sub_group_non_uniform_broadcastij(i32 0, i32 0) #7
+  %2 = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %1, i64 0
+  %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i32> @_Z31sub_group_non_uniform_broadcastDv2_ij(<2 x i32> %3, i32 0) #7
+  %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i32> %5, <16 x i32> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i32> @_Z31sub_group_non_uniform_broadcastDv3_ij(<3 x i32> %7, i32 0) #7
+  %9 = shufflevector <3 x i32> %8, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i32> %9, <16 x i32> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i32> %10, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i32> @_Z31sub_group_non_uniform_broadcastDv4_ij(<4 x i32> %11, i32 0) #7
+  %13 = shufflevector <4 x i32> %12, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i32> %13, <16 x i32> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i32> %14, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i32> @_Z31sub_group_non_uniform_broadcastDv8_ij(<8 x i32> %15, i32 0) #7
+  %17 = shufflevector <8 x i32> %16, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i32> %17, <16 x i32> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i32> @_Z31sub_group_non_uniform_broadcastDv16_ij(<16 x i32> %18, i32 0) #7
+  %20 = extractelement <16 x i32> %19, i64 0
+  %21 = tail call spir_func i32 @_Z25sub_group_broadcast_firsti(i32 %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_broadcastij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i32> @_Z31sub_group_non_uniform_broadcastDv2_ij(<2 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i32> @_Z31sub_group_non_uniform_broadcastDv3_ij(<3 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i32> @_Z31sub_group_non_uniform_broadcastDv4_ij(<4 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i32> @_Z31sub_group_non_uniform_broadcastDv8_ij(<8 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i32> @_Z31sub_group_non_uniform_broadcastDv16_ij(<16 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z25sub_group_broadcast_firsti(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int2]] [[int2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int2]] {{[0-9]+}} [[ScopeSubgroup]] [[int2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int3]] [[int3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int3]] {{[0-9]+}} [[ScopeSubgroup]] [[int3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int4]] [[int4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int4]] {{[0-9]+}} [[ScopeSubgroup]] [[int4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int8]] [[int8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int8]] {{[0-9]+}} [[ScopeSubgroup]] [[int8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[int16]] [[int16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[int16]] {{[0-9]+}} [[ScopeSubgroup]] [[int16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[int]] [[int_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastUInts
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_broadcastjj(i32 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i32> @_Z31sub_group_non_uniform_broadcastDv2_jj(<2 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i32> @_Z31sub_group_non_uniform_broadcastDv3_jj(<3 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i32> @_Z31sub_group_non_uniform_broadcastDv4_jj(<4 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i32> @_Z31sub_group_non_uniform_broadcastDv8_jj(<8 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i32> @_Z31sub_group_non_uniform_broadcastDv16_jj(<16 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z25sub_group_broadcast_firstj(i32 {{.*}})
+
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastUInts() local_unnamed_addr #3 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i32 @_Z31sub_group_non_uniform_broadcastjj(i32 0, i32 0) #7
+  %2 = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %1, i64 0
+  %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i32> @_Z31sub_group_non_uniform_broadcastDv2_jj(<2 x i32> %3, i32 0) #7
+  %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i32> %5, <16 x i32> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i32> @_Z31sub_group_non_uniform_broadcastDv3_jj(<3 x i32> %7, i32 0) #7
+  %9 = shufflevector <3 x i32> %8, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i32> %9, <16 x i32> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i32> %10, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i32> @_Z31sub_group_non_uniform_broadcastDv4_jj(<4 x i32> %11, i32 0) #7
+  %13 = shufflevector <4 x i32> %12, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i32> %13, <16 x i32> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i32> %14, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i32> @_Z31sub_group_non_uniform_broadcastDv8_jj(<8 x i32> %15, i32 0) #7
+  %17 = shufflevector <8 x i32> %16, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i32> %17, <16 x i32> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i32> @_Z31sub_group_non_uniform_broadcastDv16_jj(<16 x i32> %18, i32 0) #7
+  %20 = extractelement <16 x i32> %19, i64 0
+  %21 = tail call spir_func i32 @_Z25sub_group_broadcast_firstj(i32 %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_broadcastjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i32> @_Z31sub_group_non_uniform_broadcastDv2_jj(<2 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i32> @_Z31sub_group_non_uniform_broadcastDv3_jj(<3 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i32> @_Z31sub_group_non_uniform_broadcastDv4_jj(<4 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i32> @_Z31sub_group_non_uniform_broadcastDv8_jj(<8 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i32> @_Z31sub_group_non_uniform_broadcastDv16_jj(<16 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z25sub_group_broadcast_firstj(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long2]] [[long2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long2]] {{[0-9]+}} [[ScopeSubgroup]] [[long2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long3]] [[long3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long3]] {{[0-9]+}} [[ScopeSubgroup]] [[long3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long4]] [[long4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long4]] {{[0-9]+}} [[ScopeSubgroup]] [[long4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long8]] [[long8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long8]] {{[0-9]+}} [[ScopeSubgroup]] [[long8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[long16]] [[long16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long16]] {{[0-9]+}} [[ScopeSubgroup]] [[long16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[long]] [[long_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastLongs
+; CHECK-LLVM: call spir_func i64 @_Z31sub_group_non_uniform_broadcastmj(i64 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i64> @_Z31sub_group_non_uniform_broadcastDv2_mj(<2 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i64> @_Z31sub_group_non_uniform_broadcastDv3_mj(<3 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i64> @_Z31sub_group_non_uniform_broadcastDv4_mj(<4 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i64> @_Z31sub_group_non_uniform_broadcastDv8_mj(<8 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i64> @_Z31sub_group_non_uniform_broadcastDv16_mj(<16 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i64 @_Z25sub_group_broadcast_firstm(i64 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastLongs() local_unnamed_addr #4 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i64 @_Z31sub_group_non_uniform_broadcastlj(i64 0, i32 0) #7
+  %2 = insertelement <16 x i64> <i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 %1, i64 0
+  %3 = shufflevector <16 x i64> %2, <16 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i64> @_Z31sub_group_non_uniform_broadcastDv2_lj(<2 x i64> %3, i32 0) #7
+  %5 = shufflevector <2 x i64> %4, <2 x i64> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i64> %5, <16 x i64> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i64> %6, <16 x i64> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i64> @_Z31sub_group_non_uniform_broadcastDv3_lj(<3 x i64> %7, i32 0) #7
+  %9 = shufflevector <3 x i64> %8, <3 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i64> %9, <16 x i64> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i64> %10, <16 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i64> @_Z31sub_group_non_uniform_broadcastDv4_lj(<4 x i64> %11, i32 0) #7
+  %13 = shufflevector <4 x i64> %12, <4 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i64> %13, <16 x i64> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i64> %14, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i64> @_Z31sub_group_non_uniform_broadcastDv8_lj(<8 x i64> %15, i32 0) #7
+  %17 = shufflevector <8 x i64> %16, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i64> %17, <16 x i64> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i64> @_Z31sub_group_non_uniform_broadcastDv16_lj(<16 x i64> %18, i32 0) #7
+  %20 = extractelement <16 x i64> %19, i64 0
+  %21 = tail call spir_func i64 @_Z25sub_group_broadcast_firstl(i64 %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z31sub_group_non_uniform_broadcastlj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i64> @_Z31sub_group_non_uniform_broadcastDv2_lj(<2 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i64> @_Z31sub_group_non_uniform_broadcastDv3_lj(<3 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i64> @_Z31sub_group_non_uniform_broadcastDv4_lj(<4 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i64> @_Z31sub_group_non_uniform_broadcastDv8_lj(<8 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i64> @_Z31sub_group_non_uniform_broadcastDv16_lj(<16 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z25sub_group_broadcast_firstl(i64) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long2]] [[long2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long2]] {{[0-9]+}} [[ScopeSubgroup]] [[long2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long3]] [[long3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long3]] {{[0-9]+}} [[ScopeSubgroup]] [[long3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long4]] [[long4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long4]] {{[0-9]+}} [[ScopeSubgroup]] [[long4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long8]] [[long8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long8]] {{[0-9]+}} [[ScopeSubgroup]] [[long8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[long16]] [[long16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[long16]] {{[0-9]+}} [[ScopeSubgroup]] [[long16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[long]] [[long_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastULongs
+; CHECK-LLVM: call spir_func i64 @_Z31sub_group_non_uniform_broadcastmj(i64 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i64> @_Z31sub_group_non_uniform_broadcastDv2_mj(<2 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i64> @_Z31sub_group_non_uniform_broadcastDv3_mj(<3 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i64> @_Z31sub_group_non_uniform_broadcastDv4_mj(<4 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i64> @_Z31sub_group_non_uniform_broadcastDv8_mj(<8 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i64> @_Z31sub_group_non_uniform_broadcastDv16_mj(<16 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func i64 @_Z25sub_group_broadcast_firstm(i64 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastULongs() local_unnamed_addr #4 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i64 @_Z31sub_group_non_uniform_broadcastmj(i64 0, i32 0) #7
+  %2 = insertelement <16 x i64> <i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 %1, i64 0
+  %3 = shufflevector <16 x i64> %2, <16 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i64> @_Z31sub_group_non_uniform_broadcastDv2_mj(<2 x i64> %3, i32 0) #7
+  %5 = shufflevector <2 x i64> %4, <2 x i64> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i64> %5, <16 x i64> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i64> %6, <16 x i64> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i64> @_Z31sub_group_non_uniform_broadcastDv3_mj(<3 x i64> %7, i32 0) #7
+  %9 = shufflevector <3 x i64> %8, <3 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i64> %9, <16 x i64> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i64> %10, <16 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i64> @_Z31sub_group_non_uniform_broadcastDv4_mj(<4 x i64> %11, i32 0) #7
+  %13 = shufflevector <4 x i64> %12, <4 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i64> %13, <16 x i64> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i64> %14, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i64> @_Z31sub_group_non_uniform_broadcastDv8_mj(<8 x i64> %15, i32 0) #7
+  %17 = shufflevector <8 x i64> %16, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i64> %17, <16 x i64> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i64> @_Z31sub_group_non_uniform_broadcastDv16_mj(<16 x i64> %18, i32 0) #7
+  %20 = extractelement <16 x i64> %19, i64 0
+  %21 = tail call spir_func i64 @_Z25sub_group_broadcast_firstm(i64 %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z31sub_group_non_uniform_broadcastmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i64> @_Z31sub_group_non_uniform_broadcastDv2_mj(<2 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i64> @_Z31sub_group_non_uniform_broadcastDv3_mj(<3 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i64> @_Z31sub_group_non_uniform_broadcastDv4_mj(<4 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i64> @_Z31sub_group_non_uniform_broadcastDv8_mj(<8 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i64> @_Z31sub_group_non_uniform_broadcastDv16_mj(<16 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z25sub_group_broadcast_firstm(i64) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float2]] [[float2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[float2]] {{[0-9]+}} [[ScopeSubgroup]] [[float2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float3]] [[float3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[float3]] {{[0-9]+}} [[ScopeSubgroup]] [[float3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float4]] [[float4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[float4]] {{[0-9]+}} [[ScopeSubgroup]] [[float4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float8]] [[float8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[float8]] {{[0-9]+}} [[ScopeSubgroup]] [[float8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[float16]] [[float16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[float16]] {{[0-9]+}} [[ScopeSubgroup]] [[float16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[float]] [[float_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastFloats
+; CHECK-LLVM: call spir_func float @_Z31sub_group_non_uniform_broadcastfj(float {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x float> @_Z31sub_group_non_uniform_broadcastDv2_fj(<2 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x float> @_Z31sub_group_non_uniform_broadcastDv3_fj(<3 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x float> @_Z31sub_group_non_uniform_broadcastDv4_fj(<4 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x float> @_Z31sub_group_non_uniform_broadcastDv8_fj(<8 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x float> @_Z31sub_group_non_uniform_broadcastDv16_fj(<16 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func float @_Z25sub_group_broadcast_firstf(float {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastFloats() local_unnamed_addr #3 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func float @_Z31sub_group_non_uniform_broadcastfj(float 0.000000e+00, i32 0) #7
+  %2 = insertelement <16 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %1, i64 0
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x float> @_Z31sub_group_non_uniform_broadcastDv2_fj(<2 x float> %3, i32 0) #7
+  %5 = shufflevector <2 x float> %4, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x float> %5, <16 x float> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x float> %6, <16 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x float> @_Z31sub_group_non_uniform_broadcastDv3_fj(<3 x float> %7, i32 0) #7
+  %9 = shufflevector <3 x float> %8, <3 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x float> %9, <16 x float> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x float> %10, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x float> @_Z31sub_group_non_uniform_broadcastDv4_fj(<4 x float> %11, i32 0) #7
+  %13 = shufflevector <4 x float> %12, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x float> %13, <16 x float> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x float> %14, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x float> @_Z31sub_group_non_uniform_broadcastDv8_fj(<8 x float> %15, i32 0) #7
+  %17 = shufflevector <8 x float> %16, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x float> %17, <16 x float> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x float> @_Z31sub_group_non_uniform_broadcastDv16_fj(<16 x float> %18, i32 0) #7
+  %20 = extractelement <16 x float> %19, i64 0
+  %21 = tail call spir_func float @_Z25sub_group_broadcast_firstf(float %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z31sub_group_non_uniform_broadcastfj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x float> @_Z31sub_group_non_uniform_broadcastDv2_fj(<2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x float> @_Z31sub_group_non_uniform_broadcastDv3_fj(<3 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x float> @_Z31sub_group_non_uniform_broadcastDv4_fj(<4 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x float> @_Z31sub_group_non_uniform_broadcastDv8_fj(<8 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x float> @_Z31sub_group_non_uniform_broadcastDv16_fj(<16 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z25sub_group_broadcast_firstf(float) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half2]] [[half2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[half2]] {{[0-9]+}} [[ScopeSubgroup]] [[half2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half3]] [[half3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[half3]] {{[0-9]+}} [[ScopeSubgroup]] [[half3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half4]] [[half4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[half4]] {{[0-9]+}} [[ScopeSubgroup]] [[half4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half8]] [[half8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[half8]] {{[0-9]+}} [[ScopeSubgroup]] [[half8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[half16]] [[half16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[half16]] {{[0-9]+}} [[ScopeSubgroup]] [[half16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[half]] [[half_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastHalfs
+; CHECK-LLVM: call spir_func half @_Z31sub_group_non_uniform_broadcastDhj(half {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x half> @_Z31sub_group_non_uniform_broadcastDv2_Dhj(<2 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x half> @_Z31sub_group_non_uniform_broadcastDv3_Dhj(<3 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x half> @_Z31sub_group_non_uniform_broadcastDv4_Dhj(<4 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x half> @_Z31sub_group_non_uniform_broadcastDv8_Dhj(<8 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x half> @_Z31sub_group_non_uniform_broadcastDv16_Dhj(<16 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func half @_Z25sub_group_broadcast_firstDh(half {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastHalfs() local_unnamed_addr #2 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func half @_Z31sub_group_non_uniform_broadcastDhj(half 0xH0000, i32 0) #7
+  %2 = insertelement <16 x half> <half undef, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %1, i64 0
+  %3 = shufflevector <16 x half> %2, <16 x half> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x half> @_Z31sub_group_non_uniform_broadcastDv2_Dhj(<2 x half> %3, i32 0) #7
+  %5 = shufflevector <2 x half> %4, <2 x half> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x half> %5, <16 x half> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x half> %6, <16 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x half> @_Z31sub_group_non_uniform_broadcastDv3_Dhj(<3 x half> %7, i32 0) #7
+  %9 = shufflevector <3 x half> %8, <3 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x half> %9, <16 x half> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x half> %10, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x half> @_Z31sub_group_non_uniform_broadcastDv4_Dhj(<4 x half> %11, i32 0) #7
+  %13 = shufflevector <4 x half> %12, <4 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x half> %13, <16 x half> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x half> %14, <16 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x half> @_Z31sub_group_non_uniform_broadcastDv8_Dhj(<8 x half> %15, i32 0) #7
+  %17 = shufflevector <8 x half> %16, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x half> %17, <16 x half> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x half> @_Z31sub_group_non_uniform_broadcastDv16_Dhj(<16 x half> %18, i32 0) #7
+  %20 = extractelement <16 x half> %19, i64 0
+  %21 = tail call spir_func half @_Z25sub_group_broadcast_firstDh(half %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z31sub_group_non_uniform_broadcastDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x half> @_Z31sub_group_non_uniform_broadcastDv2_Dhj(<2 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x half> @_Z31sub_group_non_uniform_broadcastDv3_Dhj(<3 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x half> @_Z31sub_group_non_uniform_broadcastDv4_Dhj(<4 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x half> @_Z31sub_group_non_uniform_broadcastDv8_Dhj(<8 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x half> @_Z31sub_group_non_uniform_broadcastDv16_Dhj(<16 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z25sub_group_broadcast_firstDh(half) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBroadcast [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double2]] [[double2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[double2]] {{[0-9]+}} [[ScopeSubgroup]] [[double2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double3]] [[double3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[double3]] {{[0-9]+}} [[ScopeSubgroup]] [[double3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double4]] [[double4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[double4]] {{[0-9]+}} [[ScopeSubgroup]] [[double4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double8]] [[double8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[double8]] {{[0-9]+}} [[ScopeSubgroup]] [[double8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[double16]] [[double16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupNonUniformBroadcast [[double16]] {{[0-9]+}} [[ScopeSubgroup]] [[double16_0]] [[int_0]]
+; CHECK-SPIRV: CompositeExtract [[double]] [[double_value:[0-9]+]]
+; CHECK-SPIRV: GroupNonUniformBroadcastFirst [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_value]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBroadcastDoubles
+; CHECK-LLVM: call spir_func double @_Z31sub_group_non_uniform_broadcastdj(double {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x double> @_Z31sub_group_non_uniform_broadcastDv2_dj(<2 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x double> @_Z31sub_group_non_uniform_broadcastDv3_dj(<3 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x double> @_Z31sub_group_non_uniform_broadcastDv4_dj(<4 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x double> @_Z31sub_group_non_uniform_broadcastDv8_dj(<8 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x double> @_Z31sub_group_non_uniform_broadcastDv16_dj(<16 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func double @_Z25sub_group_broadcast_firstd(double {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBroadcastDoubles() local_unnamed_addr #4 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func double @_Z31sub_group_non_uniform_broadcastdj(double 0.000000e+00, i32 0) #7
+  %2 = insertelement <16 x double> <double undef, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double %1, i64 0
+  %3 = shufflevector <16 x double> %2, <16 x double> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x double> @_Z31sub_group_non_uniform_broadcastDv2_dj(<2 x double> %3, i32 0) #7
+  %5 = shufflevector <2 x double> %4, <2 x double> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x double> %5, <16 x double> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x double> %6, <16 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x double> @_Z31sub_group_non_uniform_broadcastDv3_dj(<3 x double> %7, i32 0) #7
+  %9 = shufflevector <3 x double> %8, <3 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x double> %9, <16 x double> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x double> %10, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x double> @_Z31sub_group_non_uniform_broadcastDv4_dj(<4 x double> %11, i32 0) #7
+  %13 = shufflevector <4 x double> %12, <4 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x double> %13, <16 x double> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x double> %14, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x double> @_Z31sub_group_non_uniform_broadcastDv8_dj(<8 x double> %15, i32 0) #7
+  %17 = shufflevector <8 x double> %16, <8 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x double> %17, <16 x double> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x double> @_Z31sub_group_non_uniform_broadcastDv16_dj(<16 x double> %18, i32 0) #7
+  %20 = extractelement <16 x double> %19, i64 0
+  %21 = tail call spir_func double @_Z25sub_group_broadcast_firstd(double %20) #7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z31sub_group_non_uniform_broadcastdj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x double> @_Z31sub_group_non_uniform_broadcastDv2_dj(<2 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x double> @_Z31sub_group_non_uniform_broadcastDv3_dj(<3 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x double> @_Z31sub_group_non_uniform_broadcastDv4_dj(<4 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x double> @_Z31sub_group_non_uniform_broadcastDv8_dj(<8 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x double> @_Z31sub_group_non_uniform_broadcastDv16_dj(<16 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z25sub_group_broadcast_firstd(double) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBallot [[int4]] [[ballot:[0-9]+]] [[ScopeSubgroup]] [[false]]
+; CHECK-SPIRV: GroupNonUniformInverseBallot [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[ballot]]
+; CHECK-SPIRV: GroupNonUniformBallotBitExtract [[bool]] {{[0-9]+}} [[ScopeSubgroup]] [[ballot]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBallotBitCount [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[ballot]]
+; CHECK-SPIRV: GroupNonUniformBallotBitCount [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[ballot]]
+; CHECK-SPIRV: GroupNonUniformBallotBitCount [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[ballot]]
+; CHECK-SPIRV: GroupNonUniformBallotFindLSB [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[ballot]]
+; CHECK-SPIRV: GroupNonUniformBallotFindMSB [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[ballot]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBallotOperations
+; CHECK-LLVM: %[[ballot:[0-9]+]] = call spir_func <4 x i32> @_Z16sub_group_balloti(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z24sub_group_inverse_ballotDv4_j(<4 x i32> %[[ballot]])
+; CHECK-LLVM: call spir_func i32 @_Z28sub_group_ballot_bit_extractDv4_jj(<4 x i32> %[[ballot]], i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z26sub_group_ballot_bit_countDv4_j(<4 x i32> %[[ballot]])
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_ballot_inclusive_scanDv4_j(<4 x i32> %[[ballot]])
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_ballot_exclusive_scanDv4_j(<4 x i32> %[[ballot]])
+; CHECK-LLVM: call spir_func i32 @_Z25sub_group_ballot_find_lsbDv4_j(<4 x i32> %[[ballot]])
+; CHECK-LLVM: call spir_func i32 @_Z25sub_group_ballot_find_msbDv4_j(<4 x i32> %[[ballot]])
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBallotOperations(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+  %2 = tail call spir_func <4 x i32> @_Z16sub_group_balloti(i32 0) #7
+  %3 = tail call spir_func i32 @_Z24sub_group_inverse_ballotDv4_j(<4 x i32> %2) #8
+  store i32 %3, i32 addrspace(1)* %0, align 4, !tbaa !8
+  %4 = tail call spir_func i32 @_Z28sub_group_ballot_bit_extractDv4_jj(<4 x i32> %2, i32 0) #8
+  %5 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %4, i32 addrspace(1)* %5, align 4, !tbaa !8
+  %6 = tail call spir_func i32 @_Z26sub_group_ballot_bit_countDv4_j(<4 x i32> %2) #8
+  %7 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %6, i32 addrspace(1)* %7, align 4, !tbaa !8
+  %8 = tail call spir_func i32 @_Z31sub_group_ballot_inclusive_scanDv4_j(<4 x i32> %2) #7
+  %9 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %8, i32 addrspace(1)* %9, align 4, !tbaa !8
+  %10 = tail call spir_func i32 @_Z31sub_group_ballot_exclusive_scanDv4_j(<4 x i32> %2) #7
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4
+  store i32 %10, i32 addrspace(1)* %11, align 4, !tbaa !8
+  %12 = tail call spir_func i32 @_Z25sub_group_ballot_find_lsbDv4_j(<4 x i32> %2) #7
+  %13 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5
+  store i32 %12, i32 addrspace(1)* %13, align 4, !tbaa !8
+  %14 = tail call spir_func i32 @_Z25sub_group_ballot_find_msbDv4_j(<4 x i32> %2) #7
+  %15 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6
+  store i32 %14, i32 addrspace(1)* %15, align 4, !tbaa !8
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i32> @_Z16sub_group_balloti(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func i32 @_Z24sub_group_inverse_ballotDv4_j(<4 x i32>) local_unnamed_addr #5
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func i32 @_Z28sub_group_ballot_bit_extractDv4_jj(<4 x i32>, i32) local_unnamed_addr #5
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func i32 @_Z26sub_group_ballot_bit_countDv4_j(<4 x i32>) local_unnamed_addr #5
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_ballot_inclusive_scanDv4_j(<4 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_ballot_exclusive_scanDv4_j(<4 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z25sub_group_ballot_find_lsbDv4_j(<4 x i32>) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z25sub_group_ballot_find_msbDv4_j(<4 x i32>) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: Load [[int4]] {{[0-9]+}} [[eqMask]]
+; CHECK-SPIRV: Load [[int4]] {{[0-9]+}} [[geMask]]
+; CHECK-SPIRV: Load [[int4]] {{[0-9]+}} [[gtMask]]
+; CHECK-SPIRV: Load [[int4]] {{[0-9]+}} [[leMask]]
+; CHECK-SPIRV: Load [[int4]] {{[0-9]+}} [[ltMask]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testSubgroupMasks
+; CHECK-LLVM: call spir_func <4 x i32> @_Z21get_sub_group_eq_maskv()
+; CHECK-LLVM: call spir_func <4 x i32> @_Z21get_sub_group_ge_maskv()
+; CHECK-LLVM: call spir_func <4 x i32> @_Z21get_sub_group_gt_maskv()
+; CHECK-LLVM: call spir_func <4 x i32> @_Z21get_sub_group_le_maskv()
+; CHECK-LLVM: call spir_func <4 x i32> @_Z21get_sub_group_lt_maskv()
+
+; Function Attrs: convergent nofree nounwind writeonly
+define dso_local spir_kernel void @testSubgroupMasks(<4 x i32> addrspace(1)* nocapture) local_unnamed_addr #6 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !12 !kernel_arg_base_type !13 !kernel_arg_type_qual !7 {
+  %2 = tail call spir_func <4 x i32> @_Z21get_sub_group_eq_maskv() #8
+  store <4 x i32> %2, <4 x i32> addrspace(1)* %0, align 16, !tbaa !14
+  %3 = tail call spir_func <4 x i32> @_Z21get_sub_group_ge_maskv() #8
+  %4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %0, i64 1
+  store <4 x i32> %3, <4 x i32> addrspace(1)* %4, align 16, !tbaa !14
+  %5 = tail call spir_func <4 x i32> @_Z21get_sub_group_gt_maskv() #8
+  %6 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %0, i64 2
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %6, align 16, !tbaa !14
+  %7 = tail call spir_func <4 x i32> @_Z21get_sub_group_le_maskv() #8
+  %8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %0, i64 3
+  store <4 x i32> %7, <4 x i32> addrspace(1)* %8, align 16, !tbaa !14
+  %9 = tail call spir_func <4 x i32> @_Z21get_sub_group_lt_maskv() #8
+  %10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %0, i64 4
+  store <4 x i32> %9, <4 x i32> addrspace(1)* %10, align 16, !tbaa !14
+  ret void
+}
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func <4 x i32> @_Z21get_sub_group_eq_maskv() local_unnamed_addr #5
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func <4 x i32> @_Z21get_sub_group_ge_maskv() local_unnamed_addr #5
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func <4 x i32> @_Z21get_sub_group_gt_maskv() local_unnamed_addr #5
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func <4 x i32> @_Z21get_sub_group_le_maskv() local_unnamed_addr #5
+
+; Function Attrs: convergent nounwind readnone
+declare dso_local spir_func <4 x i32> @_Z21get_sub_group_lt_maskv() local_unnamed_addr #5
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="256" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="1024" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { convergent nofree nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #7 = { convergent nounwind }
+attributes #8 = { convergent nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{}
+!4 = !{i32 1}
+!5 = !{!"none"}
+!6 = !{!"uint*"}
+!7 = !{!""}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"int", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
+!12 = !{!"uint4*"}
+!13 = !{!"uint __attribute__((ext_vector_type(4)))*"}
+!14 = !{!10, !10, i64 0}

From 133ad80e85c2e3c44a1b04459609fa036e108279 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Wed, 15 Apr 2020 14:02:16 +0200
Subject: [PATCH 752/770] Add two-way translation test for sub_group_shuffle.

---
 .../test/transcoding/sub_group_shuffle.ll     | 430 ++++++++++++++++++
 1 file changed, 430 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_shuffle.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_shuffle.ll b/llvm-spirv/test/transcoding/sub_group_shuffle.ll
new file mode 100644
index 0000000000000..7b30bbc907e1e
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_shuffle.ll
@@ -0,0 +1,430 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; 
+;; kernel void testShuffleChar(global char* dst)
+;; {
+;; 	char v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleUChar(global uchar* dst)
+;; {
+;; 	uchar v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleShort(global short* dst)
+;; {
+;; 	short v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleUShort(global ushort* dst)
+;; {
+;; 	ushort v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleInt(global int* dst)
+;; {
+;; 	int v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleUInt(global uint* dst)
+;; {
+;; 	uint v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleLong(global long* dst)
+;; {
+;; 	long v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleULong(global ulong* dst)
+;; {
+;; 	ulong v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleFloat(global float* dst)
+;; {
+;; 	float v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleHalf(global half* dst)
+;; {
+;; 	half v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleDouble(global double* dst)
+;; {
+;; 	double v = 0;
+;;     dst[0] = sub_group_shuffle( v, 0 );
+;;     dst[1] = sub_group_shuffle_xor( v, 0 );
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; ModuleID = 'sub_group_shuffle.cl'
+source_filename = "sub_group_shuffle.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleChar
+; CHECK-LLVM: call spir_func i8 @_Z17sub_group_shufflecj(i8 0, i32 0)
+; CHECK-LLVM: call spir_func i8 @_Z21sub_group_shuffle_xorcj(i8 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i8 @_Z17sub_group_shufflecj(i8 signext 0, i32 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func signext i8 @_Z21sub_group_shuffle_xorcj(i8 signext 0, i32 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z17sub_group_shufflecj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z21sub_group_shuffle_xorcj(i8 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleUChar
+; CHECK-LLVM: call spir_func i8 @_Z17sub_group_shufflecj(i8 0, i32 0)
+; CHECK-LLVM: call spir_func i8 @_Z21sub_group_shuffle_xorcj(i8 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i8 @_Z17sub_group_shufflehj(i8 zeroext 0, i32 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func zeroext i8 @_Z21sub_group_shuffle_xorhj(i8 zeroext 0, i32 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z17sub_group_shufflehj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z21sub_group_shuffle_xorhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleShort
+; CHECK-LLVM: call spir_func i16 @_Z17sub_group_shufflesj(i16 0, i32 0)
+; CHECK-LLVM: call spir_func i16 @_Z21sub_group_shuffle_xorsj(i16 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i16 @_Z17sub_group_shufflesj(i16 signext 0, i32 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func signext i16 @_Z21sub_group_shuffle_xorsj(i16 signext 0, i32 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z17sub_group_shufflesj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z21sub_group_shuffle_xorsj(i16 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleUShort
+; CHECK-LLVM: call spir_func i16 @_Z17sub_group_shufflesj(i16 0, i32 0)
+; CHECK-LLVM: call spir_func i16 @_Z21sub_group_shuffle_xorsj(i16 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i16 @_Z17sub_group_shuffletj(i16 zeroext 0, i32 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func zeroext i16 @_Z21sub_group_shuffle_xortj(i16 zeroext 0, i32 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z17sub_group_shuffletj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z21sub_group_shuffle_xortj(i16 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleInt
+; CHECK-LLVM: call spir_func i32 @_Z17sub_group_shuffleij(i32 0, i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z21sub_group_shuffle_xorij(i32 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z17sub_group_shuffleij(i32 0, i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z21sub_group_shuffle_xorij(i32 0, i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z17sub_group_shuffleij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z21sub_group_shuffle_xorij(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleUInt
+; CHECK-LLVM: call spir_func i32 @_Z17sub_group_shuffleij(i32 0, i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z21sub_group_shuffle_xorij(i32 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleUInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z17sub_group_shufflejj(i32 0, i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z21sub_group_shuffle_xorjj(i32 0, i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z17sub_group_shufflejj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z21sub_group_shuffle_xorjj(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleLong
+; CHECK-LLVM: call spir_func i64 @_Z17sub_group_shufflelj(i64 0, i32 0)
+; CHECK-LLVM: call spir_func i64 @_Z21sub_group_shuffle_xorlj(i64 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleLong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z17sub_group_shufflelj(i64 0, i32 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z21sub_group_shuffle_xorlj(i64 0, i32 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z17sub_group_shufflelj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z21sub_group_shuffle_xorlj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleULong
+; CHECK-LLVM: call spir_func i64 @_Z17sub_group_shufflelj(i64 0, i32 0)
+; CHECK-LLVM: call spir_func i64 @_Z21sub_group_shuffle_xorlj(i64 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleULong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z17sub_group_shufflemj(i64 0, i32 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z21sub_group_shuffle_xormj(i64 0, i32 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z17sub_group_shufflemj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z21sub_group_shuffle_xormj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleFloat
+; CHECK-LLVM: call spir_func float @_Z17sub_group_shufflefj(float 0.000000e+00, i32 0)
+; CHECK-LLVM: call spir_func float @_Z21sub_group_shuffle_xorfj(float 0.000000e+00, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleFloat(float addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !23 !kernel_arg_base_type !23 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func float @_Z17sub_group_shufflefj(float 0.000000e+00, i32 0) #2
+  store float %2, float addrspace(1)* %0, align 4, !tbaa !24
+  %3 = tail call spir_func float @_Z21sub_group_shuffle_xorfj(float 0.000000e+00, i32 0) #2
+  %4 = getelementptr inbounds float, float addrspace(1)* %0, i64 1
+  store float %3, float addrspace(1)* %4, align 4, !tbaa !24
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z17sub_group_shufflefj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z21sub_group_shuffle_xorfj(float, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleHalf
+; CHECK-LLVM: call spir_func half @_Z17sub_group_shuffleDhj(half 0xH0000, i32 0)
+; CHECK-LLVM: call spir_func half @_Z21sub_group_shuffle_xorDhj(half 0xH0000, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleHalf(half addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !26 !kernel_arg_base_type !26 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func half @_Z17sub_group_shuffleDhj(half 0xH0000, i32 0) #2
+  store half %2, half addrspace(1)* %0, align 2, !tbaa !27
+  %3 = tail call spir_func half @_Z21sub_group_shuffle_xorDhj(half 0xH0000, i32 0) #2
+  %4 = getelementptr inbounds half, half addrspace(1)* %0, i64 1
+  store half %3, half addrspace(1)* %4, align 2, !tbaa !27
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z17sub_group_shuffleDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z21sub_group_shuffle_xorDhj(half, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffle [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleXor [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleDouble
+; CHECK-LLVM: call spir_func double @_Z17sub_group_shuffledj(double 0.000000e+00, i32 0)
+; CHECK-LLVM: call spir_func double @_Z21sub_group_shuffle_xordj(double 0.000000e+00, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleDouble(double addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !29 !kernel_arg_base_type !29 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func double @_Z17sub_group_shuffledj(double 0.000000e+00, i32 0) #2
+  store double %2, double addrspace(1)* %0, align 8, !tbaa !30
+  %3 = tail call spir_func double @_Z21sub_group_shuffle_xordj(double 0.000000e+00, i32 0) #2
+  %4 = getelementptr inbounds double, double addrspace(1)* %0, i64 1
+  store double %3, double addrspace(1)* %4, align 8, !tbaa !30
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z17sub_group_shuffledj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z21sub_group_shuffle_xordj(double, i32) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!"uchar*"}
+!11 = !{!"short*"}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"short", !8, i64 0}
+!14 = !{!"ushort*"}
+!15 = !{!"int*"}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !8, i64 0}
+!18 = !{!"uint*"}
+!19 = !{!"long*"}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"long", !8, i64 0}
+!22 = !{!"ulong*"}
+!23 = !{!"float*"}
+!24 = !{!25, !25, i64 0}
+!25 = !{!"float", !8, i64 0}
+!26 = !{!"half*"}
+!27 = !{!28, !28, i64 0}
+!28 = !{!"half", !8, i64 0}
+!29 = !{!"double*"}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"double", !8, i64 0}
+
+

From c820ba04cdacd5c140d4344fddeb8c280a5da9d2 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Wed, 15 Apr 2020 14:29:18 +0200
Subject: [PATCH 753/770] Add two-way translation test for
 sub_group_shuffle_relative.

---
 .../transcoding/sub_group_shuffle_relative.ll | 428 ++++++++++++++++++
 1 file changed, 428 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll b/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll
new file mode 100644
index 0000000000000..cce29017af602
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll
@@ -0,0 +1,428 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; 
+;; kernel void testShuffleRelativeChar(global char* dst)
+;; {
+;; 	char v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeUChar(global uchar* dst)
+;; {
+;; 	uchar v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeShort(global short* dst)
+;; {
+;; 	short v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeUShort(global ushort* dst)
+;; {
+;; 	ushort v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeInt(global int* dst)
+;; {
+;; 	int v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeUInt(global uint* dst)
+;; {
+;; 	uint v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeLong(global long* dst)
+;; {
+;; 	long v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeULong(global ulong* dst)
+;; {
+;; 	ulong v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeFloat(global float* dst)
+;; {
+;; 	float v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeHalf(global half* dst)
+;; {
+;; 	half v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+;; 
+;; kernel void testShuffleRelativeDouble(global double* dst)
+;; {
+;; 	double v = 0;
+;;     dst[0] = sub_group_shuffle_up( v, 0 );
+;;     dst[1] = sub_group_shuffle_down( v, 0 );
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; ModuleID = 'sub_group_shuffle_relative.cl'
+source_filename = "sub_group_shuffle_relative.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeChar
+; CHECK-LLVM: call spir_func i8 @_Z20sub_group_shuffle_upcj(i8 0, i32 0)
+; CHECK-LLVM: call spir_func i8 @_Z22sub_group_shuffle_downcj(i8 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i8 @_Z20sub_group_shuffle_upcj(i8 signext 0, i32 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func signext i8 @_Z22sub_group_shuffle_downcj(i8 signext 0, i32 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z20sub_group_shuffle_upcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z22sub_group_shuffle_downcj(i8 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeUChar
+; CHECK-LLVM: call spir_func i8 @_Z20sub_group_shuffle_upcj(i8 0, i32 0)
+; CHECK-LLVM: call spir_func i8 @_Z22sub_group_shuffle_downcj(i8 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i8 @_Z20sub_group_shuffle_uphj(i8 zeroext 0, i32 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func zeroext i8 @_Z22sub_group_shuffle_downhj(i8 zeroext 0, i32 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z20sub_group_shuffle_uphj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z22sub_group_shuffle_downhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeShort
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_shuffle_upsj(i16 0, i32 0)
+; CHECK-LLVM: call spir_func i16 @_Z22sub_group_shuffle_downsj(i16 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i16 @_Z20sub_group_shuffle_upsj(i16 signext 0, i32 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func signext i16 @_Z22sub_group_shuffle_downsj(i16 signext 0, i32 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z20sub_group_shuffle_upsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z22sub_group_shuffle_downsj(i16 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeUShort
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_shuffle_upsj(i16 0, i32 0)
+; CHECK-LLVM: call spir_func i16 @_Z22sub_group_shuffle_downsj(i16 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i16 @_Z20sub_group_shuffle_uptj(i16 zeroext 0, i32 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func zeroext i16 @_Z22sub_group_shuffle_downtj(i16 zeroext 0, i32 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z20sub_group_shuffle_uptj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z22sub_group_shuffle_downtj(i16 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeInt
+; CHECK-LLVM: call spir_func i32 @_Z20sub_group_shuffle_upij(i32 0, i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z22sub_group_shuffle_downij(i32 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z20sub_group_shuffle_upij(i32 0, i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z22sub_group_shuffle_downij(i32 0, i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z20sub_group_shuffle_upij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z22sub_group_shuffle_downij(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeUInt
+; CHECK-LLVM: call spir_func i32 @_Z20sub_group_shuffle_upij(i32 0, i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z22sub_group_shuffle_downij(i32 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeUInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z20sub_group_shuffle_upjj(i32 0, i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z22sub_group_shuffle_downjj(i32 0, i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z20sub_group_shuffle_upjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z22sub_group_shuffle_downjj(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeLong
+; CHECK-LLVM: call spir_func i64 @_Z20sub_group_shuffle_uplj(i64 0, i32 0)
+; CHECK-LLVM: call spir_func i64 @_Z22sub_group_shuffle_downlj(i64 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeLong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z20sub_group_shuffle_uplj(i64 0, i32 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z22sub_group_shuffle_downlj(i64 0, i32 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z20sub_group_shuffle_uplj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z22sub_group_shuffle_downlj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeULong
+; CHECK-LLVM: call spir_func i64 @_Z20sub_group_shuffle_uplj(i64 0, i32 0)
+; CHECK-LLVM: call spir_func i64 @_Z22sub_group_shuffle_downlj(i64 0, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeULong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z20sub_group_shuffle_upmj(i64 0, i32 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z22sub_group_shuffle_downmj(i64 0, i32 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z20sub_group_shuffle_upmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z22sub_group_shuffle_downmj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeFloat
+; CHECK-LLVM: call spir_func float @_Z20sub_group_shuffle_upfj(float 0.000000e+00, i32 0)
+; CHECK-LLVM: call spir_func float @_Z22sub_group_shuffle_downfj(float 0.000000e+00, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeFloat(float addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !23 !kernel_arg_base_type !23 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func float @_Z20sub_group_shuffle_upfj(float 0.000000e+00, i32 0) #2
+  store float %2, float addrspace(1)* %0, align 4, !tbaa !24
+  %3 = tail call spir_func float @_Z22sub_group_shuffle_downfj(float 0.000000e+00, i32 0) #2
+  %4 = getelementptr inbounds float, float addrspace(1)* %0, i64 1
+  store float %3, float addrspace(1)* %4, align 4, !tbaa !24
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z20sub_group_shuffle_upfj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z22sub_group_shuffle_downfj(float, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeHalf
+; CHECK-LLVM: call spir_func half @_Z20sub_group_shuffle_upDhj(half 0xH0000, i32 0)
+; CHECK-LLVM: call spir_func half @_Z22sub_group_shuffle_downDhj(half 0xH0000, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeHalf(half addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !26 !kernel_arg_base_type !26 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func half @_Z20sub_group_shuffle_upDhj(half 0xH0000, i32 0) #2
+  store half %2, half addrspace(1)* %0, align 2, !tbaa !27
+  %3 = tail call spir_func half @_Z22sub_group_shuffle_downDhj(half 0xH0000, i32 0) #2
+  %4 = getelementptr inbounds half, half addrspace(1)* %0, i64 1
+  store half %3, half addrspace(1)* %4, align 2, !tbaa !27
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z20sub_group_shuffle_upDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z22sub_group_shuffle_downDhj(half, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformShuffleUp [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]] [[int_0]]
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testShuffleRelativeDouble
+; CHECK-LLVM: call spir_func double @_Z20sub_group_shuffle_updj(double 0.000000e+00, i32 0)
+; CHECK-LLVM: call spir_func double @_Z22sub_group_shuffle_downdj(double 0.000000e+00, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testShuffleRelativeDouble(double addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !29 !kernel_arg_base_type !29 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func double @_Z20sub_group_shuffle_updj(double 0.000000e+00, i32 0) #2
+  store double %2, double addrspace(1)* %0, align 8, !tbaa !30
+  %3 = tail call spir_func double @_Z22sub_group_shuffle_downdj(double 0.000000e+00, i32 0) #2
+  %4 = getelementptr inbounds double, double addrspace(1)* %0, i64 1
+  store double %3, double addrspace(1)* %4, align 8, !tbaa !30
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z20sub_group_shuffle_updj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z22sub_group_shuffle_downdj(double, i32) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!"uchar*"}
+!11 = !{!"short*"}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"short", !8, i64 0}
+!14 = !{!"ushort*"}
+!15 = !{!"int*"}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !8, i64 0}
+!18 = !{!"uint*"}
+!19 = !{!"long*"}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"long", !8, i64 0}
+!22 = !{!"ulong*"}
+!23 = !{!"float*"}
+!24 = !{!25, !25, i64 0}
+!25 = !{!"float", !8, i64 0}
+!26 = !{!"half*"}
+!27 = !{!28, !28, i64 0}
+!28 = !{!"half", !8, i64 0}
+!29 = !{!"double*"}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"double", !8, i64 0}

From b26a18d9e7c7dc2fda22e43a8f1de60ab98c1b07 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Thu, 16 Apr 2020 10:42:27 +0200
Subject: [PATCH 754/770] Add two-way translation test for
 sub_group_clustered_reduce.

---
 .../transcoding/sub_group_clustered_reduce.ll | 998 ++++++++++++++++++
 1 file changed, 998 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll b/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll
new file mode 100644
index 0000000000000..3a70a4a72b901
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll
@@ -0,0 +1,998 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; 
+;; kernel void testClusteredArithmeticChar(global char* dst)
+;; {
+;;     char v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticUChar(global uchar* dst)
+;; {
+;;     uchar v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticShort(global short* dst)
+;; {
+;;     short v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticUShort(global ushort* dst)
+;; {
+;;     ushort v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticInt(global int* dst)
+;; {
+;;     int v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticUInt(global uint* dst)
+;; {
+;;     uint v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticLong(global long* dst)
+;; {
+;;     long v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticULong(global ulong* dst)
+;; {
+;;     ulong v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticFloat(global float* dst)
+;; {
+;;     float v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticHalf(global half* dst)
+;; {
+;;     half v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredArithmeticDouble(global double* dst)
+;; {
+;;     double v = 0;
+;;     dst[0] = sub_group_clustered_reduce_add(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_mul(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_min(v, 2);
+;;     dst[3] = sub_group_clustered_reduce_max(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseChar(global char* dst)
+;; {
+;;     char v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseUChar(global uchar* dst)
+;; {
+;;     uchar v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseShort(global short* dst)
+;; {
+;;     short v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseUShort(global ushort* dst)
+;; {
+;;     ushort v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseInt(global int* dst)
+;; {
+;;     int v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseUInt(global uint* dst)
+;; {
+;;     uint v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseLong(global long* dst)
+;; {
+;;     long v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredBitwiseULong(global ulong* dst)
+;; {
+;;     ulong v = 0;
+;;     dst[0] = sub_group_clustered_reduce_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_xor(v, 2);
+;; }
+;; 
+;; kernel void testClusteredLogical(global int* dst)
+;; {
+;;     int v = 0;
+;;     dst[0] = sub_group_clustered_reduce_logical_and(v, 2);
+;;     dst[1] = sub_group_clustered_reduce_logical_or(v, 2);
+;;     dst[2] = sub_group_clustered_reduce_logical_xor(v, 2);
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: ConstantFalse [[bool]] [[false:[0-9]+]]
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_2:[0-9]+]]         2
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; ModuleID = 'sub_group_clustered_reduce.cl'
+source_filename = "sub_group_clustered_reduce.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticChar
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_addcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_mulcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_mincj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_maxcj(i8 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i8 @_Z30sub_group_clustered_reduce_addcj(i8 signext 0, i32 2) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func signext i8 @_Z30sub_group_clustered_reduce_mulcj(i8 signext 0, i32 2) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func signext i8 @_Z30sub_group_clustered_reduce_mincj(i8 signext 0, i32 2) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  %7 = tail call spir_func signext i8 @_Z30sub_group_clustered_reduce_maxcj(i8 signext 0, i32 2) #2
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z30sub_group_clustered_reduce_addcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z30sub_group_clustered_reduce_mulcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z30sub_group_clustered_reduce_mincj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z30sub_group_clustered_reduce_maxcj(i8 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticUChar
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_addcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_mulcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_minhj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_maxhj(i8 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i8 @_Z30sub_group_clustered_reduce_addhj(i8 zeroext 0, i32 2) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func zeroext i8 @_Z30sub_group_clustered_reduce_mulhj(i8 zeroext 0, i32 2) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func zeroext i8 @_Z30sub_group_clustered_reduce_minhj(i8 zeroext 0, i32 2) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  %7 = tail call spir_func zeroext i8 @_Z30sub_group_clustered_reduce_maxhj(i8 zeroext 0, i32 2) #2
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z30sub_group_clustered_reduce_addhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z30sub_group_clustered_reduce_mulhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z30sub_group_clustered_reduce_minhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z30sub_group_clustered_reduce_maxhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticShort
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_addsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_mulsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_minsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_maxsj(i16 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i16 @_Z30sub_group_clustered_reduce_addsj(i16 signext 0, i32 2) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func signext i16 @_Z30sub_group_clustered_reduce_mulsj(i16 signext 0, i32 2) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func signext i16 @_Z30sub_group_clustered_reduce_minsj(i16 signext 0, i32 2) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  %7 = tail call spir_func signext i16 @_Z30sub_group_clustered_reduce_maxsj(i16 signext 0, i32 2) #2
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z30sub_group_clustered_reduce_addsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z30sub_group_clustered_reduce_mulsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z30sub_group_clustered_reduce_minsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z30sub_group_clustered_reduce_maxsj(i16 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticUShort
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_addsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_mulsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_mintj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_maxtj(i16 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i16 @_Z30sub_group_clustered_reduce_addtj(i16 zeroext 0, i32 2) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func zeroext i16 @_Z30sub_group_clustered_reduce_multj(i16 zeroext 0, i32 2) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func zeroext i16 @_Z30sub_group_clustered_reduce_mintj(i16 zeroext 0, i32 2) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  %7 = tail call spir_func zeroext i16 @_Z30sub_group_clustered_reduce_maxtj(i16 zeroext 0, i32 2) #2
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z30sub_group_clustered_reduce_addtj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z30sub_group_clustered_reduce_multj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z30sub_group_clustered_reduce_mintj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z30sub_group_clustered_reduce_maxtj(i16 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticInt
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_addij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_mulij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_minij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_maxij(i32 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_addij(i32 0, i32 2) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_mulij(i32 0, i32 2) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_minij(i32 0, i32 2) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_maxij(i32 0, i32 2) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_addij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_mulij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_minij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_maxij(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticUInt
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_addij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_mulij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_minjj(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_maxjj(i32 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticUInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_addjj(i32 0, i32 2) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_muljj(i32 0, i32 2) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_minjj(i32 0, i32 2) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_maxjj(i32 0, i32 2) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_addjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_muljj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_minjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_maxjj(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformSMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticLong
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_addlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_mullj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_minlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_maxlj(i64 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticLong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_addlj(i64 0, i32 2) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_mullj(i64 0, i32 2) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_minlj(i64 0, i32 2) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  %7 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_maxlj(i64 0, i32 2) #2
+  %8 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 3
+  store i64 %7, i64 addrspace(1)* %8, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_addlj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_mullj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_minlj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_maxlj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformUMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticULong
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_addlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_mullj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_minmj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_maxmj(i64 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticULong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_addmj(i64 0, i32 2) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_mulmj(i64 0, i32 2) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_minmj(i64 0, i32 2) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  %7 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_maxmj(i64 0, i32 2) #2
+  %8 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 3
+  store i64 %7, i64 addrspace(1)* %8, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_addmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_mulmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_minmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_maxmj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformFAdd [[float]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[float_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMul [[float]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[float_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMin [[float]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[float_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMax [[float]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[float_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticFloat
+; CHECK-LLVM: call spir_func float @_Z30sub_group_clustered_reduce_addfj(float 0.000000e+00, i32 2)
+; CHECK-LLVM: call spir_func float @_Z30sub_group_clustered_reduce_mulfj(float 0.000000e+00, i32 2)
+; CHECK-LLVM: call spir_func float @_Z30sub_group_clustered_reduce_minfj(float 0.000000e+00, i32 2)
+; CHECK-LLVM: call spir_func float @_Z30sub_group_clustered_reduce_maxfj(float 0.000000e+00, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticFloat(float addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !23 !kernel_arg_base_type !23 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func float @_Z30sub_group_clustered_reduce_addfj(float 0.000000e+00, i32 2) #2
+  store float %2, float addrspace(1)* %0, align 4, !tbaa !24
+  %3 = tail call spir_func float @_Z30sub_group_clustered_reduce_mulfj(float 0.000000e+00, i32 2) #2
+  %4 = getelementptr inbounds float, float addrspace(1)* %0, i64 1
+  store float %3, float addrspace(1)* %4, align 4, !tbaa !24
+  %5 = tail call spir_func float @_Z30sub_group_clustered_reduce_minfj(float 0.000000e+00, i32 2) #2
+  %6 = getelementptr inbounds float, float addrspace(1)* %0, i64 2
+  store float %5, float addrspace(1)* %6, align 4, !tbaa !24
+  %7 = tail call spir_func float @_Z30sub_group_clustered_reduce_maxfj(float 0.000000e+00, i32 2) #2
+  %8 = getelementptr inbounds float, float addrspace(1)* %0, i64 3
+  store float %7, float addrspace(1)* %8, align 4, !tbaa !24
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z30sub_group_clustered_reduce_addfj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z30sub_group_clustered_reduce_mulfj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z30sub_group_clustered_reduce_minfj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z30sub_group_clustered_reduce_maxfj(float, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformFAdd [[half]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[half_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMul [[half]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[half_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMin [[half]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[half_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMax [[half]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[half_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticHalf
+; CHECK-LLVM: call spir_func half @_Z30sub_group_clustered_reduce_addDhj(half 0xH0000, i32 2)
+; CHECK-LLVM: call spir_func half @_Z30sub_group_clustered_reduce_mulDhj(half 0xH0000, i32 2)
+; CHECK-LLVM: call spir_func half @_Z30sub_group_clustered_reduce_minDhj(half 0xH0000, i32 2)
+; CHECK-LLVM: call spir_func half @_Z30sub_group_clustered_reduce_maxDhj(half 0xH0000, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticHalf(half addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !26 !kernel_arg_base_type !26 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func half @_Z30sub_group_clustered_reduce_addDhj(half 0xH0000, i32 2) #2
+  store half %2, half addrspace(1)* %0, align 2, !tbaa !27
+  %3 = tail call spir_func half @_Z30sub_group_clustered_reduce_mulDhj(half 0xH0000, i32 2) #2
+  %4 = getelementptr inbounds half, half addrspace(1)* %0, i64 1
+  store half %3, half addrspace(1)* %4, align 2, !tbaa !27
+  %5 = tail call spir_func half @_Z30sub_group_clustered_reduce_minDhj(half 0xH0000, i32 2) #2
+  %6 = getelementptr inbounds half, half addrspace(1)* %0, i64 2
+  store half %5, half addrspace(1)* %6, align 2, !tbaa !27
+  %7 = tail call spir_func half @_Z30sub_group_clustered_reduce_maxDhj(half 0xH0000, i32 2) #2
+  %8 = getelementptr inbounds half, half addrspace(1)* %0, i64 3
+  store half %7, half addrspace(1)* %8, align 2, !tbaa !27
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z30sub_group_clustered_reduce_addDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z30sub_group_clustered_reduce_mulDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z30sub_group_clustered_reduce_minDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z30sub_group_clustered_reduce_maxDhj(half, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformFAdd [[double]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[double_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMul [[double]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[double_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMin [[double]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[double_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformFMax [[double]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[double_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredArithmeticDouble
+; CHECK-LLVM: call spir_func double @_Z30sub_group_clustered_reduce_adddj(double 0.000000e+00, i32 2)
+; CHECK-LLVM: call spir_func double @_Z30sub_group_clustered_reduce_muldj(double 0.000000e+00, i32 2)
+; CHECK-LLVM: call spir_func double @_Z30sub_group_clustered_reduce_mindj(double 0.000000e+00, i32 2)
+; CHECK-LLVM: call spir_func double @_Z30sub_group_clustered_reduce_maxdj(double 0.000000e+00, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredArithmeticDouble(double addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !29 !kernel_arg_base_type !29 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func double @_Z30sub_group_clustered_reduce_adddj(double 0.000000e+00, i32 2) #2
+  store double %2, double addrspace(1)* %0, align 8, !tbaa !30
+  %3 = tail call spir_func double @_Z30sub_group_clustered_reduce_muldj(double 0.000000e+00, i32 2) #2
+  %4 = getelementptr inbounds double, double addrspace(1)* %0, i64 1
+  store double %3, double addrspace(1)* %4, align 8, !tbaa !30
+  %5 = tail call spir_func double @_Z30sub_group_clustered_reduce_mindj(double 0.000000e+00, i32 2) #2
+  %6 = getelementptr inbounds double, double addrspace(1)* %0, i64 2
+  store double %5, double addrspace(1)* %6, align 8, !tbaa !30
+  %7 = tail call spir_func double @_Z30sub_group_clustered_reduce_maxdj(double 0.000000e+00, i32 2) #2
+  %8 = getelementptr inbounds double, double addrspace(1)* %0, i64 3
+  store double %7, double addrspace(1)* %8, align 8, !tbaa !30
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z30sub_group_clustered_reduce_adddj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z30sub_group_clustered_reduce_muldj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z30sub_group_clustered_reduce_mindj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z30sub_group_clustered_reduce_maxdj(double, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseChar
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_andcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z29sub_group_clustered_reduce_orcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_xorcj(i8 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i8 @_Z30sub_group_clustered_reduce_andcj(i8 signext 0, i32 2) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func signext i8 @_Z29sub_group_clustered_reduce_orcj(i8 signext 0, i32 2) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func signext i8 @_Z30sub_group_clustered_reduce_xorcj(i8 signext 0, i32 2) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z30sub_group_clustered_reduce_andcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z29sub_group_clustered_reduce_orcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z30sub_group_clustered_reduce_xorcj(i8 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[char_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseUChar
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_andcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z29sub_group_clustered_reduce_orcj(i8 0, i32 2)
+; CHECK-LLVM: call spir_func i8 @_Z30sub_group_clustered_reduce_xorcj(i8 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i8 @_Z30sub_group_clustered_reduce_andhj(i8 zeroext 0, i32 2) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func zeroext i8 @_Z29sub_group_clustered_reduce_orhj(i8 zeroext 0, i32 2) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func zeroext i8 @_Z30sub_group_clustered_reduce_xorhj(i8 zeroext 0, i32 2) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z30sub_group_clustered_reduce_andhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z29sub_group_clustered_reduce_orhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z30sub_group_clustered_reduce_xorhj(i8 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseShort
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_andsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z29sub_group_clustered_reduce_orsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_xorsj(i16 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i16 @_Z30sub_group_clustered_reduce_andsj(i16 signext 0, i32 2) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func signext i16 @_Z29sub_group_clustered_reduce_orsj(i16 signext 0, i32 2) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func signext i16 @_Z30sub_group_clustered_reduce_xorsj(i16 signext 0, i32 2) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z30sub_group_clustered_reduce_andsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z29sub_group_clustered_reduce_orsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z30sub_group_clustered_reduce_xorsj(i16 signext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[short_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseUShort
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_andsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z29sub_group_clustered_reduce_orsj(i16 0, i32 2)
+; CHECK-LLVM: call spir_func i16 @_Z30sub_group_clustered_reduce_xorsj(i16 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i16 @_Z30sub_group_clustered_reduce_andtj(i16 zeroext 0, i32 2) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func zeroext i16 @_Z29sub_group_clustered_reduce_ortj(i16 zeroext 0, i32 2) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func zeroext i16 @_Z30sub_group_clustered_reduce_xortj(i16 zeroext 0, i32 2) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z30sub_group_clustered_reduce_andtj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z29sub_group_clustered_reduce_ortj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z30sub_group_clustered_reduce_xortj(i16 zeroext, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseInt
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_andij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z29sub_group_clustered_reduce_orij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_xorij(i32 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_andij(i32 0, i32 2) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z29sub_group_clustered_reduce_orij(i32 0, i32 2) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_xorij(i32 0, i32 2) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_andij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z29sub_group_clustered_reduce_orij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_xorij(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[int_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseUInt
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_andij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z29sub_group_clustered_reduce_orij(i32 0, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z30sub_group_clustered_reduce_xorij(i32 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseUInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_andjj(i32 0, i32 2) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z29sub_group_clustered_reduce_orjj(i32 0, i32 2) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z30sub_group_clustered_reduce_xorjj(i32 0, i32 2) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_andjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z29sub_group_clustered_reduce_orjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z30sub_group_clustered_reduce_xorjj(i32, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseLong
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_andlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z29sub_group_clustered_reduce_orlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_xorlj(i64 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseLong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_andlj(i64 0, i32 2) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z29sub_group_clustered_reduce_orlj(i64 0, i32 2) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_xorlj(i64 0, i32 2) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_andlj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z29sub_group_clustered_reduce_orlj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_xorlj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[long_0]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredBitwiseULong
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_andlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z29sub_group_clustered_reduce_orlj(i64 0, i32 2)
+; CHECK-LLVM: call spir_func i64 @_Z30sub_group_clustered_reduce_xorlj(i64 0, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredBitwiseULong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_andmj(i64 0, i32 2) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z29sub_group_clustered_reduce_ormj(i64 0, i32 2) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z30sub_group_clustered_reduce_xormj(i64 0, i32 2) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_andmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z29sub_group_clustered_reduce_ormj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z30sub_group_clustered_reduce_xormj(i64, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformLogicalAnd [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[false]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformLogicalOr  [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[false]] [[int_2]]
+; CHECK-SPIRV: GroupNonUniformLogicalXor [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 3 [[false]] [[int_2]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testClusteredLogical
+; CHECK-LLVM: call spir_func i32 @_Z38sub_group_clustered_reduce_logical_andij(i32 {{.*}}, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z37sub_group_clustered_reduce_logical_orij(i32 {{.*}}, i32 2)
+; CHECK-LLVM: call spir_func i32 @_Z38sub_group_clustered_reduce_logical_xorij(i32 {{.*}}, i32 2)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testClusteredLogical(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z38sub_group_clustered_reduce_logical_andij(i32 0, i32 2) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z37sub_group_clustered_reduce_logical_orij(i32 0, i32 2) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z38sub_group_clustered_reduce_logical_xorij(i32 0, i32 2) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z38sub_group_clustered_reduce_logical_andij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z37sub_group_clustered_reduce_logical_orij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z38sub_group_clustered_reduce_logical_xorij(i32, i32) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!"uchar*"}
+!11 = !{!"short*"}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"short", !8, i64 0}
+!14 = !{!"ushort*"}
+!15 = !{!"int*"}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !8, i64 0}
+!18 = !{!"uint*"}
+!19 = !{!"long*"}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"long", !8, i64 0}
+!22 = !{!"ulong*"}
+!23 = !{!"float*"}
+!24 = !{!25, !25, i64 0}
+!25 = !{!"float", !8, i64 0}
+!26 = !{!"half*"}
+!27 = !{!28, !28, i64 0}
+!28 = !{!"half", !8, i64 0}
+!29 = !{!"double*"}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"double", !8, i64 0}

From e1b38cd1196994f8808f7616aa0b27ccb90f3c8f Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Thu, 16 Apr 2020 13:42:46 +0200
Subject: [PATCH 755/770] Add two-way translation test for
 sub_group_extended_types.

---
 .../transcoding/sub_group_extended_types.ll   | 1322 +++++++++++++++++
 1 file changed, 1322 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_extended_types.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_extended_types.ll b/llvm-spirv/test/transcoding/sub_group_extended_types.ll
new file mode 100644
index 0000000000000..6f8dcc0331fbe
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_extended_types.ll
@@ -0,0 +1,1322 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_extended_types : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; 
+;; kernel void testBroadcastChar()
+;; {
+;;     char16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastUChar()
+;; {
+;;     uchar16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastShort()
+;; {
+;;     short16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastUShort()
+;; {
+;;     ushort16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastInt()
+;; {
+;;     int16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastUInt()
+;; {
+;;     uint16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastLong()
+;; {
+;;     long16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastULong()
+;; {
+;;     ulong16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastFloat()
+;; {
+;;     float16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastHalf()
+;; {
+;;     half16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testBroadcastDouble()
+;; {
+;;     double16 v = 0;
+;;     v.s0 = sub_group_broadcast(v.s0, 0);
+;;     v.s01 = sub_group_broadcast(v.s01, 0);
+;;     v.s012 = sub_group_broadcast(v.s012, 0);
+;;     v.s0123 = sub_group_broadcast(v.s0123, 0);
+;;     v.s01234567 = sub_group_broadcast(v.s01234567, 0);
+;;     v = sub_group_broadcast(v, 0);
+;; }
+;; 
+;; kernel void testReduceScanChar(global char* dst)
+;; {
+;;     char v = 0;
+;;     dst[0] = sub_group_reduce_add(v);
+;;     dst[1] = sub_group_reduce_min(v);
+;;     dst[2] = sub_group_reduce_max(v);
+;;     dst[3] = sub_group_scan_inclusive_add(v);
+;;     dst[4] = sub_group_scan_inclusive_min(v);
+;;     dst[5] = sub_group_scan_inclusive_max(v);
+;;     dst[6] = sub_group_scan_exclusive_add(v);
+;;     dst[7] = sub_group_scan_exclusive_min(v);
+;;     dst[8] = sub_group_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testReduceScanUChar(global uchar* dst)
+;; {
+;;     uchar v = 0;
+;;     dst[0] = sub_group_reduce_add(v);
+;;     dst[1] = sub_group_reduce_min(v);
+;;     dst[2] = sub_group_reduce_max(v);
+;;     dst[3] = sub_group_scan_inclusive_add(v);
+;;     dst[4] = sub_group_scan_inclusive_min(v);
+;;     dst[5] = sub_group_scan_inclusive_max(v);
+;;     dst[6] = sub_group_scan_exclusive_add(v);
+;;     dst[7] = sub_group_scan_exclusive_min(v);
+;;     dst[8] = sub_group_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testReduceScanShort(global short* dst)
+;; {
+;;     short v = 0;
+;;     dst[0] = sub_group_reduce_add(v);
+;;     dst[1] = sub_group_reduce_min(v);
+;;     dst[2] = sub_group_reduce_max(v);
+;;     dst[3] = sub_group_scan_inclusive_add(v);
+;;     dst[4] = sub_group_scan_inclusive_min(v);
+;;     dst[5] = sub_group_scan_inclusive_max(v);
+;;     dst[6] = sub_group_scan_exclusive_add(v);
+;;     dst[7] = sub_group_scan_exclusive_min(v);
+;;     dst[8] = sub_group_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testReduceScanUShort(global ushort* dst)
+;; {
+;;     ushort v = 0;
+;;     dst[0] = sub_group_reduce_add(v);
+;;     dst[1] = sub_group_reduce_min(v);
+;;     dst[2] = sub_group_reduce_max(v);
+;;     dst[3] = sub_group_scan_inclusive_add(v);
+;;     dst[4] = sub_group_scan_inclusive_min(v);
+;;     dst[5] = sub_group_scan_inclusive_max(v);
+;;     dst[6] = sub_group_scan_exclusive_add(v);
+;;     dst[7] = sub_group_scan_exclusive_min(v);
+;;     dst[8] = sub_group_scan_exclusive_max(v);
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: TypeVector [[char2:[0-9]+]]  [[char]] 2
+; CHECK-SPIRV-DAG: TypeVector [[char3:[0-9]+]]  [[char]] 3
+; CHECK-SPIRV-DAG: TypeVector [[char4:[0-9]+]]  [[char]] 4
+; CHECK-SPIRV-DAG: TypeVector [[char8:[0-9]+]]  [[char]] 8
+; CHECK-SPIRV-DAG: TypeVector [[char16:[0-9]+]] [[char]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[short2:[0-9]+]]  [[short]] 2
+; CHECK-SPIRV-DAG: TypeVector [[short3:[0-9]+]]  [[short]] 3
+; CHECK-SPIRV-DAG: TypeVector [[short4:[0-9]+]]  [[short]] 4
+; CHECK-SPIRV-DAG: TypeVector [[short8:[0-9]+]]  [[short]] 8
+; CHECK-SPIRV-DAG: TypeVector [[short16:[0-9]+]] [[short]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[int2:[0-9]+]]  [[int]] 2
+; CHECK-SPIRV-DAG: TypeVector [[int3:[0-9]+]]  [[int]] 3
+; CHECK-SPIRV-DAG: TypeVector [[int4:[0-9]+]]  [[int]] 4
+; CHECK-SPIRV-DAG: TypeVector [[int8:[0-9]+]]  [[int]] 8
+; CHECK-SPIRV-DAG: TypeVector [[int16:[0-9]+]] [[int]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[long2:[0-9]+]]  [[long]] 2
+; CHECK-SPIRV-DAG: TypeVector [[long3:[0-9]+]]  [[long]] 3
+; CHECK-SPIRV-DAG: TypeVector [[long4:[0-9]+]]  [[long]] 4
+; CHECK-SPIRV-DAG: TypeVector [[long8:[0-9]+]]  [[long]] 8
+; CHECK-SPIRV-DAG: TypeVector [[long16:[0-9]+]] [[long]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[float2:[0-9]+]]  [[float]] 2
+; CHECK-SPIRV-DAG: TypeVector [[float3:[0-9]+]]  [[float]] 3
+; CHECK-SPIRV-DAG: TypeVector [[float4:[0-9]+]]  [[float]] 4
+; CHECK-SPIRV-DAG: TypeVector [[float8:[0-9]+]]  [[float]] 8
+; CHECK-SPIRV-DAG: TypeVector [[float16:[0-9]+]] [[float]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[half2:[0-9]+]]  [[half]] 2
+; CHECK-SPIRV-DAG: TypeVector [[half3:[0-9]+]]  [[half]] 3
+; CHECK-SPIRV-DAG: TypeVector [[half4:[0-9]+]]  [[half]] 4
+; CHECK-SPIRV-DAG: TypeVector [[half8:[0-9]+]]  [[half]] 8
+; CHECK-SPIRV-DAG: TypeVector [[half16:[0-9]+]] [[half]] 16
+
+; CHECK-SPIRV-DAG: TypeVector [[double2:[0-9]+]]  [[double]] 2
+; CHECK-SPIRV-DAG: TypeVector [[double3:[0-9]+]]  [[double]] 3
+; CHECK-SPIRV-DAG: TypeVector [[double4:[0-9]+]]  [[double]] 4
+; CHECK-SPIRV-DAG: TypeVector [[double8:[0-9]+]]  [[double]] 8
+; CHECK-SPIRV-DAG: TypeVector [[double16:[0-9]+]] [[double]] 16
+
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; ModuleID = 'sub_group_extended_types.cl'
+source_filename = "sub_group_extended_types.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char2]] [[char2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char2]] {{[0-9]+}} [[ScopeSubgroup]] [[char2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char3]] [[char3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char3]] {{[0-9]+}} [[ScopeSubgroup]] [[char3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char4]] [[char4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char4]] {{[0-9]+}} [[ScopeSubgroup]] [[char4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char8]] [[char8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char8]] {{[0-9]+}} [[ScopeSubgroup]] [[char8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[char16]] [[char16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char16]] {{[0-9]+}} [[ScopeSubgroup]] [[char16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastChar
+; CHECK-LLVM: call spir_func i8 @_Z19sub_group_broadcasthj(i8 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i8> @_Z19sub_group_broadcastDv2_hj(<2 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i8> @_Z19sub_group_broadcastDv3_hj(<3 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i8> @_Z19sub_group_broadcastDv4_hj(<4 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i8> @_Z19sub_group_broadcastDv8_hj(<8 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i8> @_Z19sub_group_broadcastDv16_hj(<16 x i8> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastChar() local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func signext i8 @_Z19sub_group_broadcastcj(i8 signext 0, i32 0) #6
+  %2 = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %1, i64 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i8> @_Z19sub_group_broadcastDv2_cj(<2 x i8> %3, i32 0) #6
+  %5 = shufflevector <2 x i8> %4, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i8> %5, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i8> @_Z19sub_group_broadcastDv3_cj(<3 x i8> %7, i32 0) #6
+  %9 = shufflevector <3 x i8> %8, <3 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i8> %9, <16 x i8> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i8> %10, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i8> @_Z19sub_group_broadcastDv4_cj(<4 x i8> %11, i32 0) #6
+  %13 = shufflevector <4 x i8> %12, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i8> %13, <16 x i8> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i8> %14, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i8> @_Z19sub_group_broadcastDv8_cj(<8 x i8> %15, i32 0) #6
+  %17 = shufflevector <8 x i8> %16, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i8> %17, <16 x i8> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i8> @_Z19sub_group_broadcastDv16_cj(<16 x i8> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z19sub_group_broadcastcj(i8 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i8> @_Z19sub_group_broadcastDv2_cj(<2 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i8> @_Z19sub_group_broadcastDv3_cj(<3 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i8> @_Z19sub_group_broadcastDv4_cj(<4 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i8> @_Z19sub_group_broadcastDv8_cj(<8 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i8> @_Z19sub_group_broadcastDv16_cj(<16 x i8>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[char]] {{[0-9]+}} [[ScopeSubgroup]] [[char_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char2]] [[char2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char2]] {{[0-9]+}} [[ScopeSubgroup]] [[char2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char3]] [[char3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char3]] {{[0-9]+}} [[ScopeSubgroup]] [[char3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char4]] [[char4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char4]] {{[0-9]+}} [[ScopeSubgroup]] [[char4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char8]] [[char8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char8]] {{[0-9]+}} [[ScopeSubgroup]] [[char8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[char16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[char16]] [[char16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[char16]] {{[0-9]+}} [[ScopeSubgroup]] [[char16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastUChar
+; CHECK-LLVM: call spir_func i8 @_Z19sub_group_broadcasthj(i8 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i8> @_Z19sub_group_broadcastDv2_hj(<2 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i8> @_Z19sub_group_broadcastDv3_hj(<3 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i8> @_Z19sub_group_broadcastDv4_hj(<4 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i8> @_Z19sub_group_broadcastDv8_hj(<8 x i8> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i8> @_Z19sub_group_broadcastDv16_hj(<16 x i8> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastUChar() local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func zeroext i8 @_Z19sub_group_broadcasthj(i8 zeroext 0, i32 0) #6
+  %2 = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %1, i64 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i8> @_Z19sub_group_broadcastDv2_hj(<2 x i8> %3, i32 0) #6
+  %5 = shufflevector <2 x i8> %4, <2 x i8> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i8> %5, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i8> @_Z19sub_group_broadcastDv3_hj(<3 x i8> %7, i32 0) #6
+  %9 = shufflevector <3 x i8> %8, <3 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i8> %9, <16 x i8> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i8> %10, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i8> @_Z19sub_group_broadcastDv4_hj(<4 x i8> %11, i32 0) #6
+  %13 = shufflevector <4 x i8> %12, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i8> %13, <16 x i8> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i8> %14, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i8> @_Z19sub_group_broadcastDv8_hj(<8 x i8> %15, i32 0) #6
+  %17 = shufflevector <8 x i8> %16, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i8> %17, <16 x i8> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i8> @_Z19sub_group_broadcastDv16_hj(<16 x i8> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z19sub_group_broadcasthj(i8 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i8> @_Z19sub_group_broadcastDv2_hj(<2 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i8> @_Z19sub_group_broadcastDv3_hj(<3 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i8> @_Z19sub_group_broadcastDv4_hj(<4 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i8> @_Z19sub_group_broadcastDv8_hj(<8 x i8>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i8> @_Z19sub_group_broadcastDv16_hj(<16 x i8>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short2]] [[short2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short2]] {{[0-9]+}} [[ScopeSubgroup]] [[short2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short3]] [[short3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short3]] {{[0-9]+}} [[ScopeSubgroup]] [[short3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short4]] [[short4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short4]] {{[0-9]+}} [[ScopeSubgroup]] [[short4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short8]] [[short8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short8]] {{[0-9]+}} [[ScopeSubgroup]] [[short8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[short16]] [[short16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short16]] {{[0-9]+}} [[ScopeSubgroup]] [[short16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastShort
+; CHECK-LLVM: call spir_func i16 @_Z19sub_group_broadcasttj(i16 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i16> @_Z19sub_group_broadcastDv2_tj(<2 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i16> @_Z19sub_group_broadcastDv3_tj(<3 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i16> @_Z19sub_group_broadcastDv4_tj(<4 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i16> @_Z19sub_group_broadcastDv8_tj(<8 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i16> @_Z19sub_group_broadcastDv16_tj(<16 x i16> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastShort() local_unnamed_addr #2 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func signext i16 @_Z19sub_group_broadcastsj(i16 signext 0, i32 0) #6
+  %2 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %1, i64 0
+  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i16> @_Z19sub_group_broadcastDv2_sj(<2 x i16> %3, i32 0) #6
+  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i16> %5, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i16> @_Z19sub_group_broadcastDv3_sj(<3 x i16> %7, i32 0) #6
+  %9 = shufflevector <3 x i16> %8, <3 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i16> %9, <16 x i16> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i16> %10, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i16> @_Z19sub_group_broadcastDv4_sj(<4 x i16> %11, i32 0) #6
+  %13 = shufflevector <4 x i16> %12, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i16> %13, <16 x i16> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i16> %14, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i16> @_Z19sub_group_broadcastDv8_sj(<8 x i16> %15, i32 0) #6
+  %17 = shufflevector <8 x i16> %16, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i16> %17, <16 x i16> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i16> @_Z19sub_group_broadcastDv16_sj(<16 x i16> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z19sub_group_broadcastsj(i16 signext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i16> @_Z19sub_group_broadcastDv2_sj(<2 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i16> @_Z19sub_group_broadcastDv3_sj(<3 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i16> @_Z19sub_group_broadcastDv4_sj(<4 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i16> @_Z19sub_group_broadcastDv8_sj(<8 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i16> @_Z19sub_group_broadcastDv16_sj(<16 x i16>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[short]] {{[0-9]+}} [[ScopeSubgroup]] [[short_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short2]] [[short2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short2]] {{[0-9]+}} [[ScopeSubgroup]] [[short2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short3]] [[short3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short3]] {{[0-9]+}} [[ScopeSubgroup]] [[short3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short4]] [[short4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short4]] {{[0-9]+}} [[ScopeSubgroup]] [[short4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short8]] [[short8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short8]] {{[0-9]+}} [[ScopeSubgroup]] [[short8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[short16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[short16]] [[short16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[short16]] {{[0-9]+}} [[ScopeSubgroup]] [[short16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastUShort
+; CHECK-LLVM: call spir_func i16 @_Z19sub_group_broadcasttj(i16 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i16> @_Z19sub_group_broadcastDv2_tj(<2 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i16> @_Z19sub_group_broadcastDv3_tj(<3 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i16> @_Z19sub_group_broadcastDv4_tj(<4 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i16> @_Z19sub_group_broadcastDv8_tj(<8 x i16> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i16> @_Z19sub_group_broadcastDv16_tj(<16 x i16> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastUShort() local_unnamed_addr #2 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func zeroext i16 @_Z19sub_group_broadcasttj(i16 zeroext 0, i32 0) #6
+  %2 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %1, i64 0
+  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i16> @_Z19sub_group_broadcastDv2_tj(<2 x i16> %3, i32 0) #6
+  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i16> %5, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i16> @_Z19sub_group_broadcastDv3_tj(<3 x i16> %7, i32 0) #6
+  %9 = shufflevector <3 x i16> %8, <3 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i16> %9, <16 x i16> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i16> %10, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i16> @_Z19sub_group_broadcastDv4_tj(<4 x i16> %11, i32 0) #6
+  %13 = shufflevector <4 x i16> %12, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i16> %13, <16 x i16> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i16> %14, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i16> @_Z19sub_group_broadcastDv8_tj(<8 x i16> %15, i32 0) #6
+  %17 = shufflevector <8 x i16> %16, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i16> %17, <16 x i16> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i16> @_Z19sub_group_broadcastDv16_tj(<16 x i16> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z19sub_group_broadcasttj(i16 zeroext, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i16> @_Z19sub_group_broadcastDv2_tj(<2 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i16> @_Z19sub_group_broadcastDv3_tj(<3 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i16> @_Z19sub_group_broadcastDv4_tj(<4 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i16> @_Z19sub_group_broadcastDv8_tj(<8 x i16>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i16> @_Z19sub_group_broadcastDv16_tj(<16 x i16>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int2]] [[int2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int2]] {{[0-9]+}} [[ScopeSubgroup]] [[int2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int3]] [[int3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int3]] {{[0-9]+}} [[ScopeSubgroup]] [[int3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int4]] [[int4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int4]] {{[0-9]+}} [[ScopeSubgroup]] [[int4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int8]] [[int8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int8]] {{[0-9]+}} [[ScopeSubgroup]] [[int8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[int16]] [[int16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int16]] {{[0-9]+}} [[ScopeSubgroup]] [[int16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastInt
+; CHECK-LLVM: call spir_func i32 @_Z19sub_group_broadcastjj(i32 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i32> @_Z19sub_group_broadcastDv2_jj(<2 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i32> @_Z19sub_group_broadcastDv3_jj(<3 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i32> @_Z19sub_group_broadcastDv4_jj(<4 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i32> @_Z19sub_group_broadcastDv8_jj(<8 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i32> @_Z19sub_group_broadcastDv16_jj(<16 x i32> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastInt() local_unnamed_addr #3 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i32 @_Z19sub_group_broadcastij(i32 0, i32 0) #6
+  %2 = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %1, i64 0
+  %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i32> @_Z19sub_group_broadcastDv2_ij(<2 x i32> %3, i32 0) #6
+  %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i32> %5, <16 x i32> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i32> @_Z19sub_group_broadcastDv3_ij(<3 x i32> %7, i32 0) #6
+  %9 = shufflevector <3 x i32> %8, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i32> %9, <16 x i32> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i32> %10, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i32> @_Z19sub_group_broadcastDv4_ij(<4 x i32> %11, i32 0) #6
+  %13 = shufflevector <4 x i32> %12, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i32> %13, <16 x i32> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i32> %14, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i32> @_Z19sub_group_broadcastDv8_ij(<8 x i32> %15, i32 0) #6
+  %17 = shufflevector <8 x i32> %16, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i32> %17, <16 x i32> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i32> @_Z19sub_group_broadcastDv16_ij(<16 x i32> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z19sub_group_broadcastij(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i32> @_Z19sub_group_broadcastDv2_ij(<2 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i32> @_Z19sub_group_broadcastDv3_ij(<3 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i32> @_Z19sub_group_broadcastDv4_ij(<4 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i32> @_Z19sub_group_broadcastDv8_ij(<8 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i32> @_Z19sub_group_broadcastDv16_ij(<16 x i32>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[int]] {{[0-9]+}} [[ScopeSubgroup]] [[int_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int2]] [[int2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int2]] {{[0-9]+}} [[ScopeSubgroup]] [[int2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int3]] [[int3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int3]] {{[0-9]+}} [[ScopeSubgroup]] [[int3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int4]] [[int4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int4]] {{[0-9]+}} [[ScopeSubgroup]] [[int4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int8]] [[int8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int8]] {{[0-9]+}} [[ScopeSubgroup]] [[int8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[int16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[int16]] [[int16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[int16]] {{[0-9]+}} [[ScopeSubgroup]] [[int16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastUInt
+; CHECK-LLVM: call spir_func i32 @_Z19sub_group_broadcastjj(i32 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i32> @_Z19sub_group_broadcastDv2_jj(<2 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i32> @_Z19sub_group_broadcastDv3_jj(<3 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i32> @_Z19sub_group_broadcastDv4_jj(<4 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i32> @_Z19sub_group_broadcastDv8_jj(<8 x i32> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i32> @_Z19sub_group_broadcastDv16_jj(<16 x i32> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastUInt() local_unnamed_addr #3 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i32 @_Z19sub_group_broadcastjj(i32 0, i32 0) #6
+  %2 = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %1, i64 0
+  %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i32> @_Z19sub_group_broadcastDv2_jj(<2 x i32> %3, i32 0) #6
+  %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i32> %5, <16 x i32> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i32> @_Z19sub_group_broadcastDv3_jj(<3 x i32> %7, i32 0) #6
+  %9 = shufflevector <3 x i32> %8, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i32> %9, <16 x i32> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i32> %10, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i32> @_Z19sub_group_broadcastDv4_jj(<4 x i32> %11, i32 0) #6
+  %13 = shufflevector <4 x i32> %12, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i32> %13, <16 x i32> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i32> %14, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i32> @_Z19sub_group_broadcastDv8_jj(<8 x i32> %15, i32 0) #6
+  %17 = shufflevector <8 x i32> %16, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i32> %17, <16 x i32> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i32> @_Z19sub_group_broadcastDv16_jj(<16 x i32> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z19sub_group_broadcastjj(i32, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i32> @_Z19sub_group_broadcastDv2_jj(<2 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i32> @_Z19sub_group_broadcastDv3_jj(<3 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i32> @_Z19sub_group_broadcastDv4_jj(<4 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i32> @_Z19sub_group_broadcastDv8_jj(<8 x i32>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i32> @_Z19sub_group_broadcastDv16_jj(<16 x i32>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long2]] [[long2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long2]] {{[0-9]+}} [[ScopeSubgroup]] [[long2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long3]] [[long3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long3]] {{[0-9]+}} [[ScopeSubgroup]] [[long3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long4]] [[long4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long4]] {{[0-9]+}} [[ScopeSubgroup]] [[long4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long8]] [[long8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long8]] {{[0-9]+}} [[ScopeSubgroup]] [[long8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[long16]] [[long16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long16]] {{[0-9]+}} [[ScopeSubgroup]] [[long16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastLong
+; CHECK-LLVM: call spir_func i64 @_Z19sub_group_broadcastmj(i64 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i64> @_Z19sub_group_broadcastDv2_mj(<2 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i64> @_Z19sub_group_broadcastDv3_mj(<3 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i64> @_Z19sub_group_broadcastDv4_mj(<4 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i64> @_Z19sub_group_broadcastDv8_mj(<8 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i64> @_Z19sub_group_broadcastDv16_mj(<16 x i64> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastLong() local_unnamed_addr #4 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i64 @_Z19sub_group_broadcastlj(i64 0, i32 0) #6
+  %2 = insertelement <16 x i64> <i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 %1, i64 0
+  %3 = shufflevector <16 x i64> %2, <16 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i64> @_Z19sub_group_broadcastDv2_lj(<2 x i64> %3, i32 0) #6
+  %5 = shufflevector <2 x i64> %4, <2 x i64> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i64> %5, <16 x i64> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i64> %6, <16 x i64> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i64> @_Z19sub_group_broadcastDv3_lj(<3 x i64> %7, i32 0) #6
+  %9 = shufflevector <3 x i64> %8, <3 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i64> %9, <16 x i64> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i64> %10, <16 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i64> @_Z19sub_group_broadcastDv4_lj(<4 x i64> %11, i32 0) #6
+  %13 = shufflevector <4 x i64> %12, <4 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i64> %13, <16 x i64> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i64> %14, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i64> @_Z19sub_group_broadcastDv8_lj(<8 x i64> %15, i32 0) #6
+  %17 = shufflevector <8 x i64> %16, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i64> %17, <16 x i64> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i64> @_Z19sub_group_broadcastDv16_lj(<16 x i64> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z19sub_group_broadcastlj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i64> @_Z19sub_group_broadcastDv2_lj(<2 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i64> @_Z19sub_group_broadcastDv3_lj(<3 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i64> @_Z19sub_group_broadcastDv4_lj(<4 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i64> @_Z19sub_group_broadcastDv8_lj(<8 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i64> @_Z19sub_group_broadcastDv16_lj(<16 x i64>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[long]] {{[0-9]+}} [[ScopeSubgroup]] [[long_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long2]] [[long2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long2]] {{[0-9]+}} [[ScopeSubgroup]] [[long2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long3]] [[long3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long3]] {{[0-9]+}} [[ScopeSubgroup]] [[long3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long4]] [[long4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long4]] {{[0-9]+}} [[ScopeSubgroup]] [[long4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long8]] [[long8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long8]] {{[0-9]+}} [[ScopeSubgroup]] [[long8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[long16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[long16]] [[long16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[long16]] {{[0-9]+}} [[ScopeSubgroup]] [[long16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastULong
+; CHECK-LLVM: call spir_func i64 @_Z19sub_group_broadcastmj(i64 {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x i64> @_Z19sub_group_broadcastDv2_mj(<2 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x i64> @_Z19sub_group_broadcastDv3_mj(<3 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x i64> @_Z19sub_group_broadcastDv4_mj(<4 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x i64> @_Z19sub_group_broadcastDv8_mj(<8 x i64> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x i64> @_Z19sub_group_broadcastDv16_mj(<16 x i64> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastULong() local_unnamed_addr #4 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func i64 @_Z19sub_group_broadcastmj(i64 0, i32 0) #6
+  %2 = insertelement <16 x i64> <i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 %1, i64 0
+  %3 = shufflevector <16 x i64> %2, <16 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x i64> @_Z19sub_group_broadcastDv2_mj(<2 x i64> %3, i32 0) #6
+  %5 = shufflevector <2 x i64> %4, <2 x i64> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x i64> %5, <16 x i64> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x i64> %6, <16 x i64> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x i64> @_Z19sub_group_broadcastDv3_mj(<3 x i64> %7, i32 0) #6
+  %9 = shufflevector <3 x i64> %8, <3 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x i64> %9, <16 x i64> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x i64> %10, <16 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x i64> @_Z19sub_group_broadcastDv4_mj(<4 x i64> %11, i32 0) #6
+  %13 = shufflevector <4 x i64> %12, <4 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x i64> %13, <16 x i64> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x i64> %14, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x i64> @_Z19sub_group_broadcastDv8_mj(<8 x i64> %15, i32 0) #6
+  %17 = shufflevector <8 x i64> %16, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x i64> %17, <16 x i64> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x i64> @_Z19sub_group_broadcastDv16_mj(<16 x i64> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z19sub_group_broadcastmj(i64, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x i64> @_Z19sub_group_broadcastDv2_mj(<2 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x i64> @_Z19sub_group_broadcastDv3_mj(<3 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x i64> @_Z19sub_group_broadcastDv4_mj(<4 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x i64> @_Z19sub_group_broadcastDv8_mj(<8 x i64>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x i64> @_Z19sub_group_broadcastDv16_mj(<16 x i64>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[float]] {{[0-9]+}} [[ScopeSubgroup]] [[float_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float2]] [[float2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[float2]] {{[0-9]+}} [[ScopeSubgroup]] [[float2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float3]] [[float3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[float3]] {{[0-9]+}} [[ScopeSubgroup]] [[float3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float4]] [[float4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[float4]] {{[0-9]+}} [[ScopeSubgroup]] [[float4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float8]] [[float8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[float8]] {{[0-9]+}} [[ScopeSubgroup]] [[float8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[float16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[float16]] [[float16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[float16]] {{[0-9]+}} [[ScopeSubgroup]] [[float16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastFloat
+; CHECK-LLVM: call spir_func float @_Z19sub_group_broadcastfj(float {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x float> @_Z19sub_group_broadcastDv2_fj(<2 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x float> @_Z19sub_group_broadcastDv3_fj(<3 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x float> @_Z19sub_group_broadcastDv4_fj(<4 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x float> @_Z19sub_group_broadcastDv8_fj(<8 x float> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x float> @_Z19sub_group_broadcastDv16_fj(<16 x float> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastFloat() local_unnamed_addr #3 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func float @_Z19sub_group_broadcastfj(float 0.000000e+00, i32 0) #6
+  %2 = insertelement <16 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %1, i64 0
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x float> @_Z19sub_group_broadcastDv2_fj(<2 x float> %3, i32 0) #6
+  %5 = shufflevector <2 x float> %4, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x float> %5, <16 x float> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x float> %6, <16 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x float> @_Z19sub_group_broadcastDv3_fj(<3 x float> %7, i32 0) #6
+  %9 = shufflevector <3 x float> %8, <3 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x float> %9, <16 x float> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x float> %10, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x float> @_Z19sub_group_broadcastDv4_fj(<4 x float> %11, i32 0) #6
+  %13 = shufflevector <4 x float> %12, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x float> %13, <16 x float> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x float> %14, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x float> @_Z19sub_group_broadcastDv8_fj(<8 x float> %15, i32 0) #6
+  %17 = shufflevector <8 x float> %16, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x float> %17, <16 x float> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x float> @_Z19sub_group_broadcastDv16_fj(<16 x float> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z19sub_group_broadcastfj(float, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x float> @_Z19sub_group_broadcastDv2_fj(<2 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x float> @_Z19sub_group_broadcastDv3_fj(<3 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x float> @_Z19sub_group_broadcastDv4_fj(<4 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x float> @_Z19sub_group_broadcastDv8_fj(<8 x float>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x float> @_Z19sub_group_broadcastDv16_fj(<16 x float>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[half]] {{[0-9]+}} [[ScopeSubgroup]] [[half_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half2]] [[half2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[half2]] {{[0-9]+}} [[ScopeSubgroup]] [[half2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half3]] [[half3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[half3]] {{[0-9]+}} [[ScopeSubgroup]] [[half3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half4]] [[half4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[half4]] {{[0-9]+}} [[ScopeSubgroup]] [[half4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half8]] [[half8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[half8]] {{[0-9]+}} [[ScopeSubgroup]] [[half8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[half16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[half16]] [[half16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[half16]] {{[0-9]+}} [[ScopeSubgroup]] [[half16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastHalf
+; CHECK-LLVM: call spir_func half @_Z19sub_group_broadcastDhj(half {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x half> @_Z19sub_group_broadcastDv2_Dhj(<2 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x half> @_Z19sub_group_broadcastDv3_Dhj(<3 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x half> @_Z19sub_group_broadcastDv4_Dhj(<4 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x half> @_Z19sub_group_broadcastDv8_Dhj(<8 x half> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x half> @_Z19sub_group_broadcastDv16_Dhj(<16 x half> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastHalf() local_unnamed_addr #2 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func half @_Z19sub_group_broadcastDhj(half 0xH0000, i32 0) #6
+  %2 = insertelement <16 x half> <half undef, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %1, i64 0
+  %3 = shufflevector <16 x half> %2, <16 x half> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x half> @_Z19sub_group_broadcastDv2_Dhj(<2 x half> %3, i32 0) #6
+  %5 = shufflevector <2 x half> %4, <2 x half> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x half> %5, <16 x half> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x half> %6, <16 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x half> @_Z19sub_group_broadcastDv3_Dhj(<3 x half> %7, i32 0) #6
+  %9 = shufflevector <3 x half> %8, <3 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x half> %9, <16 x half> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x half> %10, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x half> @_Z19sub_group_broadcastDv4_Dhj(<4 x half> %11, i32 0) #6
+  %13 = shufflevector <4 x half> %12, <4 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x half> %13, <16 x half> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x half> %14, <16 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x half> @_Z19sub_group_broadcastDv8_Dhj(<8 x half> %15, i32 0) #6
+  %17 = shufflevector <8 x half> %16, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x half> %17, <16 x half> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x half> @_Z19sub_group_broadcastDv16_Dhj(<16 x half> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z19sub_group_broadcastDhj(half, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x half> @_Z19sub_group_broadcastDv2_Dhj(<2 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x half> @_Z19sub_group_broadcastDv3_Dhj(<3 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x half> @_Z19sub_group_broadcastDv4_Dhj(<4 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x half> @_Z19sub_group_broadcastDv8_Dhj(<8 x half>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x half> @_Z19sub_group_broadcastDv16_Dhj(<16 x half>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupBroadcast [[double]] {{[0-9]+}} [[ScopeSubgroup]] [[double_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double2]] [[double2_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[double2]] {{[0-9]+}} [[ScopeSubgroup]] [[double2_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double3]] [[double3_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[double3]] {{[0-9]+}} [[ScopeSubgroup]] [[double3_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double4]] [[double4_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[double4]] {{[0-9]+}} [[ScopeSubgroup]] [[double4_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double8]] [[double8_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[double8]] {{[0-9]+}} [[ScopeSubgroup]] [[double8_0]] [[int_0]]
+; CHECK-SPIRV: VectorShuffle [[double16]] {{[0-9]+}}
+; CHECK-SPIRV: VectorShuffle [[double16]] [[double16_0:[0-9]+]] 
+; CHECK-SPIRV: GroupBroadcast [[double16]] {{[0-9]+}} [[ScopeSubgroup]] [[double16_0]] [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testBroadcastDouble
+; CHECK-LLVM: call spir_func double @_Z19sub_group_broadcastdj(double {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <2 x double> @_Z19sub_group_broadcastDv2_dj(<2 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <3 x double> @_Z19sub_group_broadcastDv3_dj(<3 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <4 x double> @_Z19sub_group_broadcastDv4_dj(<4 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <8 x double> @_Z19sub_group_broadcastDv8_dj(<8 x double> {{.*}}, i32 0)
+; CHECK-LLVM: call spir_func <16 x double> @_Z19sub_group_broadcastDv16_dj(<16 x double> {{.*}}, i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testBroadcastDouble() local_unnamed_addr #4 !kernel_arg_addr_space !3 !kernel_arg_access_qual !3 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !3 {
+  %1 = tail call spir_func double @_Z19sub_group_broadcastdj(double 0.000000e+00, i32 0) #6
+  %2 = insertelement <16 x double> <double undef, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double %1, i64 0
+  %3 = shufflevector <16 x double> %2, <16 x double> undef, <2 x i32> <i32 0, i32 1>
+  %4 = tail call spir_func <2 x double> @_Z19sub_group_broadcastDv2_dj(<2 x double> %3, i32 0) #6
+  %5 = shufflevector <2 x double> %4, <2 x double> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <16 x double> %5, <16 x double> %2, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %7 = shufflevector <16 x double> %6, <16 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %8 = tail call spir_func <3 x double> @_Z19sub_group_broadcastDv3_dj(<3 x double> %7, i32 0) #6
+  %9 = shufflevector <3 x double> %8, <3 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %10 = shufflevector <16 x double> %9, <16 x double> %6, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = shufflevector <16 x double> %10, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = tail call spir_func <4 x double> @_Z19sub_group_broadcastDv4_dj(<4 x double> %11, i32 0) #6
+  %13 = shufflevector <4 x double> %12, <4 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = shufflevector <16 x double> %13, <16 x double> %10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %15 = shufflevector <16 x double> %14, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = tail call spir_func <8 x double> @_Z19sub_group_broadcastDv8_dj(<8 x double> %15, i32 0) #6
+  %17 = shufflevector <8 x double> %16, <8 x double> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %18 = shufflevector <16 x double> %17, <16 x double> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %19 = tail call spir_func <16 x double> @_Z19sub_group_broadcastDv16_dj(<16 x double> %18, i32 0) #6
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z19sub_group_broadcastdj(double, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <2 x double> @_Z19sub_group_broadcastDv2_dj(<2 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <3 x double> @_Z19sub_group_broadcastDv3_dj(<3 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <4 x double> @_Z19sub_group_broadcastDv4_dj(<4 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <8 x double> @_Z19sub_group_broadcastDv8_dj(<8 x double>, i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func <16 x double> @_Z19sub_group_broadcastDv16_dj(<16 x double>, i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testReduceScanChar
+; CHECK-LLVM call spir_func i8 @_Z20sub_group_reduce_addc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z20sub_group_reduce_minc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z20sub_group_reduce_maxc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z28sub_group_scan_inclusive_addc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z28sub_group_scan_inclusive_minc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z28sub_group_scan_inclusive_maxc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z28sub_group_scan_exclusive_addc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z28sub_group_scan_exclusive_minc(i8 0)
+; CHECK-LLVM call spir_func i8 @_Z28sub_group_scan_exclusive_maxc(i8 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testReduceScanChar(i8 addrspace(1)* nocapture) local_unnamed_addr #5 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+  %2 = tail call spir_func signext i8 @_Z20sub_group_reduce_addc(i8 signext 0) #6
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !8
+  %3 = tail call spir_func signext i8 @_Z20sub_group_reduce_minc(i8 signext 0) #6
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !8
+  %5 = tail call spir_func signext i8 @_Z20sub_group_reduce_maxc(i8 signext 0) #6
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !8
+  %7 = tail call spir_func signext i8 @_Z28sub_group_scan_inclusive_addc(i8 signext 0) #6
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !8
+  %9 = tail call spir_func signext i8 @_Z28sub_group_scan_inclusive_minc(i8 signext 0) #6
+  %10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 4
+  store i8 %9, i8 addrspace(1)* %10, align 1, !tbaa !8
+  %11 = tail call spir_func signext i8 @_Z28sub_group_scan_inclusive_maxc(i8 signext 0) #6
+  %12 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 5
+  store i8 %11, i8 addrspace(1)* %12, align 1, !tbaa !8
+  %13 = tail call spir_func signext i8 @_Z28sub_group_scan_exclusive_addc(i8 signext 0) #6
+  %14 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 6
+  store i8 %13, i8 addrspace(1)* %14, align 1, !tbaa !8
+  %15 = tail call spir_func signext i8 @_Z28sub_group_scan_exclusive_minc(i8 signext 0) #6
+  %16 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 7
+  store i8 %15, i8 addrspace(1)* %16, align 1, !tbaa !8
+  %17 = tail call spir_func signext i8 @_Z28sub_group_scan_exclusive_maxc(i8 signext 0) #6
+  %18 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+  store i8 %17, i8 addrspace(1)* %18, align 1, !tbaa !8
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z20sub_group_reduce_addc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z20sub_group_reduce_minc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z20sub_group_reduce_maxc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z28sub_group_scan_inclusive_addc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z28sub_group_scan_inclusive_minc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z28sub_group_scan_inclusive_maxc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z28sub_group_scan_exclusive_addc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z28sub_group_scan_exclusive_minc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z28sub_group_scan_exclusive_maxc(i8 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testReduceScanUChar
+; CHECK-LLVM: call spir_func i8 @_Z20sub_group_reduce_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z20sub_group_reduce_minh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z20sub_group_reduce_maxh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z28sub_group_scan_inclusive_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z28sub_group_scan_inclusive_minh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z28sub_group_scan_inclusive_maxh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z28sub_group_scan_exclusive_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z28sub_group_scan_exclusive_minh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z28sub_group_scan_exclusive_maxh(i8 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testReduceScanUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #5 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !7 {
+  %2 = tail call spir_func zeroext i8 @_Z20sub_group_reduce_addh(i8 zeroext 0) #6
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !8
+  %3 = tail call spir_func zeroext i8 @_Z20sub_group_reduce_minh(i8 zeroext 0) #6
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !8
+  %5 = tail call spir_func zeroext i8 @_Z20sub_group_reduce_maxh(i8 zeroext 0) #6
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !8
+  %7 = tail call spir_func zeroext i8 @_Z28sub_group_scan_inclusive_addh(i8 zeroext 0) #6
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !8
+  %9 = tail call spir_func zeroext i8 @_Z28sub_group_scan_inclusive_minh(i8 zeroext 0) #6
+  %10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 4
+  store i8 %9, i8 addrspace(1)* %10, align 1, !tbaa !8
+  %11 = tail call spir_func zeroext i8 @_Z28sub_group_scan_inclusive_maxh(i8 zeroext 0) #6
+  %12 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 5
+  store i8 %11, i8 addrspace(1)* %12, align 1, !tbaa !8
+  %13 = tail call spir_func zeroext i8 @_Z28sub_group_scan_exclusive_addh(i8 zeroext 0) #6
+  %14 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 6
+  store i8 %13, i8 addrspace(1)* %14, align 1, !tbaa !8
+  %15 = tail call spir_func zeroext i8 @_Z28sub_group_scan_exclusive_minh(i8 zeroext 0) #6
+  %16 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 7
+  store i8 %15, i8 addrspace(1)* %16, align 1, !tbaa !8
+  %17 = tail call spir_func zeroext i8 @_Z28sub_group_scan_exclusive_maxh(i8 zeroext 0) #6
+  %18 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+  store i8 %17, i8 addrspace(1)* %18, align 1, !tbaa !8
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z20sub_group_reduce_addh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z20sub_group_reduce_minh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z20sub_group_reduce_maxh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z28sub_group_scan_inclusive_addh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z28sub_group_scan_inclusive_minh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z28sub_group_scan_inclusive_maxh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z28sub_group_scan_exclusive_addh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z28sub_group_scan_exclusive_minh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z28sub_group_scan_exclusive_maxh(i8 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testReduceScanShort
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_reduce_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_reduce_mins(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_reduce_maxs(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_inclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_inclusive_mins(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_inclusive_maxs(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_exclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_exclusive_mins(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_exclusive_maxs(i16 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testReduceScanShort(i16 addrspace(1)* nocapture) local_unnamed_addr #5 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !7 {
+  %2 = tail call spir_func signext i16 @_Z20sub_group_reduce_adds(i16 signext 0) #6
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !13
+  %3 = tail call spir_func signext i16 @_Z20sub_group_reduce_mins(i16 signext 0) #6
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !13
+  %5 = tail call spir_func signext i16 @_Z20sub_group_reduce_maxs(i16 signext 0) #6
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !13
+  %7 = tail call spir_func signext i16 @_Z28sub_group_scan_inclusive_adds(i16 signext 0) #6
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !13
+  %9 = tail call spir_func signext i16 @_Z28sub_group_scan_inclusive_mins(i16 signext 0) #6
+  %10 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 4
+  store i16 %9, i16 addrspace(1)* %10, align 2, !tbaa !13
+  %11 = tail call spir_func signext i16 @_Z28sub_group_scan_inclusive_maxs(i16 signext 0) #6
+  %12 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 5
+  store i16 %11, i16 addrspace(1)* %12, align 2, !tbaa !13
+  %13 = tail call spir_func signext i16 @_Z28sub_group_scan_exclusive_adds(i16 signext 0) #6
+  %14 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 6
+  store i16 %13, i16 addrspace(1)* %14, align 2, !tbaa !13
+  %15 = tail call spir_func signext i16 @_Z28sub_group_scan_exclusive_mins(i16 signext 0) #6
+  %16 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 7
+  store i16 %15, i16 addrspace(1)* %16, align 2, !tbaa !13
+  %17 = tail call spir_func signext i16 @_Z28sub_group_scan_exclusive_maxs(i16 signext 0) #6
+  %18 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 8
+  store i16 %17, i16 addrspace(1)* %18, align 2, !tbaa !13
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z20sub_group_reduce_adds(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z20sub_group_reduce_mins(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z20sub_group_reduce_maxs(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z28sub_group_scan_inclusive_adds(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z28sub_group_scan_inclusive_mins(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z28sub_group_scan_inclusive_maxs(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z28sub_group_scan_exclusive_adds(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z28sub_group_scan_exclusive_mins(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z28sub_group_scan_exclusive_maxs(i16 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testReduceScanUShort
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_reduce_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_reduce_mint(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z20sub_group_reduce_maxt(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_inclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_inclusive_mint(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_inclusive_maxt(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_exclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_exclusive_mint(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z28sub_group_scan_exclusive_maxt(i16 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testReduceScanUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #5 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !7 {
+  %2 = tail call spir_func zeroext i16 @_Z20sub_group_reduce_addt(i16 zeroext 0) #6
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !13
+  %3 = tail call spir_func zeroext i16 @_Z20sub_group_reduce_mint(i16 zeroext 0) #6
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !13
+  %5 = tail call spir_func zeroext i16 @_Z20sub_group_reduce_maxt(i16 zeroext 0) #6
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !13
+  %7 = tail call spir_func zeroext i16 @_Z28sub_group_scan_inclusive_addt(i16 zeroext 0) #6
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !13
+  %9 = tail call spir_func zeroext i16 @_Z28sub_group_scan_inclusive_mint(i16 zeroext 0) #6
+  %10 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 4
+  store i16 %9, i16 addrspace(1)* %10, align 2, !tbaa !13
+  %11 = tail call spir_func zeroext i16 @_Z28sub_group_scan_inclusive_maxt(i16 zeroext 0) #6
+  %12 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 5
+  store i16 %11, i16 addrspace(1)* %12, align 2, !tbaa !13
+  %13 = tail call spir_func zeroext i16 @_Z28sub_group_scan_exclusive_addt(i16 zeroext 0) #6
+  %14 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 6
+  store i16 %13, i16 addrspace(1)* %14, align 2, !tbaa !13
+  %15 = tail call spir_func zeroext i16 @_Z28sub_group_scan_exclusive_mint(i16 zeroext 0) #6
+  %16 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 7
+  store i16 %15, i16 addrspace(1)* %16, align 2, !tbaa !13
+  %17 = tail call spir_func zeroext i16 @_Z28sub_group_scan_exclusive_maxt(i16 zeroext 0) #6
+  %18 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 8
+  store i16 %17, i16 addrspace(1)* %18, align 2, !tbaa !13
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z20sub_group_reduce_addt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z20sub_group_reduce_mint(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z20sub_group_reduce_maxt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z28sub_group_scan_inclusive_addt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z28sub_group_scan_inclusive_mint(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z28sub_group_scan_inclusive_maxt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z28sub_group_scan_exclusive_addt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z28sub_group_scan_exclusive_mint(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z28sub_group_scan_exclusive_maxt(i16 zeroext) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="256" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="1024" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{}
+!4 = !{i32 1}
+!5 = !{!"none"}
+!6 = !{!"char*"}
+!7 = !{!""}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
+!11 = !{!"uchar*"}
+!12 = !{!"short*"}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"short", !9, i64 0}
+!15 = !{!"ushort*"}
\ No newline at end of file

From c8c211ee846ee107c2aa93738a15ef48833a2057 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Fri, 17 Apr 2020 09:56:17 +0200
Subject: [PATCH 756/770] Remove duplicated values from OCLSPIRVBuiltinMap.

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp |  4 ++--
 llvm-spirv/lib/SPIRV/OCLUtil.h        | 18 ------------------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 7e68cb86f3592..882cb033fe3c2 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -942,9 +942,9 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
             .StartsWith("ballot", "group_ballot_bit_count_")
             .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
             .Default(kSPIRVName::GroupPrefix);
+          // clustered functions are handled with non uniform group opcodes
           StringRef ClusteredOp =
-            FuncName.contains("clustered_") ?
-            "clustered_" : "";
+              FuncName.contains("clustered_") ? "non_uniform_" : "";
           StringRef LogicalOp =
             FuncName.contains("logical_") ?
             "logical_" : "";
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index 23bee5d96c9e4..8c30718fd9183 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -893,24 +893,6 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   // cl_khr_subgroup_shuffle_relative
   _SPIRV_OP(group_shuffle_up, GroupNonUniformShuffleUp)
   _SPIRV_OP(group_shuffle_down, GroupNonUniformShuffleDown)
-  // cl_khr_subgroup_clustered_reduce
-  _SPIRV_OP(group_clustered_iadd, GroupNonUniformIAdd)
-  _SPIRV_OP(group_clustered_iadd, GroupNonUniformIAdd)
-  _SPIRV_OP(group_clustered_fadd, GroupNonUniformFAdd)
-  _SPIRV_OP(group_clustered_imul, GroupNonUniformIMul)
-  _SPIRV_OP(group_clustered_fmul, GroupNonUniformFMul)
-  _SPIRV_OP(group_clustered_smin, GroupNonUniformSMin)
-  _SPIRV_OP(group_clustered_umin, GroupNonUniformUMin)
-  _SPIRV_OP(group_clustered_fmin, GroupNonUniformFMin)
-  _SPIRV_OP(group_clustered_smax, GroupNonUniformSMax)
-  _SPIRV_OP(group_clustered_umax, GroupNonUniformUMax)
-  _SPIRV_OP(group_clustered_fmax, GroupNonUniformFMax)
-  _SPIRV_OP(group_clustered_iand, GroupNonUniformBitwiseAnd)
-  _SPIRV_OP(group_clustered_ior, GroupNonUniformBitwiseOr)
-  _SPIRV_OP(group_clustered_ixor, GroupNonUniformBitwiseXor)
-  _SPIRV_OP(group_clustered_logical_iand, GroupNonUniformLogicalAnd)
-  _SPIRV_OP(group_clustered_logical_ior, GroupNonUniformLogicalOr)
-  _SPIRV_OP(group_clustered_logical_ixor, GroupNonUniformLogicalXor)
 #undef _SPIRV_OP
 }
 

From 0fbfe9c20d2885c4e9fe308b0999beb6d378b5c4 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Fri, 17 Apr 2020 12:08:45 +0200
Subject: [PATCH 757/770] Add two-way translation test for
 sub_group_non_uniform_arithmetic.

---
 .../sub_group_non_uniform_arithmetic.ll       | 2268 +++++++++++++++++
 1 file changed, 2268 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll

diff --git a/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll b/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll
new file mode 100644
index 0000000000000..cbe424666f094
--- /dev/null
+++ b/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll
@@ -0,0 +1,2268 @@
+;; #pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+;; #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+;; 
+;; kernel void testNonUniformArithmeticChar(global char* dst)
+;; {
+;;     char v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticUChar(global uchar* dst)
+;; {
+;;     uchar v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticShort(global short* dst)
+;; {
+;;     short v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticUShort(global ushort* dst)
+;; {
+;;     ushort v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticInt(global int* dst)
+;; {
+;;     int v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticUInt(global uint* dst)
+;; {
+;;     uint v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticLong(global long* dst)
+;; {
+;;     long v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticULong(global ulong* dst)
+;; {
+;;     ulong v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticFloat(global float* dst)
+;; {
+;;     float v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticHalf(global half* dst)
+;; {
+;;     half v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformArithmeticDouble(global double* dst)
+;; {
+;;     double v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_add(v);
+;;     dst[1] = sub_group_non_uniform_reduce_mul(v);
+;;     dst[2] = sub_group_non_uniform_reduce_min(v);
+;;     dst[3] = sub_group_non_uniform_reduce_max(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_add(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_mul(v);
+;;     dst[6] = sub_group_non_uniform_scan_inclusive_min(v);
+;;     dst[7] = sub_group_non_uniform_scan_inclusive_max(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_add(v);
+;;     dst[9] = sub_group_non_uniform_scan_exclusive_mul(v);
+;;     dst[10] = sub_group_non_uniform_scan_exclusive_min(v);
+;;     dst[11] = sub_group_non_uniform_scan_exclusive_max(v);
+;; }
+;; 
+;; kernel void testNonUniformBitwiseChar(global char* dst)
+;; {
+;;     char v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseUChar(global uchar* dst)
+;; {
+;;     uchar v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseShort(global short* dst)
+;; {
+;;     short v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseUShort(global ushort* dst)
+;; {
+;;     ushort v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseInt(global int* dst)
+;; {
+;;     int v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseUInt(global uint* dst)
+;; {
+;;     uint v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseLong(global long* dst)
+;; {
+;;     long v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; kernel void testNonUniformBitwiseULong(global ulong* dst)
+;; {
+;;     ulong v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_xor(v);
+;; }
+;; 
+;; kernel void testNonUniformLogical(global int* dst)
+;; {
+;;     int v = 0;
+;;     dst[0] = sub_group_non_uniform_reduce_logical_and(v);
+;;     dst[1] = sub_group_non_uniform_reduce_logical_or(v);
+;;     dst[2] = sub_group_non_uniform_reduce_logical_xor(v);
+;;     dst[3] = sub_group_non_uniform_scan_inclusive_logical_and(v);
+;;     dst[4] = sub_group_non_uniform_scan_inclusive_logical_or(v);
+;;     dst[5] = sub_group_non_uniform_scan_inclusive_logical_xor(v);
+;;     dst[6] = sub_group_non_uniform_scan_exclusive_logical_and(v);
+;;     dst[7] = sub_group_non_uniform_scan_exclusive_logical_or(v);
+;;     dst[8] = sub_group_non_uniform_scan_exclusive_logical_xor(v);
+;; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; Change DISABLED to RUN once SPIRV->LLVM translation is implemented
+; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
+; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
+; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
+; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
+; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
+; CHECK-SPIRV-DAG: TypeInt   [[long:[0-9]+]]   64 0
+; CHECK-SPIRV-DAG: TypeFloat [[half:[0-9]+]]   16
+; CHECK-SPIRV-DAG: TypeFloat [[float:[0-9]+]]  32
+; CHECK-SPIRV-DAG: TypeFloat [[double:[0-9]+]] 64
+
+; CHECK-SPIRV-DAG: ConstantFalse [[bool]] [[false:[0-9]+]]
+; CHECK-SPIRV-DAG: Constant [[int]]    [[ScopeSubgroup:[0-9]+]] 3
+; CHECK-SPIRV-DAG: Constant [[char]]   [[char_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[short]]  [[short_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[int]]    [[int_0:[0-9]+]]         0
+; CHECK-SPIRV-DAG: Constant [[long]]   [[long_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[half]]   [[half_0:[0-9]+]]        0
+; CHECK-SPIRV-DAG: Constant [[float]]  [[float_0:[0-9]+]]       0
+; CHECK-SPIRV-DAG: Constant [[double]] [[double_0:[0-9]+]]      0
+
+; ModuleID = 'sub_group_non_uniform_arithmetic.cl'
+source_filename = "sub_group_non_uniform_arithmetic.cl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticChar
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_mulc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_minc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_maxc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_mulc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_minc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_maxc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_mulc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_minc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_maxc(i8 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i8 @_Z32sub_group_non_uniform_reduce_addc(i8 signext 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func signext i8 @_Z32sub_group_non_uniform_reduce_mulc(i8 signext 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func signext i8 @_Z32sub_group_non_uniform_reduce_minc(i8 signext 0) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  %7 = tail call spir_func signext i8 @_Z32sub_group_non_uniform_reduce_maxc(i8 signext 0) #2
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !7
+  %9 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_addc(i8 signext 0) #2
+  %10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 4
+  store i8 %9, i8 addrspace(1)* %10, align 1, !tbaa !7
+  %11 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_mulc(i8 signext 0) #2
+  %12 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 5
+  store i8 %11, i8 addrspace(1)* %12, align 1, !tbaa !7
+  %13 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_minc(i8 signext 0) #2
+  %14 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 6
+  store i8 %13, i8 addrspace(1)* %14, align 1, !tbaa !7
+  %15 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_maxc(i8 signext 0) #2
+  %16 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 7
+  store i8 %15, i8 addrspace(1)* %16, align 1, !tbaa !7
+  %17 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_addc(i8 signext 0) #2
+  %18 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+  store i8 %17, i8 addrspace(1)* %18, align 1, !tbaa !7
+  %19 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_mulc(i8 signext 0) #2
+  %20 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 9
+  store i8 %19, i8 addrspace(1)* %20, align 1, !tbaa !7
+  %21 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_minc(i8 signext 0) #2
+  %22 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 10
+  store i8 %21, i8 addrspace(1)* %22, align 1, !tbaa !7
+  %23 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_maxc(i8 signext 0) #2
+  %24 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 11
+  store i8 %23, i8 addrspace(1)* %24, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z32sub_group_non_uniform_reduce_addc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z32sub_group_non_uniform_reduce_mulc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z32sub_group_non_uniform_reduce_minc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z32sub_group_non_uniform_reduce_maxc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_addc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_mulc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_minc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_maxc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_addc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_mulc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_minc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_maxc(i8 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticUChar
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_mulc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_minh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_maxh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_mulc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_minh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_maxh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_addc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_mulc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_minh(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_maxh(i8 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_addh(i8 zeroext 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_mulh(i8 zeroext 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_minh(i8 zeroext 0) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  %7 = tail call spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_maxh(i8 zeroext 0) #2
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !7
+  %9 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_addh(i8 zeroext 0) #2
+  %10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 4
+  store i8 %9, i8 addrspace(1)* %10, align 1, !tbaa !7
+  %11 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_mulh(i8 zeroext 0) #2
+  %12 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 5
+  store i8 %11, i8 addrspace(1)* %12, align 1, !tbaa !7
+  %13 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_minh(i8 zeroext 0) #2
+  %14 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 6
+  store i8 %13, i8 addrspace(1)* %14, align 1, !tbaa !7
+  %15 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_maxh(i8 zeroext 0) #2
+  %16 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 7
+  store i8 %15, i8 addrspace(1)* %16, align 1, !tbaa !7
+  %17 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_addh(i8 zeroext 0) #2
+  %18 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+  store i8 %17, i8 addrspace(1)* %18, align 1, !tbaa !7
+  %19 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_mulh(i8 zeroext 0) #2
+  %20 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 9
+  store i8 %19, i8 addrspace(1)* %20, align 1, !tbaa !7
+  %21 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_minh(i8 zeroext 0) #2
+  %22 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 10
+  store i8 %21, i8 addrspace(1)* %22, align 1, !tbaa !7
+  %23 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_maxh(i8 zeroext 0) #2
+  %24 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 11
+  store i8 %23, i8 addrspace(1)* %24, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_addh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_mulh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_minh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_maxh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_addh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_mulh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_minh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_maxh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_addh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_mulh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_minh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_maxh(i8 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticShort
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_muls(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_mins(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_maxs(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_muls(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_mins(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_maxs(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_muls(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_mins(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_maxs(i16 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i16 @_Z32sub_group_non_uniform_reduce_adds(i16 signext 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func signext i16 @_Z32sub_group_non_uniform_reduce_muls(i16 signext 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func signext i16 @_Z32sub_group_non_uniform_reduce_mins(i16 signext 0) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  %7 = tail call spir_func signext i16 @_Z32sub_group_non_uniform_reduce_maxs(i16 signext 0) #2
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !12
+  %9 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_adds(i16 signext 0) #2
+  %10 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 4
+  store i16 %9, i16 addrspace(1)* %10, align 2, !tbaa !12
+  %11 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_muls(i16 signext 0) #2
+  %12 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 5
+  store i16 %11, i16 addrspace(1)* %12, align 2, !tbaa !12
+  %13 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_mins(i16 signext 0) #2
+  %14 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 6
+  store i16 %13, i16 addrspace(1)* %14, align 2, !tbaa !12
+  %15 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_maxs(i16 signext 0) #2
+  %16 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 7
+  store i16 %15, i16 addrspace(1)* %16, align 2, !tbaa !12
+  %17 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_adds(i16 signext 0) #2
+  %18 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 8
+  store i16 %17, i16 addrspace(1)* %18, align 2, !tbaa !12
+  %19 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_muls(i16 signext 0) #2
+  %20 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 9
+  store i16 %19, i16 addrspace(1)* %20, align 2, !tbaa !12
+  %21 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_mins(i16 signext 0) #2
+  %22 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 10
+  store i16 %21, i16 addrspace(1)* %22, align 2, !tbaa !12
+  %23 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_maxs(i16 signext 0) #2
+  %24 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 11
+  store i16 %23, i16 addrspace(1)* %24, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z32sub_group_non_uniform_reduce_adds(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z32sub_group_non_uniform_reduce_muls(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z32sub_group_non_uniform_reduce_mins(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z32sub_group_non_uniform_reduce_maxs(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_adds(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_muls(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_mins(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_maxs(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_adds(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_muls(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_mins(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_maxs(i16 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticUShort
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_muls(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_mint(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_maxt(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_muls(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_mint(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_maxt(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_adds(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_muls(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_mint(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_maxt(i16 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_addt(i16 zeroext 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_mult(i16 zeroext 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_mint(i16 zeroext 0) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  %7 = tail call spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_maxt(i16 zeroext 0) #2
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !12
+  %9 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_addt(i16 zeroext 0) #2
+  %10 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 4
+  store i16 %9, i16 addrspace(1)* %10, align 2, !tbaa !12
+  %11 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_mult(i16 zeroext 0) #2
+  %12 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 5
+  store i16 %11, i16 addrspace(1)* %12, align 2, !tbaa !12
+  %13 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_mint(i16 zeroext 0) #2
+  %14 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 6
+  store i16 %13, i16 addrspace(1)* %14, align 2, !tbaa !12
+  %15 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_maxt(i16 zeroext 0) #2
+  %16 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 7
+  store i16 %15, i16 addrspace(1)* %16, align 2, !tbaa !12
+  %17 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_addt(i16 zeroext 0) #2
+  %18 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 8
+  store i16 %17, i16 addrspace(1)* %18, align 2, !tbaa !12
+  %19 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_mult(i16 zeroext 0) #2
+  %20 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 9
+  store i16 %19, i16 addrspace(1)* %20, align 2, !tbaa !12
+  %21 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_mint(i16 zeroext 0) #2
+  %22 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 10
+  store i16 %21, i16 addrspace(1)* %22, align 2, !tbaa !12
+  %23 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_maxt(i16 zeroext 0) #2
+  %24 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 11
+  store i16 %23, i16 addrspace(1)* %24, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_addt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_mult(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_mint(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_maxt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_addt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_mult(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_mint(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_maxt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_addt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_mult(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_mint(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_maxt(i16 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticInt
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_addi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_muli(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_mini(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_maxi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_addi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_muli(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_mini(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_maxi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_addi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_muli(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_mini(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_maxi(i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_addi(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_muli(i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_mini(i32 0) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_maxi(i32 0) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  %9 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_addi(i32 0) #2
+  %10 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4
+  store i32 %9, i32 addrspace(1)* %10, align 4, !tbaa !16
+  %11 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_muli(i32 0) #2
+  %12 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5
+  store i32 %11, i32 addrspace(1)* %12, align 4, !tbaa !16
+  %13 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_mini(i32 0) #2
+  %14 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6
+  store i32 %13, i32 addrspace(1)* %14, align 4, !tbaa !16
+  %15 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_maxi(i32 0) #2
+  %16 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 7
+  store i32 %15, i32 addrspace(1)* %16, align 4, !tbaa !16
+  %17 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_addi(i32 0) #2
+  %18 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 8
+  store i32 %17, i32 addrspace(1)* %18, align 4, !tbaa !16
+  %19 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_muli(i32 0) #2
+  %20 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 9
+  store i32 %19, i32 addrspace(1)* %20, align 4, !tbaa !16
+  %21 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_mini(i32 0) #2
+  %22 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 10
+  store i32 %21, i32 addrspace(1)* %22, align 4, !tbaa !16
+  %23 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_maxi(i32 0) #2
+  %24 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 11
+  store i32 %23, i32 addrspace(1)* %24, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_addi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_muli(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_mini(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_maxi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_addi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_muli(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_mini(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_maxi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_addi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_muli(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_mini(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_maxi(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticUInt
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_addi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_muli(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_minj(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_maxj(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_addi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_muli(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_minj(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_maxj(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_addi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_muli(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_minj(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_maxj(i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticUInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_addj(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_mulj(i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_minj(i32 0) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_maxj(i32 0) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  %9 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_addj(i32 0) #2
+  %10 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4
+  store i32 %9, i32 addrspace(1)* %10, align 4, !tbaa !16
+  %11 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_mulj(i32 0) #2
+  %12 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5
+  store i32 %11, i32 addrspace(1)* %12, align 4, !tbaa !16
+  %13 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_minj(i32 0) #2
+  %14 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6
+  store i32 %13, i32 addrspace(1)* %14, align 4, !tbaa !16
+  %15 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_maxj(i32 0) #2
+  %16 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 7
+  store i32 %15, i32 addrspace(1)* %16, align 4, !tbaa !16
+  %17 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_addj(i32 0) #2
+  %18 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 8
+  store i32 %17, i32 addrspace(1)* %18, align 4, !tbaa !16
+  %19 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_mulj(i32 0) #2
+  %20 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 9
+  store i32 %19, i32 addrspace(1)* %20, align 4, !tbaa !16
+  %21 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_minj(i32 0) #2
+  %22 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 10
+  store i32 %21, i32 addrspace(1)* %22, align 4, !tbaa !16
+  %23 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_maxj(i32 0) #2
+  %24 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 11
+  store i32 %23, i32 addrspace(1)* %24, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_addj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_mulj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_minj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_maxj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_addj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_mulj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_minj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_maxj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_addj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_mulj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_minj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_maxj(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformSMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformSMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticLong
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_addl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_mull(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_minl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_maxl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_addl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_mull(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_minl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_maxl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_addl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_mull(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_minl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_maxl(i64 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticLong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_addl(i64 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_mull(i64 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_minl(i64 0) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  %7 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_maxl(i64 0) #2
+  %8 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 3
+  store i64 %7, i64 addrspace(1)* %8, align 8, !tbaa !20
+  %9 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_addl(i64 0) #2
+  %10 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 4
+  store i64 %9, i64 addrspace(1)* %10, align 8, !tbaa !20
+  %11 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_mull(i64 0) #2
+  %12 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 5
+  store i64 %11, i64 addrspace(1)* %12, align 8, !tbaa !20
+  %13 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_minl(i64 0) #2
+  %14 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 6
+  store i64 %13, i64 addrspace(1)* %14, align 8, !tbaa !20
+  %15 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_maxl(i64 0) #2
+  %16 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 7
+  store i64 %15, i64 addrspace(1)* %16, align 8, !tbaa !20
+  %17 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_addl(i64 0) #2
+  %18 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 8
+  store i64 %17, i64 addrspace(1)* %18, align 8, !tbaa !20
+  %19 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_mull(i64 0) #2
+  %20 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 9
+  store i64 %19, i64 addrspace(1)* %20, align 8, !tbaa !20
+  %21 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_minl(i64 0) #2
+  %22 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 10
+  store i64 %21, i64 addrspace(1)* %22, align 8, !tbaa !20
+  %23 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_maxl(i64 0) #2
+  %24 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 11
+  store i64 %23, i64 addrspace(1)* %24, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_addl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_mull(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_minl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_maxl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_addl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_mull(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_minl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_maxl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_addl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_mull(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_minl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_maxl(i64) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIAdd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformIMul [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformUMin [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformUMax [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticULong
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_addl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_mull(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_minm(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_maxm(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_addl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_mull(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_minm(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_maxm(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_addl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_mull(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_minm(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_maxm(i64 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticULong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_addm(i64 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_mulm(i64 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_minm(i64 0) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  %7 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_maxm(i64 0) #2
+  %8 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 3
+  store i64 %7, i64 addrspace(1)* %8, align 8, !tbaa !20
+  %9 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_addm(i64 0) #2
+  %10 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 4
+  store i64 %9, i64 addrspace(1)* %10, align 8, !tbaa !20
+  %11 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_mulm(i64 0) #2
+  %12 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 5
+  store i64 %11, i64 addrspace(1)* %12, align 8, !tbaa !20
+  %13 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_minm(i64 0) #2
+  %14 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 6
+  store i64 %13, i64 addrspace(1)* %14, align 8, !tbaa !20
+  %15 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_maxm(i64 0) #2
+  %16 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 7
+  store i64 %15, i64 addrspace(1)* %16, align 8, !tbaa !20
+  %17 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_addm(i64 0) #2
+  %18 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 8
+  store i64 %17, i64 addrspace(1)* %18, align 8, !tbaa !20
+  %19 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_mulm(i64 0) #2
+  %20 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 9
+  store i64 %19, i64 addrspace(1)* %20, align 8, !tbaa !20
+  %21 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_minm(i64 0) #2
+  %22 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 10
+  store i64 %21, i64 addrspace(1)* %22, align 8, !tbaa !20
+  %23 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_maxm(i64 0) #2
+  %24 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 11
+  store i64 %23, i64 addrspace(1)* %24, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_addm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_mulm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_minm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_maxm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_addm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_mulm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_minm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_maxm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_addm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_mulm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_minm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_maxm(i64) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformFAdd [[float]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[float]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[float]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[float]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFAdd [[float]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[float]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[float]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[float]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFAdd [[float]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[float]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[float]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[float_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[float]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[float_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticFloat
+; CHECK-LLVM: call spir_func float @_Z32sub_group_non_uniform_reduce_addf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z32sub_group_non_uniform_reduce_mulf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z32sub_group_non_uniform_reduce_minf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z32sub_group_non_uniform_reduce_maxf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_addf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_mulf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_minf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_maxf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_addf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_mulf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_minf(float 0.000000e+00)
+; CHECK-LLVM: call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_maxf(float 0.000000e+00)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticFloat(float addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !23 !kernel_arg_base_type !23 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func float @_Z32sub_group_non_uniform_reduce_addf(float 0.000000e+00) #2
+  store float %2, float addrspace(1)* %0, align 4, !tbaa !24
+  %3 = tail call spir_func float @_Z32sub_group_non_uniform_reduce_mulf(float 0.000000e+00) #2
+  %4 = getelementptr inbounds float, float addrspace(1)* %0, i64 1
+  store float %3, float addrspace(1)* %4, align 4, !tbaa !24
+  %5 = tail call spir_func float @_Z32sub_group_non_uniform_reduce_minf(float 0.000000e+00) #2
+  %6 = getelementptr inbounds float, float addrspace(1)* %0, i64 2
+  store float %5, float addrspace(1)* %6, align 4, !tbaa !24
+  %7 = tail call spir_func float @_Z32sub_group_non_uniform_reduce_maxf(float 0.000000e+00) #2
+  %8 = getelementptr inbounds float, float addrspace(1)* %0, i64 3
+  store float %7, float addrspace(1)* %8, align 4, !tbaa !24
+  %9 = tail call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_addf(float 0.000000e+00) #2
+  %10 = getelementptr inbounds float, float addrspace(1)* %0, i64 4
+  store float %9, float addrspace(1)* %10, align 4, !tbaa !24
+  %11 = tail call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_mulf(float 0.000000e+00) #2
+  %12 = getelementptr inbounds float, float addrspace(1)* %0, i64 5
+  store float %11, float addrspace(1)* %12, align 4, !tbaa !24
+  %13 = tail call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_minf(float 0.000000e+00) #2
+  %14 = getelementptr inbounds float, float addrspace(1)* %0, i64 6
+  store float %13, float addrspace(1)* %14, align 4, !tbaa !24
+  %15 = tail call spir_func float @_Z40sub_group_non_uniform_scan_inclusive_maxf(float 0.000000e+00) #2
+  %16 = getelementptr inbounds float, float addrspace(1)* %0, i64 7
+  store float %15, float addrspace(1)* %16, align 4, !tbaa !24
+  %17 = tail call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_addf(float 0.000000e+00) #2
+  %18 = getelementptr inbounds float, float addrspace(1)* %0, i64 8
+  store float %17, float addrspace(1)* %18, align 4, !tbaa !24
+  %19 = tail call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_mulf(float 0.000000e+00) #2
+  %20 = getelementptr inbounds float, float addrspace(1)* %0, i64 9
+  store float %19, float addrspace(1)* %20, align 4, !tbaa !24
+  %21 = tail call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_minf(float 0.000000e+00) #2
+  %22 = getelementptr inbounds float, float addrspace(1)* %0, i64 10
+  store float %21, float addrspace(1)* %22, align 4, !tbaa !24
+  %23 = tail call spir_func float @_Z40sub_group_non_uniform_scan_exclusive_maxf(float 0.000000e+00) #2
+  %24 = getelementptr inbounds float, float addrspace(1)* %0, i64 11
+  store float %23, float addrspace(1)* %24, align 4, !tbaa !24
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z32sub_group_non_uniform_reduce_addf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z32sub_group_non_uniform_reduce_mulf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z32sub_group_non_uniform_reduce_minf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z32sub_group_non_uniform_reduce_maxf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_inclusive_addf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_inclusive_mulf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_inclusive_minf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_inclusive_maxf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_exclusive_addf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_exclusive_mulf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_exclusive_minf(float) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func float @_Z40sub_group_non_uniform_scan_exclusive_maxf(float) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformFAdd [[half]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[half]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[half]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[half]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFAdd [[half]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[half]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[half]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[half]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFAdd [[half]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[half]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[half]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[half_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[half]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[half_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticHalf
+; CHECK-LLVM: call spir_func half @_Z32sub_group_non_uniform_reduce_addDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z32sub_group_non_uniform_reduce_mulDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z32sub_group_non_uniform_reduce_minDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z32sub_group_non_uniform_reduce_maxDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_addDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_mulDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_minDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_maxDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_addDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_mulDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_minDh(half 0xH0000)
+; CHECK-LLVM: call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_maxDh(half 0xH0000)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticHalf(half addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !26 !kernel_arg_base_type !26 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func half @_Z32sub_group_non_uniform_reduce_addDh(half 0xH0000) #2
+  store half %2, half addrspace(1)* %0, align 2, !tbaa !27
+  %3 = tail call spir_func half @_Z32sub_group_non_uniform_reduce_mulDh(half 0xH0000) #2
+  %4 = getelementptr inbounds half, half addrspace(1)* %0, i64 1
+  store half %3, half addrspace(1)* %4, align 2, !tbaa !27
+  %5 = tail call spir_func half @_Z32sub_group_non_uniform_reduce_minDh(half 0xH0000) #2
+  %6 = getelementptr inbounds half, half addrspace(1)* %0, i64 2
+  store half %5, half addrspace(1)* %6, align 2, !tbaa !27
+  %7 = tail call spir_func half @_Z32sub_group_non_uniform_reduce_maxDh(half 0xH0000) #2
+  %8 = getelementptr inbounds half, half addrspace(1)* %0, i64 3
+  store half %7, half addrspace(1)* %8, align 2, !tbaa !27
+  %9 = tail call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_addDh(half 0xH0000) #2
+  %10 = getelementptr inbounds half, half addrspace(1)* %0, i64 4
+  store half %9, half addrspace(1)* %10, align 2, !tbaa !27
+  %11 = tail call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_mulDh(half 0xH0000) #2
+  %12 = getelementptr inbounds half, half addrspace(1)* %0, i64 5
+  store half %11, half addrspace(1)* %12, align 2, !tbaa !27
+  %13 = tail call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_minDh(half 0xH0000) #2
+  %14 = getelementptr inbounds half, half addrspace(1)* %0, i64 6
+  store half %13, half addrspace(1)* %14, align 2, !tbaa !27
+  %15 = tail call spir_func half @_Z40sub_group_non_uniform_scan_inclusive_maxDh(half 0xH0000) #2
+  %16 = getelementptr inbounds half, half addrspace(1)* %0, i64 7
+  store half %15, half addrspace(1)* %16, align 2, !tbaa !27
+  %17 = tail call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_addDh(half 0xH0000) #2
+  %18 = getelementptr inbounds half, half addrspace(1)* %0, i64 8
+  store half %17, half addrspace(1)* %18, align 2, !tbaa !27
+  %19 = tail call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_mulDh(half 0xH0000) #2
+  %20 = getelementptr inbounds half, half addrspace(1)* %0, i64 9
+  store half %19, half addrspace(1)* %20, align 2, !tbaa !27
+  %21 = tail call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_minDh(half 0xH0000) #2
+  %22 = getelementptr inbounds half, half addrspace(1)* %0, i64 10
+  store half %21, half addrspace(1)* %22, align 2, !tbaa !27
+  %23 = tail call spir_func half @_Z40sub_group_non_uniform_scan_exclusive_maxDh(half 0xH0000) #2
+  %24 = getelementptr inbounds half, half addrspace(1)* %0, i64 11
+  store half %23, half addrspace(1)* %24, align 2, !tbaa !27
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z32sub_group_non_uniform_reduce_addDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z32sub_group_non_uniform_reduce_mulDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z32sub_group_non_uniform_reduce_minDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z32sub_group_non_uniform_reduce_maxDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_inclusive_addDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_inclusive_mulDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_inclusive_minDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_inclusive_maxDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_exclusive_addDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_exclusive_mulDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_exclusive_minDh(half) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func half @_Z40sub_group_non_uniform_scan_exclusive_maxDh(half) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformFAdd [[double]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[double]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[double]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[double]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFAdd [[double]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[double]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[double]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[double]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFAdd [[double]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMul [[double]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMin [[double]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[double_0]]
+; CHECK-SPIRV: GroupNonUniformFMax [[double]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[double_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformArithmeticDouble
+; CHECK-LLVM: call spir_func double @_Z32sub_group_non_uniform_reduce_addd(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z32sub_group_non_uniform_reduce_muld(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z32sub_group_non_uniform_reduce_mind(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z32sub_group_non_uniform_reduce_maxd(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_addd(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_muld(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_mind(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_maxd(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_addd(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_muld(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_mind(double 0.000000e+00)
+; CHECK-LLVM: call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_maxd(double 0.000000e+00)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformArithmeticDouble(double addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !29 !kernel_arg_base_type !29 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func double @_Z32sub_group_non_uniform_reduce_addd(double 0.000000e+00) #2
+  store double %2, double addrspace(1)* %0, align 8, !tbaa !30
+  %3 = tail call spir_func double @_Z32sub_group_non_uniform_reduce_muld(double 0.000000e+00) #2
+  %4 = getelementptr inbounds double, double addrspace(1)* %0, i64 1
+  store double %3, double addrspace(1)* %4, align 8, !tbaa !30
+  %5 = tail call spir_func double @_Z32sub_group_non_uniform_reduce_mind(double 0.000000e+00) #2
+  %6 = getelementptr inbounds double, double addrspace(1)* %0, i64 2
+  store double %5, double addrspace(1)* %6, align 8, !tbaa !30
+  %7 = tail call spir_func double @_Z32sub_group_non_uniform_reduce_maxd(double 0.000000e+00) #2
+  %8 = getelementptr inbounds double, double addrspace(1)* %0, i64 3
+  store double %7, double addrspace(1)* %8, align 8, !tbaa !30
+  %9 = tail call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_addd(double 0.000000e+00) #2
+  %10 = getelementptr inbounds double, double addrspace(1)* %0, i64 4
+  store double %9, double addrspace(1)* %10, align 8, !tbaa !30
+  %11 = tail call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_muld(double 0.000000e+00) #2
+  %12 = getelementptr inbounds double, double addrspace(1)* %0, i64 5
+  store double %11, double addrspace(1)* %12, align 8, !tbaa !30
+  %13 = tail call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_mind(double 0.000000e+00) #2
+  %14 = getelementptr inbounds double, double addrspace(1)* %0, i64 6
+  store double %13, double addrspace(1)* %14, align 8, !tbaa !30
+  %15 = tail call spir_func double @_Z40sub_group_non_uniform_scan_inclusive_maxd(double 0.000000e+00) #2
+  %16 = getelementptr inbounds double, double addrspace(1)* %0, i64 7
+  store double %15, double addrspace(1)* %16, align 8, !tbaa !30
+  %17 = tail call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_addd(double 0.000000e+00) #2
+  %18 = getelementptr inbounds double, double addrspace(1)* %0, i64 8
+  store double %17, double addrspace(1)* %18, align 8, !tbaa !30
+  %19 = tail call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_muld(double 0.000000e+00) #2
+  %20 = getelementptr inbounds double, double addrspace(1)* %0, i64 9
+  store double %19, double addrspace(1)* %20, align 8, !tbaa !30
+  %21 = tail call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_mind(double 0.000000e+00) #2
+  %22 = getelementptr inbounds double, double addrspace(1)* %0, i64 10
+  store double %21, double addrspace(1)* %22, align 8, !tbaa !30
+  %23 = tail call spir_func double @_Z40sub_group_non_uniform_scan_exclusive_maxd(double 0.000000e+00) #2
+  %24 = getelementptr inbounds double, double addrspace(1)* %0, i64 11
+  store double %23, double addrspace(1)* %24, align 8, !tbaa !30
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z32sub_group_non_uniform_reduce_addd(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z32sub_group_non_uniform_reduce_muld(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z32sub_group_non_uniform_reduce_mind(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z32sub_group_non_uniform_reduce_maxd(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_inclusive_addd(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_inclusive_muld(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_inclusive_mind(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_inclusive_maxd(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_exclusive_addd(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_exclusive_muld(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_exclusive_mind(double) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func double @_Z40sub_group_non_uniform_scan_exclusive_maxd(double) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseChar
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_andc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z31sub_group_non_uniform_reduce_orc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_xorc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_andc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z39sub_group_non_uniform_scan_inclusive_orc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_xorc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_andc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z39sub_group_non_uniform_scan_exclusive_orc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_xorc(i8 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i8 @_Z32sub_group_non_uniform_reduce_andc(i8 signext 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func signext i8 @_Z31sub_group_non_uniform_reduce_orc(i8 signext 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func signext i8 @_Z32sub_group_non_uniform_reduce_xorc(i8 signext 0) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  %7 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_andc(i8 signext 0) #2
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !7
+  %9 = tail call spir_func signext i8 @_Z39sub_group_non_uniform_scan_inclusive_orc(i8 signext 0) #2
+  %10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 4
+  store i8 %9, i8 addrspace(1)* %10, align 1, !tbaa !7
+  %11 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_xorc(i8 signext 0) #2
+  %12 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 5
+  store i8 %11, i8 addrspace(1)* %12, align 1, !tbaa !7
+  %13 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_andc(i8 signext 0) #2
+  %14 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 6
+  store i8 %13, i8 addrspace(1)* %14, align 1, !tbaa !7
+  %15 = tail call spir_func signext i8 @_Z39sub_group_non_uniform_scan_exclusive_orc(i8 signext 0) #2
+  %16 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 7
+  store i8 %15, i8 addrspace(1)* %16, align 1, !tbaa !7
+  %17 = tail call spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_xorc(i8 signext 0) #2
+  %18 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+  store i8 %17, i8 addrspace(1)* %18, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z32sub_group_non_uniform_reduce_andc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z31sub_group_non_uniform_reduce_orc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z32sub_group_non_uniform_reduce_xorc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_andc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z39sub_group_non_uniform_scan_inclusive_orc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_inclusive_xorc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_andc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z39sub_group_non_uniform_scan_exclusive_orc(i8 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i8 @_Z40sub_group_non_uniform_scan_exclusive_xorc(i8 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[char]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[char_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseUChar
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_andc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z31sub_group_non_uniform_reduce_orc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z32sub_group_non_uniform_reduce_xorc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_andc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z39sub_group_non_uniform_scan_inclusive_orc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_inclusive_xorc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_andc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z39sub_group_non_uniform_scan_exclusive_orc(i8 0)
+; CHECK-LLVM: call spir_func i8 @_Z40sub_group_non_uniform_scan_exclusive_xorc(i8 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseUChar(i8 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_andh(i8 zeroext 0) #2
+  store i8 %2, i8 addrspace(1)* %0, align 1, !tbaa !7
+  %3 = tail call spir_func zeroext i8 @_Z31sub_group_non_uniform_reduce_orh(i8 zeroext 0) #2
+  %4 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 1
+  store i8 %3, i8 addrspace(1)* %4, align 1, !tbaa !7
+  %5 = tail call spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_xorh(i8 zeroext 0) #2
+  %6 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 2
+  store i8 %5, i8 addrspace(1)* %6, align 1, !tbaa !7
+  %7 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_andh(i8 zeroext 0) #2
+  %8 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 3
+  store i8 %7, i8 addrspace(1)* %8, align 1, !tbaa !7
+  %9 = tail call spir_func zeroext i8 @_Z39sub_group_non_uniform_scan_inclusive_orh(i8 zeroext 0) #2
+  %10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 4
+  store i8 %9, i8 addrspace(1)* %10, align 1, !tbaa !7
+  %11 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_xorh(i8 zeroext 0) #2
+  %12 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 5
+  store i8 %11, i8 addrspace(1)* %12, align 1, !tbaa !7
+  %13 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_andh(i8 zeroext 0) #2
+  %14 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 6
+  store i8 %13, i8 addrspace(1)* %14, align 1, !tbaa !7
+  %15 = tail call spir_func zeroext i8 @_Z39sub_group_non_uniform_scan_exclusive_orh(i8 zeroext 0) #2
+  %16 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 7
+  store i8 %15, i8 addrspace(1)* %16, align 1, !tbaa !7
+  %17 = tail call spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_xorh(i8 zeroext 0) #2
+  %18 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+  store i8 %17, i8 addrspace(1)* %18, align 1, !tbaa !7
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_andh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z31sub_group_non_uniform_reduce_orh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z32sub_group_non_uniform_reduce_xorh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_andh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z39sub_group_non_uniform_scan_inclusive_orh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_inclusive_xorh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_andh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z39sub_group_non_uniform_scan_exclusive_orh(i8 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i8 @_Z40sub_group_non_uniform_scan_exclusive_xorh(i8 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseShort
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_ands(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z31sub_group_non_uniform_reduce_ors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_xors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_ands(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z39sub_group_non_uniform_scan_inclusive_ors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_xors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_ands(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z39sub_group_non_uniform_scan_exclusive_ors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_xors(i16 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func signext i16 @_Z32sub_group_non_uniform_reduce_ands(i16 signext 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func signext i16 @_Z31sub_group_non_uniform_reduce_ors(i16 signext 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func signext i16 @_Z32sub_group_non_uniform_reduce_xors(i16 signext 0) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  %7 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_ands(i16 signext 0) #2
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !12
+  %9 = tail call spir_func signext i16 @_Z39sub_group_non_uniform_scan_inclusive_ors(i16 signext 0) #2
+  %10 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 4
+  store i16 %9, i16 addrspace(1)* %10, align 2, !tbaa !12
+  %11 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_xors(i16 signext 0) #2
+  %12 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 5
+  store i16 %11, i16 addrspace(1)* %12, align 2, !tbaa !12
+  %13 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_ands(i16 signext 0) #2
+  %14 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 6
+  store i16 %13, i16 addrspace(1)* %14, align 2, !tbaa !12
+  %15 = tail call spir_func signext i16 @_Z39sub_group_non_uniform_scan_exclusive_ors(i16 signext 0) #2
+  %16 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 7
+  store i16 %15, i16 addrspace(1)* %16, align 2, !tbaa !12
+  %17 = tail call spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_xors(i16 signext 0) #2
+  %18 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 8
+  store i16 %17, i16 addrspace(1)* %18, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z32sub_group_non_uniform_reduce_ands(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z31sub_group_non_uniform_reduce_ors(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z32sub_group_non_uniform_reduce_xors(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_ands(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z39sub_group_non_uniform_scan_inclusive_ors(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_inclusive_xors(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_ands(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z39sub_group_non_uniform_scan_exclusive_ors(i16 signext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func signext i16 @_Z40sub_group_non_uniform_scan_exclusive_xors(i16 signext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[short]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[short_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseUShort
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_ands(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z31sub_group_non_uniform_reduce_ors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z32sub_group_non_uniform_reduce_xors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_ands(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z39sub_group_non_uniform_scan_inclusive_ors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_inclusive_xors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_ands(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z39sub_group_non_uniform_scan_exclusive_ors(i16 0)
+; CHECK-LLVM: call spir_func i16 @_Z40sub_group_non_uniform_scan_exclusive_xors(i16 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseUShort(i16 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_andt(i16 zeroext 0) #2
+  store i16 %2, i16 addrspace(1)* %0, align 2, !tbaa !12
+  %3 = tail call spir_func zeroext i16 @_Z31sub_group_non_uniform_reduce_ort(i16 zeroext 0) #2
+  %4 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 1
+  store i16 %3, i16 addrspace(1)* %4, align 2, !tbaa !12
+  %5 = tail call spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_xort(i16 zeroext 0) #2
+  %6 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 2
+  store i16 %5, i16 addrspace(1)* %6, align 2, !tbaa !12
+  %7 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_andt(i16 zeroext 0) #2
+  %8 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 3
+  store i16 %7, i16 addrspace(1)* %8, align 2, !tbaa !12
+  %9 = tail call spir_func zeroext i16 @_Z39sub_group_non_uniform_scan_inclusive_ort(i16 zeroext 0) #2
+  %10 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 4
+  store i16 %9, i16 addrspace(1)* %10, align 2, !tbaa !12
+  %11 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_xort(i16 zeroext 0) #2
+  %12 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 5
+  store i16 %11, i16 addrspace(1)* %12, align 2, !tbaa !12
+  %13 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_andt(i16 zeroext 0) #2
+  %14 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 6
+  store i16 %13, i16 addrspace(1)* %14, align 2, !tbaa !12
+  %15 = tail call spir_func zeroext i16 @_Z39sub_group_non_uniform_scan_exclusive_ort(i16 zeroext 0) #2
+  %16 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 7
+  store i16 %15, i16 addrspace(1)* %16, align 2, !tbaa !12
+  %17 = tail call spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_xort(i16 zeroext 0) #2
+  %18 = getelementptr inbounds i16, i16 addrspace(1)* %0, i64 8
+  store i16 %17, i16 addrspace(1)* %18, align 2, !tbaa !12
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_andt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z31sub_group_non_uniform_reduce_ort(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z32sub_group_non_uniform_reduce_xort(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_andt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z39sub_group_non_uniform_scan_inclusive_ort(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_inclusive_xort(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_andt(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z39sub_group_non_uniform_scan_exclusive_ort(i16 zeroext) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func zeroext i16 @_Z40sub_group_non_uniform_scan_exclusive_xort(i16 zeroext) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseInt
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_andi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_reduce_ori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_xori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_andi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z39sub_group_non_uniform_scan_inclusive_ori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_xori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_andi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z39sub_group_non_uniform_scan_exclusive_ori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_xori(i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_andi(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z31sub_group_non_uniform_reduce_ori(i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_xori(i32 0) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_andi(i32 0) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  %9 = tail call spir_func i32 @_Z39sub_group_non_uniform_scan_inclusive_ori(i32 0) #2
+  %10 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4
+  store i32 %9, i32 addrspace(1)* %10, align 4, !tbaa !16
+  %11 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_xori(i32 0) #2
+  %12 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5
+  store i32 %11, i32 addrspace(1)* %12, align 4, !tbaa !16
+  %13 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_andi(i32 0) #2
+  %14 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6
+  store i32 %13, i32 addrspace(1)* %14, align 4, !tbaa !16
+  %15 = tail call spir_func i32 @_Z39sub_group_non_uniform_scan_exclusive_ori(i32 0) #2
+  %16 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 7
+  store i32 %15, i32 addrspace(1)* %16, align 4, !tbaa !16
+  %17 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_xori(i32 0) #2
+  %18 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 8
+  store i32 %17, i32 addrspace(1)* %18, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_andi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_reduce_ori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_xori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_andi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z39sub_group_non_uniform_scan_inclusive_ori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_xori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_andi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z39sub_group_non_uniform_scan_exclusive_ori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_xori(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[int]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[int_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseUInt
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_andi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z31sub_group_non_uniform_reduce_ori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z32sub_group_non_uniform_reduce_xori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_andi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z39sub_group_non_uniform_scan_inclusive_ori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_xori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_andi(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z39sub_group_non_uniform_scan_exclusive_ori(i32 0)
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_xori(i32 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseUInt(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_andj(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z31sub_group_non_uniform_reduce_orj(i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z32sub_group_non_uniform_reduce_xorj(i32 0) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_andj(i32 0) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  %9 = tail call spir_func i32 @_Z39sub_group_non_uniform_scan_inclusive_orj(i32 0) #2
+  %10 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4
+  store i32 %9, i32 addrspace(1)* %10, align 4, !tbaa !16
+  %11 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_xorj(i32 0) #2
+  %12 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5
+  store i32 %11, i32 addrspace(1)* %12, align 4, !tbaa !16
+  %13 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_andj(i32 0) #2
+  %14 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6
+  store i32 %13, i32 addrspace(1)* %14, align 4, !tbaa !16
+  %15 = tail call spir_func i32 @_Z39sub_group_non_uniform_scan_exclusive_orj(i32 0) #2
+  %16 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 7
+  store i32 %15, i32 addrspace(1)* %16, align 4, !tbaa !16
+  %17 = tail call spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_xorj(i32 0) #2
+  %18 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 8
+  store i32 %17, i32 addrspace(1)* %18, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_andj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z31sub_group_non_uniform_reduce_orj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z32sub_group_non_uniform_reduce_xorj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_andj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z39sub_group_non_uniform_scan_inclusive_orj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_inclusive_xorj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_andj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z39sub_group_non_uniform_scan_exclusive_orj(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_scan_exclusive_xorj(i32) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseLong
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_andl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z31sub_group_non_uniform_reduce_orl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_xorl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_andl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z39sub_group_non_uniform_scan_inclusive_orl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_xorl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_andl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z39sub_group_non_uniform_scan_exclusive_orl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_xorl(i64 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseLong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_andl(i64 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z31sub_group_non_uniform_reduce_orl(i64 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_xorl(i64 0) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  %7 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_andl(i64 0) #2
+  %8 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 3
+  store i64 %7, i64 addrspace(1)* %8, align 8, !tbaa !20
+  %9 = tail call spir_func i64 @_Z39sub_group_non_uniform_scan_inclusive_orl(i64 0) #2
+  %10 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 4
+  store i64 %9, i64 addrspace(1)* %10, align 8, !tbaa !20
+  %11 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_xorl(i64 0) #2
+  %12 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 5
+  store i64 %11, i64 addrspace(1)* %12, align 8, !tbaa !20
+  %13 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_andl(i64 0) #2
+  %14 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 6
+  store i64 %13, i64 addrspace(1)* %14, align 8, !tbaa !20
+  %15 = tail call spir_func i64 @_Z39sub_group_non_uniform_scan_exclusive_orl(i64 0) #2
+  %16 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 7
+  store i64 %15, i64 addrspace(1)* %16, align 8, !tbaa !20
+  %17 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_xorl(i64 0) #2
+  %18 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 8
+  store i64 %17, i64 addrspace(1)* %18, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_andl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z31sub_group_non_uniform_reduce_orl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_xorl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_andl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z39sub_group_non_uniform_scan_inclusive_orl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_xorl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_andl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z39sub_group_non_uniform_scan_exclusive_orl(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_xorl(i64) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseAnd [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseOr  [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: GroupNonUniformBitwiseXor [[long]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[long_0]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformBitwiseULong
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_andl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z31sub_group_non_uniform_reduce_orl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z32sub_group_non_uniform_reduce_xorl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_andl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z39sub_group_non_uniform_scan_inclusive_orl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_xorl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_andl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z39sub_group_non_uniform_scan_exclusive_orl(i64 0)
+; CHECK-LLVM: call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_xorl(i64 0)
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformBitwiseULong(i64 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_andm(i64 0) #2
+  store i64 %2, i64 addrspace(1)* %0, align 8, !tbaa !20
+  %3 = tail call spir_func i64 @_Z31sub_group_non_uniform_reduce_orm(i64 0) #2
+  %4 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 1
+  store i64 %3, i64 addrspace(1)* %4, align 8, !tbaa !20
+  %5 = tail call spir_func i64 @_Z32sub_group_non_uniform_reduce_xorm(i64 0) #2
+  %6 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 2
+  store i64 %5, i64 addrspace(1)* %6, align 8, !tbaa !20
+  %7 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_andm(i64 0) #2
+  %8 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 3
+  store i64 %7, i64 addrspace(1)* %8, align 8, !tbaa !20
+  %9 = tail call spir_func i64 @_Z39sub_group_non_uniform_scan_inclusive_orm(i64 0) #2
+  %10 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 4
+  store i64 %9, i64 addrspace(1)* %10, align 8, !tbaa !20
+  %11 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_xorm(i64 0) #2
+  %12 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 5
+  store i64 %11, i64 addrspace(1)* %12, align 8, !tbaa !20
+  %13 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_andm(i64 0) #2
+  %14 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 6
+  store i64 %13, i64 addrspace(1)* %14, align 8, !tbaa !20
+  %15 = tail call spir_func i64 @_Z39sub_group_non_uniform_scan_exclusive_orm(i64 0) #2
+  %16 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 7
+  store i64 %15, i64 addrspace(1)* %16, align 8, !tbaa !20
+  %17 = tail call spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_xorm(i64 0) #2
+  %18 = getelementptr inbounds i64, i64 addrspace(1)* %0, i64 8
+  store i64 %17, i64 addrspace(1)* %18, align 8, !tbaa !20
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_andm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z31sub_group_non_uniform_reduce_orm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z32sub_group_non_uniform_reduce_xorm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_andm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z39sub_group_non_uniform_scan_inclusive_orm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_inclusive_xorm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_andm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z39sub_group_non_uniform_scan_exclusive_orm(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i64 @_Z40sub_group_non_uniform_scan_exclusive_xorm(i64) local_unnamed_addr #1
+
+; CHECK-SPIRV-LABEL: 5 Function
+; CHECK-SPIRV: GroupNonUniformLogicalAnd [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalOr  [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalXor [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 0 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalAnd [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalOr  [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalXor [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 1 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalAnd [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalOr  [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[false]]
+; CHECK-SPIRV: GroupNonUniformLogicalXor [[bool]] {{[0-9]+}} [[ScopeSubgroup]] 2 [[false]]
+; CHECK-SPIRV: FunctionEnd
+
+; CHECK-LLVM-LABEL: @testNonUniformLogical
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_reduce_logical_andi(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z39sub_group_non_uniform_reduce_logical_ori(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z40sub_group_non_uniform_reduce_logical_xori(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z48sub_group_non_uniform_scan_inclusive_logical_andi(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z47sub_group_non_uniform_scan_inclusive_logical_ori(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z48sub_group_non_uniform_scan_inclusive_logical_xori(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z48sub_group_non_uniform_scan_exclusive_logical_andi(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z47sub_group_non_uniform_scan_exclusive_logical_ori(i32 {{.*}})
+; CHECK-LLVM: call spir_func i32 @_Z48sub_group_non_uniform_scan_exclusive_logical_xori(i32 {{.*}})
+
+; Function Attrs: convergent nounwind
+define dso_local spir_kernel void @testNonUniformLogical(i32 addrspace(1)* nocapture) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+  %2 = tail call spir_func i32 @_Z40sub_group_non_uniform_reduce_logical_andi(i32 0) #2
+  store i32 %2, i32 addrspace(1)* %0, align 4, !tbaa !16
+  %3 = tail call spir_func i32 @_Z39sub_group_non_uniform_reduce_logical_ori(i32 0) #2
+  %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1
+  store i32 %3, i32 addrspace(1)* %4, align 4, !tbaa !16
+  %5 = tail call spir_func i32 @_Z40sub_group_non_uniform_reduce_logical_xori(i32 0) #2
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2
+  store i32 %5, i32 addrspace(1)* %6, align 4, !tbaa !16
+  %7 = tail call spir_func i32 @_Z48sub_group_non_uniform_scan_inclusive_logical_andi(i32 0) #2
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3
+  store i32 %7, i32 addrspace(1)* %8, align 4, !tbaa !16
+  %9 = tail call spir_func i32 @_Z47sub_group_non_uniform_scan_inclusive_logical_ori(i32 0) #2
+  %10 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4
+  store i32 %9, i32 addrspace(1)* %10, align 4, !tbaa !16
+  %11 = tail call spir_func i32 @_Z48sub_group_non_uniform_scan_inclusive_logical_xori(i32 0) #2
+  %12 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5
+  store i32 %11, i32 addrspace(1)* %12, align 4, !tbaa !16
+  %13 = tail call spir_func i32 @_Z48sub_group_non_uniform_scan_exclusive_logical_andi(i32 0) #2
+  %14 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6
+  store i32 %13, i32 addrspace(1)* %14, align 4, !tbaa !16
+  %15 = tail call spir_func i32 @_Z47sub_group_non_uniform_scan_exclusive_logical_ori(i32 0) #2
+  %16 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 7
+  store i32 %15, i32 addrspace(1)* %16, align 4, !tbaa !16
+  %17 = tail call spir_func i32 @_Z48sub_group_non_uniform_scan_exclusive_logical_xori(i32 0) #2
+  %18 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 8
+  store i32 %17, i32 addrspace(1)* %18, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_reduce_logical_andi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z39sub_group_non_uniform_reduce_logical_ori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z40sub_group_non_uniform_reduce_logical_xori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z48sub_group_non_uniform_scan_inclusive_logical_andi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z47sub_group_non_uniform_scan_inclusive_logical_ori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z48sub_group_non_uniform_scan_inclusive_logical_xori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z48sub_group_non_uniform_scan_exclusive_logical_andi(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z47sub_group_non_uniform_scan_exclusive_logical_ori(i32) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare dso_local spir_func i32 @_Z48sub_group_non_uniform_scan_exclusive_logical_xori(i32) local_unnamed_addr #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 9.0.1 (https://github.com/llvm/llvm-project.git cb6d58d1dcf36a29ae5dd24ff891d6552f00bac7)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!"uchar*"}
+!11 = !{!"short*"}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"short", !8, i64 0}
+!14 = !{!"ushort*"}
+!15 = !{!"int*"}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !8, i64 0}
+!18 = !{!"uint*"}
+!19 = !{!"long*"}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"long", !8, i64 0}
+!22 = !{!"ulong*"}
+!23 = !{!"float*"}
+!24 = !{!25, !25, i64 0}
+!25 = !{!"float", !8, i64 0}
+!26 = !{!"half*"}
+!27 = !{!28, !28, i64 0}
+!28 = !{!"half", !8, i64 0}
+!29 = !{!"double*"}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"double", !8, i64 0}
\ No newline at end of file

From d41ea9932b02ce75b83929b9ac51964cd7746ccc Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 21 Apr 2020 23:00:45 -0700
Subject: [PATCH 758/770] formatted with clang-format

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp   |  36 ++---
 llvm-spirv/lib/SPIRV/OCLUtil.h          |   2 +-
 llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp | 179 ++++++++++++++++++++++++
 3 files changed, 198 insertions(+), 19 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 882cb033fe3c2..e48a760776f5d 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -938,23 +938,23 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
           if (!FuncName.startswith(S))
             return true; // continue
           PreOps.push_back(G);
-          StringRef Op = StringSwitch<StringRef>(FuncName)
-            .StartsWith("ballot", "group_ballot_bit_count_")
-            .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
-            .Default(kSPIRVName::GroupPrefix);
+          StringRef Op =
+              StringSwitch<StringRef>(FuncName)
+                  .StartsWith("ballot", "group_ballot_bit_count_")
+                  .StartsWith("non_uniform", kSPIRVName::GroupNonUniformPrefix)
+                  .Default(kSPIRVName::GroupPrefix);
           // clustered functions are handled with non uniform group opcodes
           StringRef ClusteredOp =
               FuncName.contains("clustered_") ? "non_uniform_" : "";
-          StringRef LogicalOp =
-            FuncName.contains("logical_") ?
-            "logical_" : "";
+          StringRef LogicalOp = FuncName.contains("logical_") ? "logical_" : "";
           StringRef GroupOp = StringSwitch<StringRef>(FuncName)
-            .Case("ballot_bit_count", "add")
-            .Case("ballot_inclusive_scan", "add")
-            .Case("ballot_exclusive_scan", "add")
-            .Default(FuncName.take_back(3));    // assumes op is three characters
+                                  .Case("ballot_bit_count", "add")
+                                  .Case("ballot_inclusive_scan", "add")
+                                  .Case("ballot_exclusive_scan", "add")
+                                  .Default(FuncName.take_back(
+                                      3)); // assumes op is three characters
           if (GroupOp.startswith("_"))
-            GroupOp = GroupOp.take_back(2);     // when op is two characters
+            GroupOp = GroupOp.take_back(2); // when op is two characters
           assert(!GroupOp.empty() && "Invalid OpenCL group builtin function");
           char OpTyC = 0;
           auto OpTy = F->getReturnType();
@@ -968,10 +968,9 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
               // clustered reduce args are (type, uint)
               // other operation args are (type)
               auto mangledName = F->getName();
-              auto mangledTyC =
-                ClusteredOp.empty() ?
-                mangledName.back() :
-                mangledName.take_back(2).front();
+              auto mangledTyC = ClusteredOp.empty()
+                                    ? mangledName.back()
+                                    : mangledName.take_back(2).front();
               if (isMangledTypeSigned(mangledTyC))
                 OpTyC = 's';
               else
@@ -981,7 +980,7 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
             llvm_unreachable("Invalid OpenCL group builtin argument type");
 
           DemangledName = Op.str() + ClusteredOp.str() + LogicalOp.str() +
-            OpTyC + GroupOp.str();
+                          OpTyC + GroupOp.str();
           return false; // break out of loop
         });
   }
@@ -996,7 +995,8 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
   const bool IsLogical = DemangledName.find("_logical") != std::string::npos;
 
   const bool HasBoolReturnType = IsElect || IsAllOrAny || IsAllEqual ||
-      IsInverseBallot || IsBallotBitExtract || IsLogical;
+                                 IsInverseBallot || IsBallotBitExtract ||
+                                 IsLogical;
   const bool HasBoolArg = (IsAllOrAny && !IsAllEqual) || IsBallot || IsLogical;
 
   auto Consts = getInt32(M, PreOps);
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index 8c30718fd9183..d91316e0a310f 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -736,7 +736,7 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   _SPIRV_OP(max, SMax)
   _SPIRV_OP(and, And)
   _SPIRV_OP(or, Or)
-  _SPIRV_OP (xor, Xor)
+  _SPIRV_OP(xor, Xor)
 #undef _SPIRV_OP
 #define _SPIRV_OP(x, y) add("atomic_" #x, Op##y);
   // CL 2.0 atomic builtins
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp b/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp
index f432a85c80177..cb4cacb7c0c61 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp
@@ -432,6 +432,7 @@ enum Decoration {
   DecorationNonWritable = 24,
   DecorationNonReadable = 25,
   DecorationUniform = 26,
+  DecorationUniformId = 27,
   DecorationSaturatedConversion = 28,
   DecorationStream = 29,
   DecorationLocation = 30,
@@ -450,15 +451,32 @@ enum Decoration {
   DecorationInputAttachmentIndex = 43,
   DecorationAlignment = 44,
   DecorationMaxByteOffset = 45,
+  DecorationAlignmentId = 46,
+  DecorationMaxByteOffsetId = 47,
   DecorationNoSignedWrap = 4469,
   DecorationNoUnsignedWrap = 4470,
+  DecorationExplicitInterpAMD = 4999,
   DecorationOverrideCoverageNV = 5248,
   DecorationPassthroughNV = 5250,
   DecorationViewportRelativeNV = 5252,
   DecorationSecondaryViewportRelativeNV = 5256,
+  DecorationPerPrimitiveNV = 5271,
+  DecorationPerViewNV = 5272,
+  DecorationPerTaskNV = 5273,
+  DecorationPerVertexNV = 5285,
+  DecorationNonUniform = 5300,
+  DecorationNonUniformEXT = 5300,
+  DecorationRestrictPointer = 5355,
+  DecorationRestrictPointerEXT = 5355,
+  DecorationAliasedPointer = 5356,
+  DecorationAliasedPointerEXT = 5356,
   DecorationReferencedIndirectlyINTEL = 5602,
   DecorationSideEffectsINTEL = 5608,
+  DecorationCounterBuffer = 5634,
+  DecorationHlslCounterBufferGOOGLE = 5634,
+  DecorationHlslSemanticGOOGLE = 5635,
   DecorationUserSemantic = 5635,
+  DecorationUserTypeGOOGLE = 5636,
   DecorationRegisterINTEL = 5825,
   DecorationMemoryINTEL = 5826,
   DecorationNumbanksINTEL = 5827,
@@ -809,6 +827,16 @@ enum Capability {
   CapabilitySubgroupDispatch = 58,
   CapabilityNamedBarrier = 59,
   CapabilityPipeStorage = 60,
+  CapabilityGroupNonUniform = 61,
+  CapabilityGroupNonUniformVote = 62,
+  CapabilityGroupNonUniformArithmetic = 63,
+  CapabilityGroupNonUniformBallot = 64,
+  CapabilityGroupNonUniformShuffle = 65,
+  CapabilityGroupNonUniformShuffleRelative = 66,
+  CapabilityGroupNonUniformClustered = 67,
+  CapabilityGroupNonUniformQuad = 68,
+  CapabilityShaderLayer = 69,
+  CapabilityShaderViewportIndex = 70,
   CapabilitySubgroupBallotKHR = 4423,
   CapabilityDrawParameters = 4427,
   CapabilitySubgroupVoteKHR = 4431,
@@ -822,16 +850,80 @@ enum Capability {
   CapabilityMultiView = 4439,
   CapabilityVariablePointersStorageBuffer = 4441,
   CapabilityVariablePointers = 4442,
+  CapabilityAtomicStorageOps = 4445,
+  CapabilitySampleMaskPostDepthCoverage = 4447,
+  CapabilityStorageBuffer8BitAccess = 4448,
+  CapabilityUniformAndStorageBuffer8BitAccess = 4449,
+  CapabilityStoragePushConstant8 = 4450,
+  CapabilityDenormPreserve = 4464,
+  CapabilityDenormFlushToZero = 4465,
+  CapabilitySignedZeroInfNanPreserve = 4466,
+  CapabilityRoundingModeRTE = 4467,
+  CapabilityRoundingModeRTZ = 4468,
+  CapabilityFloat16ImageAMD = 5008,
+  CapabilityImageGatherBiasLodAMD = 5009,
+  CapabilityFragmentMaskAMD = 5010,
+  CapabilityStencilExportEXT = 5013,
+  CapabilityImageReadWriteLodAMD = 5015,
+  CapabilityShaderClockKHR = 5055,
   CapabilitySampleMaskOverrideCoverageNV = 5249,
   CapabilityGeometryShaderPassthroughNV = 5251,
+  CapabilityShaderViewportIndexLayerEXT = 5254,
   CapabilityShaderViewportIndexLayerNV = 5254,
   CapabilityShaderViewportMaskNV = 5255,
   CapabilityShaderStereoViewNV = 5259,
   CapabilityPerViewAttributesNV = 5260,
+  CapabilityFragmentFullyCoveredEXT = 5265,
+  CapabilityMeshShadingNV = 5266,
+  CapabilityImageFootprintNV = 5282,
+  CapabilityFragmentBarycentricNV = 5284,
+  CapabilityComputeDerivativeGroupQuadsNV = 5288,
+  CapabilityFragmentDensityEXT = 5291,
+  CapabilityShadingRateNV = 5291,
+  CapabilityGroupNonUniformPartitionedNV = 5297,
+  CapabilityShaderNonUniform = 5301,
+  CapabilityShaderNonUniformEXT = 5301,
+  CapabilityRuntimeDescriptorArray = 5302,
+  CapabilityRuntimeDescriptorArrayEXT = 5302,
+  CapabilityInputAttachmentArrayDynamicIndexing = 5303,
+  CapabilityInputAttachmentArrayDynamicIndexingEXT = 5303,
+  CapabilityUniformTexelBufferArrayDynamicIndexing = 5304,
+  CapabilityUniformTexelBufferArrayDynamicIndexingEXT = 5304,
+  CapabilityStorageTexelBufferArrayDynamicIndexing = 5305,
+  CapabilityStorageTexelBufferArrayDynamicIndexingEXT = 5305,
+  CapabilityUniformBufferArrayNonUniformIndexing = 5306,
+  CapabilityUniformBufferArrayNonUniformIndexingEXT = 5306,
+  CapabilitySampledImageArrayNonUniformIndexing = 5307,
+  CapabilitySampledImageArrayNonUniformIndexingEXT = 5307,
+  CapabilityStorageBufferArrayNonUniformIndexing = 5308,
+  CapabilityStorageBufferArrayNonUniformIndexingEXT = 5308,
+  CapabilityStorageImageArrayNonUniformIndexing = 5309,
+  CapabilityStorageImageArrayNonUniformIndexingEXT = 5309,
+  CapabilityInputAttachmentArrayNonUniformIndexing = 5310,
+  CapabilityInputAttachmentArrayNonUniformIndexingEXT = 5310,
+  CapabilityUniformTexelBufferArrayNonUniformIndexing = 5311,
+  CapabilityUniformTexelBufferArrayNonUniformIndexingEXT = 5311,
+  CapabilityStorageTexelBufferArrayNonUniformIndexing = 5312,
+  CapabilityStorageTexelBufferArrayNonUniformIndexingEXT = 5312,
+  CapabilityRayTracingNV = 5340,
+  CapabilityVulkanMemoryModel = 5345,
+  CapabilityVulkanMemoryModelKHR = 5345,
+  CapabilityVulkanMemoryModelDeviceScope = 5346,
+  CapabilityVulkanMemoryModelDeviceScopeKHR = 5346,
+  CapabilityPhysicalStorageBufferAddresses = 5347,
+  CapabilityPhysicalStorageBufferAddressesEXT = 5347,
+  CapabilityComputeDerivativeGroupLinearNV = 5350,
+  CapabilityCooperativeMatrixNV = 5357,
+  CapabilityFragmentShaderSampleInterlockEXT = 5363,
+  CapabilityFragmentShaderShadingRateInterlockEXT = 5372,
+  CapabilityShaderSMBuiltinsNV = 5373,
+  CapabilityFragmentShaderPixelInterlockEXT = 5378,
+  CapabilityDemoteToHelperInvocationEXT = 5379,
   CapabilitySubgroupShuffleINTEL = 5568,
   CapabilitySubgroupBufferBlockIOINTEL = 5569,
   CapabilitySubgroupImageBlockIOINTEL = 5570,
   CapabilitySubgroupImageMediaBlockIOINTEL = 5579,
+  CapabilityIntegerFunctions2INTEL = 5584,
   CapabilityFunctionPointersINTEL = 5603,
   CapabilityIndirectReferencesINTEL = 5604,
   CapabilityAsmINTEL = 5606,
@@ -1156,6 +1248,46 @@ enum Op {
   OpNamedBarrierInitialize = 328,
   OpMemoryNamedBarrier = 329,
   OpModuleProcessed = 330,
+  OpExecutionModeId = 331,
+  OpDecorateId = 332,
+  OpGroupNonUniformElect = 333,
+  OpGroupNonUniformAll = 334,
+  OpGroupNonUniformAny = 335,
+  OpGroupNonUniformAllEqual = 336,
+  OpGroupNonUniformBroadcast = 337,
+  OpGroupNonUniformBroadcastFirst = 338,
+  OpGroupNonUniformBallot = 339,
+  OpGroupNonUniformInverseBallot = 340,
+  OpGroupNonUniformBallotBitExtract = 341,
+  OpGroupNonUniformBallotBitCount = 342,
+  OpGroupNonUniformBallotFindLSB = 343,
+  OpGroupNonUniformBallotFindMSB = 344,
+  OpGroupNonUniformShuffle = 345,
+  OpGroupNonUniformShuffleXor = 346,
+  OpGroupNonUniformShuffleUp = 347,
+  OpGroupNonUniformShuffleDown = 348,
+  OpGroupNonUniformIAdd = 349,
+  OpGroupNonUniformFAdd = 350,
+  OpGroupNonUniformIMul = 351,
+  OpGroupNonUniformFMul = 352,
+  OpGroupNonUniformSMin = 353,
+  OpGroupNonUniformUMin = 354,
+  OpGroupNonUniformFMin = 355,
+  OpGroupNonUniformSMax = 356,
+  OpGroupNonUniformUMax = 357,
+  OpGroupNonUniformFMax = 358,
+  OpGroupNonUniformBitwiseAnd = 359,
+  OpGroupNonUniformBitwiseOr = 360,
+  OpGroupNonUniformBitwiseXor = 361,
+  OpGroupNonUniformLogicalAnd = 362,
+  OpGroupNonUniformLogicalOr = 363,
+  OpGroupNonUniformLogicalXor = 364,
+  OpGroupNonUniformQuadBroadcast = 365,
+  OpGroupNonUniformQuadSwap = 366,
+  OpCopyLogical = 400,
+  OpPtrEqual = 401,
+  OpPtrNotEqual = 402,
+  OpPtrDiff = 403,
   OpForward = 1024, /* internal use only */
   OpSubgroupBallotKHR = 4421,
   OpSubgroupFirstInvocationKHR = 4422,
@@ -1163,6 +1295,35 @@ enum Op {
   OpSubgroupAnyKHR = 4429,
   OpSubgroupAllEqualKHR = 4430,
   OpSubgroupReadInvocationKHR = 4432,
+  OpGroupIAddNonUniformAMD = 5000,
+  OpGroupFAddNonUniformAMD = 5001,
+  OpGroupFMinNonUniformAMD = 5002,
+  OpGroupUMinNonUniformAMD = 5003,
+  OpGroupSMinNonUniformAMD = 5004,
+  OpGroupFMaxNonUniformAMD = 5005,
+  OpGroupUMaxNonUniformAMD = 5006,
+  OpGroupSMaxNonUniformAMD = 5007,
+  OpFragmentMaskFetchAMD = 5011,
+  OpFragmentFetchAMD = 5012,
+  OpReadClockKHR = 5056,
+  OpImageSampleFootprintNV = 5283,
+  OpGroupNonUniformPartitionNV = 5296,
+  OpWritePackedPrimitiveIndices4x8NV = 5299,
+  OpReportIntersectionNV = 5334,
+  OpIgnoreIntersectionNV = 5335,
+  OpTerminateRayNV = 5336,
+  OpTraceNV = 5337,
+  OpTypeAccelerationStructureNV = 5341,
+  OpExecuteCallableNV = 5344,
+  OpTypeCooperativeMatrixNV = 5358,
+  OpCooperativeMatrixLoadNV = 5359,
+  OpCooperativeMatrixStoreNV = 5360,
+  OpCooperativeMatrixMulAddNV = 5361,
+  OpCooperativeMatrixLengthNV = 5362,
+  OpBeginInvocationInterlockEXT = 5364,
+  OpEndInvocationInterlockEXT = 5365,
+  OpDemoteToHelperInvocationEXT = 5380,
+  OpIsHelperInvocationEXT = 5381,
   OpSubgroupShuffleINTEL = 5571,
   OpSubgroupShuffleDownINTEL = 5572,
   OpSubgroupShuffleUpINTEL = 5573,
@@ -1173,6 +1334,20 @@ enum Op {
   OpSubgroupImageBlockWriteINTEL = 5578,
   OpSubgroupImageMediaBlockReadINTEL = 5580,
   OpSubgroupImageMediaBlockWriteINTEL = 5581,
+  OpUCountLeadingZerosINTEL = 5585,
+  OpUCountTrailingZerosINTEL = 5586,
+  OpAbsISubINTEL = 5587,
+  OpAbsUSubINTEL = 5588,
+  OpIAddSatINTEL = 5589,
+  OpUAddSatINTEL = 5590,
+  OpIAverageINTEL = 5591,
+  OpUAverageINTEL = 5592,
+  OpIAverageRoundedINTEL = 5593,
+  OpUAverageRoundedINTEL = 5594,
+  OpISubSatINTEL = 5595,
+  OpUSubSatINTEL = 5596,
+  OpIMul32x16INTEL = 5597,
+  OpUMul32x16INTEL = 5598,
   OpFunctionPointerINTEL = 5600,
   OpFunctionPointerCallINTEL = 5601,
   OpAsmTargetINTEL = 5609,
@@ -1180,6 +1355,10 @@ enum Op {
   OpAsmCallINTEL = 5611,
   OpAssumeTrueINTEL = 5630,
   OpExpectINTEL = 5631,
+  OpDecorateString = 5632,
+  OpDecorateStringGOOGLE = 5632,
+  OpMemberDecorateString = 5633,
+  OpMemberDecorateStringGOOGLE = 5633,
   OpVmeImageINTEL = 5699,
   OpTypeVmeImageINTEL = 5700,
   OpTypeAvcImePayloadINTEL = 5701,

From 6d3b3ef7e3f655e67950e3be792a0bac8c51f545 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Tue, 21 Apr 2020 11:08:26 +0200
Subject: [PATCH 759/770] Handle capabilities related to
 cl_khr_subgroup_extensions.

---
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     | 92 ++++++++++++++++++-
 .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h     |  8 ++
 .../test/transcoding/sub_group_ballot.ll      |  2 +
 .../transcoding/sub_group_clustered_reduce.ll |  2 +
 .../sub_group_non_uniform_arithmetic.ll       |  2 +
 .../transcoding/sub_group_non_uniform_vote.ll |  3 +
 .../test/transcoding/sub_group_shuffle.ll     |  2 +
 .../transcoding/sub_group_shuffle_relative.ll |  2 +
 8 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 73e681e6521dd..375d08b83ff93 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2516,10 +2516,47 @@ _SPIRV_OP(GroupReserveReadPipePackets, true, 8)
 _SPIRV_OP(GroupReserveWritePipePackets, true, 8)
 _SPIRV_OP(GroupCommitReadPipe, false, 6)
 _SPIRV_OP(GroupCommitWritePipe, false, 6)
+#undef _SPIRV_OP
+
+class SPIRVGroupNonUniformElectInst : public SPIRVInstTemplateBase {
+public:
+  SPIRVCapVec getRequiredCapability() const override {
+    return getVec(CapabilityGroupNonUniform);
+  }
+};
+
+#define _SPIRV_OP(x, ...)                                                      \
+  typedef SPIRVInstTemplate<SPIRVGroupNonUniformElectInst, Op##x, __VA_ARGS__> \
+      SPIRV##x;
 _SPIRV_OP(GroupNonUniformElect, true, 4)
+#undef _SPIRV_OP
+
+class SPIRVGroupNonUniformVoteInst : public SPIRVInstTemplateBase {
+public:
+  SPIRVCapVec getRequiredCapability() const override {
+    return getVec(CapabilityGroupNonUniformVote);
+  }
+};
+
+#define _SPIRV_OP(x, ...)                                                      \
+  typedef SPIRVInstTemplate<SPIRVGroupNonUniformVoteInst, Op##x, __VA_ARGS__>  \
+      SPIRV##x;
 _SPIRV_OP(GroupNonUniformAll, true, 5)
 _SPIRV_OP(GroupNonUniformAny, true, 5)
 _SPIRV_OP(GroupNonUniformAllEqual, true, 5)
+#undef _SPIRV_OP
+
+class SPIRVGroupNonUniformBallotInst : public SPIRVInstTemplateBase {
+public:
+  SPIRVCapVec getRequiredCapability() const override {
+    return getVec(CapabilityGroupNonUniformBallot);
+  }
+};
+
+#define _SPIRV_OP(x, ...)                                                      \
+  typedef SPIRVInstTemplate<SPIRVGroupNonUniformBallotInst, Op##x,             \
+                            __VA_ARGS__>                                       \
+      SPIRV##x;
 _SPIRV_OP(GroupNonUniformBroadcast, true, 6)
 _SPIRV_OP(GroupNonUniformBroadcastFirst, true, 5)
 _SPIRV_OP(GroupNonUniformBallot, true, 5)
@@ -2528,10 +2565,28 @@ _SPIRV_OP(GroupNonUniformBallotBitExtract, true, 6)
 _SPIRV_OP(GroupNonUniformBallotBitCount, true, 6, false, 1)
 _SPIRV_OP(GroupNonUniformBallotFindLSB, true, 5)
 _SPIRV_OP(GroupNonUniformBallotFindMSB, true, 5)
-_SPIRV_OP(GroupNonUniformShuffle, true, 6)
-_SPIRV_OP(GroupNonUniformShuffleXor, true, 6)
-_SPIRV_OP(GroupNonUniformShuffleUp, true, 6)
-_SPIRV_OP(GroupNonUniformShuffleDown, true, 6)
+#undef _SPIRV_OP
+
+class SPIRVGroupNonUniformArithmeticInst : public SPIRVInstTemplateBase {
+public:
+  void setOpWords(const std::vector<SPIRVWord> &Ops) override {
+    SPIRVInstTemplateBase::setOpWords(Ops);
+    SPIRVGroupOperationKind GroupOp;
+    if (getSPIRVGroupOperation(GroupOp)) {
+      if (GroupOp == GroupOperationClusteredReduce)
+        Module->addCapability(CapabilityGroupNonUniformClustered);
+      else
+        Module->addCapability(CapabilityGroupNonUniformArithmetic);
+    } else
+      llvm_unreachable(
+          "GroupNonUniformArithmeticInst has no group operation operand!");
+  }
+};
+
+#define _SPIRV_OP(x, ...)                                                      \
+  typedef SPIRVInstTemplate<SPIRVGroupNonUniformArithmeticInst, Op##x,         \
+                            __VA_ARGS__>                                       \
+      SPIRV##x;
 _SPIRV_OP(GroupNonUniformIAdd, true, 6, true, 1)
 _SPIRV_OP(GroupNonUniformFAdd, true, 6, true, 1)
 _SPIRV_OP(GroupNonUniformIMul, true, 6, true, 1)
@@ -2548,7 +2603,36 @@ _SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, true, 1)
 _SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, true, 1)
 _SPIRV_OP(GroupNonUniformLogicalOr, true, 6, true, 1)
 _SPIRV_OP(GroupNonUniformLogicalXor, true, 6, true, 1)
+#undef _SPIRV_OP
+
+class SPIRVGroupNonUniformShuffleInst : public SPIRVInstTemplateBase {
+public:
+  SPIRVCapVec getRequiredCapability() const override {
+    return getVec(CapabilityGroupNonUniformShuffle);
+  }
+};
+
+#define _SPIRV_OP(x, ...)                                                      \
+  typedef SPIRVInstTemplate<SPIRVGroupNonUniformShuffleInst, Op##x,            \
+                            __VA_ARGS__>                                       \
+      SPIRV##x;
+_SPIRV_OP(GroupNonUniformShuffle, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleXor, true, 6)
+#undef _SPIRV_OP
+
+class SPIRVGroupNonUniformShuffleRelativeInst : public SPIRVInstTemplateBase {
+public:
+  SPIRVCapVec getRequiredCapability() const override {
+    return getVec(CapabilityGroupNonUniformShuffleRelative);
+  }
+};
 
+#define _SPIRV_OP(x, ...)                                                      \
+  typedef SPIRVInstTemplate<SPIRVGroupNonUniformShuffleRelativeInst, Op##x,    \
+                            __VA_ARGS__>                                       \
+      SPIRV##x;
+_SPIRV_OP(GroupNonUniformShuffleUp, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleDown, true, 6)
 #undef _SPIRV_OP
 
 class SPIRVBlockingPipesIntelInst : public SPIRVInstTemplateBase {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index 723cc7b9b1af7..3fa66434bd721 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -515,6 +515,14 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
   add(CapabilityFPGAKernelAttributesINTEL, "FPGAKernelAttributesINTEL");
   add(CapabilityIOPipeINTEL, "IOPipeINTEL");
   add(CapabilityOptimizationHintsINTEL, "OptimizationHintsINTEL");
+  add(CapabilityGroupNonUniform, "GroupNonUniform");
+  add(CapabilityGroupNonUniformVote, "GroupNonUniformVote");
+  add(CapabilityGroupNonUniformArithmetic, "GroupNonUniformArithmetic");
+  add(CapabilityGroupNonUniformBallot, "GroupNonUniformBallot");
+  add(CapabilityGroupNonUniformShuffle, "GroupNonUniformShuffle");
+  add(CapabilityGroupNonUniformShuffleRelative,
+      "GroupNonUniformShuffleRelative");
+  add(CapabilityGroupNonUniformClustered, "GroupNonUniformClustered");
 }
 SPIRV_DEF_NAMEMAP(Capability, SPIRVCapabilityNameMap)
 
diff --git a/llvm-spirv/test/transcoding/sub_group_ballot.ll b/llvm-spirv/test/transcoding/sub_group_ballot.ll
index ede9b6f5cbf8e..2092f9544912e 100644
--- a/llvm-spirv/test/transcoding/sub_group_ballot.ll
+++ b/llvm-spirv/test/transcoding/sub_group_ballot.ll
@@ -169,6 +169,8 @@ source_filename = "subgroup_ballot.cl"
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64"
 
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniformBallot
+
 ; CHECK-SPIRV-DAG: Decorate [[eqMask:[0-9]+]] BuiltIn 4416
 ; CHECK-SPIRV-DAG: Decorate [[geMask:[0-9]+]] BuiltIn 4417
 ; CHECK-SPIRV-DAG: Decorate [[gtMask:[0-9]+]] BuiltIn 4418
diff --git a/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll b/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll
index 3a70a4a72b901..ca687768360d2 100644
--- a/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll
+++ b/llvm-spirv/test/transcoding/sub_group_clustered_reduce.ll
@@ -182,6 +182,8 @@
 ; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
 ; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniformClustered
+
 ; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
 ; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
 ; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
diff --git a/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll b/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll
index cbe424666f094..82a59bdbbdb19 100644
--- a/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll
+++ b/llvm-spirv/test/transcoding/sub_group_non_uniform_arithmetic.ll
@@ -317,6 +317,8 @@
 ; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
 ; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniformArithmetic
+
 ; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
 ; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
 ; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
diff --git a/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll b/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll
index 6210a5a26f995..5174eae28502b 100644
--- a/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll
+++ b/llvm-spirv/test/transcoding/sub_group_non_uniform_vote.ll
@@ -75,6 +75,9 @@ source_filename = "sub_group_non_uniform_vote.cl"
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64"
 
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniform
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniformVote
+
 ; CHECK-SPIRV-DAG: TypeBool  [[bool:[0-9]+]]
 ; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
 ; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
diff --git a/llvm-spirv/test/transcoding/sub_group_shuffle.ll b/llvm-spirv/test/transcoding/sub_group_shuffle.ll
index 7b30bbc907e1e..b25281562af03 100644
--- a/llvm-spirv/test/transcoding/sub_group_shuffle.ll
+++ b/llvm-spirv/test/transcoding/sub_group_shuffle.ll
@@ -88,6 +88,8 @@
 ; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
 ; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniformShuffle
+
 ; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
 ; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
 ; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0
diff --git a/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll b/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll
index cce29017af602..dd30aa65e11d2 100644
--- a/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll
+++ b/llvm-spirv/test/transcoding/sub_group_shuffle_relative.ll
@@ -88,6 +88,8 @@
 ; DISABLED: llvm-spirv -r %t.spv -o %t.rev.bc
 ; DISABLED: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
+; CHECK-SPIRV-DAG: {{[0-9]*}} Capability GroupNonUniformShuffleRelative
+
 ; CHECK-SPIRV-DAG: TypeInt   [[char:[0-9]+]]   8  0
 ; CHECK-SPIRV-DAG: TypeInt   [[short:[0-9]+]]  16 0
 ; CHECK-SPIRV-DAG: TypeInt   [[int:[0-9]+]]    32 0

From a9b9001100931ecb9526f3ebc7edae90b8009772 Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 21 Apr 2020 23:45:08 -0700
Subject: [PATCH 760/770] don't add enums for SPIR-V 1.4 or 1.5 yet update
 version-controls-negative tests

---
 llvm-spirv/include/LLVMSPIRVOpts.h                    | 8 +++-----
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp         | 6 ++++++
 llvm-spirv/test/spirv-version-controls-negative-1.spt | 2 +-
 llvm-spirv/test/spirv-version-controls-negative-2.spt | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h
index d5ce801de4af8..62831d0c4b6b7 100644
--- a/llvm-spirv/include/LLVMSPIRVOpts.h
+++ b/llvm-spirv/include/LLVMSPIRVOpts.h
@@ -53,12 +53,10 @@ enum class VersionNumber : uint32_t {
   SPIRV_1_1 = 0x00010100,
   SPIRV_1_2 = 0x00010200,
   SPIRV_1_3 = 0x00010300,
-  SPIRV_1_4 = 0x00010400,
-  SPIRV_1_5 = 0x00010500,
-  // TODO: populate this enum with the latest versions (up to 1.4) once
-  // translator get support of correponding features
+  // TODO: populate this enum with the latest versions (up to 1.5) once
+  // translator get support of corresponding features
   MinimumVersion = SPIRV_1_0,
-  MaximumVersion = SPIRV_1_5
+  MaximumVersion = SPIRV_1_3
 };
 
 enum class ExtensionID : uint32_t {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index 2cc1120f4e226..88e0a91d9984d 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -1758,6 +1758,12 @@ static std::string to_string(uint32_t Version) {
   case static_cast<uint32_t>(VersionNumber::SPIRV_1_1):
     Res = "1.1";
     break;
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_2):
+    Res = "1.2";
+    break;
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_3):
+    Res = "1.3";
+    break;
   default:
     Res = "unknown";
   }
diff --git a/llvm-spirv/test/spirv-version-controls-negative-1.spt b/llvm-spirv/test/spirv-version-controls-negative-1.spt
index f47dac3000f2f..778015c754371 100644
--- a/llvm-spirv/test/spirv-version-controls-negative-1.spt
+++ b/llvm-spirv/test/spirv-version-controls-negative-1.spt
@@ -29,5 +29,5 @@
 
 ; RUN: not --crash llvm-spirv %s -to-binary -o - 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 ;
-; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (66560)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.1 (65792)
+; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (66560)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.3 (66304)
 
diff --git a/llvm-spirv/test/spirv-version-controls-negative-2.spt b/llvm-spirv/test/spirv-version-controls-negative-2.spt
index ce0d9deb7a8d6..fc7b8c36fc670 100644
--- a/llvm-spirv/test/spirv-version-controls-negative-2.spt
+++ b/llvm-spirv/test/spirv-version-controls-negative-2.spt
@@ -29,6 +29,6 @@
 
 ; RUN: not --crash llvm-spirv %s -to-binary -o - 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 ;
-; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (1024)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.1 (65792)
+; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (1024)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.3 (66304)
 
 

From fc5f453409a5c22bca38fd47651476cc3929a635 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <piotr.fusik@intel.com>
Date: Mon, 4 May 2020 14:43:32 +0200
Subject: [PATCH 761/770] Fix case style.

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index e48a760776f5d..2a696ccd48eb2 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -967,11 +967,11 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
             else {
               // clustered reduce args are (type, uint)
               // other operation args are (type)
-              auto mangledName = F->getName();
-              auto mangledTyC = ClusteredOp.empty()
-                                    ? mangledName.back()
-                                    : mangledName.take_back(2).front();
-              if (isMangledTypeSigned(mangledTyC))
+              auto MangledName = F->getName();
+              auto MangledTyC = ClusteredOp.empty()
+                                    ? MangledName.back()
+                                    : MangledName.take_back(2).front();
+              if (isMangledTypeSigned(MangledTyC))
                 OpTyC = 's';
               else
                 OpTyC = 'u';

From f3f18864ae86b9224c733164e3f94173d3d5b5a1 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Tue, 5 May 2020 12:47:16 -0400
Subject: [PATCH 762/770] extend spirv-max-version option to versions 1.2 and
 1.3

---
 llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp      | 3 +--
 llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
index 2a696ccd48eb2..b51f8e2dcbf6b 100644
--- a/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -953,8 +953,7 @@ void OCL20ToSPIRV::visitCallGroupBuiltin(CallInst *CI,
                                   .Case("ballot_exclusive_scan", "add")
                                   .Default(FuncName.take_back(
                                       3)); // assumes op is three characters
-          if (GroupOp.startswith("_"))
-            GroupOp = GroupOp.take_back(2); // when op is two characters
+          GroupOp.consume_front("_");      // when op is two characters
           assert(!GroupOp.empty() && "Invalid OpenCL group builtin function");
           char OpTyC = 0;
           auto OpTy = F->getReturnType();
diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
index 11d7b9533f832..ce4cd190dda3b 100644
--- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
+++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
@@ -102,7 +102,9 @@ static cl::opt<VersionNumber> MaxSPIRVVersion(
     "spirv-max-version",
     cl::desc("Choose maximum SPIR-V version which can be emitted"),
     cl::values(clEnumValN(VersionNumber::SPIRV_1_0, "1.0", "SPIR-V 1.0"),
-               clEnumValN(VersionNumber::SPIRV_1_1, "1.1", "SPIR-V 1.1")),
+               clEnumValN(VersionNumber::SPIRV_1_1, "1.1", "SPIR-V 1.1"),
+               clEnumValN(VersionNumber::SPIRV_1_2, "1.2", "SPIR-V 1.2"),
+               clEnumValN(VersionNumber::SPIRV_1_3, "1.3", "SPIR-V 1.3")),
     cl::init(VersionNumber::MaximumVersion));
 
 static cl::list<std::string>

From e8994b2f822533373a959010601979b4b693f7f7 Mon Sep 17 00:00:00 2001
From: Andrew Savonichev <andrew.savonichev@intel.com>
Date: Mon, 25 May 2020 12:59:09 +0300
Subject: [PATCH 763/770] Fix a warning from "Enable strict rules to set
 ContractionOFF (#521)"

Signed-off-by: Andrew Savonichev <andrew.savonichev@intel.com>
---
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index b5528a25457f3..f08a74b495d99 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -2764,6 +2764,7 @@ bool LLVMToSPIRV::joinFPContract(Function *F, FPContract C) {
   case FPContract::DISABLED:
     return false;
   }
+  llvm_unreachable("Unhandled FPContract value.");
 }
 
 } // namespace SPIRV

From 0676db245571f0b7e9e64ce543570d4703d50b66 Mon Sep 17 00:00:00 2001
From: Nikita Rudenko <nikita.rudenko@intel.com>
Date: Wed, 27 May 2020 12:45:50 +0300
Subject: [PATCH 764/770] Add infrastructure for SPV_KHR_float_controls
 extension (#540)

Added support for `SPV_KHR_float_controls` extension to SPIR-V generator part
Fixed not emitting required extensions for capabilities
---
 llvm-spirv/include/LLVMSPIRVExtensions.inc    |  1 +
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp          | 13 +++
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp  |  5 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h    | 61 +++++++++++-
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h     |  6 ++
 .../lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h     |  5 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp | 11 ++-
 .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h     | 10 ++
 .../test/exec_mode_float_control_khr.ll       | 94 +++++++++++++++++++
 9 files changed, 203 insertions(+), 3 deletions(-)
 create mode 100755 llvm-spirv/test/exec_mode_float_control_khr.ll

diff --git a/llvm-spirv/include/LLVMSPIRVExtensions.inc b/llvm-spirv/include/LLVMSPIRVExtensions.inc
index 8fcaaeaed1a58..a0cbcd97aa57f 100644
--- a/llvm-spirv/include/LLVMSPIRVExtensions.inc
+++ b/llvm-spirv/include/LLVMSPIRVExtensions.inc
@@ -4,6 +4,7 @@
 #endif
 
 EXT(SPV_KHR_no_integer_wrap_decoration)
+EXT(SPV_KHR_float_controls)
 EXT(SPV_INTEL_subgroups)
 EXT(SPV_INTEL_media_block_io)
 EXT(SPV_INTEL_device_side_avc_motion_estimation)
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index f08a74b495d99..ddab3f2002cac 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -2481,6 +2481,19 @@ bool LLVMToSPIRV::transExecutionMode() {
           BM->addCapability(CapabilityFPGAKernelAttributesINTEL);
         }
       } break;
+
+      case spv::ExecutionModeDenormPreserve:
+      case spv::ExecutionModeDenormFlushToZero:
+      case spv::ExecutionModeSignedZeroInfNanPreserve:
+      case spv::ExecutionModeRoundingModeRTE:
+      case spv::ExecutionModeRoundingModeRTZ: {
+        if (!BM->isAllowedToUseExtension(ExtensionID::SPV_KHR_float_controls))
+          break;
+        unsigned TargetWidth;
+        N.get(TargetWidth);
+        BF->addExecutionMode(BM->add(new SPIRVExecutionMode(
+            BF, static_cast<ExecutionMode>(EMode), TargetWidth)));
+      } break;
       default:
         llvm_unreachable("invalid execution mode");
       }
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
index 28bacfdb46d20..d5609726be143 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
@@ -482,6 +482,11 @@ void SPIRVExecutionMode::decode(std::istream &I) {
   case ExecutionModeInvocations:
   case ExecutionModeOutputVertices:
   case ExecutionModeVecTypeHint:
+  case ExecutionModeDenormPreserve:
+  case ExecutionModeDenormFlushToZero:
+  case ExecutionModeSignedZeroInfNanPreserve:
+  case ExecutionModeRoundingModeRTE:
+  case ExecutionModeRoundingModeRTZ:
   case ExecutionModeSubgroupSize:
   case ExecutionModeMaxWorkDimINTEL:
   case ExecutionModeNumSIMDWorkitemsINTEL:
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
index 17da5f0f1bf84..a734f10379290 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
@@ -665,12 +665,52 @@ class SPIRVExecutionMode : public SPIRVAnnotation<OpExecutionMode> {
 };
 
 class SPIRVComponentExecutionModes {
-  typedef std::map<SPIRVExecutionModeKind, SPIRVExecutionMode *>
+  typedef std::multimap<SPIRVExecutionModeKind, SPIRVExecutionMode *>
       SPIRVExecutionModeMap;
+  typedef std::pair<SPIRVExecutionModeMap::const_iterator,
+                    SPIRVExecutionModeMap::const_iterator>
+      SPIRVExecutionModeRange;
 
 public:
   void addExecutionMode(SPIRVExecutionMode *ExecMode) {
-    ExecModes[ExecMode->getExecutionMode()] = ExecMode;
+    // There should not be more than 1 execution mode kind except the ones
+    // mentioned in SPV_KHR_float_controls.
+#ifndef NDEBUG
+    auto IsDenorm = [](auto EMK) {
+      return EMK == ExecutionModeDenormPreserve ||
+             EMK == ExecutionModeDenormFlushToZero;
+    };
+    auto IsRoundingMode = [](auto EMK) {
+      return EMK == ExecutionModeRoundingModeRTE ||
+             EMK == ExecutionModeRoundingModeRTZ;
+    };
+    auto IsOtherFP = [](auto EMK) {
+      return EMK == ExecutionModeSignedZeroInfNanPreserve;
+    };
+    auto IsFloatControl = [&](auto EMK) {
+      return IsDenorm(EMK) || IsRoundingMode(EMK) || IsOtherFP(EMK);
+    };
+    auto IsCompatible = [&](SPIRVExecutionMode *EM0, SPIRVExecutionMode *EM1) {
+      if (EM0->getTargetId() != EM1->getTargetId())
+        return true;
+      auto EMK0 = EM0->getExecutionMode();
+      auto EMK1 = EM1->getExecutionMode();
+      if (!IsFloatControl(EMK0) || !IsFloatControl(EMK1))
+        return EMK0 != EMK1;
+      auto TW0 = EM0->getLiterals().at(0);
+      auto TW1 = EM1->getLiterals().at(0);
+      if (TW0 != TW1)
+        return true;
+      return !(IsDenorm(EMK0) && IsDenorm(EMK1)) &&
+             !(IsRoundingMode(EMK0) && IsRoundingMode(EMK1));
+    };
+    for (auto I = ExecModes.begin(); I != ExecModes.end(); ++I) {
+      assert(IsCompatible(ExecMode, (*I).second) &&
+             "Found incompatible execution modes");
+    }
+#endif // !NDEBUG
+    SPIRVExecutionModeKind EMK = ExecMode->getExecutionMode();
+    ExecModes.emplace(EMK, ExecMode);
   }
   SPIRVExecutionMode *getExecutionMode(SPIRVExecutionModeKind EMK) const {
     auto Loc = ExecModes.find(EMK);
@@ -678,6 +718,10 @@ class SPIRVComponentExecutionModes {
       return nullptr;
     return Loc->second;
   }
+  SPIRVExecutionModeRange
+  getExecutionModeRange(SPIRVExecutionModeKind EMK) const {
+    return ExecModes.equal_range(EMK);
+  }
 
 protected:
   SPIRVExecutionModeMap ExecModes;
@@ -762,6 +806,19 @@ class SPIRVCapability : public SPIRVEntryNoId<OpCapability> {
     }
   }
 
+  SPIRVExtSet getRequiredExtensions() const override {
+    switch (Kind) {
+    case CapabilityDenormPreserve:
+    case CapabilityDenormFlushToZero:
+    case CapabilitySignedZeroInfNanPreserve:
+    case CapabilityRoundingModeRTE:
+    case CapabilityRoundingModeRTZ:
+      return getSet(ExtensionID::SPV_KHR_float_controls);
+    default:
+      return SPIRVExtSet();
+    }
+  }
+
 private:
   SPIRVCapabilityKind Kind;
 };
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
index 3de9929617622..95555240d9dbf 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
@@ -237,6 +237,12 @@ template <> inline void SPIRVMap<SPIRVExecutionModeKind, SPIRVCapVec>::init() {
   ADD_VEC_INIT(ExecutionModeOutputTriangleStrip, {CapabilityGeometry});
   ADD_VEC_INIT(ExecutionModeVecTypeHint, {CapabilityKernel});
   ADD_VEC_INIT(ExecutionModeContractionOff, {CapabilityKernel});
+  ADD_VEC_INIT(ExecutionModeDenormPreserve, {CapabilityDenormPreserve});
+  ADD_VEC_INIT(ExecutionModeDenormFlushToZero, {CapabilityDenormFlushToZero});
+  ADD_VEC_INIT(ExecutionModeSignedZeroInfNanPreserve,
+               {CapabilitySignedZeroInfNanPreserve});
+  ADD_VEC_INIT(ExecutionModeRoundingModeRTE, {CapabilityRoundingModeRTE});
+  ADD_VEC_INIT(ExecutionModeRoundingModeRTZ, {CapabilityRoundingModeRTZ});
 }
 
 template <> inline void SPIRVMap<SPIRVMemoryModelKind, SPIRVCapVec>::init() {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
index 121a6522007f2..a6000908b2a94 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
@@ -144,6 +144,11 @@ inline bool isValid(spv::ExecutionMode V) {
   case ExecutionModeNoGlobalOffsetINTEL:
   case ExecutionModeMaxWorkDimINTEL:
   case ExecutionModeNumSIMDWorkitemsINTEL:
+  case ExecutionModeDenormPreserve:
+  case ExecutionModeDenormFlushToZero:
+  case ExecutionModeSignedZeroInfNanPreserve:
+  case ExecutionModeRoundingModeRTE:
+  case ExecutionModeRoundingModeRTZ:
     return true;
   default:
     return false;
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index 88e0a91d9984d..de2f3b574647d 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -600,7 +600,16 @@ void SPIRVModuleImpl::addCapability(SPIRVCapabilityKind Cap) {
   if (hasCapability(Cap))
     return;
 
-  CapMap.insert(std::make_pair(Cap, new SPIRVCapability(this, Cap)));
+  auto *CapObj = new SPIRVCapability(this, Cap);
+  if (AutoAddExtensions) {
+    // While we are reading existing SPIR-V we need to read it as-is and don't
+    // add required extensions for each entry automatically
+    for (auto &E : CapObj->getRequiredExtensions()) {
+      addExtension(E);
+    }
+  }
+
+  CapMap.insert(std::make_pair(Cap, CapObj));
 }
 
 void SPIRVModuleImpl::addCapabilityInternal(SPIRVCapabilityKind Cap) {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index 3fa66434bd721..9ef434659a843 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -123,6 +123,11 @@ template <> inline void SPIRVMap<ExecutionMode, std::string>::init() {
   add(ExecutionModeNoGlobalOffsetINTEL, "NoGlobalOffsetINTEL");
   add(ExecutionModeMaxWorkDimINTEL, "MaxWorkDimINTEL");
   add(ExecutionModeNumSIMDWorkitemsINTEL, "NumSIMDWorkitemsINTEL");
+  add(ExecutionModeDenormPreserve, "DenormPreserve");
+  add(ExecutionModeDenormFlushToZero, "DenormFlushToZero");
+  add(ExecutionModeSignedZeroInfNanPreserve, "SignedZeroInfNanPreserve");
+  add(ExecutionModeRoundingModeRTE, "RoundingModeRTE");
+  add(ExecutionModeRoundingModeRTZ, "RoundingModeRTZ");
 }
 SPIRV_DEF_NAMEMAP(ExecutionMode, SPIRVExecutionModeNameMap)
 
@@ -490,6 +495,11 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
   add(CapabilityStorageImageWriteWithoutFormat,
       "StorageImageWriteWithoutFormat");
   add(CapabilityMultiViewport, "MultiViewport");
+  add(CapabilityDenormPreserve, "DenormPreserve");
+  add(CapabilityDenormFlushToZero, "DenormFlushToZero");
+  add(CapabilitySignedZeroInfNanPreserve, "SignedZeroInfNanPreserve");
+  add(CapabilityRoundingModeRTE, "RoundingModeRTE");
+  add(CapabilityRoundingModeRTZ, "RoundingModeRTZ");
   add(CapabilitySubgroupShuffleINTEL, "SubgroupShuffleINTEL");
   add(CapabilitySubgroupBufferBlockIOINTEL, "SubgroupBufferBlockIOINTEL");
   add(CapabilitySubgroupImageBlockIOINTEL, "SubgroupImageBlockIOINTEL");
diff --git a/llvm-spirv/test/exec_mode_float_control_khr.ll b/llvm-spirv/test/exec_mode_float_control_khr.ll
new file mode 100755
index 0000000000000..e1b1768995e4e
--- /dev/null
+++ b/llvm-spirv/test/exec_mode_float_control_khr.ll
@@ -0,0 +1,94 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv --spirv-ext=+SPV_KHR_float_controls
+; RUN: llvm-spirv %t.spv -o %t.spt --to-text
+; RUN: FileCheck %s --input-file %t.spt -check-prefix=SPV
+
+; ModuleID = 'float_control.bc'
+source_filename = "float_control.cpp"
+target datalayout = "e-p:64:64-i64:64-n8:16:32"
+target triple = "spir"
+
+; Function Attrs: noinline norecurse nounwind readnone
+define dso_local dllexport spir_kernel void @k_float_controls_0(i32 %ibuf, i32 %obuf) local_unnamed_addr {
+entry:
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind readnone
+define dso_local dllexport spir_kernel void @k_float_controls_1(i32 %ibuf, i32 %obuf) local_unnamed_addr {
+entry:
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind readnone
+define dso_local dllexport spir_kernel void @k_float_controls_2(i32 %ibuf, i32 %obuf) local_unnamed_addr {
+entry:
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind readnone
+define dso_local dllexport spir_kernel void @k_float_controls_3(i32 %ibuf, i32 %obuf) local_unnamed_addr {
+entry:
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind readnone
+define dso_local dllexport spir_kernel void @k_float_controls_4(i32 %ibuf, i32 %obuf) local_unnamed_addr {
+entry:
+  ret void
+}
+
+
+!llvm.module.flags = !{!12}
+!llvm.ident = !{!13}
+!spirv.EntryPoint = !{}
+!spirv.ExecutionMode = !{!15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29}
+
+; SPV-DAG: EntryPoint {{[0-9]+}} [[KERNEL0:[0-9]+]] "k_float_controls_0"
+; SPV-DAG: EntryPoint {{[0-9]+}} [[KERNEL1:[0-9]+]] "k_float_controls_1"
+; SPV-DAG: EntryPoint {{[0-9]+}} [[KERNEL2:[0-9]+]] "k_float_controls_2"
+; SPV-DAG: EntryPoint {{[0-9]+}} [[KERNEL3:[0-9]+]] "k_float_controls_3"
+; SPV-DAG: EntryPoint {{[0-9]+}} [[KERNEL4:[0-9]+]] "k_float_controls_4"
+!0 = !{void (i32, i32)* @k_float_controls_0, !"k_float_controls_0", !1, i32 0, !2, !3, !4, i32 0, i32 0}
+!1 = !{i32 2, i32 2}
+!2 = !{i32 32, i32 36}
+!3 = !{i32 0, i32 0}
+!4 = !{!"", !""}
+!12 = !{i32 1, !"wchar_size", i32 4}
+!13 = !{!"clang version 8.0.1"}
+!14 = !{i32 1, i32 0}
+
+; SPV-DAG: ExecutionMode [[KERNEL0]] 4459 64
+!15 = !{void (i32, i32)* @k_float_controls_0, i32 4459, i32 64}
+; SPV-DAG: ExecutionMode [[KERNEL0]] 4459 32
+!16 = !{void (i32, i32)* @k_float_controls_0, i32 4459, i32 32}
+; SPV-DAG: ExecutionMode [[KERNEL0]] 4459 16
+!17 = !{void (i32, i32)* @k_float_controls_0, i32 4459, i32 16}
+
+; SPV-DAG: ExecutionMode [[KERNEL1]] 4460 64
+!18 = !{void (i32, i32)* @k_float_controls_1, i32 4460, i32 64}
+; SPV-DAG: ExecutionMode [[KERNEL1]] 4460 32
+!19 = !{void (i32, i32)* @k_float_controls_1, i32 4460, i32 32}
+; SPV-DAG: ExecutionMode [[KERNEL1]] 4460 16
+!20 = !{void (i32, i32)* @k_float_controls_1, i32 4460, i32 16}
+
+; SPV-DAG: ExecutionMode [[KERNEL2]] 4461 64
+!21 = !{void (i32, i32)* @k_float_controls_2, i32 4461, i32 64}
+; SPV-DAG: ExecutionMode [[KERNEL2]] 4461 32
+!22 = !{void (i32, i32)* @k_float_controls_2, i32 4461, i32 32}
+; SPV-DAG: ExecutionMode [[KERNEL2]] 4461 16
+!23 = !{void (i32, i32)* @k_float_controls_2, i32 4461, i32 16}
+
+; SPV-DAG: ExecutionMode [[KERNEL3]] 4462 64
+!24 = !{void (i32, i32)* @k_float_controls_3, i32 4462, i32 64}
+; SPV-DAG: ExecutionMode [[KERNEL3]] 4462 32
+!25 = !{void (i32, i32)* @k_float_controls_3, i32 4462, i32 32}
+; SPV-DAG: ExecutionMode [[KERNEL3]] 4462 16
+!26 = !{void (i32, i32)* @k_float_controls_3, i32 4462, i32 16}
+
+; SPV-DAG: ExecutionMode [[KERNEL4]] 4463 64
+!27 = !{void (i32, i32)* @k_float_controls_4, i32 4463, i32 64}
+; SPV-DAG: ExecutionMode [[KERNEL4]] 4463 32
+!28 = !{void (i32, i32)* @k_float_controls_4, i32 4463, i32 32}
+; SPV-DAG: ExecutionMode [[KERNEL4]] 4463 16
+!29 = !{void (i32, i32)* @k_float_controls_4, i32 4463, i32 16}

From cc90c6072e923f4efc0c3eee9cdbda42e210c6bd Mon Sep 17 00:00:00 2001
From: Nikita Rudenko <nikita.rudenko@intel.com>
Date: Wed, 27 May 2020 22:56:22 +0300
Subject: [PATCH 765/770] Minor changes and cleanups (#553)

* Refined condition for CapabilityVector16
* Removed redundant assert from SPIRVVectorShuffle

  This assert is redundant because the allowed size of Components
  is checked by Type. Moreover it can interfere with capabilities
  allowing non-standart sizes.

* Added required capabilities for ExecutionModeSubgroupSize
* Added missing float control capabilities to SPIRVIsValidEnum.h
---
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h        | 1 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h | 1 -
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h | 5 +++++
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h | 1 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h        | 2 +-
 llvm-spirv/test/transcoding/ReqdSubgroupSize.ll  | 1 +
 6 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
index 95555240d9dbf..6d24877a0230c 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
@@ -237,6 +237,7 @@ template <> inline void SPIRVMap<SPIRVExecutionModeKind, SPIRVCapVec>::init() {
   ADD_VEC_INIT(ExecutionModeOutputTriangleStrip, {CapabilityGeometry});
   ADD_VEC_INIT(ExecutionModeVecTypeHint, {CapabilityKernel});
   ADD_VEC_INIT(ExecutionModeContractionOff, {CapabilityKernel});
+  ADD_VEC_INIT(ExecutionModeSubgroupSize, {CapabilitySubgroupDispatch});
   ADD_VEC_INIT(ExecutionModeDenormPreserve, {CapabilityDenormPreserve});
   ADD_VEC_INIT(ExecutionModeDenormFlushToZero, {CapabilityDenormFlushToZero});
   ADD_VEC_INIT(ExecutionModeSignedZeroInfNanPreserve,
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 375d08b83ff93..c20ad86fd3c9f 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -2275,7 +2275,6 @@ class SPIRVVectorShuffle : public SPIRVInstruction {
       return;
     assert(getValueType(Vector1) == getValueType(Vector2));
     assert(Components.size() == Type->getVectorComponentCount());
-    assert(Components.size() > 1);
   }
   SPIRVId Vector1;
   SPIRVId Vector2;
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
index a6000908b2a94..3db6bb9f8a59f 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
@@ -581,6 +581,11 @@ inline bool isValid(spv::Capability V) {
   case CapabilityGroupNonUniformShuffleRelative:
   case CapabilityGroupNonUniformClustered:
   case CapabilityGroupNonUniformQuad:
+  case CapabilityDenormPreserve:
+  case CapabilityDenormFlushToZero:
+  case CapabilitySignedZeroInfNanPreserve:
+  case CapabilityRoundingModeRTE:
+  case CapabilityRoundingModeRTZ:
   case CapabilityFPGAMemoryAttributesINTEL:
   case CapabilityArbitraryPrecisionIntegersINTEL:
   case CapabilityFPGALoopControlsINTEL:
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index 9ef434659a843..0a62838dff765 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -495,6 +495,7 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
   add(CapabilityStorageImageWriteWithoutFormat,
       "StorageImageWriteWithoutFormat");
   add(CapabilityMultiViewport, "MultiViewport");
+  add(CapabilitySubgroupDispatch, "CapabilitySubgroupDispatch");
   add(CapabilityDenormPreserve, "DenormPreserve");
   add(CapabilityDenormFlushToZero, "DenormFlushToZero");
   add(CapabilitySignedZeroInfNanPreserve, "SignedZeroInfNanPreserve");
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
index c717bc005f1bf..1ce1d3058fcf6 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
@@ -305,7 +305,7 @@ class SPIRVTypeVector : public SPIRVType {
     SPIRVCapVec V(getComponentType()->getRequiredCapability());
     // Even though the capability name is "Vector16", it describes
     // usage of 8-component or 16-component vectors.
-    if (CompCount >= 8)
+    if (CompCount == 8 || CompCount == 16)
       V.push_back(CapabilityVector16);
     return V;
   }
diff --git a/llvm-spirv/test/transcoding/ReqdSubgroupSize.ll b/llvm-spirv/test/transcoding/ReqdSubgroupSize.ll
index 703a87f5e673d..6978c3d42425c 100644
--- a/llvm-spirv/test/transcoding/ReqdSubgroupSize.ll
+++ b/llvm-spirv/test/transcoding/ReqdSubgroupSize.ll
@@ -7,6 +7,7 @@
 ; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: llvm-spirv %t.spv -r -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-LLVM
 
+; CHECK-SPIRV: Capability CapabilitySubgroupDispatch
 ; CHECK-SPIRV: EntryPoint 6 [[kernel:[0-9]+]] "foo"
 ; CHECK-SPIRV: ExecutionMode [[kernel]] 35 8
 

From e648d0b3af14604127418347e5e5d5d3bf4d255d Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Mon, 25 May 2020 17:18:21 +0300
Subject: [PATCH 766/770] Fix handling of function pointers going through
 select

---
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp          |  27 +++-
 .../SPV_INTEL_function_pointers/select.ll     | 147 ++++++++++++++++++
 2 files changed, 170 insertions(+), 4 deletions(-)
 create mode 100644 llvm-spirv/test/transcoding/SPV_INTEL_function_pointers/select.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index ddab3f2002cac..9506133540f73 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -1079,11 +1079,30 @@ SPIRVValue *LLVMToSPIRV::transValueWithoutDecoration(Value *V,
     return mapValue(V, BI);
   }
 
-  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+  if (SelectInst *Sel = dyn_cast<SelectInst>(V)) {
+    SPIRVValue *TrueValue = nullptr;
+    SPIRVValue *FalseValue = nullptr;
+    if (isa<Function>(Sel->getTrueValue())) {
+      if (!BM->checkExtension(ExtensionID::SPV_INTEL_function_pointers,
+                              SPIRVEC_FunctionPointers, toString(Sel)))
+        return nullptr;
+
+      // select with function pointers
+      auto *TrueF = cast<Function>(Sel->getTrueValue());
+      TrueValue = BM->addFunctionPointerINTELInst(
+          transType(TrueF->getType()),
+          static_cast<SPIRVFunction *>(transValue(TrueF, BB)), BB);
+      auto *FalseF = cast<Function>(Sel->getFalseValue());
+      FalseValue = BM->addFunctionPointerINTELInst(
+          transType(FalseF->getType()),
+          static_cast<SPIRVFunction *>(transValue(FalseF, BB)), BB);
+    } else {
+      TrueValue = transValue(Sel->getTrueValue(), BB);
+      FalseValue = transValue(Sel->getFalseValue(), BB);
+    }
     return mapValue(V, BM->addSelectInst(transValue(Sel->getCondition(), BB),
-                                         transValue(Sel->getTrueValue(), BB),
-                                         transValue(Sel->getFalseValue(), BB),
-                                         BB));
+                                         TrueValue, FalseValue, BB));
+  }
 
   if (AllocaInst *Alc = dyn_cast<AllocaInst>(V))
     return mapValue(
diff --git a/llvm-spirv/test/transcoding/SPV_INTEL_function_pointers/select.ll b/llvm-spirv/test/transcoding/SPV_INTEL_function_pointers/select.ll
new file mode 100644
index 0000000000000..163fc17e78d3c
--- /dev/null
+++ b/llvm-spirv/test/transcoding/SPV_INTEL_function_pointers/select.ll
@@ -0,0 +1,147 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_function_pointers -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv -r %t.spv -o %t.r.bc
+; RUN: llvm-dis %t.r.bc -o %t.r.ll
+; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: EntryPoint 6 [[#KERNEL_ID:]] "_ZTS6kernel"
+; CHECK-SPIRV-DAG: Name [[#BAR:]] "_Z3barii"
+; CHECK-SPIRV-DAG: Name [[#BAZ:]] "_Z3bazii"
+; CHECK-SPIRV: TypeInt [[#INT32:]] 32
+; CHECK-SPIRV: TypeFunction [[#FUNC_TYPE:]] [[#INT32]] [[#INT32]]
+; CHECK-SPIRV: TypePointer [[#FUNC_PTR_TYPE:]] [[#]] [[#FUNC_TYPE]]
+; CHECK-SPIRV: TypePointer [[#FUNC_PTR_ALLOCA_TYPE:]] [[#]] [[#FUNC_PTR_TYPE]]
+; CHECK-SPIRV: Function [[#]] [[#KERNEL_ID]]
+; CHECK-SPIRV: Variable [[#FUNC_PTR_ALLOCA_TYPE]] [[#FPTR:]]
+; CHECK-SPIRV-DAG: FunctionPointerINTEL [[#FUNC_PTR_TYPE]] [[#BARPTR:]] [[#BAR]]
+; CHECK-SPIRV-DAG: FunctionPointerINTEL [[#FUNC_PTR_TYPE]] [[#BAZPTR:]] [[#BAZ]]
+; CHECK-SPIRV: Select [[#FUNC_PTR_TYPE]] [[#SELECT:]] [[#]] [[#BARPTR]] [[#BAZPTR]]
+; CHECK-SPIRV: Store [[#FPTR]] [[#SELECT]]
+; CHECK-SPIRV: Load [[#FUNC_PTR_TYPE]] [[#LOAD:]] [[#FPTR]]
+; CHECK-SPIRV: FunctionPointerCallINTEL [[#]] [[#]] [[#LOAD]]
+
+; CHECK-LLVM: define spir_kernel void @_ZTS6kernel
+; CHECK-LLVM: %[[FPTR_ALLOCA:.*]] = alloca i32 (i32, i32)*
+; CHECK-LLVM: %[[SELECT:.*]] = select i1 %{{.*}}, i32 (i32, i32)* @_Z3barii, i32 (i32, i32)* @_Z3bazii
+; CHECK-LLVM: store i32 (i32, i32)* %[[SELECT]], i32 (i32, i32)** %[[FPTR_ALLOCA]]
+; CHECK-LLVM: %[[FPTR:.*]] = load i32 (i32, i32)*, i32 (i32, i32)** %[[FPTR_ALLOCA]]
+; CHECK-LLVM: call spir_func i32 %[[FPTR]](
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range" = type { %"class._ZTSN2cl4sycl6detail5arrayILi1EEE.cl::sycl::detail::array" }
+%"class._ZTSN2cl4sycl6detail5arrayILi1EEE.cl::sycl::detail::array" = type { [1 x i64] }
+%"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id" = type { %"class._ZTSN2cl4sycl6detail5arrayILi1EEE.cl::sycl::detail::array" }
+
+$_ZTS6kernel = comdat any
+
+@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+; Function Attrs: norecurse
+define weak_odr dso_local spir_kernel void @_ZTS6kernel(i32 addrspace(1)* %_arg_, %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"* byval(%"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range") align 8 %_arg_1, %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"* byval(%"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range") align 8 %_arg_2, %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id"* byval(%"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id") align 8 %_arg_3) local_unnamed_addr #0 comdat !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+entry:
+  %fptr.alloca = alloca i32 (i32, i32)*, align 8
+  %ref.tmp.i = alloca %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id", align 8
+  %agg.tmp2.i = alloca %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range", align 8
+  %agg.tmp3.i = alloca %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range", align 8
+  %agg.tmp6 = alloca %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id", align 8
+  %0 = bitcast %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"* %agg.tmp2.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0)
+  %1 = bitcast %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"* %agg.tmp3.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1)
+  %2 = addrspacecast %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"* %agg.tmp2.i to %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range" addrspace(4)*
+  %ptrint4.i = ptrtoint %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range" addrspace(4)* %2 to i64
+  %maskedptr5.i = and i64 %ptrint4.i, 7
+  %maskcond6.i = icmp eq i64 %maskedptr5.i, 0
+  %3 = addrspacecast %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"* %agg.tmp3.i to %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range" addrspace(4)*
+  %ptrint.i = ptrtoint %"class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range" addrspace(4)* %3 to i64
+  %maskedptr.i = and i64 %ptrint.i, 7
+  %maskcond.i = icmp eq i64 %maskedptr.i, 0
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1)
+  %4 = getelementptr inbounds %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id", %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id"* %_arg_3, i64 0, i32 0, i32 0, i64 0
+  %5 = load i64, i64* %4, align 8
+  %add.ptr.i = getelementptr inbounds i32, i32 addrspace(1)* %_arg_, i64 %5
+  %6 = addrspacecast %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id"* %agg.tmp6 to %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id" addrspace(4)*
+  %ptrint = ptrtoint %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id" addrspace(4)* %6 to i64
+  %maskedptr = and i64 %ptrint, 7
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %7 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast (<3 x i64> addrspace(1)* @__spirv_BuiltInGlobalInvocationId to <3 x i64> addrspace(4)*), align 32, !noalias !8
+  %8 = extractelement <3 x i64> %7, i64 0
+  %arrayinit.begin.i.i.i.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id", %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id" addrspace(4)* %6, i64 0, i32 0, i32 0, i64 0
+  store i64 %8, i64 addrspace(4)* %arrayinit.begin.i.i.i.i.i, align 8, !tbaa !15, !alias.scope !8
+  %9 = bitcast %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id"* %ref.tmp.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %9) #4
+  %10 = addrspacecast %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id"* %ref.tmp.i to %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id" addrspace(4)*
+  %ptrint.i2 = ptrtoint %"class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id" addrspace(4)* %10 to i64
+  %maskedptr.i3 = and i64 %ptrint.i2, 7
+  %maskcond.i4 = icmp eq i64 %maskedptr.i3, 0
+  %rem.i.i = and i64 %8, 1
+  %cmp.i.i = icmp eq i64 %rem.i.i, 0
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %9) #4
+  %_Z3barii._Z3bazii.i = select i1 %cmp.i.i, i32 (i32, i32)* @_Z3barii, i32 (i32, i32)* @_Z3bazii
+  store i32 (i32, i32)* %_Z3barii._Z3bazii.i, i32 (i32, i32)** %fptr.alloca, align 8
+  %fptr = load i32 (i32, i32)*, i32 (i32, i32)** %fptr.alloca, align 8
+  %call4.i = call spir_func i32 %fptr(i32 10, i32 10), !callees !19
+  %arrayidx.i3.i = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr.i, i64 %8
+  %arrayidx.ascast.i.i = addrspacecast i32 addrspace(1)* %arrayidx.i3.i to i32 addrspace(4)*
+  store i32 %call4.i, i32 addrspace(4)* %arrayidx.ascast.i.i, align 4, !tbaa !20
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind readnone
+define dso_local spir_func i32 @_Z3barii(i32 %a, i32 %b) local_unnamed_addr #2 {
+entry:
+  %add = add nsw i32 %b, %a
+  ret i32 %add
+}
+
+; Function Attrs: norecurse nounwind readnone
+define dso_local spir_func i32 @_Z3bazii(i32 %a, i32 %b) local_unnamed_addr #2 {
+entry:
+  %sub = sub nsw i32 %a, %b
+  ret i32 %sub
+}
+
+attributes #0 = { norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "sycl-module-id"="f.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+attributes #2 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind willreturn }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 11.0.0 "}
+!4 = !{i32 1, i32 0, i32 0, i32 0}
+!5 = !{!"none", !"none", !"none", !"none"}
+!6 = !{!"int*", !"cl::sycl::range<1>", !"cl::sycl::range<1>", !"cl::sycl::id<1>"}
+!7 = !{!"", !"", !"", !""}
+!8 = !{!9, !11, !13}
+!9 = distinct !{!9, !10, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN2cl4sycl2idILi1EEEE8initSizeEv: %agg.result"}
+!10 = distinct !{!10, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN2cl4sycl2idILi1EEEE8initSizeEv"}
+!11 = distinct !{!11, !12, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN2cl4sycl2idILi1EEEEET0_v: %agg.result"}
+!12 = distinct !{!12, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN2cl4sycl2idILi1EEEEET0_v"}
+!13 = distinct !{!13, !14, !"_ZN2cl4sycl6detail7Builder5getIdILi1EEEKNS0_2idIXT_EEEv: %agg.result"}
+!14 = distinct !{!14, !"_ZN2cl4sycl6detail7Builder5getIdILi1EEEKNS0_2idIXT_EEEv"}
+!15 = !{!16, !16, i64 0}
+!16 = !{!"long", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C++ TBAA"}
+!19 = !{i32 (i32, i32)* @_Z3barii, i32 (i32, i32)* @_Z3bazii}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !17, i64 0}

From b0b8f1b76826ff8db516e5f9c8a115816d9c7a00 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Tue, 26 May 2020 09:39:11 -0700
Subject: [PATCH 767/770] Ignore llvm.trap instrinsic

Clang, due to some ABI constraints needs to generate llvm.trap
intrinsics. According to the LangRef documentation a trap instruction
is lowered to a target's trap instruction or to an abort() function
if the target doesn't have a trap instruction. SPIRV has neither a
trap nor an abort instruction and no current opcode has the semantics
of an abort/trap.
Currently the IR to SPIRV translator is crashing when it finds an
llvm.trap intrinsic. The solution will require some thoughts on the
SPIRV side to decide how to implement an abort instruction.
This patch changes the translator so it doesn't crash. This will be
revised when a decision is taken.

NOTE: clang could eventually not generate an llvm.trap instruction in
the current case (non-base destructor of an abstract class needs to be
emmitted) but keep in mind that clang might generate an llvm.trap
intrinsic some other ways and we might stumble into this issue again.
An alternative to this, could be to add an LLVM pass that will get rid
of the llvm.trap intrisincs in the code before the SPIRV translator.
But having the translator solve this issue is a preferred solution.
---
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp |  3 +++
 llvm-spirv/test/trap.ll              | 40 ++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 llvm-spirv/test/trap.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 9506133540f73..0e1d0a5ffdad6 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -1923,6 +1923,9 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
   case Intrinsic::invariant_start:
   case Intrinsic::invariant_end:
   case Intrinsic::dbg_label:
+  case Intrinsic::trap:
+    // llvm.trap intrinsic is not implemented. But for now don't crash. This
+    // change is pending the trap/abort intrisinc implementation.
     return nullptr;
   default:
     if (SPIRVAllowUnknownIntrinsics)
diff --git a/llvm-spirv/test/trap.ll b/llvm-spirv/test/trap.ll
new file mode 100644
index 0000000000000..7366d32985160
--- /dev/null
+++ b/llvm-spirv/test/trap.ll
@@ -0,0 +1,40 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o %t
+; RUN: FileCheck < %t %s
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: nounwind
+; CHECK: Capability Addresses
+; CHECK: "foo"
+
+; Function Attrs: cold noreturn nounwind
+declare void @llvm.trap() #8
+
+define spir_kernel void @foo(i32 addrspace(1)* %a) #0 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+entry:
+  %a.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %a, i32 addrspace(1)** %a.addr, align 4
+  call void @llvm.trap() #12
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #12 = { noreturn nounwind }
+
+!opencl.enable.FP_CONTRACT = !{}
+!opencl.spir.version = !{!6}
+!opencl.ocl.version = !{!6}
+!opencl.used.extensions = !{!7}
+!opencl.used.optional.core.features = !{!7}
+!opencl.compiler.options = !{!7}
+
+!1 = !{i32 1}
+!2 = !{!"none"}
+!3 = !{!"int*"}
+!4 = !{!"int*"}
+!5 = !{!""}
+!6 = !{i32 1, i32 2}
+!7 = !{}

From b90645a7f1aaad0b42b5b37c7797bad32216467d Mon Sep 17 00:00:00 2001
From: Alexey Sotkin <alexey.sotkin@intel.com>
Date: Fri, 29 May 2020 09:40:59 +0300
Subject: [PATCH 768/770] Update DebugInfo test after LLVM changes D80197

Update after LLVM commit llvm/llvm-project@d20bf5a
"[DebugInfo] Upgrade DISubrange to support Fortran dynamic arrays"

Signed-off-by: Alexey Sotkin <alexey.sotkin@intel.com>
---
 llvm-spirv/test/DebugInfo/X86/default-subrange-array.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm-spirv/test/DebugInfo/X86/default-subrange-array.ll b/llvm-spirv/test/DebugInfo/X86/default-subrange-array.ll
index 6509c92cf7072..0222363439a2e 100644
--- a/llvm-spirv/test/DebugInfo/X86/default-subrange-array.ll
+++ b/llvm-spirv/test/DebugInfo/X86/default-subrange-array.ll
@@ -31,7 +31,7 @@ source_filename = "test/DebugInfo/X86/default-subrange-array.ll"
 ; CHECK-NEXT:         DW_AT_type
 ; CHECK:            DW_TAG_subrange_type
 ; CHECK-NEXT:         DW_AT_type
-; DWARF4-NEXT:        DW_AT_lower_bound [DW_FORM_data1] (0x00)
+; DWARF4-NEXT:        DW_AT_lower_bound [DW_FORM_sdata] (0)
 ; CHECK-NEXT:         DW_AT_count [DW_FORM_data1]       (0x2a)
 ; DWARF5-NOT:         DW_AT_lower_bound
 

From 55957ad0fa31985f4b03941042418e11dd1a0267 Mon Sep 17 00:00:00 2001
From: nrudenko <nikita.rudenko@intel.com>
Date: Wed, 27 May 2020 20:45:05 +0300
Subject: [PATCH 769/770] Add translation of memory model from module metadata

---
 llvm-spirv/lib/SPIRV/SPIRVReader.cpp |  5 +++++
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp | 31 ++++++++++++++++++++++++++--
 llvm-spirv/lib/SPIRV/SPIRVWriter.h   |  3 ++-
 llvm-spirv/test/memory_model_md.ll   | 31 ++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+), 3 deletions(-)
 create mode 100755 llvm-spirv/test/memory_model_md.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index 54353dfc3dd99..d0c8a8ea64b10 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -3345,6 +3345,11 @@ bool SPIRVToLLVM::transMetadata() {
                      getMDNodeStringIntVec(Context, EM->getLiterals()));
     }
   }
+  NamedMDNode *MemoryModelMD =
+      M->getOrInsertNamedMetadata(kSPIRVMD::MemoryModel);
+  MemoryModelMD->addOperand(
+      getMDTwoInt(Context, static_cast<unsigned>(BM->getAddressingModel()),
+                  static_cast<unsigned>(BM->getMemoryModel())));
   return true;
 }
 
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 0e1d0a5ffdad6..b19a11665e740 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -105,6 +105,22 @@ static void foreachKernelArgMD(
   }
 }
 
+static SPIRVMemoryModelKind getMemoryModel(Module &M) {
+  auto *MemoryModelMD = M.getNamedMetadata(kSPIRVMD::MemoryModel);
+  if (MemoryModelMD && (MemoryModelMD->getNumOperands() > 0)) {
+    auto *Ref0 = MemoryModelMD->getOperand(0);
+    if (Ref0 && Ref0->getNumOperands() > 1) {
+      auto &&ModelOp = Ref0->getOperand(1);
+      auto *ModelCI = mdconst::dyn_extract<ConstantInt>(ModelOp);
+      if (ModelCI && (ModelCI->getValue().getActiveBits() <= 64)) {
+        auto Model = static_cast<SPIRVMemoryModelKind>(ModelCI->getZExtValue());
+        return Model;
+      }
+    }
+  }
+  return SPIRVMemoryModelKind::MemoryModelMax;
+}
+
 LLVMToSPIRV::LLVMToSPIRV(SPIRVModule *SMod)
     : ModulePass(ID), M(nullptr), Ctx(nullptr), BM(SMod), SrcLang(0),
       SrcLangVer(0) {
@@ -2362,7 +2378,7 @@ bool LLVMToSPIRV::translate() {
   for (auto I : Defs)
     transFunction(I);
 
-  if (!transOCLKernelMetadata())
+  if (!transMetadata())
     return false;
   if (!transExecutionMode())
     return false;
@@ -2565,7 +2581,18 @@ void LLVMToSPIRV::transFPContract() {
   }
 }
 
-bool LLVMToSPIRV::transOCLKernelMetadata() {
+bool LLVMToSPIRV::transMetadata() {
+  if (!transOCLMetadata())
+    return false;
+
+  auto Model = getMemoryModel(*M);
+  if (Model != SPIRVMemoryModelKind::MemoryModelMax)
+    BM->setMemoryModel(static_cast<SPIRVMemoryModelKind>(Model));
+
+  return true;
+}
+
+bool LLVMToSPIRV::transOCLMetadata() {
   for (auto &F : *M) {
     if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
       continue;
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.h b/llvm-spirv/lib/SPIRV/SPIRVWriter.h
index 68bf0229513be..743044ee3ecb8 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.h
@@ -173,7 +173,8 @@ class LLVMToSPIRV : public ModulePass {
                                SPIRVWord *EntryPoint = nullptr,
                                SmallVectorImpl<std::string> *Dec = nullptr);
   bool oclIsKernel(Function *F);
-  bool transOCLKernelMetadata();
+  bool transMetadata();
+  bool transOCLMetadata();
   SPIRVInstruction *transBuiltinToInst(StringRef DemangledName, CallInst *CI,
                                        SPIRVBasicBlock *BB);
   SPIRVValue *transBuiltinToConstant(StringRef DemangledName, CallInst *CI);
diff --git a/llvm-spirv/test/memory_model_md.ll b/llvm-spirv/test/memory_model_md.ll
new file mode 100755
index 0000000000000..a90354ca245fb
--- /dev/null
+++ b/llvm-spirv/test/memory_model_md.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv -o %t.spt --to-text
+; RUN: llvm-spirv -r %t.spv -o %t.bc
+; RUN: llvm-dis %t.bc -o %t.ll
+; RUN: FileCheck %s --input-file %t.ll  -check-prefix=LLVM
+; RUN: FileCheck %s --input-file %t.spt -check-prefix=SPV
+
+; ModuleID = 'float_control_empty.bc'
+source_filename = "float_control_empty.cpp"
+target datalayout = "e-p:64:64-i64:64-n8:16:32"
+target triple = "spir"
+
+; LLVM: !spirv.MemoryModel = !{![[MMMD:[0-9]+]]}
+; LLVM: ![[MMMD]] = !{i32 {{[0-9]+}}, i32 0}
+; SPV: MemoryModel 1 0
+; Function Attrs: noinline norecurse nounwind readnone
+define dso_local dllexport void @k_no_fc(i32 %ibuf, i32 %obuf) local_unnamed_addr #16 {
+entry:
+  ret void
+}
+
+attributes #16 = { noinline norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+!spirv.MemoryModel = !{!0}
+
+!0 = !{i32 1, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 8.0.1"}

From a4f4fa9ba38d1cea98b2fef011a23f3680d8b78d Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Tue, 2 Jun 2020 23:08:29 +0300
Subject: [PATCH 770/770] [SYCL] Disable FP16 support check for SYCL CUDA BE

Disable check for SYCL CUDA BE until FP16 support is properly
reported there (issue#1799)
---
 clang/lib/Sema/Sema.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 882048d766b0c..6b6b5a239dbf0 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1753,7 +1753,12 @@ void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
     if (Ty->isDependentType())
       return;
 
-    if ((Ty->isFloat16Type() && !Context.getTargetInfo().hasFloat16Type()) ||
+    auto IsSYCLDeviceCuda = getLangOpts().SYCLIsDevice &&
+                            Context.getTargetInfo().getTriple().isNVPTX();
+    if ((Ty->isFloat16Type() && !Context.getTargetInfo().hasFloat16Type() &&
+         // Disable check for SYCL CUDA BE until FP16 support is properly
+         // reported there (issue#1799)
+         !IsSYCLDeviceCuda) ||
         ((Ty->isFloat128Type() ||
           (Ty->isRealFloatingType() && Context.getTypeSize(Ty) == 128)) &&
          !Context.getTargetInfo().hasFloat128Type()) ||